grafana/cluster/instance-alerts.json
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 1,
"id": 2,
"links": [],
"panels": [
{
"alert": {
"alertRuleTags": {
"service": "node",
"severity": "error"
},
"conditions": [
{
"evaluator": {
"params": [
8
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A",
"10m",
"now"
]
},
"reducer": {
"params": [],
"type": "max"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"for": "0s",
"frequency": "60s",
"handler": 1,
"name": "high cpu load",
"noDataState": "no_data",
"notifications": [
{
"id": 1
}
]
},
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "monitor-timescale",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"alias": "$tag_instance",
"format": "time_series",
"group": [],
"groupBy": [
{
"params": [
"${__interval}"
],
"type": "time"
},
{
"params": [
"instance"
],
"type": "tag"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "node_load1",
"metricColumn": "none",
"orderByTime": "ASC",
"policy": "default",
"rawQuery": true,
"rawSql": "SELECT\n l.labels->>'nodename' AS \"metric\",\n $__timeGroup(time, ${__interval}) AS \"time\",\n MAX(\"value\") AS \"value\"\nFROM metrics AS m\nJOIN metrics_labels AS l\nON l.metric_name = 'node_uname_info' AND\n m.labels->>'instance' = l.labels->>'instance'\nWHERE\n $__timeFilter(\"time\") AND\n name = 'node_load1' AND\n value != 'NaN'\nGROUP BY $__timeGroup(time, ${__interval}), metric\nORDER BY $__timeGroup(time, ${__interval}), metric;",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "column"
}
]
],
"table": "metrics_values",
"tags": [],
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [],
"type": "macro"
}
]
}
],
"thresholds": [
{
"colorMode": "critical",
"fill": true,
"line": true,
"op": "gt",
"value": 8
}
],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Load",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"alert": {
"alertRuleTags": {},
"conditions": [
{
"evaluator": {
"params": [
0.2
],
"type": "lt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"B",
"5m",
"now"
]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"for": "0m",
"frequency": "60s",
"handler": 1,
"name": "low memory",
"noDataState": "alerting",
"notifications": [
{
"id": 1
}
]
},
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "monitor-timescale",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
},
"id": 4,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": false,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"alias": "$tag_instance avail",
"format": "time_series",
"group": [],
"groupBy": [
{
"params": [
"${__interval}"
],
"type": "time"
},
{
"params": [
"instance"
],
"type": "tag"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "node_memory_MemAvailable",
"metricColumn": "none",
"orderByTime": "ASC",
"policy": "default",
"rawQuery": true,
"rawSql": "SELECT\n l.labels->>'nodename' AS \"metric\",\n $__timeGroup(m.time, ${__interval}) AS \"time\",\n MIN(m.value) AS \"value\"\nFROM metrics AS m\nJOIN metrics_labels AS l\n ON l.metric_name = 'node_uname_info' AND\n m.labels->>'instance' = l.labels->>'instance'\nWHERE\n $__timeFilter(m.time) AND\n m.name = 'node_memory_free_pct' AND\n m.value != 'NaN'\nGROUP BY metric, $__timeGroup(m.time, ${__interval})\nORDER BY metric, time;",
"refId": "B",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "column"
}
]
],
"table": "metrics_values",
"tags": [],
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [],
"type": "macro"
}
]
}
],
"thresholds": [
{
"colorMode": "critical",
"fill": true,
"line": true,
"op": "lt",
"value": 0.2,
"yaxis": "left"
}
],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Memory",
"tooltip": {
"shared": true,
"sort": 1,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percentunit",
"label": null,
"logBase": 1,
"max": "1",
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"alert": {
"alertRuleTags": {},
"conditions": [
{
"evaluator": {
"params": [
0.2
],
"type": "lt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"B",
"5m",
"now"
]
},
"reducer": {
"params": [],
"type": "min"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"for": "5m",
"frequency": "1m",
"handler": 1,
"name": "node root free",
"noDataState": "no_data",
"notifications": []
},
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "monitor-timescale",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 16
},
"id": 6,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": false,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"alias": "$tag_instance avail",
"format": "time_series",
"group": [],
"groupBy": [
{
"params": [
"${__interval}"
],
"type": "time"
},
{
"params": [
"instance"
],
"type": "tag"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "node_memory_MemAvailable",
"metricColumn": "none",
"orderByTime": "ASC",
"policy": "default",
"rawQuery": true,
"rawSql": "SELECT\n l.labels->>'nodename' AS \"metric\",\n $__timeGroup(m.time, ${__interval}),\n MIN(m.value) AS \"value\"\nFROM metrics AS m\nJOIN metrics_labels AS l\n ON l.metric_name = 'node_uname_info' AND \n m.labels->>'instance' = l.labels->>'instance'\nWHERE\n $__timeFilter(m.time) AND\n m.name = 'node_filesystem_free_pct' AND\n m.value != 'NaN'\nGROUP BY metric, $__timeGroup(time, ${__interval})\nORDER BY metric, time;",
"refId": "B",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "column"
}
]
],
"table": "metrics_values",
"tags": [],
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [],
"type": "macro"
}
]
}
],
"thresholds": [
{
"colorMode": "critical",
"fill": true,
"line": true,
"op": "lt",
"value": 0.2
}
],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Root",
"tooltip": {
"shared": true,
"sort": 1,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percentunit",
"label": null,
"logBase": 1,
"max": "1",
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "monitor-timescale",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 24
},
"id": 5,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sort": "max",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/sda$/",
"linewidth": 2
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"alias": "$tag_instance",
"format": "time_series",
"group": [],
"groupBy": [
{
"params": [
"${__interval}"
],
"type": "time"
},
{
"params": [
"instance"
],
"type": "tag"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "node_load1",
"metricColumn": "none",
"orderByTime": "ASC",
"policy": "default",
"rawQuery": true,
"rawSql": "-- this query gets disk latency\nSELECT\n metric,\n time,\n rate_time(value, lag(value) OVER w, '${__interval}') AS \"value\"\nFROM (\n SELECT\n CONCAT(l.labels->>'nodename', ' ', m.labels->>'device') AS \"metric\",\n $__timeGroupAlias(\"time\", ${__interval}, previous),\n MAX(value) AS \"value\"\n FROM metrics AS m\n JOIN metrics_labels AS l\n ON l.metric_name = 'node_uname_info' AND\n m.labels->>'instance' = l.labels->>'instance'\n WHERE\n $__timeFilter(\"time\") AND\n name='node_disk_write_time_seconds_total'\n GROUP BY metric, $__timeGroup(\"time\", ${__interval})\n ORDER BY metric, time\n) AS metrics\nWINDOW w as (PARTITION BY metric ORDER BY time)\nORDER BY metric, time;",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "column"
}
]
],
"table": "metrics_values",
"tags": [],
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [],
"type": "macro"
}
]
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Disk Latency",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "30s",
"schemaVersion": 20,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-3h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Instance Alerts",
"uid": "434n4WTmz",
"version": 30
}