grafana: update resources
Changed files: alerts/Merlin7/Login002_Load_High.yaml,alerts/Merlin7/LoginNodeLoadHigh.yaml alerts/Merlin7/SlurmNodeUnstable.yaml,alerts/Tier3/UiNodeLoadHigh.yaml
This commit is contained in:
@@ -0,0 +1,102 @@
|
||||
{
|
||||
"metadata": {
|
||||
"name": null,
|
||||
"namespace": "default",
|
||||
"uid": null,
|
||||
"resourceVersion": "13",
|
||||
"labels": {
|
||||
"grafana.app/folder": "bf7geueotdmgwc",
|
||||
"grafana.com/group": "5m-check-interval",
|
||||
"grafana.com/group-index": "2"
|
||||
},
|
||||
"annotations": {
|
||||
"grafana.app/folder": "bf7geueotdmgwc",
|
||||
"grafana.app/updatedBy": "ff52p5ojz4b28c",
|
||||
"grafana.app/updatedTimestamp": "2026-05-20T14:36:25Z",
|
||||
"grafana.com/provenance": "",
|
||||
"grafana.com/updateTimestamp": "2026-05-20T14:36:25Z",
|
||||
"grafana.com/updatedBy": "ff52p5ojz4b28c"
|
||||
}
|
||||
},
|
||||
"spec": {
|
||||
"title": "Login002 Load High",
|
||||
"paused": true,
|
||||
"trigger": {
|
||||
"interval": "5m"
|
||||
},
|
||||
"labels": {
|
||||
"component": "linux",
|
||||
"type": "load"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "The node login002 is experiencing unusually high load (over 50.0 for 15m)!"
|
||||
},
|
||||
"for": "5m0s",
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"notificationSettings": {
|
||||
"receiver": "Merlin Alarms"
|
||||
},
|
||||
"expressions": {
|
||||
"A": {
|
||||
"relativeTimeRange": {
|
||||
"from": "30m0s",
|
||||
"to": "0s"
|
||||
},
|
||||
"datasourceUID": "merlin-mimir",
|
||||
"model": {
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "merlin-mimir"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "node_load15{instance=\"login002\"}",
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
"C": {
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
50
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
},
|
||||
"source": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"status": {}
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
{
|
||||
"metadata": {
|
||||
"name": null,
|
||||
"namespace": "default",
|
||||
"uid": null,
|
||||
"resourceVersion": "14",
|
||||
"labels": {
|
||||
"grafana.app/folder": "bf7geueotdmgwc",
|
||||
"grafana.com/group": "5m-check-interval",
|
||||
"grafana.com/group-index": "1"
|
||||
},
|
||||
"annotations": {
|
||||
"grafana.app/folder": "bf7geueotdmgwc",
|
||||
"grafana.app/updatedBy": "ff52p5ojz4b28c",
|
||||
"grafana.app/updatedTimestamp": "2026-05-20T14:36:25Z",
|
||||
"grafana.com/provenance": "",
|
||||
"grafana.com/updateTimestamp": "2026-05-20T14:36:25Z",
|
||||
"grafana.com/updatedBy": "ff52p5ojz4b28c"
|
||||
}
|
||||
},
|
||||
"spec": {
|
||||
"title": "LoginNodeLoadHigh",
|
||||
"trigger": {
|
||||
"interval": "5m"
|
||||
},
|
||||
"labels": {
|
||||
"component": "linux",
|
||||
"type": "load"
|
||||
},
|
||||
"annotations": {
|
||||
"__dashboardUid__": "hacmpzr",
|
||||
"__panelId__": "3",
|
||||
"summary": "The login node is experiencing unusually high load (over 50.0 for 15m)!"
|
||||
},
|
||||
"for": "15m0s",
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"notificationSettings": {
|
||||
"receiver": "Merlin Alarms"
|
||||
},
|
||||
"expressions": {
|
||||
"A": {
|
||||
"relativeTimeRange": {
|
||||
"from": "30m0s",
|
||||
"to": "0s"
|
||||
},
|
||||
"datasourceUID": "merlin-mimir",
|
||||
"model": {
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "merlin-mimir"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "node_load15{instance=~\"login.*\"}",
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
"C": {
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
50
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
},
|
||||
"source": true
|
||||
}
|
||||
},
|
||||
"panelRef": {
|
||||
"dashboardUID": "hacmpzr",
|
||||
"panelID": 3
|
||||
}
|
||||
},
|
||||
"status": {}
|
||||
}
|
||||
@@ -0,0 +1,109 @@
|
||||
{
|
||||
"metadata": {
|
||||
"name": null,
|
||||
"namespace": "default",
|
||||
"uid": null,
|
||||
"resourceVersion": "2",
|
||||
"labels": {
|
||||
"grafana.app/folder": "bf7geueotdmgwc",
|
||||
"grafana.com/group": "10m-check-interval",
|
||||
"grafana.com/group-index": "1"
|
||||
},
|
||||
"annotations": {
|
||||
"grafana.app/folder": "bf7geueotdmgwc",
|
||||
"grafana.app/updatedBy": "ff52p5ojz4b28c",
|
||||
"grafana.app/updatedTimestamp": "2026-05-20T14:34:40Z",
|
||||
"grafana.com/provenance": "",
|
||||
"grafana.com/updateTimestamp": "2026-05-20T14:34:40Z",
|
||||
"grafana.com/updatedBy": "ff52p5ojz4b28c"
|
||||
}
|
||||
},
|
||||
"spec": {
|
||||
"title": "SlurmNodeUnstable",
|
||||
"trigger": {
|
||||
"interval": "10m"
|
||||
},
|
||||
"labels": {
|
||||
"component": "slurm",
|
||||
"type": "state"
|
||||
},
|
||||
"annotations": {
|
||||
"__dashboardUid__": "de7rpsq1merlin7slurm1",
|
||||
"__panelId__": "7",
|
||||
"description": "A node or group of nodes is in an unstable state and requires intervention by an administrator. This could be due to a hardware issue, leading to state notifications like down or unknown, or it can be causes by some scheduling issue, like misbehaving job.",
|
||||
"summary": "A node or group of nodes is in an unstable state!"
|
||||
},
|
||||
"for": "10m0s",
|
||||
"noDataState": "KeepLast",
|
||||
"execErrState": "Error",
|
||||
"notificationSettings": {
|
||||
"receiver": "Merlin Alarms"
|
||||
},
|
||||
"expressions": {
|
||||
"A": {
|
||||
"relativeTimeRange": {
|
||||
"from": "10m0s",
|
||||
"to": "0s"
|
||||
},
|
||||
"datasourceUID": "merlin-mimir",
|
||||
"model": {
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "merlin-mimir"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(slurm_node_state{state=~\"inval|drain|drng|fail|failg|down|unk\"}) by (node,cluster,state)",
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
"C": {
|
||||
"queryType": "expression",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
},
|
||||
"source": true
|
||||
}
|
||||
},
|
||||
"panelRef": {
|
||||
"dashboardUID": "de7rpsq1merlin7slurm1",
|
||||
"panelID": 7
|
||||
}
|
||||
},
|
||||
"status": {}
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
{
|
||||
"metadata": {
|
||||
"name": null,
|
||||
"namespace": "default",
|
||||
"uid": null,
|
||||
"resourceVersion": "2",
|
||||
"labels": {
|
||||
"grafana.app/folder": "afbmpbce6xa80b",
|
||||
"grafana.com/group": "5m-check-interval",
|
||||
"grafana.com/group-index": "1"
|
||||
},
|
||||
"annotations": {
|
||||
"grafana.app/folder": "afbmpbce6xa80b",
|
||||
"grafana.app/updatedBy": "ff52p5ojz4b28c",
|
||||
"grafana.app/updatedTimestamp": "2026-05-20T14:57:21Z",
|
||||
"grafana.com/provenance": "",
|
||||
"grafana.com/updateTimestamp": "2026-05-20T14:57:21Z",
|
||||
"grafana.com/updatedBy": "ff52p5ojz4b28c"
|
||||
}
|
||||
},
|
||||
"spec": {
|
||||
"title": "UiNodeLoadHigh",
|
||||
"trigger": {
|
||||
"interval": "5m"
|
||||
},
|
||||
"labels": {
|
||||
"component": "linux",
|
||||
"type": "load"
|
||||
},
|
||||
"annotations": {
|
||||
"__dashboardUid__": "013cf482-687c-4b7b-951f-4d88ef78514f",
|
||||
"__panelId__": "18",
|
||||
"summary": "The login node is experiencing unusually high load (over 50.0 for 15m)!"
|
||||
},
|
||||
"for": "5m0s",
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"notificationSettings": {
|
||||
"receiver": "Teir3 Alerts"
|
||||
},
|
||||
"expressions": {
|
||||
"A": {
|
||||
"relativeTimeRange": {
|
||||
"from": "30m0s",
|
||||
"to": "0s"
|
||||
},
|
||||
"datasourceUID": "tier3-mimir",
|
||||
"model": {
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tier3-mimir"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "node_load15{instance=~\"t3ui.*\"}",
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
"C": {
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
50
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
},
|
||||
"source": true
|
||||
}
|
||||
},
|
||||
"panelRef": {
|
||||
"dashboardUID": "013cf482-687c-4b7b-951f-4d88ef78514f",
|
||||
"panelID": 18
|
||||
}
|
||||
},
|
||||
"status": {}
|
||||
}
|
||||
Reference in New Issue
Block a user