grafana: update resources

Changed files: alerts/Merlin7/Login002_Load_High.yaml,alerts/Merlin7/LoginNodeLoadHigh.yaml alerts/Merlin7/SlurmNodeUnstable.yaml,alerts/Tier3/UiNodeLoadHigh.yaml
This commit is contained in:
gitea-actions[bot]
2026-05-21 11:26:23 +00:00
parent 708c9ba69f
commit 264d31a902
4 changed files with 425 additions and 0 deletions
+102
View File
@@ -0,0 +1,102 @@
{
"metadata": {
"name": null,
"namespace": "default",
"uid": null,
"resourceVersion": "13",
"labels": {
"grafana.app/folder": "bf7geueotdmgwc",
"grafana.com/group": "5m-check-interval",
"grafana.com/group-index": "2"
},
"annotations": {
"grafana.app/folder": "bf7geueotdmgwc",
"grafana.app/updatedBy": "ff52p5ojz4b28c",
"grafana.app/updatedTimestamp": "2026-05-20T14:36:25Z",
"grafana.com/provenance": "",
"grafana.com/updateTimestamp": "2026-05-20T14:36:25Z",
"grafana.com/updatedBy": "ff52p5ojz4b28c"
}
},
"spec": {
"title": "Login002 Load High",
"paused": true,
"trigger": {
"interval": "5m"
},
"labels": {
"component": "linux",
"type": "load"
},
"annotations": {
"summary": "The node login002 is experiencing unusually high load (over 50.0 for 15m)!"
},
"for": "5m0s",
"noDataState": "NoData",
"execErrState": "Error",
"notificationSettings": {
"receiver": "Merlin Alarms"
},
"expressions": {
"A": {
"relativeTimeRange": {
"from": "30m0s",
"to": "0s"
},
"datasourceUID": "merlin-mimir",
"model": {
"datasource": {
"type": "prometheus",
"uid": "merlin-mimir"
},
"editorMode": "code",
"expr": "node_load15{instance=\"login002\"}",
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A"
}
},
"C": {
"model": {
"conditions": [
{
"evaluator": {
"params": [
50
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"C"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "__expr__"
},
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
},
"source": true
}
}
},
"status": {}
}
+107
View File
@@ -0,0 +1,107 @@
{
"metadata": {
"name": null,
"namespace": "default",
"uid": null,
"resourceVersion": "14",
"labels": {
"grafana.app/folder": "bf7geueotdmgwc",
"grafana.com/group": "5m-check-interval",
"grafana.com/group-index": "1"
},
"annotations": {
"grafana.app/folder": "bf7geueotdmgwc",
"grafana.app/updatedBy": "ff52p5ojz4b28c",
"grafana.app/updatedTimestamp": "2026-05-20T14:36:25Z",
"grafana.com/provenance": "",
"grafana.com/updateTimestamp": "2026-05-20T14:36:25Z",
"grafana.com/updatedBy": "ff52p5ojz4b28c"
}
},
"spec": {
"title": "LoginNodeLoadHigh",
"trigger": {
"interval": "5m"
},
"labels": {
"component": "linux",
"type": "load"
},
"annotations": {
"__dashboardUid__": "hacmpzr",
"__panelId__": "3",
"summary": "The login node is experiencing unusually high load (over 50.0 for 15m)!"
},
"for": "15m0s",
"noDataState": "NoData",
"execErrState": "Error",
"notificationSettings": {
"receiver": "Merlin Alarms"
},
"expressions": {
"A": {
"relativeTimeRange": {
"from": "30m0s",
"to": "0s"
},
"datasourceUID": "merlin-mimir",
"model": {
"datasource": {
"type": "prometheus",
"uid": "merlin-mimir"
},
"editorMode": "code",
"expr": "node_load15{instance=~\"login.*\"}",
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A"
}
},
"C": {
"model": {
"conditions": [
{
"evaluator": {
"params": [
50
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"C"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "__expr__"
},
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
},
"source": true
}
},
"panelRef": {
"dashboardUID": "hacmpzr",
"panelID": 3
}
},
"status": {}
}
+109
View File
@@ -0,0 +1,109 @@
{
"metadata": {
"name": null,
"namespace": "default",
"uid": null,
"resourceVersion": "2",
"labels": {
"grafana.app/folder": "bf7geueotdmgwc",
"grafana.com/group": "10m-check-interval",
"grafana.com/group-index": "1"
},
"annotations": {
"grafana.app/folder": "bf7geueotdmgwc",
"grafana.app/updatedBy": "ff52p5ojz4b28c",
"grafana.app/updatedTimestamp": "2026-05-20T14:34:40Z",
"grafana.com/provenance": "",
"grafana.com/updateTimestamp": "2026-05-20T14:34:40Z",
"grafana.com/updatedBy": "ff52p5ojz4b28c"
}
},
"spec": {
"title": "SlurmNodeUnstable",
"trigger": {
"interval": "10m"
},
"labels": {
"component": "slurm",
"type": "state"
},
"annotations": {
"__dashboardUid__": "de7rpsq1merlin7slurm1",
"__panelId__": "7",
"description": "A node or group of nodes is in an unstable state and requires intervention by an administrator. This could be due to a hardware issue, leading to state notifications like down or unknown, or it can be causes by some scheduling issue, like misbehaving job.",
"summary": "A node or group of nodes is in an unstable state!"
},
"for": "10m0s",
"noDataState": "KeepLast",
"execErrState": "Error",
"notificationSettings": {
"receiver": "Merlin Alarms"
},
"expressions": {
"A": {
"relativeTimeRange": {
"from": "10m0s",
"to": "0s"
},
"datasourceUID": "merlin-mimir",
"model": {
"datasource": {
"type": "prometheus",
"uid": "merlin-mimir"
},
"editorMode": "code",
"expr": "sum(slurm_node_state{state=~\"inval|drain|drng|fail|failg|down|unk\"}) by (node,cluster,state)",
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A"
}
},
"C": {
"queryType": "expression",
"model": {
"conditions": [
{
"evaluator": {
"params": [
0
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"C"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "__expr__"
},
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
},
"source": true
}
},
"panelRef": {
"dashboardUID": "de7rpsq1merlin7slurm1",
"panelID": 7
}
},
"status": {}
}
+107
View File
@@ -0,0 +1,107 @@
{
"metadata": {
"name": null,
"namespace": "default",
"uid": null,
"resourceVersion": "2",
"labels": {
"grafana.app/folder": "afbmpbce6xa80b",
"grafana.com/group": "5m-check-interval",
"grafana.com/group-index": "1"
},
"annotations": {
"grafana.app/folder": "afbmpbce6xa80b",
"grafana.app/updatedBy": "ff52p5ojz4b28c",
"grafana.app/updatedTimestamp": "2026-05-20T14:57:21Z",
"grafana.com/provenance": "",
"grafana.com/updateTimestamp": "2026-05-20T14:57:21Z",
"grafana.com/updatedBy": "ff52p5ojz4b28c"
}
},
"spec": {
"title": "UiNodeLoadHigh",
"trigger": {
"interval": "5m"
},
"labels": {
"component": "linux",
"type": "load"
},
"annotations": {
"__dashboardUid__": "013cf482-687c-4b7b-951f-4d88ef78514f",
"__panelId__": "18",
"summary": "The login node is experiencing unusually high load (over 50.0 for 15m)!"
},
"for": "5m0s",
"noDataState": "NoData",
"execErrState": "Error",
"notificationSettings": {
"receiver": "Teir3 Alerts"
},
"expressions": {
"A": {
"relativeTimeRange": {
"from": "30m0s",
"to": "0s"
},
"datasourceUID": "tier3-mimir",
"model": {
"datasource": {
"type": "prometheus",
"uid": "tier3-mimir"
},
"editorMode": "code",
"expr": "node_load15{instance=~\"t3ui.*\"}",
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A"
}
},
"C": {
"model": {
"conditions": [
{
"evaluator": {
"params": [
50
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"C"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "__expr__"
},
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
},
"source": true
}
},
"panelRef": {
"dashboardUID": "013cf482-687c-4b7b-951f-4d88ef78514f",
"panelID": 18
}
},
"status": {}
}