110 lines
2.9 KiB
JSON
110 lines
2.9 KiB
JSON
{
|
|
"metadata": {
|
|
"name": null,
|
|
"namespace": "default",
|
|
"uid": null,
|
|
"resourceVersion": "2",
|
|
"labels": {
|
|
"grafana.app/folder": "bf7geueotdmgwc",
|
|
"grafana.com/group": "10m-check-interval",
|
|
"grafana.com/group-index": "1"
|
|
},
|
|
"annotations": {
|
|
"grafana.app/folder": "bf7geueotdmgwc",
|
|
"grafana.app/updatedBy": "ff52p5ojz4b28c",
|
|
"grafana.app/updatedTimestamp": "2026-05-20T14:34:40Z",
|
|
"grafana.com/provenance": "",
|
|
"grafana.com/updateTimestamp": "2026-05-20T14:34:40Z",
|
|
"grafana.com/updatedBy": "ff52p5ojz4b28c"
|
|
}
|
|
},
|
|
"spec": {
|
|
"title": "SlurmNodeUnstable",
|
|
"trigger": {
|
|
"interval": "10m"
|
|
},
|
|
"labels": {
|
|
"component": "slurm",
|
|
"type": "state"
|
|
},
|
|
"annotations": {
|
|
"__dashboardUid__": "de7rpsq1merlin7slurm1",
|
|
"__panelId__": "7",
|
|
"description": "A node or group of nodes is in an unstable state and requires intervention by an administrator. This could be due to a hardware issue, leading to state notifications like down or unknown, or it can be causes by some scheduling issue, like misbehaving job.",
|
|
"summary": "A node or group of nodes is in an unstable state!"
|
|
},
|
|
"for": "10m0s",
|
|
"noDataState": "KeepLast",
|
|
"execErrState": "Error",
|
|
"notificationSettings": {
|
|
"receiver": "Merlin Alarms"
|
|
},
|
|
"expressions": {
|
|
"A": {
|
|
"relativeTimeRange": {
|
|
"from": "10m0s",
|
|
"to": "0s"
|
|
},
|
|
"datasourceUID": "merlin-mimir",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "merlin-mimir"
|
|
},
|
|
"editorMode": "code",
|
|
"expr": "sum(slurm_node_state{state=~\"inval|drain|drng|fail|failg|down|unk\"}) by (node,cluster,state)",
|
|
"instant": true,
|
|
"intervalMs": 1000,
|
|
"legendFormat": "__auto",
|
|
"maxDataPoints": 43200,
|
|
"range": false,
|
|
"refId": "A"
|
|
}
|
|
},
|
|
"C": {
|
|
"queryType": "expression",
|
|
"model": {
|
|
"conditions": [
|
|
{
|
|
"evaluator": {
|
|
"params": [
|
|
0
|
|
],
|
|
"type": "gt"
|
|
},
|
|
"operator": {
|
|
"type": "and"
|
|
},
|
|
"query": {
|
|
"params": [
|
|
"C"
|
|
]
|
|
},
|
|
"reducer": {
|
|
"params": [],
|
|
"type": "last"
|
|
},
|
|
"type": "query"
|
|
}
|
|
],
|
|
"datasource": {
|
|
"type": "__expr__",
|
|
"uid": "__expr__"
|
|
},
|
|
"expression": "A",
|
|
"intervalMs": 1000,
|
|
"maxDataPoints": 43200,
|
|
"refId": "C",
|
|
"type": "threshold"
|
|
},
|
|
"source": true
|
|
}
|
|
},
|
|
"panelRef": {
|
|
"dashboardUID": "de7rpsq1merlin7slurm1",
|
|
"panelID": 7
|
|
}
|
|
},
|
|
"status": {}
|
|
}
|