diff --git a/alerts/Merlin7/Login002_Load_High.yaml b/alerts/Merlin7/Login002_Load_High.yaml new file mode 100644 index 0000000..1c9c8a1 --- /dev/null +++ b/alerts/Merlin7/Login002_Load_High.yaml @@ -0,0 +1,102 @@ +{ + "metadata": { + "name": null, + "namespace": "default", + "uid": null, + "resourceVersion": "13", + "labels": { + "grafana.app/folder": "bf7geueotdmgwc", + "grafana.com/group": "5m-check-interval", + "grafana.com/group-index": "2" + }, + "annotations": { + "grafana.app/folder": "bf7geueotdmgwc", + "grafana.app/updatedBy": "ff52p5ojz4b28c", + "grafana.app/updatedTimestamp": "2026-05-20T14:36:25Z", + "grafana.com/provenance": "", + "grafana.com/updateTimestamp": "2026-05-20T14:36:25Z", + "grafana.com/updatedBy": "ff52p5ojz4b28c" + } + }, + "spec": { + "title": "Login002 Load High", + "paused": true, + "trigger": { + "interval": "5m" + }, + "labels": { + "component": "linux", + "type": "load" + }, + "annotations": { + "summary": "The node login002 is experiencing unusually high load (over 50.0 for 15m)!" + }, + "for": "5m0s", + "noDataState": "NoData", + "execErrState": "Error", + "notificationSettings": { + "receiver": "Merlin Alarms" + }, + "expressions": { + "A": { + "relativeTimeRange": { + "from": "30m0s", + "to": "0s" + }, + "datasourceUID": "merlin-mimir", + "model": { + "datasource": { + "type": "prometheus", + "uid": "merlin-mimir" + }, + "editorMode": "code", + "expr": "node_load15{instance=\"login002\"}", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "A" + } + }, + "C": { + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 50 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + }, + "source": true + } + } + }, + "status": {} +} diff --git a/alerts/Merlin7/LoginNodeLoadHigh.yaml b/alerts/Merlin7/LoginNodeLoadHigh.yaml new file mode 100644 index 0000000..96281ec --- /dev/null +++ b/alerts/Merlin7/LoginNodeLoadHigh.yaml @@ -0,0 +1,107 @@ +{ + "metadata": { + "name": null, + "namespace": "default", + "uid": null, + "resourceVersion": "14", + "labels": { + "grafana.app/folder": "bf7geueotdmgwc", + "grafana.com/group": "5m-check-interval", + "grafana.com/group-index": "1" + }, + "annotations": { + "grafana.app/folder": "bf7geueotdmgwc", + "grafana.app/updatedBy": "ff52p5ojz4b28c", + "grafana.app/updatedTimestamp": "2026-05-20T14:36:25Z", + "grafana.com/provenance": "", + "grafana.com/updateTimestamp": "2026-05-20T14:36:25Z", + "grafana.com/updatedBy": "ff52p5ojz4b28c" + } + }, + "spec": { + "title": "LoginNodeLoadHigh", + "trigger": { + "interval": "5m" + }, + "labels": { + "component": "linux", + "type": "load" + }, + "annotations": { + "__dashboardUid__": "hacmpzr", + "__panelId__": "3", + "summary": "The login node is experiencing unusually high load (over 50.0 for 15m)!" + }, + "for": "15m0s", + "noDataState": "NoData", + "execErrState": "Error", + "notificationSettings": { + "receiver": "Merlin Alarms" + }, + "expressions": { + "A": { + "relativeTimeRange": { + "from": "30m0s", + "to": "0s" + }, + "datasourceUID": "merlin-mimir", + "model": { + "datasource": { + "type": "prometheus", + "uid": "merlin-mimir" + }, + "editorMode": "code", + "expr": "node_load15{instance=~\"login.*\"}", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "A" + } + }, + "C": { + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 50 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + }, + "source": true + } + }, + "panelRef": { + "dashboardUID": "hacmpzr", + "panelID": 3 + } + }, + "status": {} +} diff --git a/alerts/Merlin7/SlurmNodeUnstable.yaml b/alerts/Merlin7/SlurmNodeUnstable.yaml new file mode 100644 index 0000000..843698a --- /dev/null +++ b/alerts/Merlin7/SlurmNodeUnstable.yaml @@ -0,0 +1,109 @@ +{ + "metadata": { + "name": null, + "namespace": "default", + "uid": null, + "resourceVersion": "2", + "labels": { + "grafana.app/folder": "bf7geueotdmgwc", + "grafana.com/group": "10m-check-interval", + "grafana.com/group-index": "1" + }, + "annotations": { + "grafana.app/folder": "bf7geueotdmgwc", + "grafana.app/updatedBy": "ff52p5ojz4b28c", + "grafana.app/updatedTimestamp": "2026-05-20T14:34:40Z", + "grafana.com/provenance": "", + "grafana.com/updateTimestamp": "2026-05-20T14:34:40Z", + "grafana.com/updatedBy": "ff52p5ojz4b28c" + } + }, + "spec": { + "title": "SlurmNodeUnstable", + "trigger": { + "interval": "10m" + }, + "labels": { + "component": "slurm", + "type": "state" + }, + "annotations": { + "__dashboardUid__": "de7rpsq1merlin7slurm1", + "__panelId__": "7", + "description": "A node or group of nodes is in an unstable state and requires intervention by an administrator. This could be due to a hardware issue, leading to state notifications like down or unknown, or it can be causes by some scheduling issue, like misbehaving job.", + "summary": "A node or group of nodes is in an unstable state!" + }, + "for": "10m0s", + "noDataState": "KeepLast", + "execErrState": "Error", + "notificationSettings": { + "receiver": "Merlin Alarms" + }, + "expressions": { + "A": { + "relativeTimeRange": { + "from": "10m0s", + "to": "0s" + }, + "datasourceUID": "merlin-mimir", + "model": { + "datasource": { + "type": "prometheus", + "uid": "merlin-mimir" + }, + "editorMode": "code", + "expr": "sum(slurm_node_state{state=~\"inval|drain|drng|fail|failg|down|unk\"}) by (node,cluster,state)", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "A" + } + }, + "C": { + "queryType": "expression", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + }, + "source": true + } + }, + "panelRef": { + "dashboardUID": "de7rpsq1merlin7slurm1", + "panelID": 7 + } + }, + "status": {} +} diff --git a/alerts/Tier3/UiNodeLoadHigh.yaml b/alerts/Tier3/UiNodeLoadHigh.yaml new file mode 100644 index 0000000..4c3a962 --- /dev/null +++ b/alerts/Tier3/UiNodeLoadHigh.yaml @@ -0,0 +1,107 @@ +{ + "metadata": { + "name": null, + "namespace": "default", + "uid": null, + "resourceVersion": "2", + "labels": { + "grafana.app/folder": "afbmpbce6xa80b", + "grafana.com/group": "5m-check-interval", + "grafana.com/group-index": "1" + }, + "annotations": { + "grafana.app/folder": "afbmpbce6xa80b", + "grafana.app/updatedBy": "ff52p5ojz4b28c", + "grafana.app/updatedTimestamp": "2026-05-20T14:57:21Z", + "grafana.com/provenance": "", + "grafana.com/updateTimestamp": "2026-05-20T14:57:21Z", + "grafana.com/updatedBy": "ff52p5ojz4b28c" + } + }, + "spec": { + "title": "UiNodeLoadHigh", + "trigger": { + "interval": "5m" + }, + "labels": { + "component": "linux", + "type": "load" + }, + "annotations": { + "__dashboardUid__": "013cf482-687c-4b7b-951f-4d88ef78514f", + "__panelId__": "18", + "summary": "The login node is experiencing unusually high load (over 50.0 for 15m)!" + }, + "for": "5m0s", + "noDataState": "NoData", + "execErrState": "Error", + "notificationSettings": { + "receiver": "Teir3 Alerts" + }, + "expressions": { + "A": { + "relativeTimeRange": { + "from": "30m0s", + "to": "0s" + }, + "datasourceUID": "tier3-mimir", + "model": { + "datasource": { + "type": "prometheus", + "uid": "tier3-mimir" + }, + "editorMode": "code", + "expr": "node_load15{instance=~\"t3ui.*\"}", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "A" + } + }, + "C": { + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 50 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + }, + "source": true + } + }, + "panelRef": { + "dashboardUID": "013cf482-687c-4b7b-951f-4d88ef78514f", + "panelID": 18 + } + }, + "status": {} +}