{ "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": null, "liveNow": false, "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 12, "title": "Overview", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "description": "This only accounts for burst nodes", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 2, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": 0 }, { "color": "orange", "value": 0.97 }, { "color": "green", "value": 0.99 } ] }, "unit": "percentunit" } }, "gridPos": { "h": 9, "w": 8, "x": 0, "y": 1 }, "id": 11, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "\navg_over_time(\n avg(\n probe_success{job=\"prometheus.exporter.blackbox.blackbox/ssh_banner\"}\n )[$__interval:]\n)", "instant": false, "legendFormat": " ", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "avg_over_time(\n avg(\n probe_success{job=\"prometheus.exporter.blackbox.blackbox/ssh_banner\"}\n )[30d:]\n)", "instant": false, "legendFormat": "30 days", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "avg_over_time(\n avg(\n probe_success{job=\"prometheus.exporter.blackbox.blackbox/ssh_banner\"}\n )[90d:]\n)", "instant": false, "legendFormat": "90 days", "range": true, "refId": "B" } ], "title": "Cluster Uptime", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "description": "Checks if network jitter rises above a specific threshold", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "noValue": "DOWN", "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": 0 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "PSI <-> CSCS", "scope": "series" }, "properties": [ { "id": "mappings", "value": [ { "options": { "from": 0.001, "result": { "color": "green", "index": 0, "text": "OK" }, "to": 0.026 }, "type": "range" }, { "options": { "from": 0.026, "result": { "color": "orange", "index": 1, "text": "UNSTABLE" } }, "type": "range" } ] } ] }, { "matcher": { "id": "byName", "options": "CSCS SSHOT", "scope": "series" }, "properties": [ { "id": "mappings", "value": [ { "options": { "from": 0.001, "result": { "color": "green", "index": 0, "text": "OK" }, "to": 0.007 }, "type": "range" }, { "options": { "from": 0.007, "result": { "color": "orange", "index": 1, "text": "UNSTABLE" } }, "type": "range" } ] } ] } ] }, "gridPos": { "h": 9, "w": 8, "x": 8, "y": 1 }, "id": 8, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "iperf3_received_jitter_ms{job=\"prometheus.scrape.iperf3_metrics\", instance=\"merlin7-admin01\"}", "instant": false, "legendFormat": "PSI <-> CSCS", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "iperf3_received_jitter_ms{job=\"prometheus.scrape.iperf3_metrics\", instance=\"psistorn00\"}", "instant": false, "legendFormat": "CSCS SSHOT", "range": true, "refId": "B" } ], "title": "Network Status", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "description": "Checks individual node status", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Last", "scope": "series" }, "properties": [ { "id": "mappings", "value": [ { "options": { "0": { "color": "green", "index": 1, "text": "OK" }, "1": { "color": "yellow", "index": 0, "text": "ISSUE" } }, "type": "value" } ] } ] }, { "matcher": { "id": "byName", "options": "Uptime (%)", "scope": "series" }, "properties": [ { "id": "unit", "value": "percentunit" } ] } ] }, "gridPos": { "h": 9, "w": 8, "x": 16, "y": 1 }, "id": 7, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "hpe_redfish_clusterstor_node_health", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "avg_over_time(avg(probe_success{job=\"prometheus.exporter.blackbox.blackbox/lustre\"})[$__interval:])", "instant": false, "legendFormat": "Uptime (%)", "range": true, "refId": "B" } ], "title": "Storage Health", "transformations": [ { "filter": { "id": "byRefId", "options": "/^(?:A)$/" }, "id": "reduce", "options": { "labelsToFields": false, "reducers": [ "last" ] } } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "description": "Any yellow/red lines indicate that nodes are not responding to our probes, which probably means that they are down.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "axisPlacement": "auto", "axisWidth": 0, "fillOpacity": 70, "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineWidth": 1, "spanNulls": false }, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": 0 }, { "color": "#EAB839", "value": 0.95 }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 9, "w": 8, "x": 0, "y": 10 }, "id": 1, "options": { "alignValue": "left", "annotations": { "clustering": -1, "multiLane": false }, "legend": { "displayMode": "list", "placement": "bottom", "showLegend": false }, "mergeValues": true, "rowHeight": 0.44, "showValue": "never", "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "sum(probe_success{job=\"prometheus.exporter.blackbox.blackbox/ssh_banner\"}) / count(probe_success{job=\"prometheus.exporter.blackbox.blackbox/ssh_banner\"})", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Cluster Status Timeline", "type": "state-timeline" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "description": "This panel should be empty, if it is not, that means node(s) are down and not responding to SSH probes.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "axisPlacement": "auto", "fillOpacity": 70, "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineWidth": 1, "spanNulls": false }, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": 0 } ] } } }, "gridPos": { "h": 9, "w": 8, "x": 8, "y": 10 }, "id": 10, "options": { "alignValue": "left", "annotations": { "clustering": -1, "multiLane": false }, "legend": { "displayMode": "list", "placement": "bottom", "showLegend": false }, "mergeValues": true, "rowHeight": 0.9, "showValue": "never", "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "exemplar": false, "expr": "probe_success{job=\"prometheus.exporter.blackbox.blackbox/ssh_banner\"} == 0", "instant": false, "legendFormat": "{{hostname}}", "range": true, "refId": "A" } ], "title": "Node Status Timeline (DOWN)", "type": "state-timeline" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 19 }, "id": 13, "title": "Node Status", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "yellow", "value": 60 }, { "color": "red", "value": 100 } ] } } }, "gridPos": { "h": 9, "w": 8, "x": 0, "y": 20 }, "id": 3, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": true, "textMode": "auto", "wideLayout": true }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "node_load1{job=\"integrations/unix\",instance=~\"login.*\"}", "instant": false, "legendFormat": "{{instance}}", "range": true, "refId": "A" } ], "title": "Login Node Load (avg)", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "showValues": false, "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Max Load", "scope": "series" }, "properties": [ { "id": "custom.fillOpacity", "value": 0 }, { "id": "custom.showPoints", "value": "never" }, { "id": "custom.lineStyle", "value": { "dash": [ 10, 10 ], "fill": "dash" } } ] } ] }, "gridPos": { "h": 9, "w": 8, "x": 8, "y": 20 }, "id": 2, "options": { "annotations": { "clustering": -1, "multiLane": false }, "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "node_load1{job=\"integrations/unix\",instance=~\"cn.*|gpu.*|login.*\"}", "instant": false, "legendFormat": "{{instance}}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "max(count without (cpu) (node_cpu_seconds_total{job=\"integrations/unix\", instance=~\"cn.*|gpu.*|login.*\", mode=\"idle\"}))", "hide": true, "instant": false, "legendFormat": "Max Load", "range": true, "refId": "B" } ], "title": "Node Load (1m)", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 29 }, "id": 14, "title": "Network Status", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "showValues": false, "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] } } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 30 }, "id": 4, "options": { "annotations": { "clustering": -1, "multiLane": false }, "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "iperf3_received_jitter_ms{job=\"prometheus.scrape.iperf3_metrics\", node=\"merlin7-admin01\"}", "instant": false, "legendFormat": "{{node}} - PSI", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "iperf3_received_jitter_ms{job=\"prometheus.scrape.iperf3_metrics\", node=\"psistorn00\"}", "instant": false, "legendFormat": "{{node}} - CSCS/SSHOT", "range": true, "refId": "B" } ], "title": "Network Stability (jitter ms)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "showValues": false, "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "area" } }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 5 } ] } } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 30 }, "id": 5, "options": { "annotations": { "clustering": -1, "multiLane": false }, "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "iperf3_sent_lost_percent", "instant": false, "legendFormat": "{{node}}", "range": true, "refId": "A" } ], "title": "Lost Packets", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }, "id": 15, "title": "Storage Status", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 25, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "showValues": false, "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Max Load", "scope": "series" }, "properties": [ { "id": "custom.stacking", "value": { "group": "A", "mode": "none" } }, { "id": "custom.fillOpacity", "value": 0 }, { "id": "custom.lineStyle", "value": { "dash": [ 10, 10 ], "fill": "dash" } } ] } ] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 39 }, "id": 6, "options": { "annotations": { "clustering": -1, "multiLane": false }, "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "hpe_redfish_clusterstor_node_loadaverage1m", "instant": false, "legendFormat": "{{hostname}}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "hpe_redfish_clusterstor_nodes_total * 32", "instant": false, "legendFormat": "Max Load", "range": true, "refId": "B" } ], "title": "PSIstor Node Load (1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "showValues": false, "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 39 }, "id": 9, "options": { "annotations": { "clustering": -1, "multiLane": false }, "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "13.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "merlin-mimir" }, "editorMode": "code", "expr": "hpe_redfish_clusterstor_node_memoryutilization_percent", "instant": false, "legendFormat": "{{hostname}}", "range": true, "refId": "A" } ], "title": "Memory Utilisation (%)", "type": "timeseries" } ], "preload": false, "refresh": "", "schemaVersion": 42, "tags": [ "storage", "compute", "network" ], "time": { "from": "now-6h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ] }, "timezone": "browser", "title": "General Overview", "uid": "hacmpzr", "version": 40 }