171 lines
4.7 KiB
Bash
Executable File
171 lines
4.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# --- configuration ---------------------------------------------------------
|
|
|
|
# Example of an enum PV (typically returned with quotes by caget, e.g. "\"ON\""):
|
|
# PV_NAME="X12SA-FE-VMMG-0010:PLC_RELAY-D"
|
|
|
|
PV_NAME="AGEBD-PARAMS:INJECTION-RATE" # EPICS PV to monitor
|
|
PHONE_NUMBER="0041793083005" # phone number(s), space separated (may be empty)
|
|
EMAIL="andreas.menzel@psi.ch" # email address(es), space separated (may be empty)
|
|
POLL_INTERVAL=5 # seconds between polls while Running
|
|
ERROR_INTERVAL=60 # seconds between polls while NOT Running
|
|
|
|
# Define what "Running" means for the PV:
|
|
# CHECK_MODE="numeric": Running if numeric PV value > OK_THRESHOLD
|
|
# CHECK_MODE="enum": Running if PV string equals OK_STATE exactly
|
|
CHECK_MODE="numeric" # numeric | enum
|
|
OK_THRESHOLD=0 # used when CHECK_MODE=numeric
|
|
OK_STATE="\"ON\"" # used when CHECK_MODE=enum (match exact caget output)
|
|
|
|
# Notification delivery
|
|
SENDMAIL="/usr/sbin/sendmail"
|
|
SMS_GATEWAY_DOMAIN="sms.switch.ch"
|
|
|
|
# Set to 1 to print the email payload instead of sending it (useful for testing)
|
|
DRY_RUN=0
|
|
|
|
# --- helpers ---------------------------------------------------------------
|
|
|
|
log() {
|
|
printf '[%s] %s\n' "$(date '+%F %T')" "$*"
|
|
}
|
|
|
|
require_command() {
|
|
local cmd="$1"
|
|
command -v "$cmd" >/dev/null 2>&1 || {
|
|
log "ERROR: required command not found: $cmd"
|
|
exit 2
|
|
}
|
|
}
|
|
|
|
is_running() {
|
|
# Decide whether a PV value should be treated as "Running".
|
|
# Returns:
|
|
# 0 => Running
|
|
# 1 => Not Running
|
|
# 2 => Configuration error
|
|
local v="$1"
|
|
|
|
case "$CHECK_MODE" in
|
|
numeric)
|
|
# IMPORTANT: pass PV value as data (-v v="...") rather than splicing it into awk code.
|
|
# (v+0) forces numeric conversion so non-numeric values become 0.
|
|
awk -v v="$v" -v t="$OK_THRESHOLD" 'BEGIN { exit !((v+0) > (t+0)) }'
|
|
;;
|
|
enum)
|
|
[[ "$v" == "$OK_STATE" ]]
|
|
;;
|
|
*)
|
|
log "ERROR: unknown CHECK_MODE='$CHECK_MODE' (expected 'numeric' or 'enum')"
|
|
return 2
|
|
;;
|
|
esac
|
|
}
|
|
|
|
send_notification() {
|
|
# This sends to:
|
|
# - phone numbers via <number>@sms.switch.ch
|
|
# - email addresses directly
|
|
#
|
|
# Usage: send_notification "<phone numbers>" "<email addresses>" "<message>"
|
|
local numbers="$1 $2" # combine PHONE_NUMBER and EMAIL
|
|
local message="$3"
|
|
|
|
local subject="[EPICS] $PV_NAME"
|
|
local -a to_lines=()
|
|
|
|
# Build To: header lines
|
|
local -a recipients=()
|
|
read -r -a recipients <<<"$numbers"
|
|
|
|
local tok
|
|
for tok in "${recipients[@]}"; do
|
|
if [[ "$tok" == *"@"* ]]; then
|
|
to_lines+=("To: $tok")
|
|
else
|
|
to_lines+=("To: ${tok}@${SMS_GATEWAY_DOMAIN}")
|
|
fi
|
|
done
|
|
|
|
if ((${#to_lines[@]} == 0)); then
|
|
log "ERROR: no recipients configured (PHONE_NUMBER and EMAIL are empty)"
|
|
return 3
|
|
fi
|
|
|
|
# Append a timestamp
|
|
local message_with_date
|
|
message_with_date="$message"$'\n'"$(date)"
|
|
|
|
# Choose send command (real vs dry-run)
|
|
local -a send_cmd=("$SENDMAIL" -t)
|
|
((DRY_RUN)) && send_cmd=(cat)
|
|
|
|
{
|
|
printf '%s\n' "${to_lines[@]}"
|
|
printf 'Subject: %s\n' "$subject"
|
|
printf '\n'
|
|
printf '%s\n' "$message_with_date"
|
|
} | "${send_cmd[@]}"
|
|
}
|
|
|
|
# --- main loop ------------------------------------------------------------
|
|
|
|
require_command caget
|
|
|
|
# Basic startup sanity checks
|
|
if [[ -z "${PHONE_NUMBER// /}" && -z "${EMAIL// /}" ]]; then
|
|
log "ERROR: at least one recipient must be configured (PHONE_NUMBER and/or EMAIL)."
|
|
exit 2
|
|
fi
|
|
|
|
if ((!DRY_RUN)); then
|
|
if [[ ! -x "$SENDMAIL" ]]; then
|
|
log "ERROR: SENDMAIL='$SENDMAIL' is not executable (set DRY_RUN=1 to test without sendmail)."
|
|
exit 2
|
|
fi
|
|
fi
|
|
|
|
alert_sent=0 # 0 = no alert sent for current outage, 1 = already sent
|
|
|
|
log "Starting EPICS monitor for PV '$PV_NAME' (CHECK_MODE=$CHECK_MODE)..."
|
|
|
|
while true; do
|
|
# Get PV value as a plain string; adjust flags if needed (-S for string PVs)
|
|
value=$(caget -noname -nounit "$PV_NAME" 2>/dev/null)
|
|
status=$?
|
|
|
|
# If caget fails, treat that as "not Running" (e.g. IOC down)
|
|
if [[ $status -ne 0 ]]; then
|
|
log "WARNING: caget failed for '$PV_NAME' (exit $status)"
|
|
value="UNAVAILABLE (caget exit $status)"
|
|
fi
|
|
|
|
if is_running "$value"; then
|
|
# System is OK again
|
|
if [[ $alert_sent -eq 1 ]]; then
|
|
log "PV '$PV_NAME' back to Running (value='$value')."
|
|
fi
|
|
alert_sent=0
|
|
sleep "$POLL_INTERVAL"
|
|
else
|
|
rc=$?
|
|
if [[ $rc -eq 2 ]]; then
|
|
log "ERROR: stopping due to configuration error in CHECK_MODE."
|
|
exit 2
|
|
fi
|
|
|
|
# System not OK
|
|
if [[ $alert_sent -eq 0 ]]; then
|
|
msg="Alert: PV $PV_NAME is '$value'."
|
|
if send_notification "$PHONE_NUMBER" "$EMAIL" "$msg"; then
|
|
alert_sent=1
|
|
log "Alert sent."
|
|
else
|
|
log "ERROR: failed to send alert; will retry on next check."
|
|
fi
|
|
fi
|
|
sleep "$ERROR_INTERVAL"
|
|
fi
|
|
done
|