Refactor EPICS2SMS: explicit SMTP via smtp.psi.ch, no local sendmail

This commit is contained in:
2026-01-22 18:46:53 +01:00
parent 13c8881ec8
commit 499bf058e3
2 changed files with 219 additions and 118 deletions
+86 -118
View File
@@ -1,31 +1,32 @@
#!/usr/bin/env bash
# --- configuration ---------------------------------------------------------
# ---------------------------------------------------------------------------
# EPICS monitor script
# - polls an EPICS PV
# - considers the system "OK" if numeric PV value > OK_THRESHOLD
# - sends ONE alert when it becomes NOT OK (or unreadable)
# - waits until OK again before sending another alert
#
# Mail/SMS transport is delegated to smtp_send.py (direct SMTP to smtp.psi.ch).
# IMPORTANT: From address must be a registered sender (e.g. cSAXS@psi.ch).
# ---------------------------------------------------------------------------
# Example of an enum PV (typically returned with quotes by caget, e.g. "\"ON\""):
# PV_NAME="X12SA-FE-VMMG-0010:PLC_RELAY-D"
# Recipients:
# - Swisscom mail2sms gateway address (phone number + domain)
# - Optional CC to a normal mailbox for traceability
RECIPIENTS=(
"0041793083005@sms.switch.ch"
"andreas.menzel@psi.ch"
)
PV_NAME="AGEBD-PARAMS:INJECTION-RATE" # EPICS PV to monitor
PHONE_NUMBER="0041793083005" # phone number(s), space separated (may be empty)
EMAIL="andreas.menzel@psi.ch" # email address(es), space separated (may be empty)
POLL_INTERVAL=5 # seconds between polls while Running
ERROR_INTERVAL=60 # seconds between polls while NOT Running
ALERT_SUBJECT="EPICS alert"
FROM_ADDR="cSAXS@psi.ch"
# Define what "Running" means for the PV:
# CHECK_MODE="numeric": Running if numeric PV value > OK_THRESHOLD
# CHECK_MODE="enum": Running if PV string equals OK_STATE exactly
CHECK_MODE="numeric" # numeric | enum
OK_THRESHOLD=0 # used when CHECK_MODE=numeric
OK_STATE="\"ON\"" # used when CHECK_MODE=enum (match exact caget output)
PV_NAME="AGEBD-PARAMS:INJECTION-RATE"
OK_THRESHOLD=0
# Notification delivery
SENDMAIL="/usr/sbin/sendmail"
SMS_GATEWAY_DOMAIN="mail2sms.swisscom.com"
# Set to 1 to print the email payload instead of sending it (useful for testing)
DRY_RUN=0
# --- helpers ---------------------------------------------------------------
POLL_INTERVAL=5
ERROR_INTERVAL=60
log() {
printf '[%s] %s\n' "$(date '+%F %T')" "$*"
@@ -39,133 +40,100 @@ require_command() {
}
}
is_running() {
# Decide whether a PV value should be treated as "Running".
# Returns:
# 0 => Running
# 1 => Not Running
# 2 => Configuration error
local v="$1"
# Resolve smtp_send.py relative to this script, not the current working directory.
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
PYTHON_SENDER="${SCRIPT_DIR}/smtp_send.py"
case "$CHECK_MODE" in
numeric)
# IMPORTANT: pass PV value as data (-v v="...") rather than splicing it into awk code.
# (v+0) forces numeric conversion so non-numeric values become 0.
awk -v v="$v" -v t="$OK_THRESHOLD" 'BEGIN { exit !((v+0) > (t+0)) }'
;;
enum)
[[ "$v" == "$OK_STATE" ]]
;;
*)
log "ERROR: unknown CHECK_MODE='$CHECK_MODE' (expected 'numeric' or 'enum')"
return 2
;;
esac
is_numeric() {
# Accepts integers, decimals, and scientific notation.
# Examples: 1, -1, 1.0, .5, 1e-3, -2.3E+4
local s="$1"
[[ "$s" =~ ^-?([0-9]+([.][0-9]*)?|[.][0-9]+)([eE][+-]?[0-9]+)?$ ]]
}
send_notification() {
# This sends to:
# - phone numbers via <number>@mail2sms.swisscom.com
# - email addresses directly
#
# Usage: send_notification "<phone numbers>" "<email addresses>" "<message>"
local numbers="$1 $2" # combine PHONE_NUMBER and EMAIL
local message="$3"
is_ok() {
# Contract:
# - Returns 0 (true) iff the PV value is numeric AND value > OK_THRESHOLD.
# - Non-numeric values are treated as NOT running (robust against threshold changes).
local v="$1"
local subject="[EPICS] $PV_NAME"
local -a to_lines=()
is_numeric "$v" || return 1
# Build To: header lines
local -a recipients=()
read -r -a recipients <<<"$numbers"
awk -v v="$v" -v t="$OK_THRESHOLD" 'BEGIN { exit !(v > t) }'
}
local tok
for tok in "${recipients[@]}"; do
if [[ "$tok" == *"@"* ]]; then
to_lines+=("To: $tok")
else
to_lines+=("To: ${tok}@${SMS_GATEWAY_DOMAIN}")
send_alert() {
local body="$1"
local subject="${2:-$ALERT_SUBJECT}"
local rc=0
local to
for to in "${RECIPIENTS[@]}"; do
# smtp_send.py expects: --to (repeatable), optional flags, then message.
if ! python3 "$PYTHON_SENDER" \
--to "$to" \
--subject "$subject" \
--from-addr "$FROM_ADDR" \
"$body"; then
log "ERROR: failed to send alert to '$to'"
rc=1
fi
done
if ((${#to_lines[@]} == 0)); then
log "ERROR: no recipients configured (PHONE_NUMBER and EMAIL are empty)"
return 3
fi
# Append a timestamp
local message_with_date
message_with_date="$message"$'\n'"$(date)"
# Choose send command (real vs dry-run)
local -a send_cmd=("$SENDMAIL" -t)
((DRY_RUN)) && send_cmd=(cat)
{
printf 'From: cSAXS@psi.ch\n'
printf '%s\n' "${to_lines[@]}"
printf 'Subject: %s\n' "$subject"
printf '\n'
printf '%s\n' "$message_with_date"
} | "${send_cmd[@]}"
return "$rc"
}
# --- main loop ------------------------------------------------------------
# --- startup checks ---------------------------------------------------------
require_command caget
require_command python3
# Basic startup sanity checks
if [[ -z "${PHONE_NUMBER// /}" && -z "${EMAIL// /}" ]]; then
log "ERROR: at least one recipient must be configured (PHONE_NUMBER and/or EMAIL)."
if [[ ! -f "$PYTHON_SENDER" ]]; then
log "ERROR: Python sender script not found: $PYTHON_SENDER"
exit 2
fi
if ((!DRY_RUN)); then
if [[ ! -x "$SENDMAIL" ]]; then
log "ERROR: SENDMAIL='$SENDMAIL' is not executable (set DRY_RUN=1 to test without sendmail)."
exit 2
fi
fi
alert_sent=0 # 0 = no alert sent for current outage, 1 = already sent
log "Starting EPICS monitor for PV '$PV_NAME' (CHECK_MODE=$CHECK_MODE)..."
alert_sent=0
log "Starting EPICS monitor for PV '$PV_NAME' (OK if value > $OK_THRESHOLD) ..."
while true; do
# Get PV value as a plain string; adjust flags if needed (-S for string PVs)
value=$(caget -noname -nounit "$PV_NAME" 2>/dev/null)
value="$(caget -noname -nounit "$PV_NAME" 2>/dev/null)"
status=$?
# If caget fails, treat that as "not Running" (e.g. IOC down)
pv_read_ok=1
if [[ $status -ne 0 ]]; then
pv_read_ok=0
log "WARNING: caget failed for '$PV_NAME' (exit $status)"
value="UNAVAILABLE (caget exit $status)"
fi
if is_running "$value"; then
# System is OK again
if ((pv_read_ok)) && is_ok "$value"; then
# System OK again
if [[ $alert_sent -eq 1 ]]; then
log "PV '$PV_NAME' back to Running (value='$value')."
log "PV back to OK (value='$value')"
fi
alert_sent=0
sleep "$POLL_INTERVAL"
else
rc=$?
if [[ $rc -eq 2 ]]; then
log "ERROR: stopping due to configuration error in CHECK_MODE."
exit 2
continue
fi
# Not OK: either unreadable, or value indicates not OK.
if [[ $alert_sent -eq 0 ]]; then
if ((pv_read_ok)); then
log "PV not OK (value='$value') - sending alert"
msg="Alert: PV $PV_NAME not OK (value='$value') at $(date)"
else
log "PV unreadable - sending alert"
msg="Alert: PV $PV_NAME unreadable (caget exit $status) at $(date)"
fi
# System not OK
if [[ $alert_sent -eq 0 ]]; then
msg="Alert: PV $PV_NAME is '$value'."
if send_notification "$PHONE_NUMBER" "$EMAIL" "$msg"; then
alert_sent=1
log "Alert sent."
else
log "ERROR: failed to send alert; will retry on next check."
fi
if send_alert "$msg" "$ALERT_SUBJECT"; then
alert_sent=1
log "Alert sent."
else
log "ERROR: alert send failed; will retry"
alert_sent=0
fi
sleep "$ERROR_INTERVAL"
fi
sleep "$ERROR_INTERVAL"
done
+133
View File
@@ -0,0 +1,133 @@
#!/usr/bin/env python3
"""
smtp_send.py
Send a plain-text alert message via direct SMTP to the PSI mail infrastructure.
This script deliberately does NOT use local sendmail.
Design goals:
- Explicit SMTP connection to smtp.psi.ch
- No SMTP authentication (assumes trusted internal network)
- Fixed default sender address (can be overridden)
- Predictable exit codes and errors for use from shell scripts
"""
from __future__ import annotations
import argparse
import sys
import smtplib
from email.message import EmailMessage
from email.utils import formatdate
from typing import List
DEFAULT_SMTP_SERVER = "smtp.psi.ch"
DEFAULT_SMTP_PORT = 25
DEFAULT_TIMEOUT_S = 10.0
DEFAULT_FROM_ADDR = "cSAXS@psi.ch"
DEFAULT_SUBJECT = "EPICS alert"
def build_message(
*, from_addr: str, to_addrs: List[str], subject: str, body: str
) -> EmailMessage:
msg = EmailMessage()
msg["From"] = from_addr
msg["To"] = ", ".join(to_addrs)
msg["Subject"] = subject
msg["Date"] = formatdate(localtime=True)
msg.set_content(body)
return msg
def send_email_via_smtp(
*,
to_addrs: List[str],
body: str,
subject: str = DEFAULT_SUBJECT,
from_addr: str = DEFAULT_FROM_ADDR,
smtp_server: str = DEFAULT_SMTP_SERVER,
smtp_port: int = DEFAULT_SMTP_PORT,
timeout_s: float = DEFAULT_TIMEOUT_S,
) -> None:
msg = build_message(
from_addr=from_addr, to_addrs=to_addrs, subject=subject, body=body
)
with smtplib.SMTP(smtp_server, smtp_port, timeout=timeout_s) as smtp:
# Explicit envelope sender/recipients (avoid relying on headers for SMTP routing)
smtp.send_message(msg, from_addr=from_addr, to_addrs=to_addrs)
def parse_args(argv: List[str]) -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Send a plain-text alert email via direct SMTP (no local sendmail)."
)
p.add_argument(
"--to",
dest="to_addrs",
action="append",
required=True,
help="Recipient address. Repeat --to for multiple recipients.",
)
p.add_argument(
"message",
nargs="?",
default="-",
help="Message body. Use '-' (default) to read the body from stdin.",
)
p.add_argument("--subject", default=DEFAULT_SUBJECT, help="Email subject.")
p.add_argument(
"--from-addr", default=DEFAULT_FROM_ADDR, help="Envelope/header From address."
)
p.add_argument(
"--server", default=DEFAULT_SMTP_SERVER, help="SMTP server hostname."
)
p.add_argument(
"--port", type=int, default=DEFAULT_SMTP_PORT, help="SMTP server port."
)
p.add_argument(
"--timeout",
type=float,
default=DEFAULT_TIMEOUT_S,
help="SMTP timeout (seconds).",
)
return p.parse_args(argv)
def main(argv: List[str]) -> int:
args = parse_args(argv)
if args.message == "-":
body = sys.stdin.read()
else:
body = args.message
# Make the "empty message" case explicit: its almost always a bug in alerting.
if not body.strip():
print("ERROR: message body is empty", file=sys.stderr)
return 2
try:
send_email_via_smtp(
to_addrs=args.to_addrs,
body=body,
subject=args.subject,
from_addr=args.from_addr,
smtp_server=args.server,
smtp_port=args.port,
timeout_s=args.timeout,
)
except (OSError, smtplib.SMTPException) as e:
print(
f"ERROR: failed to send email via {args.server}:{args.port}: {e}",
file=sys.stderr,
)
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))