From 499bf058e3cf8f552fa1fe4b0f341143211e8550 Mon Sep 17 00:00:00 2001 From: menzel Date: Thu, 22 Jan 2026 18:46:53 +0100 Subject: [PATCH] Refactor EPICS2SMS: explicit SMTP via smtp.psi.ch, no local sendmail --- EPICS2SMS.sh | 204 ++++++++++++++++++++++----------------------------- smtp_send.py | 133 +++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+), 118 deletions(-) create mode 100644 smtp_send.py diff --git a/EPICS2SMS.sh b/EPICS2SMS.sh index 17d4146..54d56ed 100755 --- a/EPICS2SMS.sh +++ b/EPICS2SMS.sh @@ -1,31 +1,32 @@ #!/usr/bin/env bash -# --- configuration --------------------------------------------------------- +# --------------------------------------------------------------------------- +# EPICS monitor script +# - polls an EPICS PV +# - considers the system "OK" if numeric PV value > OK_THRESHOLD +# - sends ONE alert when it becomes NOT OK (or unreadable) +# - waits until OK again before sending another alert +# +# Mail/SMS transport is delegated to smtp_send.py (direct SMTP to smtp.psi.ch). +# IMPORTANT: From address must be a registered sender (e.g. cSAXS@psi.ch). +# --------------------------------------------------------------------------- -# Example of an enum PV (typically returned with quotes by caget, e.g. "\"ON\""): -# PV_NAME="X12SA-FE-VMMG-0010:PLC_RELAY-D" +# Recipients: +# - Swisscom mail2sms gateway address (phone number + domain) +# - Optional CC to a normal mailbox for traceability +RECIPIENTS=( + "0041793083005@sms.switch.ch" + "andreas.menzel@psi.ch" +) -PV_NAME="AGEBD-PARAMS:INJECTION-RATE" # EPICS PV to monitor -PHONE_NUMBER="0041793083005" # phone number(s), space separated (may be empty) -EMAIL="andreas.menzel@psi.ch" # email address(es), space separated (may be empty) -POLL_INTERVAL=5 # seconds between polls while Running -ERROR_INTERVAL=60 # seconds between polls while NOT Running +ALERT_SUBJECT="EPICS alert" +FROM_ADDR="cSAXS@psi.ch" -# Define what "Running" means for the PV: -# CHECK_MODE="numeric": Running if numeric PV value > OK_THRESHOLD -# CHECK_MODE="enum": Running if PV string equals OK_STATE exactly -CHECK_MODE="numeric" # numeric | enum -OK_THRESHOLD=0 # used when CHECK_MODE=numeric -OK_STATE="\"ON\"" # used when CHECK_MODE=enum (match exact caget output) +PV_NAME="AGEBD-PARAMS:INJECTION-RATE" +OK_THRESHOLD=0 -# Notification delivery -SENDMAIL="/usr/sbin/sendmail" -SMS_GATEWAY_DOMAIN="mail2sms.swisscom.com" - -# Set to 1 to print the email payload instead of sending it (useful for testing) -DRY_RUN=0 - -# --- helpers --------------------------------------------------------------- +POLL_INTERVAL=5 +ERROR_INTERVAL=60 log() { printf '[%s] %s\n' "$(date '+%F %T')" "$*" @@ -39,133 +40,100 @@ require_command() { } } -is_running() { - # Decide whether a PV value should be treated as "Running". - # Returns: - # 0 => Running - # 1 => Not Running - # 2 => Configuration error - local v="$1" +# Resolve smtp_send.py relative to this script, not the current working directory. +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +PYTHON_SENDER="${SCRIPT_DIR}/smtp_send.py" - case "$CHECK_MODE" in - numeric) - # IMPORTANT: pass PV value as data (-v v="...") rather than splicing it into awk code. - # (v+0) forces numeric conversion so non-numeric values become 0. - awk -v v="$v" -v t="$OK_THRESHOLD" 'BEGIN { exit !((v+0) > (t+0)) }' - ;; - enum) - [[ "$v" == "$OK_STATE" ]] - ;; - *) - log "ERROR: unknown CHECK_MODE='$CHECK_MODE' (expected 'numeric' or 'enum')" - return 2 - ;; - esac +is_numeric() { + # Accepts integers, decimals, and scientific notation. + # Examples: 1, -1, 1.0, .5, 1e-3, -2.3E+4 + local s="$1" + [[ "$s" =~ ^-?([0-9]+([.][0-9]*)?|[.][0-9]+)([eE][+-]?[0-9]+)?$ ]] } -send_notification() { - # This sends to: - # - phone numbers via @mail2sms.swisscom.com - # - email addresses directly - # - # Usage: send_notification "" "" "" - local numbers="$1 $2" # combine PHONE_NUMBER and EMAIL - local message="$3" +is_ok() { + # Contract: + # - Returns 0 (true) iff the PV value is numeric AND value > OK_THRESHOLD. + # - Non-numeric values are treated as NOT running (robust against threshold changes). + local v="$1" - local subject="[EPICS] $PV_NAME" - local -a to_lines=() + is_numeric "$v" || return 1 - # Build To: header lines - local -a recipients=() - read -r -a recipients <<<"$numbers" + awk -v v="$v" -v t="$OK_THRESHOLD" 'BEGIN { exit !(v > t) }' +} - local tok - for tok in "${recipients[@]}"; do - if [[ "$tok" == *"@"* ]]; then - to_lines+=("To: $tok") - else - to_lines+=("To: ${tok}@${SMS_GATEWAY_DOMAIN}") +send_alert() { + local body="$1" + local subject="${2:-$ALERT_SUBJECT}" + + local rc=0 + local to + for to in "${RECIPIENTS[@]}"; do + # smtp_send.py expects: --to (repeatable), optional flags, then message. + if ! python3 "$PYTHON_SENDER" \ + --to "$to" \ + --subject "$subject" \ + --from-addr "$FROM_ADDR" \ + "$body"; then + log "ERROR: failed to send alert to '$to'" + rc=1 fi done - if ((${#to_lines[@]} == 0)); then - log "ERROR: no recipients configured (PHONE_NUMBER and EMAIL are empty)" - return 3 - fi - - # Append a timestamp - local message_with_date - message_with_date="$message"$'\n'"$(date)" - - # Choose send command (real vs dry-run) - local -a send_cmd=("$SENDMAIL" -t) - ((DRY_RUN)) && send_cmd=(cat) - - { - printf 'From: cSAXS@psi.ch\n' - printf '%s\n' "${to_lines[@]}" - printf 'Subject: %s\n' "$subject" - printf '\n' - printf '%s\n' "$message_with_date" - } | "${send_cmd[@]}" + return "$rc" } -# --- main loop ------------------------------------------------------------ +# --- startup checks --------------------------------------------------------- require_command caget +require_command python3 -# Basic startup sanity checks -if [[ -z "${PHONE_NUMBER// /}" && -z "${EMAIL// /}" ]]; then - log "ERROR: at least one recipient must be configured (PHONE_NUMBER and/or EMAIL)." +if [[ ! -f "$PYTHON_SENDER" ]]; then + log "ERROR: Python sender script not found: $PYTHON_SENDER" exit 2 fi -if ((!DRY_RUN)); then - if [[ ! -x "$SENDMAIL" ]]; then - log "ERROR: SENDMAIL='$SENDMAIL' is not executable (set DRY_RUN=1 to test without sendmail)." - exit 2 - fi -fi - -alert_sent=0 # 0 = no alert sent for current outage, 1 = already sent - -log "Starting EPICS monitor for PV '$PV_NAME' (CHECK_MODE=$CHECK_MODE)..." +alert_sent=0 +log "Starting EPICS monitor for PV '$PV_NAME' (OK if value > $OK_THRESHOLD) ..." while true; do - # Get PV value as a plain string; adjust flags if needed (-S for string PVs) - value=$(caget -noname -nounit "$PV_NAME" 2>/dev/null) + value="$(caget -noname -nounit "$PV_NAME" 2>/dev/null)" status=$? - # If caget fails, treat that as "not Running" (e.g. IOC down) + pv_read_ok=1 if [[ $status -ne 0 ]]; then + pv_read_ok=0 log "WARNING: caget failed for '$PV_NAME' (exit $status)" - value="UNAVAILABLE (caget exit $status)" fi - if is_running "$value"; then - # System is OK again + if ((pv_read_ok)) && is_ok "$value"; then + # System OK again if [[ $alert_sent -eq 1 ]]; then - log "PV '$PV_NAME' back to Running (value='$value')." + log "PV back to OK (value='$value')" fi alert_sent=0 sleep "$POLL_INTERVAL" - else - rc=$? - if [[ $rc -eq 2 ]]; then - log "ERROR: stopping due to configuration error in CHECK_MODE." - exit 2 + continue + fi + + # Not OK: either unreadable, or value indicates not OK. + if [[ $alert_sent -eq 0 ]]; then + if ((pv_read_ok)); then + log "PV not OK (value='$value') - sending alert" + msg="Alert: PV $PV_NAME not OK (value='$value') at $(date)" + else + log "PV unreadable - sending alert" + msg="Alert: PV $PV_NAME unreadable (caget exit $status) at $(date)" fi - # System not OK - if [[ $alert_sent -eq 0 ]]; then - msg="Alert: PV $PV_NAME is '$value'." - if send_notification "$PHONE_NUMBER" "$EMAIL" "$msg"; then - alert_sent=1 - log "Alert sent." - else - log "ERROR: failed to send alert; will retry on next check." - fi + if send_alert "$msg" "$ALERT_SUBJECT"; then + alert_sent=1 + log "Alert sent." + else + log "ERROR: alert send failed; will retry" + alert_sent=0 fi - sleep "$ERROR_INTERVAL" fi + + sleep "$ERROR_INTERVAL" done diff --git a/smtp_send.py b/smtp_send.py new file mode 100644 index 0000000..f1804a4 --- /dev/null +++ b/smtp_send.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +""" +smtp_send.py + +Send a plain-text alert message via direct SMTP to the PSI mail infrastructure. +This script deliberately does NOT use local sendmail. + +Design goals: +- Explicit SMTP connection to smtp.psi.ch +- No SMTP authentication (assumes trusted internal network) +- Fixed default sender address (can be overridden) +- Predictable exit codes and errors for use from shell scripts +""" + +from __future__ import annotations + +import argparse +import sys +import smtplib +from email.message import EmailMessage +from email.utils import formatdate +from typing import List + + +DEFAULT_SMTP_SERVER = "smtp.psi.ch" +DEFAULT_SMTP_PORT = 25 +DEFAULT_TIMEOUT_S = 10.0 +DEFAULT_FROM_ADDR = "cSAXS@psi.ch" +DEFAULT_SUBJECT = "EPICS alert" + + +def build_message( + *, from_addr: str, to_addrs: List[str], subject: str, body: str +) -> EmailMessage: + msg = EmailMessage() + msg["From"] = from_addr + msg["To"] = ", ".join(to_addrs) + msg["Subject"] = subject + msg["Date"] = formatdate(localtime=True) + msg.set_content(body) + return msg + + +def send_email_via_smtp( + *, + to_addrs: List[str], + body: str, + subject: str = DEFAULT_SUBJECT, + from_addr: str = DEFAULT_FROM_ADDR, + smtp_server: str = DEFAULT_SMTP_SERVER, + smtp_port: int = DEFAULT_SMTP_PORT, + timeout_s: float = DEFAULT_TIMEOUT_S, +) -> None: + msg = build_message( + from_addr=from_addr, to_addrs=to_addrs, subject=subject, body=body + ) + + with smtplib.SMTP(smtp_server, smtp_port, timeout=timeout_s) as smtp: + # Explicit envelope sender/recipients (avoid relying on headers for SMTP routing) + smtp.send_message(msg, from_addr=from_addr, to_addrs=to_addrs) + + +def parse_args(argv: List[str]) -> argparse.Namespace: + p = argparse.ArgumentParser( + description="Send a plain-text alert email via direct SMTP (no local sendmail)." + ) + p.add_argument( + "--to", + dest="to_addrs", + action="append", + required=True, + help="Recipient address. Repeat --to for multiple recipients.", + ) + p.add_argument( + "message", + nargs="?", + default="-", + help="Message body. Use '-' (default) to read the body from stdin.", + ) + p.add_argument("--subject", default=DEFAULT_SUBJECT, help="Email subject.") + p.add_argument( + "--from-addr", default=DEFAULT_FROM_ADDR, help="Envelope/header From address." + ) + p.add_argument( + "--server", default=DEFAULT_SMTP_SERVER, help="SMTP server hostname." + ) + p.add_argument( + "--port", type=int, default=DEFAULT_SMTP_PORT, help="SMTP server port." + ) + p.add_argument( + "--timeout", + type=float, + default=DEFAULT_TIMEOUT_S, + help="SMTP timeout (seconds).", + ) + return p.parse_args(argv) + + +def main(argv: List[str]) -> int: + args = parse_args(argv) + + if args.message == "-": + body = sys.stdin.read() + else: + body = args.message + + # Make the "empty message" case explicit: it’s almost always a bug in alerting. + if not body.strip(): + print("ERROR: message body is empty", file=sys.stderr) + return 2 + + try: + send_email_via_smtp( + to_addrs=args.to_addrs, + body=body, + subject=args.subject, + from_addr=args.from_addr, + smtp_server=args.server, + smtp_port=args.port, + timeout_s=args.timeout, + ) + except (OSError, smtplib.SMTPException) as e: + print( + f"ERROR: failed to send email via {args.server}:{args.port}: {e}", + file=sys.stderr, + ) + return 1 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:]))