Refactor EPICS2SMS: explicit SMTP via smtp.psi.ch, no local sendmail
This commit is contained in:
+86
-118
@@ -1,31 +1,32 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# --- configuration ---------------------------------------------------------
|
||||
# ---------------------------------------------------------------------------
|
||||
# EPICS monitor script
|
||||
# - polls an EPICS PV
|
||||
# - considers the system "OK" if numeric PV value > OK_THRESHOLD
|
||||
# - sends ONE alert when it becomes NOT OK (or unreadable)
|
||||
# - waits until OK again before sending another alert
|
||||
#
|
||||
# Mail/SMS transport is delegated to smtp_send.py (direct SMTP to smtp.psi.ch).
|
||||
# IMPORTANT: From address must be a registered sender (e.g. cSAXS@psi.ch).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Example of an enum PV (typically returned with quotes by caget, e.g. "\"ON\""):
|
||||
# PV_NAME="X12SA-FE-VMMG-0010:PLC_RELAY-D"
|
||||
# Recipients:
|
||||
# - Swisscom mail2sms gateway address (phone number + domain)
|
||||
# - Optional CC to a normal mailbox for traceability
|
||||
RECIPIENTS=(
|
||||
"0041793083005@sms.switch.ch"
|
||||
"andreas.menzel@psi.ch"
|
||||
)
|
||||
|
||||
PV_NAME="AGEBD-PARAMS:INJECTION-RATE" # EPICS PV to monitor
|
||||
PHONE_NUMBER="0041793083005" # phone number(s), space separated (may be empty)
|
||||
EMAIL="andreas.menzel@psi.ch" # email address(es), space separated (may be empty)
|
||||
POLL_INTERVAL=5 # seconds between polls while Running
|
||||
ERROR_INTERVAL=60 # seconds between polls while NOT Running
|
||||
ALERT_SUBJECT="EPICS alert"
|
||||
FROM_ADDR="cSAXS@psi.ch"
|
||||
|
||||
# Define what "Running" means for the PV:
|
||||
# CHECK_MODE="numeric": Running if numeric PV value > OK_THRESHOLD
|
||||
# CHECK_MODE="enum": Running if PV string equals OK_STATE exactly
|
||||
CHECK_MODE="numeric" # numeric | enum
|
||||
OK_THRESHOLD=0 # used when CHECK_MODE=numeric
|
||||
OK_STATE="\"ON\"" # used when CHECK_MODE=enum (match exact caget output)
|
||||
PV_NAME="AGEBD-PARAMS:INJECTION-RATE"
|
||||
OK_THRESHOLD=0
|
||||
|
||||
# Notification delivery
|
||||
SENDMAIL="/usr/sbin/sendmail"
|
||||
SMS_GATEWAY_DOMAIN="mail2sms.swisscom.com"
|
||||
|
||||
# Set to 1 to print the email payload instead of sending it (useful for testing)
|
||||
DRY_RUN=0
|
||||
|
||||
# --- helpers ---------------------------------------------------------------
|
||||
POLL_INTERVAL=5
|
||||
ERROR_INTERVAL=60
|
||||
|
||||
log() {
|
||||
printf '[%s] %s\n' "$(date '+%F %T')" "$*"
|
||||
@@ -39,133 +40,100 @@ require_command() {
|
||||
}
|
||||
}
|
||||
|
||||
is_running() {
|
||||
# Decide whether a PV value should be treated as "Running".
|
||||
# Returns:
|
||||
# 0 => Running
|
||||
# 1 => Not Running
|
||||
# 2 => Configuration error
|
||||
local v="$1"
|
||||
# Resolve smtp_send.py relative to this script, not the current working directory.
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PYTHON_SENDER="${SCRIPT_DIR}/smtp_send.py"
|
||||
|
||||
case "$CHECK_MODE" in
|
||||
numeric)
|
||||
# IMPORTANT: pass PV value as data (-v v="...") rather than splicing it into awk code.
|
||||
# (v+0) forces numeric conversion so non-numeric values become 0.
|
||||
awk -v v="$v" -v t="$OK_THRESHOLD" 'BEGIN { exit !((v+0) > (t+0)) }'
|
||||
;;
|
||||
enum)
|
||||
[[ "$v" == "$OK_STATE" ]]
|
||||
;;
|
||||
*)
|
||||
log "ERROR: unknown CHECK_MODE='$CHECK_MODE' (expected 'numeric' or 'enum')"
|
||||
return 2
|
||||
;;
|
||||
esac
|
||||
is_numeric() {
|
||||
# Accepts integers, decimals, and scientific notation.
|
||||
# Examples: 1, -1, 1.0, .5, 1e-3, -2.3E+4
|
||||
local s="$1"
|
||||
[[ "$s" =~ ^-?([0-9]+([.][0-9]*)?|[.][0-9]+)([eE][+-]?[0-9]+)?$ ]]
|
||||
}
|
||||
|
||||
send_notification() {
|
||||
# This sends to:
|
||||
# - phone numbers via <number>@mail2sms.swisscom.com
|
||||
# - email addresses directly
|
||||
#
|
||||
# Usage: send_notification "<phone numbers>" "<email addresses>" "<message>"
|
||||
local numbers="$1 $2" # combine PHONE_NUMBER and EMAIL
|
||||
local message="$3"
|
||||
is_ok() {
|
||||
# Contract:
|
||||
# - Returns 0 (true) iff the PV value is numeric AND value > OK_THRESHOLD.
|
||||
# - Non-numeric values are treated as NOT running (robust against threshold changes).
|
||||
local v="$1"
|
||||
|
||||
local subject="[EPICS] $PV_NAME"
|
||||
local -a to_lines=()
|
||||
is_numeric "$v" || return 1
|
||||
|
||||
# Build To: header lines
|
||||
local -a recipients=()
|
||||
read -r -a recipients <<<"$numbers"
|
||||
awk -v v="$v" -v t="$OK_THRESHOLD" 'BEGIN { exit !(v > t) }'
|
||||
}
|
||||
|
||||
local tok
|
||||
for tok in "${recipients[@]}"; do
|
||||
if [[ "$tok" == *"@"* ]]; then
|
||||
to_lines+=("To: $tok")
|
||||
else
|
||||
to_lines+=("To: ${tok}@${SMS_GATEWAY_DOMAIN}")
|
||||
send_alert() {
|
||||
local body="$1"
|
||||
local subject="${2:-$ALERT_SUBJECT}"
|
||||
|
||||
local rc=0
|
||||
local to
|
||||
for to in "${RECIPIENTS[@]}"; do
|
||||
# smtp_send.py expects: --to (repeatable), optional flags, then message.
|
||||
if ! python3 "$PYTHON_SENDER" \
|
||||
--to "$to" \
|
||||
--subject "$subject" \
|
||||
--from-addr "$FROM_ADDR" \
|
||||
"$body"; then
|
||||
log "ERROR: failed to send alert to '$to'"
|
||||
rc=1
|
||||
fi
|
||||
done
|
||||
|
||||
if ((${#to_lines[@]} == 0)); then
|
||||
log "ERROR: no recipients configured (PHONE_NUMBER and EMAIL are empty)"
|
||||
return 3
|
||||
fi
|
||||
|
||||
# Append a timestamp
|
||||
local message_with_date
|
||||
message_with_date="$message"$'\n'"$(date)"
|
||||
|
||||
# Choose send command (real vs dry-run)
|
||||
local -a send_cmd=("$SENDMAIL" -t)
|
||||
((DRY_RUN)) && send_cmd=(cat)
|
||||
|
||||
{
|
||||
printf 'From: cSAXS@psi.ch\n'
|
||||
printf '%s\n' "${to_lines[@]}"
|
||||
printf 'Subject: %s\n' "$subject"
|
||||
printf '\n'
|
||||
printf '%s\n' "$message_with_date"
|
||||
} | "${send_cmd[@]}"
|
||||
return "$rc"
|
||||
}
|
||||
|
||||
# --- main loop ------------------------------------------------------------
|
||||
# --- startup checks ---------------------------------------------------------
|
||||
|
||||
require_command caget
|
||||
require_command python3
|
||||
|
||||
# Basic startup sanity checks
|
||||
if [[ -z "${PHONE_NUMBER// /}" && -z "${EMAIL// /}" ]]; then
|
||||
log "ERROR: at least one recipient must be configured (PHONE_NUMBER and/or EMAIL)."
|
||||
if [[ ! -f "$PYTHON_SENDER" ]]; then
|
||||
log "ERROR: Python sender script not found: $PYTHON_SENDER"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if ((!DRY_RUN)); then
|
||||
if [[ ! -x "$SENDMAIL" ]]; then
|
||||
log "ERROR: SENDMAIL='$SENDMAIL' is not executable (set DRY_RUN=1 to test without sendmail)."
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
|
||||
alert_sent=0 # 0 = no alert sent for current outage, 1 = already sent
|
||||
|
||||
log "Starting EPICS monitor for PV '$PV_NAME' (CHECK_MODE=$CHECK_MODE)..."
|
||||
alert_sent=0
|
||||
log "Starting EPICS monitor for PV '$PV_NAME' (OK if value > $OK_THRESHOLD) ..."
|
||||
|
||||
while true; do
|
||||
# Get PV value as a plain string; adjust flags if needed (-S for string PVs)
|
||||
value=$(caget -noname -nounit "$PV_NAME" 2>/dev/null)
|
||||
value="$(caget -noname -nounit "$PV_NAME" 2>/dev/null)"
|
||||
status=$?
|
||||
|
||||
# If caget fails, treat that as "not Running" (e.g. IOC down)
|
||||
pv_read_ok=1
|
||||
if [[ $status -ne 0 ]]; then
|
||||
pv_read_ok=0
|
||||
log "WARNING: caget failed for '$PV_NAME' (exit $status)"
|
||||
value="UNAVAILABLE (caget exit $status)"
|
||||
fi
|
||||
|
||||
if is_running "$value"; then
|
||||
# System is OK again
|
||||
if ((pv_read_ok)) && is_ok "$value"; then
|
||||
# System OK again
|
||||
if [[ $alert_sent -eq 1 ]]; then
|
||||
log "PV '$PV_NAME' back to Running (value='$value')."
|
||||
log "PV back to OK (value='$value')"
|
||||
fi
|
||||
alert_sent=0
|
||||
sleep "$POLL_INTERVAL"
|
||||
else
|
||||
rc=$?
|
||||
if [[ $rc -eq 2 ]]; then
|
||||
log "ERROR: stopping due to configuration error in CHECK_MODE."
|
||||
exit 2
|
||||
continue
|
||||
fi
|
||||
|
||||
# Not OK: either unreadable, or value indicates not OK.
|
||||
if [[ $alert_sent -eq 0 ]]; then
|
||||
if ((pv_read_ok)); then
|
||||
log "PV not OK (value='$value') - sending alert"
|
||||
msg="Alert: PV $PV_NAME not OK (value='$value') at $(date)"
|
||||
else
|
||||
log "PV unreadable - sending alert"
|
||||
msg="Alert: PV $PV_NAME unreadable (caget exit $status) at $(date)"
|
||||
fi
|
||||
|
||||
# System not OK
|
||||
if [[ $alert_sent -eq 0 ]]; then
|
||||
msg="Alert: PV $PV_NAME is '$value'."
|
||||
if send_notification "$PHONE_NUMBER" "$EMAIL" "$msg"; then
|
||||
alert_sent=1
|
||||
log "Alert sent."
|
||||
else
|
||||
log "ERROR: failed to send alert; will retry on next check."
|
||||
fi
|
||||
if send_alert "$msg" "$ALERT_SUBJECT"; then
|
||||
alert_sent=1
|
||||
log "Alert sent."
|
||||
else
|
||||
log "ERROR: alert send failed; will retry"
|
||||
alert_sent=0
|
||||
fi
|
||||
sleep "$ERROR_INTERVAL"
|
||||
fi
|
||||
|
||||
sleep "$ERROR_INTERVAL"
|
||||
done
|
||||
|
||||
+133
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
smtp_send.py
|
||||
|
||||
Send a plain-text alert message via direct SMTP to the PSI mail infrastructure.
|
||||
This script deliberately does NOT use local sendmail.
|
||||
|
||||
Design goals:
|
||||
- Explicit SMTP connection to smtp.psi.ch
|
||||
- No SMTP authentication (assumes trusted internal network)
|
||||
- Fixed default sender address (can be overridden)
|
||||
- Predictable exit codes and errors for use from shell scripts
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import smtplib
|
||||
from email.message import EmailMessage
|
||||
from email.utils import formatdate
|
||||
from typing import List
|
||||
|
||||
|
||||
DEFAULT_SMTP_SERVER = "smtp.psi.ch"
|
||||
DEFAULT_SMTP_PORT = 25
|
||||
DEFAULT_TIMEOUT_S = 10.0
|
||||
DEFAULT_FROM_ADDR = "cSAXS@psi.ch"
|
||||
DEFAULT_SUBJECT = "EPICS alert"
|
||||
|
||||
|
||||
def build_message(
|
||||
*, from_addr: str, to_addrs: List[str], subject: str, body: str
|
||||
) -> EmailMessage:
|
||||
msg = EmailMessage()
|
||||
msg["From"] = from_addr
|
||||
msg["To"] = ", ".join(to_addrs)
|
||||
msg["Subject"] = subject
|
||||
msg["Date"] = formatdate(localtime=True)
|
||||
msg.set_content(body)
|
||||
return msg
|
||||
|
||||
|
||||
def send_email_via_smtp(
|
||||
*,
|
||||
to_addrs: List[str],
|
||||
body: str,
|
||||
subject: str = DEFAULT_SUBJECT,
|
||||
from_addr: str = DEFAULT_FROM_ADDR,
|
||||
smtp_server: str = DEFAULT_SMTP_SERVER,
|
||||
smtp_port: int = DEFAULT_SMTP_PORT,
|
||||
timeout_s: float = DEFAULT_TIMEOUT_S,
|
||||
) -> None:
|
||||
msg = build_message(
|
||||
from_addr=from_addr, to_addrs=to_addrs, subject=subject, body=body
|
||||
)
|
||||
|
||||
with smtplib.SMTP(smtp_server, smtp_port, timeout=timeout_s) as smtp:
|
||||
# Explicit envelope sender/recipients (avoid relying on headers for SMTP routing)
|
||||
smtp.send_message(msg, from_addr=from_addr, to_addrs=to_addrs)
|
||||
|
||||
|
||||
def parse_args(argv: List[str]) -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Send a plain-text alert email via direct SMTP (no local sendmail)."
|
||||
)
|
||||
p.add_argument(
|
||||
"--to",
|
||||
dest="to_addrs",
|
||||
action="append",
|
||||
required=True,
|
||||
help="Recipient address. Repeat --to for multiple recipients.",
|
||||
)
|
||||
p.add_argument(
|
||||
"message",
|
||||
nargs="?",
|
||||
default="-",
|
||||
help="Message body. Use '-' (default) to read the body from stdin.",
|
||||
)
|
||||
p.add_argument("--subject", default=DEFAULT_SUBJECT, help="Email subject.")
|
||||
p.add_argument(
|
||||
"--from-addr", default=DEFAULT_FROM_ADDR, help="Envelope/header From address."
|
||||
)
|
||||
p.add_argument(
|
||||
"--server", default=DEFAULT_SMTP_SERVER, help="SMTP server hostname."
|
||||
)
|
||||
p.add_argument(
|
||||
"--port", type=int, default=DEFAULT_SMTP_PORT, help="SMTP server port."
|
||||
)
|
||||
p.add_argument(
|
||||
"--timeout",
|
||||
type=float,
|
||||
default=DEFAULT_TIMEOUT_S,
|
||||
help="SMTP timeout (seconds).",
|
||||
)
|
||||
return p.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv: List[str]) -> int:
|
||||
args = parse_args(argv)
|
||||
|
||||
if args.message == "-":
|
||||
body = sys.stdin.read()
|
||||
else:
|
||||
body = args.message
|
||||
|
||||
# Make the "empty message" case explicit: it’s almost always a bug in alerting.
|
||||
if not body.strip():
|
||||
print("ERROR: message body is empty", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
try:
|
||||
send_email_via_smtp(
|
||||
to_addrs=args.to_addrs,
|
||||
body=body,
|
||||
subject=args.subject,
|
||||
from_addr=args.from_addr,
|
||||
smtp_server=args.server,
|
||||
smtp_port=args.port,
|
||||
timeout_s=args.timeout,
|
||||
)
|
||||
except (OSError, smtplib.SMTPException) as e:
|
||||
print(
|
||||
f"ERROR: failed to send email via {args.server}:{args.port}: {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv[1:]))
|
||||
Reference in New Issue
Block a user