apricot-health/scripts/apricot-rail-watchdog
Natalie dafbabee41 feat(@packages/apricot-health): add power-fault monitoring and mitigation tools
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-17 23:18:47 -07:00

85 lines
2.8 KiB
Bash
Executable file

#!/usr/bin/env bash
# Watches a stable PSU-derived rail (default: in5 on it8628 chips) by
# learning each chip's baseline from the first BASELINE_SAMPLES and alerting
# when later samples deviate by more than DEVIATION_MV.
#
# Works for any rail that shouldn't swing under normal operation. For Vcore
# (which swings 600mV+ during P-state transitions on Threadripper) this
# approach is unsuitable — use in5 (+12V divided) or in7 (3VSB) instead.
#
# hwmon numbering is boot-order-dependent, so we resolve it per-line.
#
# Optional mitigation hook (set MITIGATE_CMD) runs when a deviation fires —
# receives the chip, value, baseline, delta on its argv. Use to auto-throttle
# GPU power or CPU governor as an emergency response.
set -o pipefail
LOG="${HOME}/apricot-crash.log"
ALERTS="${HOME}/apricot-rail-alerts.log"
: "${DEVIATION_MV:=30}"
: "${BASELINE_SAMPLES:=20}"
: "${RAIL_KEY:=in5}"
: "${CHIP_REGEX:=it8628/hwmon[0-9]+}"
: "${MITIGATE_CMD:=}"
printf '=== rail-watchdog start %s key=%s deviation=%smV baseline_samples=%s chip=%s mitigate=%s ===\n' \
"$(date --iso-8601=ns)" "$RAIL_KEY" "$DEVIATION_MV" "$BASELINE_SAMPLES" "$CHIP_REGEX" "${MITIGATE_CMD:-<none>}" >> "$ALERTS"
emit() {
local ts msg="$*"
ts=$(date --iso-8601=ns)
printf '%s [WARN] %s\n' "$ts" "$msg" | tee -a "$ALERTS" >&2
}
info() {
local ts msg="$*"
ts=$(date --iso-8601=ns)
printf '%s [INFO] %s\n' "$ts" "$msg" >> "$ALERTS"
}
declare -A seen_count
declare -A baseline
declare -A buffer
chip_re="($CHIP_REGEX)"
val_re=" ${RAIL_KEY}=([0-9]+)$"
median_of() {
printf '%s\n' $1 | sort -n | awk -v n=$(wc -w <<< "$1") 'NR==int((n+1)/2){print;exit}'
}
tail -F -n 0 "$LOG" 2>/dev/null | while IFS= read -r line; do
[[ "$line" =~ $chip_re ]] || continue
chip="${BASH_REMATCH[1]}"
[[ "$line" =~ $val_re ]] || continue
val="${BASH_REMATCH[1]}"
src_ts="${line%% *}"
n="${seen_count[$chip]:-0}"
n=$(( n + 1 ))
seen_count[$chip]=$n
if (( n <= BASELINE_SAMPLES )); then
buffer[$chip]="${buffer[$chip]:+${buffer[$chip]} }$val"
if (( n == BASELINE_SAMPLES )); then
b=$(median_of "${buffer[$chip]}")
baseline[$chip]=$b
info "baseline_learned chip=${chip} key=${RAIL_KEY} baseline=${b}mV samples=${BASELINE_SAMPLES}"
unset 'buffer[$chip]'
fi
continue
fi
b="${baseline[$chip]}"
dev=$(( val - b ))
(( dev < 0 )) && dev=$(( -dev ))
if (( dev > DEVIATION_MV )); then
emit "rail_deviation chip=${chip} key=${RAIL_KEY} val=${val}mV baseline=${b}mV |Δ|=${dev}mV at=${src_ts}"
if [[ -n "$MITIGATE_CMD" ]]; then
# Detach mitigation so a slow command can't block alert delivery.
"$MITIGATE_CMD" "$chip" "$val" "$b" "$dev" "$src_ts" >> "$ALERTS" 2>&1 &
fi
fi
done