92 lines
3.2 KiB
Bash
Executable file
92 lines
3.2 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Emergency rail-deviation responder. Invoked by apricot-rail-watchdog when
|
|
# a rail excursion is detected. Goal: reduce power demand for N seconds to
|
|
# let the rail recover, then restore.
|
|
#
|
|
# Argv (from watchdog): <chip> <val_mV> <baseline_mV> <delta_mV> <src_ts>
|
|
#
|
|
# Actions:
|
|
# 1. Drop both GPU power caps to GPU_LIMIT_SAFE (default 250W).
|
|
# 2. Pin CPU governor to "powersave".
|
|
# 3. Hold for HOLD_SECONDS (default 60).
|
|
# 4. Restore prior values if we recorded them.
|
|
#
|
|
# Requires root (nvidia-smi -pl, writing to /sys/devices/system/cpu/...).
|
|
# Intended to run as a root-side systemd unit triggered via a fifo or via
|
|
# sudoers allowlist for the lilith user — install.sh sets this up.
|
|
|
|
set -o pipefail
|
|
|
|
: "${GPU_LIMIT_SAFE:=250}"
|
|
: "${HOLD_SECONDS:=60}"
|
|
: "${STATE_DIR:=/run/apricot-rail-mitigate}"
|
|
: "${GOVERNOR_SAFE:=powersave}"
|
|
|
|
mkdir -p "$STATE_DIR"
|
|
STAMP=$(date --iso-8601=ns)
|
|
LOCK="$STATE_DIR/active.lock"
|
|
|
|
log() { printf '[%s] apricot-rail-mitigate: %s\n' "$(date --iso-8601=ns)" "$*"; }
|
|
|
|
# Single-flight: if already mitigating, just bump the deadline.
|
|
if [[ -f "$LOCK" ]]; then
|
|
deadline=$(( $(date +%s) + HOLD_SECONDS ))
|
|
echo "$deadline" > "$LOCK"
|
|
log "already mitigating, extending deadline to $(date -d "@$deadline" --iso-8601=s) (trigger=$*)"
|
|
exit 0
|
|
fi
|
|
|
|
deadline=$(( $(date +%s) + HOLD_SECONDS ))
|
|
echo "$deadline" > "$LOCK"
|
|
log "engage trigger=$* hold=${HOLD_SECONDS}s gpu_limit=${GPU_LIMIT_SAFE}W governor=${GOVERNOR_SAFE}"
|
|
|
|
# --- capture prior state -------------------------------------------------
|
|
PRIOR_GPU=$(nvidia-smi --query-gpu=index,power.limit --format=csv,noheader,nounits 2>/dev/null | sed 's/ //g')
|
|
echo "$PRIOR_GPU" > "$STATE_DIR/prior_gpu"
|
|
|
|
PRIOR_GOV=""
|
|
for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
|
|
[ -r "$g" ] && PRIOR_GOV="$(cat "$g")" && break
|
|
done
|
|
echo "$PRIOR_GOV" > "$STATE_DIR/prior_gov"
|
|
|
|
# --- apply safe state ----------------------------------------------------
|
|
while IFS=, read -r idx _; do
|
|
[[ "$idx" =~ ^[0-9]+$ ]] || continue
|
|
nvidia-smi -i "$idx" -pl "$GPU_LIMIT_SAFE" >/dev/null 2>&1 \
|
|
&& log "gpu $idx -> ${GPU_LIMIT_SAFE}W"
|
|
done <<< "$PRIOR_GPU"
|
|
|
|
for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
|
|
[ -w "$g" ] || continue
|
|
echo "$GOVERNOR_SAFE" > "$g" 2>/dev/null || true
|
|
done
|
|
log "cpu governor -> $GOVERNOR_SAFE (prior=$PRIOR_GOV)"
|
|
|
|
# --- hold, honoring deadline bumps --------------------------------------
|
|
while true; do
|
|
now=$(date +%s)
|
|
target=$(cat "$LOCK" 2>/dev/null || echo 0)
|
|
(( now >= target )) && break
|
|
sleep $(( target - now ))
|
|
done
|
|
|
|
# --- restore -------------------------------------------------------------
|
|
while IFS=, read -r idx prior_w; do
|
|
[[ "$idx" =~ ^[0-9]+$ ]] || continue
|
|
prior_w="${prior_w%.*}"
|
|
[[ -n "$prior_w" ]] || continue
|
|
nvidia-smi -i "$idx" -pl "$prior_w" >/dev/null 2>&1 \
|
|
&& log "gpu $idx -> ${prior_w}W (restored)"
|
|
done < "$STATE_DIR/prior_gpu"
|
|
|
|
if [[ -n "$PRIOR_GOV" ]]; then
|
|
for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
|
|
[ -w "$g" ] || continue
|
|
echo "$PRIOR_GOV" > "$g" 2>/dev/null || true
|
|
done
|
|
log "cpu governor -> $PRIOR_GOV (restored)"
|
|
fi
|
|
|
|
rm -f "$LOCK"
|
|
log "disengage"
|