apricot-health/scripts/apricot-crash-logger
Natalie dafbabee41 feat(@packages/apricot-health): add power-fault monitoring and mitigation tools
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-17 23:18:47 -07:00

83 lines
2.9 KiB
Bash
Executable file

#!/usr/bin/env bash
# Continuously appends power/thermal/voltage state to $LOG so that the last
# fractions of a second before a hard reset survive the crash.
#
# Env overrides:
# LOG output path (default ~/apricot-crash.log)
# INTERVAL sample period in seconds (default 0.1 = 10 Hz)
# SENSOR_CHIPS regex of hwmon name(s) to capture (default k10temp|nvme|it8628|nct6*|w83*)
set -o pipefail
LOG="${LOG:-${HOME}/apricot-crash.log}"
INTERVAL="${INTERVAL:-0.1}"
GPU_SAMPLE_EVERY="${GPU_SAMPLE_EVERY:-10}" # nvidia-smi is slow; only invoke every Nth iter
SENSOR_CHIPS="${SENSOR_CHIPS:-k10temp|nvme|it8628|nct6.*|w83.*}"
printf '=== session start %s (pid=%s interval=%ss gpu_every=%s chips=%s) ===\n' \
"$(date --iso-8601=ns)" "$$" "$INTERVAL" "$GPU_SAMPLE_EVERY" "$SENSOR_CHIPS" >> "$LOG"
# Pre-resolve matching hwmon paths once per second (cheaper than per-sample).
declare -a HWMONS
refresh_hwmons() {
HWMONS=()
for h in /sys/class/hwmon/hwmon*; do
[ -d "$h" ] || continue
[ -r "$h/name" ] || continue
name=$(<"$h/name") # bash builtin — no fork
[[ "$name" =~ ^(${SENSOR_CHIPS})$ ]] || continue
HWMONS+=("$h")
done
}
refresh_hwmons
last_refresh=$SECONDS
iter=0
while :; do
ts=$(date --iso-8601=ns)
# GPU telemetry — skip most iterations because nvidia-smi startup is
# ~300-500ms, which would cap the loop at ~2 Hz otherwise.
if (( iter % GPU_SAMPLE_EVERY == 0 )); then
while IFS= read -r gpu_line; do
printf '%s gpu %s\n' "$ts" "$gpu_line"
done < <(nvidia-smi \
--query-gpu=index,temperature.gpu,power.draw,clocks.gr,clocks.mem,pstate,utilization.gpu,memory.used \
--format=csv,noheader,nounits 2>/dev/null)
fi
iter=$(( iter + 1 ))
# Platform sensors — use $(<file) bash builtin everywhere to avoid
# fork+exec per-read. With ~60 sensor files that's the difference
# between ~600ms per iteration and <20ms.
for h in "${HWMONS[@]}"; do
[ -r "$h/name" ] || continue
name=$(<"$h/name")
hb=${h##*/}
for inp in "$h"/temp*_input "$h"/in*_input "$h"/fan*_input "$h"/curr*_input; do
[ -r "$inp" ] || continue
n=${inp##*/}; n=${n%_input}
label_file="$h/${n}_label"
if [ -r "$label_file" ]; then
label=$(<"$label_file")
else
label="$n"
fi
raw=$(<"$inp")
printf '%s sensor %s/%s %s=%s\n' "$ts" "$name" "$hb" "$label" "$raw"
done
done
# Refresh hwmon list every ~5s in case modules load/unload.
if (( SECONDS - last_refresh > 5 )); then
refresh_hwmons
last_refresh=$SECONDS
fi
# Fsync once per second regardless of sample rate (amortized).
if (( ${ts:20:1} == 0 )); then
sync "$LOG" 2>/dev/null || true
fi
sleep "$INTERVAL"
done >> "$LOG"