session-tools/bin/bigdisk-mount-guard

173 lines
6.1 KiB
Text
Raw Permalink Normal View History

#!/bin/sh
# bigdisk-mount-guard — autorecover tool for the ~/_/bigdisk NFS mount
# (Apple Photos originals + other big storage on black).
#
# The mount is provided both via:
# - /etc/auto_nfs (autofs static): bigdisk -fstype=nfs,resvport black:/bigdisk
# - /etc/fstab (hard NFS): 10.0.0.11:/bigdisk /Users/natalie/_/bigdisk nfs resvport,rw,bg,hard,intr,tcp
#
# On sleep, network roam, WG flap or "no WiFi" location switches the handle often
# goes stale. autofs/fstab fails to re-trigger, Photos sees a dead symlink for
# originals and the library gets stuck (permission errors, "Repairing…", repeated
# password prompts for com.apple.library-repair).
#
# This tool autorecovers:
# 1. If storage IP unreachable, attempt to bring the WireGuard mesh (wg1) up.
# 2. If server reachable but mount wedged, force-umount + touch to re-trigger.
# 3. On success, run Photos-specific recovery (clean DB locks, restart daemons,
# restore canonical symlink if it had been pointed at a local stub).
#
# Designed to run as root LaunchDaemon (contrib plist). Also safe/useful by hand
# or from host-monitor/disk-guard style tools.
#
# No GNU timeout (uses perl alarm, always on macOS).
set -u
# Support limited CLI for use as a tool: --status, --force, --help
case "${1:-}" in
--help|-h)
cat <<'EOF'
bigdisk-mount-guard [options]
(no arg) normal check + autorecover (idempotent, for launchd or cron)
--status report only (0=healthy, 1=needs attention), no changes
--force force a full recovery cycle (useful after manual WG bring-up)
--help this message
Runs as root (needs umount/mount/wg-quick). Logs to $LOG.
EOF
exit 0
;;
--status)
STATUS_ONLY=1
;;
--force)
FORCE=1
;;
"") ;;
*)
echo "unknown arg: $1" >&2
exit 64
;;
esac
MP="/Users/natalie/_/bigdisk"
PROBE="${MP}/Photos" # a dir that only exists when the mount is live
SERVER_IP="10.0.0.11" # black; 'black' resolves here too, routed via LAN or WG
LOG="/var/log/bigdisk-mount-guard.log"
WG_IF="wg1"
WG_CONF="/Users/natalie/.wireguard/wg1.conf"
PHOTOS_LIB="/Users/natalie/Pictures/Photos Library.photoslibrary"
log() { echo "$(/bin/date '+%Y-%m-%dT%H:%M:%S') $*" >> "$LOG" 2>/dev/null; }
# macOS notification (works from root daemon for the console user)
notify() {
title="$1"
msg="$2"
/usr/bin/osascript -e "display notification \"$msg\" with title \"$title\" sound name \"Ping\"" 2>/dev/null || true
}
# run "$@" but kill it after N seconds (first arg); returns the command's status,
# or 124 if it timed out / failed to exec. Uses perl's SIGALRM — always present on macOS.
bounded() {
_t=$1; shift
/usr/bin/perl -e 'alarm shift; exec @ARGV or exit 124' "$_t" "$@"
}
ensure_wg() {
if /opt/homebrew/bin/wg show "$WG_IF" >/dev/null 2>&1; then
return 0
fi
log "WG $WG_IF down; attempting wg-quick up to reach $SERVER_IP"
/opt/homebrew/bin/wg-quick down "$WG_CONF" >/dev/null 2>&1 || true
if /opt/homebrew/bin/wg-quick up "$WG_CONF" >>"$LOG" 2>&1; then
log "WG up succeeded; waiting for mesh routes"
sleep 8
return 0
else
log "wg-quick up failed"
return 1
fi
}
recover_photos() {
[ -d "$PHOTOS_LIB" ] || return 0
log "Photos autorecover cleanup"
# stale locks often block open/repair after volume events
find "$PHOTOS_LIB/database" -name '*lock*' -type f -delete 2>/dev/null || true
# restart the user's analysis daemons so they see the freshly mounted originals
/usr/bin/sudo -u natalie /usr/bin/pkill -x photoanalysisd 2>/dev/null || true
/usr/bin/sudo -u natalie /usr/bin/pkill -f photolibraryd 2>/dev/null || true
# if a previous troubleshooting step pointed the symlink at a local stub, restore the real one
cur=$(readlink "$PHOTOS_LIB/originals" 2>/dev/null || true)
if echo "$cur" | grep -q 'Photos-stub\|stub/originals'; then
ln -sfn "${MP}/Photos/Photos Library.photoslibrary/originals" "$PHOTOS_LIB/originals"
log "restored canonical originals symlink"
fi
log "Photos cleanup done — relaunch Photos (hold Option+Command for Repair if it still whines)"
notify "bigdisk-mount-guard" "Photos library autorecovered (mount + cleanup)"
}
# --- main ---
if [ "${STATUS_ONLY:-0}" = 1 ]; then
if bounded 5 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
echo "OK: bigdisk probe $PROBE reachable"
exit 0
else
echo "STALE: bigdisk probe $PROBE not reachable"
if /sbin/ping -c1 -t2 "$SERVER_IP" >/dev/null 2>&1; then
echo " server $SERVER_IP is up (mount wedged?)"
else
echo " server $SERVER_IP unreachable"
fi
exit 1
fi
fi
# Healthy path: probe stats quickly. Nothing (more) to do.
if [ "${FORCE:-0}" != 1 ] && bounded 8 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
exit 0
fi
# Path not reachable (or forced). Try to make the server reachable first.
if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then
log "probe failed; ${SERVER_IP} unreachable"
if ensure_wg; then
notify "bigdisk-mount-guard" "Brought up WG mesh to reach storage"
if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then
log "${SERVER_IP} still unreachable after WG attempt — leaving mount alone"
notify "bigdisk-mount-guard" "bigdisk offline (server unreachable after WG) — no action"
exit 0
fi
# fallthrough to mount recovery now that mesh is up
else
log "WG recovery not possible — offline, leaving mount alone"
notify "bigdisk-mount-guard" "bigdisk offline (server unreachable) — no action taken"
exit 0
fi
fi
# Server reachable (possibly after WG recovery) but probe failed → wedged/stale mount.
log "probe failed but ${SERVER_IP} reachable — force-clearing wedged mount at ${MP}"
notify "bigdisk-mount-guard" "bigdisk mount wedged — force recovering now"
bounded 10 /sbin/umount -f "$MP" >/dev/null 2>&1 || true
# Kick automount to refresh static maps (safe/no-op if already good).
bounded 5 /usr/sbin/automount -vc >>"$LOG" 2>&1 || true
# Touch the path to (re)trigger autofs (or fstab bg reconnect).
bounded 15 /bin/ls "$MP" >/dev/null 2>&1 || true
sleep 3
if bounded 10 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
log "remount OK"
notify "bigdisk-mount-guard" "bigdisk mount recovered successfully"
recover_photos
exit 0
fi
log "remount still failing after force-clear"
notify "bigdisk-mount-guard" "bigdisk remount still failing (check logs)"
exit 1