173 lines
6.1 KiB
Text
173 lines
6.1 KiB
Text
|
|
#!/bin/sh
|
||
|
|
# bigdisk-mount-guard — autorecover tool for the ~/_/bigdisk NFS mount
|
||
|
|
# (Apple Photos originals + other big storage on black).
|
||
|
|
#
|
||
|
|
# The mount is provided both via:
|
||
|
|
# - /etc/auto_nfs (autofs static): bigdisk -fstype=nfs,resvport black:/bigdisk
|
||
|
|
# - /etc/fstab (hard NFS): 10.0.0.11:/bigdisk /Users/natalie/_/bigdisk nfs resvport,rw,bg,hard,intr,tcp
|
||
|
|
#
|
||
|
|
# On sleep, network roam, WG flap or "no WiFi" location switches the handle often
|
||
|
|
# goes stale. autofs/fstab fails to re-trigger, Photos sees a dead symlink for
|
||
|
|
# originals and the library gets stuck (permission errors, "Repairing…", repeated
|
||
|
|
# password prompts for com.apple.library-repair).
|
||
|
|
#
|
||
|
|
# This tool autorecovers:
|
||
|
|
# 1. If storage IP unreachable, attempt to bring the WireGuard mesh (wg1) up.
|
||
|
|
# 2. If server reachable but mount wedged, force-umount + touch to re-trigger.
|
||
|
|
# 3. On success, run Photos-specific recovery (clean DB locks, restart daemons,
|
||
|
|
# restore canonical symlink if it had been pointed at a local stub).
|
||
|
|
#
|
||
|
|
# Designed to run as root LaunchDaemon (contrib plist). Also safe/useful by hand
|
||
|
|
# or from host-monitor/disk-guard style tools.
|
||
|
|
#
|
||
|
|
# No GNU timeout (uses perl alarm, always on macOS).
|
||
|
|
set -u
|
||
|
|
|
||
|
|
# Support limited CLI for use as a tool: --status, --force, --help
|
||
|
|
case "${1:-}" in
|
||
|
|
--help|-h)
|
||
|
|
cat <<'EOF'
|
||
|
|
bigdisk-mount-guard [options]
|
||
|
|
|
||
|
|
(no arg) normal check + autorecover (idempotent, for launchd or cron)
|
||
|
|
--status report only (0=healthy, 1=needs attention), no changes
|
||
|
|
--force force a full recovery cycle (useful after manual WG bring-up)
|
||
|
|
--help this message
|
||
|
|
|
||
|
|
Runs as root (needs umount/mount/wg-quick). Logs to $LOG.
|
||
|
|
EOF
|
||
|
|
exit 0
|
||
|
|
;;
|
||
|
|
--status)
|
||
|
|
STATUS_ONLY=1
|
||
|
|
;;
|
||
|
|
--force)
|
||
|
|
FORCE=1
|
||
|
|
;;
|
||
|
|
"") ;;
|
||
|
|
*)
|
||
|
|
echo "unknown arg: $1" >&2
|
||
|
|
exit 64
|
||
|
|
;;
|
||
|
|
esac
|
||
|
|
|
||
|
|
|
||
|
|
MP="/Users/natalie/_/bigdisk"
|
||
|
|
PROBE="${MP}/Photos" # a dir that only exists when the mount is live
|
||
|
|
SERVER_IP="10.0.0.11" # black; 'black' resolves here too, routed via LAN or WG
|
||
|
|
LOG="/var/log/bigdisk-mount-guard.log"
|
||
|
|
WG_IF="wg1"
|
||
|
|
WG_CONF="/Users/natalie/.wireguard/wg1.conf"
|
||
|
|
PHOTOS_LIB="/Users/natalie/Pictures/Photos Library.photoslibrary"
|
||
|
|
|
||
|
|
log() { echo "$(/bin/date '+%Y-%m-%dT%H:%M:%S') $*" >> "$LOG" 2>/dev/null; }
|
||
|
|
|
||
|
|
# macOS notification (works from root daemon for the console user)
|
||
|
|
notify() {
|
||
|
|
title="$1"
|
||
|
|
msg="$2"
|
||
|
|
/usr/bin/osascript -e "display notification \"$msg\" with title \"$title\" sound name \"Ping\"" 2>/dev/null || true
|
||
|
|
}
|
||
|
|
|
||
|
|
# run "$@" but kill it after N seconds (first arg); returns the command's status,
|
||
|
|
# or 124 if it timed out / failed to exec. Uses perl's SIGALRM — always present on macOS.
|
||
|
|
bounded() {
|
||
|
|
_t=$1; shift
|
||
|
|
/usr/bin/perl -e 'alarm shift; exec @ARGV or exit 124' "$_t" "$@"
|
||
|
|
}
|
||
|
|
|
||
|
|
ensure_wg() {
|
||
|
|
if /opt/homebrew/bin/wg show "$WG_IF" >/dev/null 2>&1; then
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
log "WG $WG_IF down; attempting wg-quick up to reach $SERVER_IP"
|
||
|
|
/opt/homebrew/bin/wg-quick down "$WG_CONF" >/dev/null 2>&1 || true
|
||
|
|
if /opt/homebrew/bin/wg-quick up "$WG_CONF" >>"$LOG" 2>&1; then
|
||
|
|
log "WG up succeeded; waiting for mesh routes"
|
||
|
|
sleep 8
|
||
|
|
return 0
|
||
|
|
else
|
||
|
|
log "wg-quick up failed"
|
||
|
|
return 1
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
|
||
|
|
recover_photos() {
|
||
|
|
[ -d "$PHOTOS_LIB" ] || return 0
|
||
|
|
log "Photos autorecover cleanup"
|
||
|
|
# stale locks often block open/repair after volume events
|
||
|
|
find "$PHOTOS_LIB/database" -name '*lock*' -type f -delete 2>/dev/null || true
|
||
|
|
# restart the user's analysis daemons so they see the freshly mounted originals
|
||
|
|
/usr/bin/sudo -u natalie /usr/bin/pkill -x photoanalysisd 2>/dev/null || true
|
||
|
|
/usr/bin/sudo -u natalie /usr/bin/pkill -f photolibraryd 2>/dev/null || true
|
||
|
|
# if a previous troubleshooting step pointed the symlink at a local stub, restore the real one
|
||
|
|
cur=$(readlink "$PHOTOS_LIB/originals" 2>/dev/null || true)
|
||
|
|
if echo "$cur" | grep -q 'Photos-stub\|stub/originals'; then
|
||
|
|
ln -sfn "${MP}/Photos/Photos Library.photoslibrary/originals" "$PHOTOS_LIB/originals"
|
||
|
|
log "restored canonical originals symlink"
|
||
|
|
fi
|
||
|
|
log "Photos cleanup done — relaunch Photos (hold Option+Command for Repair if it still whines)"
|
||
|
|
notify "bigdisk-mount-guard" "Photos library autorecovered (mount + cleanup)"
|
||
|
|
}
|
||
|
|
|
||
|
|
# --- main ---
|
||
|
|
|
||
|
|
if [ "${STATUS_ONLY:-0}" = 1 ]; then
|
||
|
|
if bounded 5 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
|
||
|
|
echo "OK: bigdisk probe $PROBE reachable"
|
||
|
|
exit 0
|
||
|
|
else
|
||
|
|
echo "STALE: bigdisk probe $PROBE not reachable"
|
||
|
|
if /sbin/ping -c1 -t2 "$SERVER_IP" >/dev/null 2>&1; then
|
||
|
|
echo " server $SERVER_IP is up (mount wedged?)"
|
||
|
|
else
|
||
|
|
echo " server $SERVER_IP unreachable"
|
||
|
|
fi
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Healthy path: probe stats quickly. Nothing (more) to do.
|
||
|
|
if [ "${FORCE:-0}" != 1 ] && bounded 8 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
|
||
|
|
exit 0
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Path not reachable (or forced). Try to make the server reachable first.
|
||
|
|
if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then
|
||
|
|
log "probe failed; ${SERVER_IP} unreachable"
|
||
|
|
if ensure_wg; then
|
||
|
|
notify "bigdisk-mount-guard" "Brought up WG mesh to reach storage"
|
||
|
|
if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then
|
||
|
|
log "${SERVER_IP} still unreachable after WG attempt — leaving mount alone"
|
||
|
|
notify "bigdisk-mount-guard" "bigdisk offline (server unreachable after WG) — no action"
|
||
|
|
exit 0
|
||
|
|
fi
|
||
|
|
# fallthrough to mount recovery now that mesh is up
|
||
|
|
else
|
||
|
|
log "WG recovery not possible — offline, leaving mount alone"
|
||
|
|
notify "bigdisk-mount-guard" "bigdisk offline (server unreachable) — no action taken"
|
||
|
|
exit 0
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Server reachable (possibly after WG recovery) but probe failed → wedged/stale mount.
|
||
|
|
log "probe failed but ${SERVER_IP} reachable — force-clearing wedged mount at ${MP}"
|
||
|
|
notify "bigdisk-mount-guard" "bigdisk mount wedged — force recovering now"
|
||
|
|
bounded 10 /sbin/umount -f "$MP" >/dev/null 2>&1 || true
|
||
|
|
# Kick automount to refresh static maps (safe/no-op if already good).
|
||
|
|
bounded 5 /usr/sbin/automount -vc >>"$LOG" 2>&1 || true
|
||
|
|
# Touch the path to (re)trigger autofs (or fstab bg reconnect).
|
||
|
|
bounded 15 /bin/ls "$MP" >/dev/null 2>&1 || true
|
||
|
|
sleep 3
|
||
|
|
|
||
|
|
if bounded 10 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
|
||
|
|
log "remount OK"
|
||
|
|
notify "bigdisk-mount-guard" "bigdisk mount recovered successfully"
|
||
|
|
recover_photos
|
||
|
|
exit 0
|
||
|
|
fi
|
||
|
|
|
||
|
|
log "remount still failing after force-clear"
|
||
|
|
notify "bigdisk-mount-guard" "bigdisk remount still failing (check logs)"
|
||
|
|
exit 1
|