session-tools/bin/bigdisk-mount-guard

#!/bin/sh
# bigdisk-mount-guard — autorecover tool for the ~/_/bigdisk NFS mount
# (Apple Photos originals + other big storage on black).
#
# The mount is provided both via:
#   - /etc/auto_nfs (autofs static): bigdisk -fstype=nfs,resvport black:/bigdisk
#   - /etc/fstab (hard NFS): 10.0.0.11:/bigdisk /Users/natalie/_/bigdisk nfs resvport,rw,bg,hard,intr,tcp
#
# On sleep, network roam, WG flap or "no WiFi" location switches the handle often
# goes stale. autofs/fstab fails to re-trigger, Photos sees a dead symlink for
# originals and the library gets stuck (permission errors, "Repairing…", repeated
# password prompts for com.apple.library-repair).
#
# This tool autorecovers:
#   1. If storage IP unreachable, attempt to bring the WireGuard mesh (wg1) up.
#   2. If server reachable but mount wedged, force-umount + touch to re-trigger.
#   3. On success, run Photos-specific recovery (clean DB locks, restart daemons,
#      restore canonical symlink if it had been pointed at a local stub).
#
# Designed to run as root LaunchDaemon (contrib plist). Also safe/useful by hand
# or from host-monitor/disk-guard style tools.
#
# No GNU timeout (uses perl alarm, always on macOS).
set -u

# Support limited CLI for use as a tool: --status, --force, --help
case "${1:-}" in
--help|-h)
	cat <<'EOF'
bigdisk-mount-guard [options]

  (no arg)   normal check + autorecover (idempotent, for launchd or cron)
  --status   report only (0=healthy, 1=needs attention), no changes
  --force    force a full recovery cycle (useful after manual WG bring-up)
  --help     this message

Runs as root (needs umount/mount/wg-quick). Logs to $LOG.
EOF
	exit 0
	;;
--status)
	STATUS_ONLY=1
	;;
--force)
	FORCE=1
	;;
"") ;;
*)
	echo "unknown arg: $1" >&2
	exit 64
	;;
esac


MP="/Users/natalie/_/bigdisk"
PROBE="${MP}/Photos"          # a dir that only exists when the mount is live
SERVER_IP="10.0.0.11"         # black; 'black' resolves here too, routed via LAN or WG
LOG="/var/log/bigdisk-mount-guard.log"
WG_IF="wg1"
WG_CONF="/Users/natalie/.wireguard/wg1.conf"
PHOTOS_LIB="/Users/natalie/Pictures/Photos Library.photoslibrary"

log() { echo "$(/bin/date '+%Y-%m-%dT%H:%M:%S') $*" >> "$LOG" 2>/dev/null; }

# macOS notification (works from root daemon for the console user)
notify() {
    title="$1"
    msg="$2"
    /usr/bin/osascript -e "display notification \"$msg\" with title \"$title\" sound name \"Ping\"" 2>/dev/null || true
}

# run "$@" but kill it after N seconds (first arg); returns the command's status,
# or 124 if it timed out / failed to exec. Uses perl's SIGALRM — always present on macOS.
bounded() {
	_t=$1; shift
	/usr/bin/perl -e 'alarm shift; exec @ARGV or exit 124' "$_t" "$@"
}

ensure_wg() {
	if /opt/homebrew/bin/wg show "$WG_IF" >/dev/null 2>&1; then
		return 0
	fi
	log "WG $WG_IF down; attempting wg-quick up to reach $SERVER_IP"
	/opt/homebrew/bin/wg-quick down "$WG_CONF" >/dev/null 2>&1 || true
	if /opt/homebrew/bin/wg-quick up "$WG_CONF" >>"$LOG" 2>&1; then
		log "WG up succeeded; waiting for mesh routes"
		sleep 8
		return 0
	else
		log "wg-quick up failed"
		return 1
	fi
}

recover_photos() {
	[ -d "$PHOTOS_LIB" ] || return 0
	log "Photos autorecover cleanup"
	# stale locks often block open/repair after volume events
	find "$PHOTOS_LIB/database" -name '*lock*' -type f -delete 2>/dev/null || true
	# restart the user's analysis daemons so they see the freshly mounted originals
	/usr/bin/sudo -u natalie /usr/bin/pkill -x photoanalysisd 2>/dev/null || true
	/usr/bin/sudo -u natalie /usr/bin/pkill -f photolibraryd 2>/dev/null || true
	# if a previous troubleshooting step pointed the symlink at a local stub, restore the real one
	cur=$(readlink "$PHOTOS_LIB/originals" 2>/dev/null || true)
	if echo "$cur" | grep -q 'Photos-stub\|stub/originals'; then
		ln -sfn "${MP}/Photos/Photos Library.photoslibrary/originals" "$PHOTOS_LIB/originals"
		log "restored canonical originals symlink"
	fi
	log "Photos cleanup done — relaunch Photos (hold Option+Command for Repair if it still whines)"
	notify "bigdisk-mount-guard" "Photos library autorecovered (mount + cleanup)"
}

# --- main ---

if [ "${STATUS_ONLY:-0}" = 1 ]; then
	if bounded 5 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
		echo "OK: bigdisk probe $PROBE reachable"
		exit 0
	else
		echo "STALE: bigdisk probe $PROBE not reachable"
		if /sbin/ping -c1 -t2 "$SERVER_IP" >/dev/null 2>&1; then
			echo "  server $SERVER_IP is up (mount wedged?)"
		else
			echo "  server $SERVER_IP unreachable"
		fi
		exit 1
	fi
fi

# Healthy path: probe stats quickly. Nothing (more) to do.
if [ "${FORCE:-0}" != 1 ] && bounded 8 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
	exit 0
fi

# Path not reachable (or forced). Try to make the server reachable first.
if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then
	log "probe failed; ${SERVER_IP} unreachable"
	if ensure_wg; then
		notify "bigdisk-mount-guard" "Brought up WG mesh to reach storage"
		if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then
			log "${SERVER_IP} still unreachable after WG attempt — leaving mount alone"
			notify "bigdisk-mount-guard" "bigdisk offline (server unreachable after WG) — no action"
			exit 0
		fi
		# fallthrough to mount recovery now that mesh is up
	else
		log "WG recovery not possible — offline, leaving mount alone"
		notify "bigdisk-mount-guard" "bigdisk offline (server unreachable) — no action taken"
		exit 0
	fi
fi

# Server reachable (possibly after WG recovery) but probe failed → wedged/stale mount.
log "probe failed but ${SERVER_IP} reachable — force-clearing wedged mount at ${MP}"
notify "bigdisk-mount-guard" "bigdisk mount wedged — force recovering now"
bounded 10 /sbin/umount -f "$MP" >/dev/null 2>&1 || true
# Kick automount to refresh static maps (safe/no-op if already good).
bounded 5 /usr/sbin/automount -vc >>"$LOG" 2>&1 || true
# Touch the path to (re)trigger autofs (or fstab bg reconnect).
bounded 15 /bin/ls "$MP" >/dev/null 2>&1 || true
sleep 3

if bounded 10 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
	log "remount OK"
	notify "bigdisk-mount-guard" "bigdisk mount recovered successfully"
	recover_photos
	exit 0
fi

log "remount still failing after force-clear"
notify "bigdisk-mount-guard" "bigdisk remount still failing (check logs)"
exit 1
feat(session-tools): disk-guard/bigdisk-mount-guard tools + wa console/lookup Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> 2026-06-29 14:35:20 -04:00			`#!/bin/sh`
			`# bigdisk-mount-guard — autorecover tool for the ~/_/bigdisk NFS mount`
			`# (Apple Photos originals + other big storage on black).`
			`#`
			`# The mount is provided both via:`
			`# - /etc/auto_nfs (autofs static): bigdisk -fstype=nfs,resvport black:/bigdisk`
			`# - /etc/fstab (hard NFS): 10.0.0.11:/bigdisk /Users/natalie/_/bigdisk nfs resvport,rw,bg,hard,intr,tcp`
			`#`
			`# On sleep, network roam, WG flap or "no WiFi" location switches the handle often`
			`# goes stale. autofs/fstab fails to re-trigger, Photos sees a dead symlink for`
			`# originals and the library gets stuck (permission errors, "Repairing…", repeated`
			`# password prompts for com.apple.library-repair).`
			`#`
			`# This tool autorecovers:`
			`# 1. If storage IP unreachable, attempt to bring the WireGuard mesh (wg1) up.`
			`# 2. If server reachable but mount wedged, force-umount + touch to re-trigger.`
			`# 3. On success, run Photos-specific recovery (clean DB locks, restart daemons,`
			`# restore canonical symlink if it had been pointed at a local stub).`
			`#`
			`# Designed to run as root LaunchDaemon (contrib plist). Also safe/useful by hand`
			`# or from host-monitor/disk-guard style tools.`
			`#`
			`# No GNU timeout (uses perl alarm, always on macOS).`
			`set -u`

			`# Support limited CLI for use as a tool: --status, --force, --help`
			`case "${1:-}" in`
			`--help\|-h)`
			`cat <<'EOF'`
			`bigdisk-mount-guard [options]`

			`(no arg) normal check + autorecover (idempotent, for launchd or cron)`
			`--status report only (0=healthy, 1=needs attention), no changes`
			`--force force a full recovery cycle (useful after manual WG bring-up)`
			`--help this message`

			`Runs as root (needs umount/mount/wg-quick). Logs to $LOG.`
			`EOF`
			`exit 0`
			`;;`
			`--status)`
			`STATUS_ONLY=1`
			`;;`
			`--force)`
			`FORCE=1`
			`;;`
			`"") ;;`
			`*)`
			`echo "unknown arg: $1" >&2`
			`exit 64`
			`;;`
			`esac`


			`MP="/Users/natalie/_/bigdisk"`
			`PROBE="${MP}/Photos" # a dir that only exists when the mount is live`
			`SERVER_IP="10.0.0.11" # black; 'black' resolves here too, routed via LAN or WG`
			`LOG="/var/log/bigdisk-mount-guard.log"`
			`WG_IF="wg1"`
			`WG_CONF="/Users/natalie/.wireguard/wg1.conf"`
			`PHOTOS_LIB="/Users/natalie/Pictures/Photos Library.photoslibrary"`

			`log() { echo "$(/bin/date '+%Y-%m-%dT%H:%M:%S') $*" >> "$LOG" 2>/dev/null; }`

			`# macOS notification (works from root daemon for the console user)`
			`notify() {`
			`title="$1"`
			`msg="$2"`
			`/usr/bin/osascript -e "display notification \"$msg\" with title \"$title\" sound name \"Ping\"" 2>/dev/null \|\| true`
			`}`

			`# run "$@" but kill it after N seconds (first arg); returns the command's status,`
			`# or 124 if it timed out / failed to exec. Uses perl's SIGALRM — always present on macOS.`
			`bounded() {`
			`_t=$1; shift`
			`/usr/bin/perl -e 'alarm shift; exec @ARGV or exit 124' "$_t" "$@"`
			`}`

			`ensure_wg() {`
			`if /opt/homebrew/bin/wg show "$WG_IF" >/dev/null 2>&1; then`
			`return 0`
			`fi`
			`log "WG $WG_IF down; attempting wg-quick up to reach $SERVER_IP"`
			`/opt/homebrew/bin/wg-quick down "$WG_CONF" >/dev/null 2>&1 \|\| true`
			`if /opt/homebrew/bin/wg-quick up "$WG_CONF" >>"$LOG" 2>&1; then`
			`log "WG up succeeded; waiting for mesh routes"`
			`sleep 8`
			`return 0`
			`else`
			`log "wg-quick up failed"`
			`return 1`
			`fi`
			`}`

			`recover_photos() {`
			`[ -d "$PHOTOS_LIB" ] \|\| return 0`
			`log "Photos autorecover cleanup"`
			`# stale locks often block open/repair after volume events`
			`find "$PHOTOS_LIB/database" -name 'lock' -type f -delete 2>/dev/null \|\| true`
			`# restart the user's analysis daemons so they see the freshly mounted originals`
			`/usr/bin/sudo -u natalie /usr/bin/pkill -x photoanalysisd 2>/dev/null \|\| true`
			`/usr/bin/sudo -u natalie /usr/bin/pkill -f photolibraryd 2>/dev/null \|\| true`
			`# if a previous troubleshooting step pointed the symlink at a local stub, restore the real one`
			`cur=$(readlink "$PHOTOS_LIB/originals" 2>/dev/null \|\| true)`
			`if echo "$cur" \| grep -q 'Photos-stub\\|stub/originals'; then`
			`ln -sfn "${MP}/Photos/Photos Library.photoslibrary/originals" "$PHOTOS_LIB/originals"`
			`log "restored canonical originals symlink"`
			`fi`
			`log "Photos cleanup done — relaunch Photos (hold Option+Command for Repair if it still whines)"`
			`notify "bigdisk-mount-guard" "Photos library autorecovered (mount + cleanup)"`
			`}`

			`# --- main ---`

			`if [ "${STATUS_ONLY:-0}" = 1 ]; then`
			`if bounded 5 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then`
			`echo "OK: bigdisk probe $PROBE reachable"`
			`exit 0`
			`else`
			`echo "STALE: bigdisk probe $PROBE not reachable"`
			`if /sbin/ping -c1 -t2 "$SERVER_IP" >/dev/null 2>&1; then`
			`echo " server $SERVER_IP is up (mount wedged?)"`
			`else`
			`echo " server $SERVER_IP unreachable"`
			`fi`
			`exit 1`
			`fi`
			`fi`

			`# Healthy path: probe stats quickly. Nothing (more) to do.`
			`if [ "${FORCE:-0}" != 1 ] && bounded 8 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then`
			`exit 0`
			`fi`

			`# Path not reachable (or forced). Try to make the server reachable first.`
			`if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then`
			`log "probe failed; ${SERVER_IP} unreachable"`
			`if ensure_wg; then`
			`notify "bigdisk-mount-guard" "Brought up WG mesh to reach storage"`
			`if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then`
			`log "${SERVER_IP} still unreachable after WG attempt — leaving mount alone"`
			`notify "bigdisk-mount-guard" "bigdisk offline (server unreachable after WG) — no action"`
			`exit 0`
			`fi`
			`# fallthrough to mount recovery now that mesh is up`
			`else`
			`log "WG recovery not possible — offline, leaving mount alone"`
			`notify "bigdisk-mount-guard" "bigdisk offline (server unreachable) — no action taken"`
			`exit 0`
			`fi`
			`fi`

			`# Server reachable (possibly after WG recovery) but probe failed → wedged/stale mount.`
			`log "probe failed but ${SERVER_IP} reachable — force-clearing wedged mount at ${MP}"`
			`notify "bigdisk-mount-guard" "bigdisk mount wedged — force recovering now"`
			`bounded 10 /sbin/umount -f "$MP" >/dev/null 2>&1 \|\| true`
			`# Kick automount to refresh static maps (safe/no-op if already good).`
			`bounded 5 /usr/sbin/automount -vc >>"$LOG" 2>&1 \|\| true`
			`# Touch the path to (re)trigger autofs (or fstab bg reconnect).`
			`bounded 15 /bin/ls "$MP" >/dev/null 2>&1 \|\| true`
			`sleep 3`

			`if bounded 10 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then`
			`log "remount OK"`
			`notify "bigdisk-mount-guard" "bigdisk mount recovered successfully"`
			`recover_photos`
			`exit 0`
			`fi`

			`log "remount still failing after force-clear"`
			`notify "bigdisk-mount-guard" "bigdisk remount still failing (check logs)"`
			`exit 1`