diff --git a/scripts/deploy-agent.sh b/scripts/deploy-agent.sh index 9b4b991..f955e57 100755 --- a/scripts/deploy-agent.sh +++ b/scripts/deploy-agent.sh @@ -32,31 +32,43 @@ PY )" say "plum peer URL = $PLUM_URL" -say "[$HOST] reachability + clock" -ssh -o ConnectTimeout=8 -o BatchMode=yes "$HOST" 'true' \ - || { echo "ERROR: cannot ssh $HOST" >&2; exit 1; } -ssh "$HOST" 'timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown' +# Resolve a reachable SSH transport. The host LABEL stays $HOST (claire +# identity / sessions.host / per_host config), but the plum↔host route flaps: +# `.lan` is unreachable off-site and the direct WG relay can drop, so fall back +# to the `-wg` (direct WireGuard) then `-j` (black jump-host) aliases defined in +# ~/.ssh/config. Only the bare ssh/rsync legs need this — `remote-run` does its +# own routing. Override with CLAIRE_SSH_ALIAS= to force one. +say "[$HOST] resolve ssh transport + clock" +SSH="" +for cand in ${CLAIRE_SSH_ALIAS:-"$HOST" "${HOST}-wg" "${HOST}-j"}; do + if ssh -o ConnectTimeout=8 -o BatchMode=yes "$cand" 'true' 2>/dev/null; then + SSH="$cand"; break + fi +done +[ -n "$SSH" ] || { echo "ERROR: no reachable ssh transport for $HOST (tried ${CLAIRE_SSH_ALIAS:-$HOST $HOST-wg $HOST-j})" >&2; exit 1; } +[ "$SSH" = "$HOST" ] || say "[$HOST] direct route down — using ssh transport '$SSH'" +ssh "$SSH" 'timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown' say "[$HOST] rsync source" -ssh "$HOST" "mkdir -p ~/$REMOTE_DIR" -rsync -az --delete \ +ssh "$SSH" "mkdir -p ~/$REMOTE_DIR" +rsync -az --delete -e ssh \ --exclude='.venv/' --exclude='.git/' --exclude='__pycache__/' \ --exclude='*.pyc' --exclude='.pytest_cache/' --exclude='.ruff_cache/' \ --exclude='claire.toml' \ --exclude='src/claire/web/app/node_modules/' \ --exclude='src/claire/web/app/dist/' \ - "$SRC/" "${HOST}:${REMOTE_DIR}/" + "$SRC/" "${SSH}:${REMOTE_DIR}/" say "[$HOST] install (uv if present, else python venv+pip) + init" remote-run "$HOST" "export PATH=\"\$HOME/.local/bin:\$PATH\"; cd ~/$REMOTE_DIR && if command -v uv >/dev/null 2>&1; then { [ -d .venv ] || uv venv; }; uv pip install -e .; else { [ -d .venv ] || python3 -m venv .venv; }; .venv/bin/pip install -q -e .; fi && .venv/bin/claire init" say "[$HOST] seed vault (BEFORE agent starts — it reads the HMAC secret from here)" -ssh "$HOST" 'mkdir -p ~/.vault && chmod 700 ~/.vault' -rsync -az --no-owner --no-group --chmod=D700,F600 \ +ssh "$SSH" 'mkdir -p ~/.vault && chmod 700 ~/.vault' +rsync -az --no-owner --no-group --chmod=D700,F600 -e ssh \ --exclude='.vault-backups/' --exclude='*.prev.txt' \ - "$HOME/.vault/" "${HOST}:.vault/" + "$HOME/.vault/" "${SSH}:.vault/" # Gate: the agent will 401 forever without the shared secret present. -ssh "$HOST" '[ -s ~/.vault/claire-sync-secret.txt ]' \ +ssh "$SSH" '[ -s ~/.vault/claire-sync-secret.txt ]' \ || { echo "ERROR: ~/.vault/claire-sync-secret.txt missing on $HOST after seed" >&2; exit 1; } say "[$HOST] configure peer (url only — secret is vault-sourced)" diff --git a/src/claire/db.py b/src/claire/db.py index d42b55b..1e8fda5 100644 --- a/src/claire/db.py +++ b/src/claire/db.py @@ -436,6 +436,9 @@ def open_db(path: Path | str | None = None) -> sqlite3.Connection: conn = sqlite3.connect(str(path), isolation_level=None, check_same_thread=False) conn.row_factory = sqlite3.Row conn.execute("PRAGMA foreign_keys = ON") + # Wait up to 5s for a held write lock instead of failing immediately with + # "database is locked" — peer-sync ingest and API writes contend otherwise. + conn.execute("PRAGMA busy_timeout = 5000") # WAL doesn't apply to :memory: and isn't strictly needed; harmless to try. if str(path) != ":memory:": conn.execute("PRAGMA journal_mode = WAL")