claire/scripts/deploy-agent.sh

97 lines
4.7 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
#
# Deploy the headless `claire agent` peer node to a Linux host (apricot|black).
# Runs FROM plum. Idempotent. Code + systemd unit + peer config (injects plum's
# sync_secret so the host can sync to plum).
#
# scripts/deploy-agent.sh apricot
#
# Requires: `remote-run` on PATH (~/Code/@scripts/session-tools), ssh access,
# uv + python3.12+ on the remote, and NTP-synced clocks (HMAC skew window 300s).
set -euo pipefail
HOST="${1:?usage: deploy-agent.sh <host>}"
SRC="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
REMOTE_DIR="Code/@projects/@claire" # relative to remote $HOME
PLUM_TOML="${CLAIRE_TOML:-$HOME/.config/claire/claire.toml}"
say() { printf '\033[1;35m▸\033[0m %s\n' "$*"; }
# Plum's bind URL — the peer's event-sync target. The HMAC secret is NO LONGER
# injected here: it lives in the vault (~/.vault/claire-sync-secret.txt), seeded
# below and read at runtime, so rotation is just a vault push.
PLUM_URL="$("$SRC/.venv/bin/python" - "$PLUM_TOML" <<'PY'
import sys, tomllib, pathlib
c = tomllib.loads(pathlib.Path(sys.argv[1]).read_text())
web = c.get("web", {})
host = web.get("host", "127.0.0.1")
if host in ("0.0.0.0", "::", ""):
host = "127.0.0.1"
print(f"http://{host}:{web.get('port', 8765)}")
PY
)"
say "plum peer URL = $PLUM_URL"
# Resolve a reachable SSH transport. The host LABEL stays $HOST (claire
# identity / sessions.host / per_host config), but the plum↔host route flaps:
# `.lan` is unreachable off-site and the direct WG relay can drop, so fall back
# to the `-wg` (direct WireGuard) then `-j` (black jump-host) aliases defined in
# ~/.ssh/config. Only the bare ssh/rsync legs need this — `remote-run` does its
# own routing. Override with CLAIRE_SSH_ALIAS=<alias> to force one.
say "[$HOST] resolve ssh transport + clock"
SSH=""
for cand in ${CLAIRE_SSH_ALIAS:-"$HOST" "${HOST}-wg" "${HOST}-j"}; do
if ssh -o ConnectTimeout=8 -o BatchMode=yes "$cand" 'true' 2>/dev/null; then
SSH="$cand"; break
fi
done
[ -n "$SSH" ] || { echo "ERROR: no reachable ssh transport for $HOST (tried ${CLAIRE_SSH_ALIAS:-$HOST $HOST-wg $HOST-j})" >&2; exit 1; }
[ "$SSH" = "$HOST" ] || say "[$HOST] direct route down — using ssh transport '$SSH'"
ssh "$SSH" 'timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown'
say "[$HOST] rsync source"
ssh "$SSH" "mkdir -p ~/$REMOTE_DIR"
rsync -az --delete -e ssh \
--exclude='.venv/' --exclude='.git/' --exclude='__pycache__/' \
--exclude='*.pyc' --exclude='.pytest_cache/' --exclude='.ruff_cache/' \
--exclude='claire.toml' \
--exclude='src/claire/web/app/node_modules/' \
--exclude='src/claire/web/app/dist/' \
"$SRC/" "${SSH}:${REMOTE_DIR}/"
say "[$HOST] install (uv if present, else python venv+pip) + init"
remote-run "$HOST" "export PATH=\"\$HOME/.local/bin:\$PATH\"; cd ~/$REMOTE_DIR && if command -v uv >/dev/null 2>&1; then { [ -d .venv ] || uv venv; }; uv pip install -e .; else { [ -d .venv ] || python3 -m venv .venv; }; .venv/bin/pip install -q -e .; fi && .venv/bin/claire init"
say "[$HOST] seed vault (BEFORE agent starts — it reads the HMAC secret from here)"
ssh "$SSH" 'mkdir -p ~/.vault && chmod 700 ~/.vault'
rsync -az --no-owner --no-group --chmod=D700,F600 -e ssh \
--exclude='.vault-backups/' --exclude='*.prev.txt' \
"$HOME/.vault/" "${SSH}:.vault/"
# Gate: the agent will 401 forever without the shared secret present.
ssh "$SSH" '[ -s ~/.vault/claire-sync-secret.txt ]' \
|| { echo "ERROR: ~/.vault/claire-sync-secret.txt missing on $HOST after seed" >&2; exit 1; }
say "[$HOST] configure peer (url only — secret is vault-sourced)"
remote-run "$HOST" "cd ~/$REMOTE_DIR && .venv/bin/claire agent add-peer --url '$PLUM_URL' && chmod 600 ~/.config/claire/claire.toml"
say "[$HOST] enable local orchestrator ([<host>] claire) — MCP → plum's central endpoint"
remote-run "$HOST" "cd ~/$REMOTE_DIR && .venv/bin/claire agent enable-orchestrator --mcp-url '$PLUM_URL/mcp/' && chmod 600 ~/.config/claire/claire.toml"
say "[$HOST] install + enable systemd --user unit"
remote-run "$HOST" "
mkdir -p ~/.config/systemd/user
cp ~/$REMOTE_DIR/deployments/systemd/claire-agent.service ~/.config/systemd/user/
systemctl --user daemon-reload
systemctl --user enable claire-agent.service
# restart (not just enable --now) so a redeploy actually loads the new code.
systemctl --user restart claire-agent.service
loginctl enable-linger \$(whoami) 2>/dev/null || true
sleep 2
# Real gate: is-active is non-zero iff the unit failed to come up. The status
# dump below is cosmetic — piping to head closes the pipe early (SIGPIPE), so
# keep it non-fatal or it false-aborts an otherwise-healthy deploy.
systemctl --user is-active claire-agent.service
systemctl --user --no-pager status claire-agent.service 2>&1 | head -5 || true
"
say "[$HOST] done."