diff --git a/scripts/deploy-agent.sh b/scripts/deploy-agent.sh index eb77517..9b4b991 100755 --- a/scripts/deploy-agent.sh +++ b/scripts/deploy-agent.sh @@ -75,6 +75,10 @@ remote-run "$HOST" " systemctl --user restart claire-agent.service loginctl enable-linger \$(whoami) 2>/dev/null || true sleep 2 - systemctl --user --no-pager status claire-agent.service | head -5 + # Real gate: is-active is non-zero iff the unit failed to come up. The status + # dump below is cosmetic — piping to head closes the pipe early (SIGPIPE), so + # keep it non-fatal or it false-aborts an otherwise-healthy deploy. + systemctl --user is-active claire-agent.service + systemctl --user --no-pager status claire-agent.service 2>&1 | head -5 || true " say "[$HOST] done." diff --git a/scripts/release-fleet.sh b/scripts/release-fleet.sh new file mode 100755 index 0000000..0099133 --- /dev/null +++ b/scripts/release-fleet.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# +# release-fleet.sh — one command to release the current claire working tree to +# the WHOLE fleet and restart every service. Runs FROM plum (the source of +# truth + the only host with the launchd services). +# +# scripts/release-fleet.sh # test → deploy apricot+black → restart plum +# scripts/release-fleet.sh --no-test # skip the pre-deploy pytest gate +# scripts/release-fleet.sh --no-plum # leave plum's services running (only push workers) +# scripts/release-fleet.sh --hosts apricot # restrict the worker host list +# scripts/release-fleet.sh --dry-run # print the plan, change nothing +# +# What it does, in order: +# 1. (gate) run the test suite — abort the whole release if it fails. +# 2. workers (apricot, black): scripts/deploy-agent.sh +# → rsync working tree + `uv pip install -e .` + restart claire-agent.service. +# 3. plum: `launchctl kickstart -k` claire-serve + claire-tray +# → editable install, so a restart is all that's needed to load new code. +# +# ⚠ Restarting plum's `com.lilith.claire-serve` briefly drops the web / API / +# MCP endpoint (~a few seconds). Anything mid-call against claire's MCP tools +# — INCLUDING the orchestrator itself — will blip until it comes back. Run it +# when you can tolerate that, or pass --no-plum and restart plum by hand. +# +# Requires (same as deploy-agent.sh): `remote-run` on PATH, ssh to the worker +# hosts, uv/python on the remotes, NTP-synced clocks. +set -euo pipefail + +SRC="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +HOSTS=(apricot black) +RUN_TESTS=1 +RESTART_PLUM=1 +DRY_RUN=0 +PLUM_SERVICES=(com.lilith.claire-serve com.lilith.claire-tray) + +say() { printf '\033[1;35m▸\033[0m %s\n' "$*"; } +warn() { printf '\033[1;33m⚠\033[0m %s\n' "$*" >&2; } +die() { printf '\033[1;31m✗\033[0m %s\n' "$*" >&2; exit 1; } +run() { if [ "$DRY_RUN" = 1 ]; then printf ' [dry-run] %s\n' "$*"; else eval "$@"; fi; } + +while [ $# -gt 0 ]; do + case "$1" in + --no-test) RUN_TESTS=0; shift ;; + --no-plum) RESTART_PLUM=0; shift ;; + --dry-run) DRY_RUN=1; shift ;; + --hosts) shift; IFS=' ' read -r -a HOSTS <<< "${1:?--hosts needs a value}"; shift ;; + -h|--help) sed -n '2,30p' "$0"; exit 0 ;; + *) die "unknown arg: $1 (try --help)" ;; + esac +done + +# --- 1. test gate ---------------------------------------------------------- +if [ "$RUN_TESTS" = 1 ]; then + say "test gate: pytest (use --no-test to skip)" + # Run via the project venv (where pytest + dev deps live); fall back to uv's + # managed env. `uv run pytest` is deliberately NOT used — pytest is a dev-extra + # in .venv, not a uv tool, so that spawn fails on this repo. + if [ -x "$SRC/.venv/bin/python" ]; then + run "(cd '$SRC' && .venv/bin/python -m pytest -q)" || die "tests failed — release aborted" + elif command -v uv >/dev/null 2>&1; then + run "(cd '$SRC' && uv run python -m pytest -q)" || die "tests failed — release aborted" + else + die "no .venv/bin/python and no uv — cannot run the test gate (use --no-test to override)" + fi +else + warn "skipping test gate (--no-test)" +fi + +# --- 2. worker hosts ------------------------------------------------------- +for h in "${HOSTS[@]}"; do + say "deploy worker → $h" + run "'$SRC/scripts/deploy-agent.sh' '$h'" || die "deploy-agent.sh $h failed" +done + +# --- 3. plum services ------------------------------------------------------ +if [ "$RESTART_PLUM" = 1 ]; then + warn "restarting plum services ${PLUM_SERVICES[*]} — claire-serve restart blips the MCP/web endpoint" + uid="$(id -u)" + for svc in "${PLUM_SERVICES[@]}"; do + if launchctl print "gui/$uid/$svc" >/dev/null 2>&1; then + say "kickstart $svc" + run "launchctl kickstart -k 'gui/$uid/$svc'" || warn "kickstart $svc returned non-zero" + else + warn "$svc not loaded in gui/$uid — skipping (load its LaunchAgent first)" + fi + done +else + warn "leaving plum services running (--no-plum) — restart them by hand to load new code" +fi + +if [ "$DRY_RUN" = 1 ]; then say "release plan printed (dry-run — nothing changed)."; else say "release complete."; fi diff --git a/src/claire/web/app.py b/src/claire/web/app.py index af31656..0228f76 100644 --- a/src/claire/web/app.py +++ b/src/claire/web/app.py @@ -170,7 +170,11 @@ def create_app( from ..config import load_or_init from ..db import migrate, open_db from ..hlc import HLCGenerator - from .rounds import fleet_fingerprint, post_rounds_turn + from .rounds import ( + fleet_fingerprint, + post_rounds_turn, + should_skip_round, + ) interval = load_or_init(config_path).orchestrator.rounds_interval_s if interval <= 0: @@ -193,11 +197,13 @@ def create_app( fp = fleet_fingerprint( conn, cfg.orchestrator.session_uuid ) - gate = cfg.orchestrator.rounds_skip_unchanged - floor = cfg.orchestrator.rounds_heartbeat_every - # Skip (don't wake the model) only while unchanged - # AND under the max-staleness floor. - if gate and fp == prev_fp and skips + 1 < floor: + # Skip (don't wake the model) only while the worker + # fleet is unchanged AND under the staleness floor. + if should_skip_round( + prev_fp, fp, skips, + enabled=cfg.orchestrator.rounds_skip_unchanged, + heartbeat_every=cfg.orchestrator.rounds_heartbeat_every, + ): return (False, fp) gen = HLCGenerator(cfg.machine_id) post_rounds_turn(conn, gen, cfg) diff --git a/src/claire/web/rounds.py b/src/claire/web/rounds.py index de9c3c1..5f906ed 100644 --- a/src/claire/web/rounds.py +++ b/src/claire/web/rounds.py @@ -65,6 +65,27 @@ def fleet_fingerprint( return h.hexdigest() +def should_skip_round( + prev_fp: str | None, + fp: str, + consecutive_skips: int, + *, + enabled: bool, + heartbeat_every: int, +) -> bool: + """Decide whether the rounds timer should SKIP waking the orchestrator. + + Skip only when the gate is enabled, the worker fleet fingerprint is + unchanged since the last *posted* round, and we are still under the + max-staleness floor (`heartbeat_every` consecutive skips force a round). + """ + if not enabled: + return False + if fp != prev_fp: + return False + return consecutive_skips + 1 < heartbeat_every + + def build_rounds_prompt(cfg: ClaireConfig) -> str: """The structured DO-ROUNDS prompt posted to the orchestrator. diff --git a/tests/test_health_rounds.py b/tests/test_health_rounds.py index 99a06a6..3a75c72 100644 --- a/tests/test_health_rounds.py +++ b/tests/test_health_rounds.py @@ -84,3 +84,89 @@ def test_rounds_tick_schedules_turn(tmp_path: Path, monkeypatch) -> None: assert r.json()["status"] == "scheduled" # TestClient runs background tasks before returning, so the turn posted. assert len(calls) == 1 + + +# --- rounds skip gate (fingerprint) ---------------------------------------- + +import uuid as _uuid # noqa: E402 + +from claire.db import migrate # noqa: E402 +from claire.hlc import HLCGenerator # noqa: E402 +from claire.orchestrator.tools import report_status # noqa: E402 +from claire.web.rounds import fleet_fingerprint, should_skip_round # noqa: E402 + +_MACHINE = "00000000-0000-0000-0000-0000000000aa" + + +def _seed_db(tmp_path: Path): + conn = open_db(tmp_path / "claire.db") + migrate(conn) + gen = HLCGenerator(_MACHINE) + hlc = lambda: str(gen.tick()) # noqa: E731 + conn.execute( + "INSERT INTO projects(id,name,created_hlc,updated_hlc) VALUES(?,?,?,?)", + ("p1", "proj", hlc(), hlc()), + ) + conn.execute( + "INSERT INTO tasks(id,project_id,title,status,priority,created_hlc," + "updated_hlc) VALUES(?,?,?,?,?,?,?)", + ("t1", "p1", "work", "in_progress", 2, hlc(), hlc()), + ) + worker = str(_uuid.uuid4()) + orch = str(_uuid.uuid4()) + for sid, host in ((worker, "apricot"), (orch, "local")): + conn.execute( + "INSERT INTO sessions(uuid,host,cwd,tmux_name,updated_hlc,liveness)" + " VALUES(?,?,?,?,?,?)", + (sid, host, f"/cwd/{host}", host, hlc(), "alive"), + ) + report_status(conn, gen, session_uuid=worker, summary="worker booting") + conn.commit() + return conn, gen, worker, orch + + +def test_fingerprint_ignores_orchestrators_own_round_side_effects( + tmp_path: Path, +) -> None: + """A round mutates the orchestrator's OWN session/status every tick. If the + fingerprint folded that in, the skip gate would never engage. It must not.""" + conn, gen, _worker, orch = _seed_db(tmp_path) + fp1 = fleet_fingerprint(conn, orch) + + # Simulate the round's own side effects: the orchestrator reports its own + # status (as it does every single round, with a fresh summary each time). + report_status(conn, gen, session_uuid=orch, summary="Rounds tick (1st)") + report_status(conn, gen, session_uuid=orch, summary="Rounds tick (2nd)…") + fp2 = fleet_fingerprint(conn, orch) + + assert fp2 == fp1, "orchestrator self-status must not change the fingerprint" + # And the gate would therefore skip the next tick (under the floor). + assert should_skip_round(fp1, fp2, 0, enabled=True, heartbeat_every=6) + + +def test_fingerprint_changes_on_real_worker_movement(tmp_path: Path) -> None: + conn, gen, worker, orch = _seed_db(tmp_path) + fp1 = fleet_fingerprint(conn, orch) + + # A worker pushing new status is real fleet movement → fingerprint changes. + report_status(conn, gen, session_uuid=worker, summary="now compiling") + fp_worker = fleet_fingerprint(conn, orch) + assert fp_worker != fp1 + assert not should_skip_round(fp1, fp_worker, 0, enabled=True, heartbeat_every=6) + + # A task leaving the open set (→ done) is also real movement. + conn.execute("UPDATE tasks SET status='done' WHERE id='t1'") + conn.commit() + assert fleet_fingerprint(conn, orch) != fp_worker + + +def test_should_skip_round_floor_and_toggle() -> None: + # Unchanged + enabled + under floor → skip. + assert should_skip_round("a", "a", 0, enabled=True, heartbeat_every=3) + assert should_skip_round("a", "a", 1, enabled=True, heartbeat_every=3) + # Hitting the floor forces a round (no skip), so the HUD never goes silent. + assert not should_skip_round("a", "a", 2, enabled=True, heartbeat_every=3) + # Any change forces a round. + assert not should_skip_round("a", "b", 0, enabled=True, heartbeat_every=3) + # Gate off → always post. + assert not should_skip_round("a", "a", 0, enabled=False, heartbeat_every=99)