feat(@projects/@claire): ✨ rounds skip-gate loop + fleet release tool
Wire the rounds timer to a pure-Python skip gate so claire-serve only wakes the orchestrator model when worker fleet state changed (not every tick): - web/rounds.py: fleet_fingerprint() over worker sessions (minus the orchestrator's own) + open tasks; should_skip_round() with heartbeat floor. - web/app.py: _rounds_loop tracks last fingerprint + consecutive skips. - excludes the orchestrator's own session/chat so a round's self-side-effects can't defeat the gate. Add scripts/release-fleet.sh (test -> deploy apricot+black -> restart plum services) and harden deploy-agent.sh's cosmetic status check against a SIGPIPE false-abort. 3 new discriminating tests; 349 pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8d82bb0abc
commit
ce6948d6e9
5 changed files with 215 additions and 7 deletions
|
|
@ -75,6 +75,10 @@ remote-run "$HOST" "
|
|||
systemctl --user restart claire-agent.service
|
||||
loginctl enable-linger \$(whoami) 2>/dev/null || true
|
||||
sleep 2
|
||||
systemctl --user --no-pager status claire-agent.service | head -5
|
||||
# Real gate: is-active is non-zero iff the unit failed to come up. The status
|
||||
# dump below is cosmetic — piping to head closes the pipe early (SIGPIPE), so
|
||||
# keep it non-fatal or it false-aborts an otherwise-healthy deploy.
|
||||
systemctl --user is-active claire-agent.service
|
||||
systemctl --user --no-pager status claire-agent.service 2>&1 | head -5 || true
|
||||
"
|
||||
say "[$HOST] done."
|
||||
|
|
|
|||
91
scripts/release-fleet.sh
Executable file
91
scripts/release-fleet.sh
Executable file
|
|
@ -0,0 +1,91 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# release-fleet.sh — one command to release the current claire working tree to
|
||||
# the WHOLE fleet and restart every service. Runs FROM plum (the source of
|
||||
# truth + the only host with the launchd services).
|
||||
#
|
||||
# scripts/release-fleet.sh # test → deploy apricot+black → restart plum
|
||||
# scripts/release-fleet.sh --no-test # skip the pre-deploy pytest gate
|
||||
# scripts/release-fleet.sh --no-plum # leave plum's services running (only push workers)
|
||||
# scripts/release-fleet.sh --hosts apricot # restrict the worker host list
|
||||
# scripts/release-fleet.sh --dry-run # print the plan, change nothing
|
||||
#
|
||||
# What it does, in order:
|
||||
# 1. (gate) run the test suite — abort the whole release if it fails.
|
||||
# 2. workers (apricot, black): scripts/deploy-agent.sh <host>
|
||||
# → rsync working tree + `uv pip install -e .` + restart claire-agent.service.
|
||||
# 3. plum: `launchctl kickstart -k` claire-serve + claire-tray
|
||||
# → editable install, so a restart is all that's needed to load new code.
|
||||
#
|
||||
# ⚠ Restarting plum's `com.lilith.claire-serve` briefly drops the web / API /
|
||||
# MCP endpoint (~a few seconds). Anything mid-call against claire's MCP tools
|
||||
# — INCLUDING the orchestrator itself — will blip until it comes back. Run it
|
||||
# when you can tolerate that, or pass --no-plum and restart plum by hand.
|
||||
#
|
||||
# Requires (same as deploy-agent.sh): `remote-run` on PATH, ssh to the worker
|
||||
# hosts, uv/python on the remotes, NTP-synced clocks.
|
||||
set -euo pipefail
|
||||
|
||||
SRC="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
HOSTS=(apricot black)
|
||||
RUN_TESTS=1
|
||||
RESTART_PLUM=1
|
||||
DRY_RUN=0
|
||||
PLUM_SERVICES=(com.lilith.claire-serve com.lilith.claire-tray)
|
||||
|
||||
say() { printf '\033[1;35m▸\033[0m %s\n' "$*"; }
|
||||
warn() { printf '\033[1;33m⚠\033[0m %s\n' "$*" >&2; }
|
||||
die() { printf '\033[1;31m✗\033[0m %s\n' "$*" >&2; exit 1; }
|
||||
run() { if [ "$DRY_RUN" = 1 ]; then printf ' [dry-run] %s\n' "$*"; else eval "$@"; fi; }
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--no-test) RUN_TESTS=0; shift ;;
|
||||
--no-plum) RESTART_PLUM=0; shift ;;
|
||||
--dry-run) DRY_RUN=1; shift ;;
|
||||
--hosts) shift; IFS=' ' read -r -a HOSTS <<< "${1:?--hosts needs a value}"; shift ;;
|
||||
-h|--help) sed -n '2,30p' "$0"; exit 0 ;;
|
||||
*) die "unknown arg: $1 (try --help)" ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# --- 1. test gate ----------------------------------------------------------
|
||||
if [ "$RUN_TESTS" = 1 ]; then
|
||||
say "test gate: pytest (use --no-test to skip)"
|
||||
# Run via the project venv (where pytest + dev deps live); fall back to uv's
|
||||
# managed env. `uv run pytest` is deliberately NOT used — pytest is a dev-extra
|
||||
# in .venv, not a uv tool, so that spawn fails on this repo.
|
||||
if [ -x "$SRC/.venv/bin/python" ]; then
|
||||
run "(cd '$SRC' && .venv/bin/python -m pytest -q)" || die "tests failed — release aborted"
|
||||
elif command -v uv >/dev/null 2>&1; then
|
||||
run "(cd '$SRC' && uv run python -m pytest -q)" || die "tests failed — release aborted"
|
||||
else
|
||||
die "no .venv/bin/python and no uv — cannot run the test gate (use --no-test to override)"
|
||||
fi
|
||||
else
|
||||
warn "skipping test gate (--no-test)"
|
||||
fi
|
||||
|
||||
# --- 2. worker hosts -------------------------------------------------------
|
||||
for h in "${HOSTS[@]}"; do
|
||||
say "deploy worker → $h"
|
||||
run "'$SRC/scripts/deploy-agent.sh' '$h'" || die "deploy-agent.sh $h failed"
|
||||
done
|
||||
|
||||
# --- 3. plum services ------------------------------------------------------
|
||||
if [ "$RESTART_PLUM" = 1 ]; then
|
||||
warn "restarting plum services ${PLUM_SERVICES[*]} — claire-serve restart blips the MCP/web endpoint"
|
||||
uid="$(id -u)"
|
||||
for svc in "${PLUM_SERVICES[@]}"; do
|
||||
if launchctl print "gui/$uid/$svc" >/dev/null 2>&1; then
|
||||
say "kickstart $svc"
|
||||
run "launchctl kickstart -k 'gui/$uid/$svc'" || warn "kickstart $svc returned non-zero"
|
||||
else
|
||||
warn "$svc not loaded in gui/$uid — skipping (load its LaunchAgent first)"
|
||||
fi
|
||||
done
|
||||
else
|
||||
warn "leaving plum services running (--no-plum) — restart them by hand to load new code"
|
||||
fi
|
||||
|
||||
if [ "$DRY_RUN" = 1 ]; then say "release plan printed (dry-run — nothing changed)."; else say "release complete."; fi
|
||||
|
|
@ -170,7 +170,11 @@ def create_app(
|
|||
from ..config import load_or_init
|
||||
from ..db import migrate, open_db
|
||||
from ..hlc import HLCGenerator
|
||||
from .rounds import fleet_fingerprint, post_rounds_turn
|
||||
from .rounds import (
|
||||
fleet_fingerprint,
|
||||
post_rounds_turn,
|
||||
should_skip_round,
|
||||
)
|
||||
|
||||
interval = load_or_init(config_path).orchestrator.rounds_interval_s
|
||||
if interval <= 0:
|
||||
|
|
@ -193,11 +197,13 @@ def create_app(
|
|||
fp = fleet_fingerprint(
|
||||
conn, cfg.orchestrator.session_uuid
|
||||
)
|
||||
gate = cfg.orchestrator.rounds_skip_unchanged
|
||||
floor = cfg.orchestrator.rounds_heartbeat_every
|
||||
# Skip (don't wake the model) only while unchanged
|
||||
# AND under the max-staleness floor.
|
||||
if gate and fp == prev_fp and skips + 1 < floor:
|
||||
# Skip (don't wake the model) only while the worker
|
||||
# fleet is unchanged AND under the staleness floor.
|
||||
if should_skip_round(
|
||||
prev_fp, fp, skips,
|
||||
enabled=cfg.orchestrator.rounds_skip_unchanged,
|
||||
heartbeat_every=cfg.orchestrator.rounds_heartbeat_every,
|
||||
):
|
||||
return (False, fp)
|
||||
gen = HLCGenerator(cfg.machine_id)
|
||||
post_rounds_turn(conn, gen, cfg)
|
||||
|
|
|
|||
|
|
@ -65,6 +65,27 @@ def fleet_fingerprint(
|
|||
return h.hexdigest()
|
||||
|
||||
|
||||
def should_skip_round(
|
||||
prev_fp: str | None,
|
||||
fp: str,
|
||||
consecutive_skips: int,
|
||||
*,
|
||||
enabled: bool,
|
||||
heartbeat_every: int,
|
||||
) -> bool:
|
||||
"""Decide whether the rounds timer should SKIP waking the orchestrator.
|
||||
|
||||
Skip only when the gate is enabled, the worker fleet fingerprint is
|
||||
unchanged since the last *posted* round, and we are still under the
|
||||
max-staleness floor (`heartbeat_every` consecutive skips force a round).
|
||||
"""
|
||||
if not enabled:
|
||||
return False
|
||||
if fp != prev_fp:
|
||||
return False
|
||||
return consecutive_skips + 1 < heartbeat_every
|
||||
|
||||
|
||||
def build_rounds_prompt(cfg: ClaireConfig) -> str:
|
||||
"""The structured DO-ROUNDS prompt posted to the orchestrator.
|
||||
|
||||
|
|
|
|||
|
|
@ -84,3 +84,89 @@ def test_rounds_tick_schedules_turn(tmp_path: Path, monkeypatch) -> None:
|
|||
assert r.json()["status"] == "scheduled"
|
||||
# TestClient runs background tasks before returning, so the turn posted.
|
||||
assert len(calls) == 1
|
||||
|
||||
|
||||
# --- rounds skip gate (fingerprint) ----------------------------------------
|
||||
|
||||
import uuid as _uuid # noqa: E402
|
||||
|
||||
from claire.db import migrate # noqa: E402
|
||||
from claire.hlc import HLCGenerator # noqa: E402
|
||||
from claire.orchestrator.tools import report_status # noqa: E402
|
||||
from claire.web.rounds import fleet_fingerprint, should_skip_round # noqa: E402
|
||||
|
||||
_MACHINE = "00000000-0000-0000-0000-0000000000aa"
|
||||
|
||||
|
||||
def _seed_db(tmp_path: Path):
|
||||
conn = open_db(tmp_path / "claire.db")
|
||||
migrate(conn)
|
||||
gen = HLCGenerator(_MACHINE)
|
||||
hlc = lambda: str(gen.tick()) # noqa: E731
|
||||
conn.execute(
|
||||
"INSERT INTO projects(id,name,created_hlc,updated_hlc) VALUES(?,?,?,?)",
|
||||
("p1", "proj", hlc(), hlc()),
|
||||
)
|
||||
conn.execute(
|
||||
"INSERT INTO tasks(id,project_id,title,status,priority,created_hlc,"
|
||||
"updated_hlc) VALUES(?,?,?,?,?,?,?)",
|
||||
("t1", "p1", "work", "in_progress", 2, hlc(), hlc()),
|
||||
)
|
||||
worker = str(_uuid.uuid4())
|
||||
orch = str(_uuid.uuid4())
|
||||
for sid, host in ((worker, "apricot"), (orch, "local")):
|
||||
conn.execute(
|
||||
"INSERT INTO sessions(uuid,host,cwd,tmux_name,updated_hlc,liveness)"
|
||||
" VALUES(?,?,?,?,?,?)",
|
||||
(sid, host, f"/cwd/{host}", host, hlc(), "alive"),
|
||||
)
|
||||
report_status(conn, gen, session_uuid=worker, summary="worker booting")
|
||||
conn.commit()
|
||||
return conn, gen, worker, orch
|
||||
|
||||
|
||||
def test_fingerprint_ignores_orchestrators_own_round_side_effects(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
"""A round mutates the orchestrator's OWN session/status every tick. If the
|
||||
fingerprint folded that in, the skip gate would never engage. It must not."""
|
||||
conn, gen, _worker, orch = _seed_db(tmp_path)
|
||||
fp1 = fleet_fingerprint(conn, orch)
|
||||
|
||||
# Simulate the round's own side effects: the orchestrator reports its own
|
||||
# status (as it does every single round, with a fresh summary each time).
|
||||
report_status(conn, gen, session_uuid=orch, summary="Rounds tick (1st)")
|
||||
report_status(conn, gen, session_uuid=orch, summary="Rounds tick (2nd)…")
|
||||
fp2 = fleet_fingerprint(conn, orch)
|
||||
|
||||
assert fp2 == fp1, "orchestrator self-status must not change the fingerprint"
|
||||
# And the gate would therefore skip the next tick (under the floor).
|
||||
assert should_skip_round(fp1, fp2, 0, enabled=True, heartbeat_every=6)
|
||||
|
||||
|
||||
def test_fingerprint_changes_on_real_worker_movement(tmp_path: Path) -> None:
|
||||
conn, gen, worker, orch = _seed_db(tmp_path)
|
||||
fp1 = fleet_fingerprint(conn, orch)
|
||||
|
||||
# A worker pushing new status is real fleet movement → fingerprint changes.
|
||||
report_status(conn, gen, session_uuid=worker, summary="now compiling")
|
||||
fp_worker = fleet_fingerprint(conn, orch)
|
||||
assert fp_worker != fp1
|
||||
assert not should_skip_round(fp1, fp_worker, 0, enabled=True, heartbeat_every=6)
|
||||
|
||||
# A task leaving the open set (→ done) is also real movement.
|
||||
conn.execute("UPDATE tasks SET status='done' WHERE id='t1'")
|
||||
conn.commit()
|
||||
assert fleet_fingerprint(conn, orch) != fp_worker
|
||||
|
||||
|
||||
def test_should_skip_round_floor_and_toggle() -> None:
|
||||
# Unchanged + enabled + under floor → skip.
|
||||
assert should_skip_round("a", "a", 0, enabled=True, heartbeat_every=3)
|
||||
assert should_skip_round("a", "a", 1, enabled=True, heartbeat_every=3)
|
||||
# Hitting the floor forces a round (no skip), so the HUD never goes silent.
|
||||
assert not should_skip_round("a", "a", 2, enabled=True, heartbeat_every=3)
|
||||
# Any change forces a round.
|
||||
assert not should_skip_round("a", "b", 0, enabled=True, heartbeat_every=3)
|
||||
# Gate off → always post.
|
||||
assert not should_skip_round("a", "a", 0, enabled=False, heartbeat_every=99)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue