diff --git a/src/clare/config.py b/src/clare/config.py index 60df837..a71e26e 100644 --- a/src/clare/config.py +++ b/src/clare/config.py @@ -59,6 +59,12 @@ class OrchestratorConfig(_Strict): # user. Set True to let Clare dispatch eligible work autonomously # (still gated by the budget envelope + per-host caps). autonomous_dispatch: bool = False + # Recycle the orchestrator session once it's older than this many + # seconds. The orchestrator is one long-lived Claude session that the + # rounds loop appends turns to; without recycling its context grows + # unbounded (a 1100-message session wedged on 2026-05-20). The + # supervisor kills + respawns past this age. 0 disables. Default 6h. + max_session_age_s: int = Field(default=21600, ge=0, le=604800) class BudgetConfig(_Strict): diff --git a/src/clare/orchestrator/bootstrap.py b/src/clare/orchestrator/bootstrap.py index 8c3ee6f..dbeedaa 100644 --- a/src/clare/orchestrator/bootstrap.py +++ b/src/clare/orchestrator/bootstrap.py @@ -274,6 +274,18 @@ def _cwd_slug(cwd: str) -> str: return _re.sub(r"[^A-Za-z0-9]", "-", _re.sub(r"^[~/]+", "", cwd)) +def _tmux_epoch(session_name: str) -> int | None: + """Spawn epoch from a tmux name `claude---`. + + rclaude appends `-$(date +%s)` when it creates the session, so the + trailing all-digit segment is the spawn time. Returns None if the + name doesn't end in digits. + """ + import re as _re + m = _re.search(r"-(\d+)$", session_name) + return int(m.group(1)) if m else None + + def _send_kick(*, rcl: Rclaude, cwd: str) -> None: """Send a bootstrap message so Claude flushes its session JSONL to disk. @@ -386,12 +398,30 @@ def ensure_running( tmux_rows = rcl.list_tmux() except RclaudeError: tmux_rows = [] - alive = any( - r.host == cfg.orchestrator.host and slug in r.session_name - for r in tmux_rows - ) - if alive: - return cfg.orchestrator.session_uuid + live = [ + r for r in tmux_rows + if r.host == cfg.orchestrator.host and slug in r.session_name + ] + if live: + # Alive — but recycle if it's older than max_session_age_s. + # The orchestrator accumulates a turn per round; left running + # for many hours its context bloats until it stops replying. + max_age = cfg.orchestrator.max_session_age_s + stale = False + if max_age > 0: + oldest_epoch = min( + (_tmux_epoch(r.session_name) for r in live), + default=None, + ) + if oldest_epoch is not None and (time.time() - oldest_epoch) > max_age: + stale = True + if not stale: + return cfg.orchestrator.session_uuid + # Stale — kill the live session(s); fall through to spawn fresh. + try: + rcl.kill(match=slug, yes=True) + except RclaudeError: + pass # best-effort; spawn proceeds either way # Snapshot pre-spawn session uuids at this cwd so discovery can ignore # stale disk JSONLs from previous Claude runs (Claude doesn't delete diff --git a/src/clare/rclaude.py b/src/clare/rclaude.py index 1b2e001..f8b822b 100644 --- a/src/clare/rclaude.py +++ b/src/clare/rclaude.py @@ -265,3 +265,16 @@ class Rclaude: # back into structured form (rclaude prints stderr-aligned text). # Push A surfaces the raw output; consumers can show it to the user. return SendResult(targets=(), delivered=yes and not dry_run, raw_output=raw) + + def kill(self, *, match: str, yes: bool = True) -> str: + """Invoke `rclaude kill --match ` — ends matching tmux sessions. + + Used by the supervisor to recycle a stale orchestrator session. + Returns rclaude's raw output. `RclaudeError` propagates on a + non-zero exit (e.g. no matching sessions) — callers that treat a + no-match as benign should catch it. + """ + args = ["kill", "--match", match] + if yes: + args.append("--yes") + return self._run(args) diff --git a/tests/test_orchestrator_supervisor.py b/tests/test_orchestrator_supervisor.py index 1107803..428069e 100644 --- a/tests/test_orchestrator_supervisor.py +++ b/tests/test_orchestrator_supervisor.py @@ -59,6 +59,13 @@ class _FakeRclaude: def send(self, *, text: str, match: str, yes: bool = False, dry_run: bool = False): # noqa: ARG002 return None + # `.kill()` is used to recycle a stale orchestrator session. + def kill(self, *, match: str, yes: bool = True) -> str: # noqa: ARG002 + self.kill_calls = getattr(self, "kill_calls", []) + self.kill_calls.append(match) + self._tmux_rows = [] # killed sessions vanish from the tmux roster + return "killed" + def with_initial_rows(self, rows: list[_FakeSessionRow]) -> "_FakeRclaude": self._initial_rows = rows return self @@ -96,18 +103,22 @@ def test_ensure_running_noop_when_session_already_alive(isolated_cfg: Path) -> N leaving the JSONL behind, and the supervisor must detect that and respawn. Liveness is checked against `list_tmux()`. """ + import time as _time existing = _uuid.uuid4() bootstrap.write_session_uuid(str(existing)) # The cwd-slug `Users-natalie--local-share-clare-orchestrator` is the - # substring `_cwd_slug(_DEFAULT_CWD)` looks for in the tmux name. + # substring `_cwd_slug(_DEFAULT_CWD)` looks for in the tmux name. The + # trailing epoch must be RECENT so the age-recycle check treats it as + # fresh (an epoch of `1` would look ancient and trigger a recycle). from clare.orchestrator.bootstrap import _DEFAULT_CWD, _cwd_slug slug = _cwd_slug(str(_DEFAULT_CWD.expanduser().resolve())) + now_epoch = int(_time.time()) rcl = _FakeRclaude().with_tmux_rows([ - _FakeTmuxRow(host="local", session_name=f"claude-tester-{slug}-1"), + _FakeTmuxRow(host="local", session_name=f"claude-tester-{slug}-{now_epoch}"), ]) result = bootstrap.ensure_running(rclaude=rcl, discover_timeout_s=1) assert result == str(existing) - assert rcl.spawn_calls == [] # nothing spawned — already alive + assert rcl.spawn_calls == [] # nothing spawned — already alive + fresh def test_ensure_running_respawns_when_session_died(isolated_cfg: Path) -> None: @@ -130,6 +141,34 @@ def test_ensure_running_respawns_when_session_died(isolated_cfg: Path) -> None: assert load_or_init().orchestrator.session_uuid == str(new_uuid) +def test_ensure_running_recycles_stale_session(isolated_cfg: Path) -> None: + """An alive-but-too-old orchestrator session is killed + respawned. + + Guards the 2026-05-20 regression: the orchestrator ran one session for + ~12h, the rounds loop appended 1100+ turns, context bloated, and it + stopped replying. The supervisor must recycle past max_session_age_s. + """ + from clare.orchestrator.bootstrap import _DEFAULT_CWD, _cwd_slug + existing = _uuid.uuid4() + bootstrap.write_session_uuid(str(existing)) + slug = _cwd_slug(str(_DEFAULT_CWD.expanduser().resolve())) + # tmux name with an ANCIENT epoch (year-2001-ish) → far past 6h cap. + new_uuid = _uuid.uuid4() + rcl = _FakeRclaude(rows_after_spawn=[ + _FakeSessionRow( + host="local", uuid=new_uuid, + cwd=str(_DEFAULT_CWD.expanduser().resolve()), mtime_epoch=999, + ), + ]).with_tmux_rows([ + _FakeTmuxRow(host="local", session_name=f"claude-natalie-{slug}-1000000000"), + ]) + result = bootstrap.ensure_running(rclaude=rcl, discover_timeout_s=2) + # The stale session was killed and a fresh one spawned + persisted. + assert getattr(rcl, "kill_calls", []) == [slug] + assert len(rcl.spawn_calls) == 1 + assert result == str(new_uuid) + + def test_ensure_running_returns_none_when_discovery_times_out( isolated_cfg: Path, ) -> None: