feat(@projects/@clare): add session age recycling logic

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Natalie 2026-05-20 17:29:42 -07:00
parent d821a2e08c
commit 2d355bcab5
4 changed files with 97 additions and 9 deletions

View file

@ -59,6 +59,12 @@ class OrchestratorConfig(_Strict):
# user. Set True to let Clare dispatch eligible work autonomously
# (still gated by the budget envelope + per-host caps).
autonomous_dispatch: bool = False
# Recycle the orchestrator session once it's older than this many
# seconds. The orchestrator is one long-lived Claude session that the
# rounds loop appends turns to; without recycling its context grows
# unbounded (a 1100-message session wedged on 2026-05-20). The
# supervisor kills + respawns past this age. 0 disables. Default 6h.
max_session_age_s: int = Field(default=21600, ge=0, le=604800)
class BudgetConfig(_Strict):

View file

@ -274,6 +274,18 @@ def _cwd_slug(cwd: str) -> str:
return _re.sub(r"[^A-Za-z0-9]", "-", _re.sub(r"^[~/]+", "", cwd))
def _tmux_epoch(session_name: str) -> int | None:
"""Spawn epoch from a tmux name `claude-<user>-<slug>-<epoch>`.
rclaude appends `-$(date +%s)` when it creates the session, so the
trailing all-digit segment is the spawn time. Returns None if the
name doesn't end in digits.
"""
import re as _re
m = _re.search(r"-(\d+)$", session_name)
return int(m.group(1)) if m else None
def _send_kick(*, rcl: Rclaude, cwd: str) -> None:
"""Send a bootstrap message so Claude flushes its session JSONL to disk.
@ -386,12 +398,30 @@ def ensure_running(
tmux_rows = rcl.list_tmux()
except RclaudeError:
tmux_rows = []
alive = any(
r.host == cfg.orchestrator.host and slug in r.session_name
for r in tmux_rows
)
if alive:
return cfg.orchestrator.session_uuid
live = [
r for r in tmux_rows
if r.host == cfg.orchestrator.host and slug in r.session_name
]
if live:
# Alive — but recycle if it's older than max_session_age_s.
# The orchestrator accumulates a turn per round; left running
# for many hours its context bloats until it stops replying.
max_age = cfg.orchestrator.max_session_age_s
stale = False
if max_age > 0:
oldest_epoch = min(
(_tmux_epoch(r.session_name) for r in live),
default=None,
)
if oldest_epoch is not None and (time.time() - oldest_epoch) > max_age:
stale = True
if not stale:
return cfg.orchestrator.session_uuid
# Stale — kill the live session(s); fall through to spawn fresh.
try:
rcl.kill(match=slug, yes=True)
except RclaudeError:
pass # best-effort; spawn proceeds either way
# Snapshot pre-spawn session uuids at this cwd so discovery can ignore
# stale disk JSONLs from previous Claude runs (Claude doesn't delete

View file

@ -265,3 +265,16 @@ class Rclaude:
# back into structured form (rclaude prints stderr-aligned text).
# Push A surfaces the raw output; consumers can show it to the user.
return SendResult(targets=(), delivered=yes and not dry_run, raw_output=raw)
def kill(self, *, match: str, yes: bool = True) -> str:
"""Invoke `rclaude kill --match <pat>` — ends matching tmux sessions.
Used by the supervisor to recycle a stale orchestrator session.
Returns rclaude's raw output. `RclaudeError` propagates on a
non-zero exit (e.g. no matching sessions) callers that treat a
no-match as benign should catch it.
"""
args = ["kill", "--match", match]
if yes:
args.append("--yes")
return self._run(args)

View file

@ -59,6 +59,13 @@ class _FakeRclaude:
def send(self, *, text: str, match: str, yes: bool = False, dry_run: bool = False): # noqa: ARG002
return None
# `.kill()` is used to recycle a stale orchestrator session.
def kill(self, *, match: str, yes: bool = True) -> str: # noqa: ARG002
self.kill_calls = getattr(self, "kill_calls", [])
self.kill_calls.append(match)
self._tmux_rows = [] # killed sessions vanish from the tmux roster
return "killed"
def with_initial_rows(self, rows: list[_FakeSessionRow]) -> "_FakeRclaude":
self._initial_rows = rows
return self
@ -96,18 +103,22 @@ def test_ensure_running_noop_when_session_already_alive(isolated_cfg: Path) -> N
leaving the JSONL behind, and the supervisor must detect that and
respawn. Liveness is checked against `list_tmux()`.
"""
import time as _time
existing = _uuid.uuid4()
bootstrap.write_session_uuid(str(existing))
# The cwd-slug `Users-natalie--local-share-clare-orchestrator` is the
# substring `_cwd_slug(_DEFAULT_CWD)` looks for in the tmux name.
# substring `_cwd_slug(_DEFAULT_CWD)` looks for in the tmux name. The
# trailing epoch must be RECENT so the age-recycle check treats it as
# fresh (an epoch of `1` would look ancient and trigger a recycle).
from clare.orchestrator.bootstrap import _DEFAULT_CWD, _cwd_slug
slug = _cwd_slug(str(_DEFAULT_CWD.expanduser().resolve()))
now_epoch = int(_time.time())
rcl = _FakeRclaude().with_tmux_rows([
_FakeTmuxRow(host="local", session_name=f"claude-tester-{slug}-1"),
_FakeTmuxRow(host="local", session_name=f"claude-tester-{slug}-{now_epoch}"),
])
result = bootstrap.ensure_running(rclaude=rcl, discover_timeout_s=1)
assert result == str(existing)
assert rcl.spawn_calls == [] # nothing spawned — already alive
assert rcl.spawn_calls == [] # nothing spawned — already alive + fresh
def test_ensure_running_respawns_when_session_died(isolated_cfg: Path) -> None:
@ -130,6 +141,34 @@ def test_ensure_running_respawns_when_session_died(isolated_cfg: Path) -> None:
assert load_or_init().orchestrator.session_uuid == str(new_uuid)
def test_ensure_running_recycles_stale_session(isolated_cfg: Path) -> None:
"""An alive-but-too-old orchestrator session is killed + respawned.
Guards the 2026-05-20 regression: the orchestrator ran one session for
~12h, the rounds loop appended 1100+ turns, context bloated, and it
stopped replying. The supervisor must recycle past max_session_age_s.
"""
from clare.orchestrator.bootstrap import _DEFAULT_CWD, _cwd_slug
existing = _uuid.uuid4()
bootstrap.write_session_uuid(str(existing))
slug = _cwd_slug(str(_DEFAULT_CWD.expanduser().resolve()))
# tmux name with an ANCIENT epoch (year-2001-ish) → far past 6h cap.
new_uuid = _uuid.uuid4()
rcl = _FakeRclaude(rows_after_spawn=[
_FakeSessionRow(
host="local", uuid=new_uuid,
cwd=str(_DEFAULT_CWD.expanduser().resolve()), mtime_epoch=999,
),
]).with_tmux_rows([
_FakeTmuxRow(host="local", session_name=f"claude-natalie-{slug}-1000000000"),
])
result = bootstrap.ensure_running(rclaude=rcl, discover_timeout_s=2)
# The stale session was killed and a fresh one spawned + persisted.
assert getattr(rcl, "kill_calls", []) == [slug]
assert len(rcl.spawn_calls) == 1
assert result == str(new_uuid)
def test_ensure_running_returns_none_when_discovery_times_out(
isolated_cfg: Path,
) -> None: