feat(@projects/@clare): ✨ add session age recycling logic
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
d821a2e08c
commit
2d355bcab5
4 changed files with 97 additions and 9 deletions
|
|
@ -59,6 +59,12 @@ class OrchestratorConfig(_Strict):
|
|||
# user. Set True to let Clare dispatch eligible work autonomously
|
||||
# (still gated by the budget envelope + per-host caps).
|
||||
autonomous_dispatch: bool = False
|
||||
# Recycle the orchestrator session once it's older than this many
|
||||
# seconds. The orchestrator is one long-lived Claude session that the
|
||||
# rounds loop appends turns to; without recycling its context grows
|
||||
# unbounded (a 1100-message session wedged on 2026-05-20). The
|
||||
# supervisor kills + respawns past this age. 0 disables. Default 6h.
|
||||
max_session_age_s: int = Field(default=21600, ge=0, le=604800)
|
||||
|
||||
|
||||
class BudgetConfig(_Strict):
|
||||
|
|
|
|||
|
|
@ -274,6 +274,18 @@ def _cwd_slug(cwd: str) -> str:
|
|||
return _re.sub(r"[^A-Za-z0-9]", "-", _re.sub(r"^[~/]+", "", cwd))
|
||||
|
||||
|
||||
def _tmux_epoch(session_name: str) -> int | None:
|
||||
"""Spawn epoch from a tmux name `claude-<user>-<slug>-<epoch>`.
|
||||
|
||||
rclaude appends `-$(date +%s)` when it creates the session, so the
|
||||
trailing all-digit segment is the spawn time. Returns None if the
|
||||
name doesn't end in digits.
|
||||
"""
|
||||
import re as _re
|
||||
m = _re.search(r"-(\d+)$", session_name)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def _send_kick(*, rcl: Rclaude, cwd: str) -> None:
|
||||
"""Send a bootstrap message so Claude flushes its session JSONL to disk.
|
||||
|
||||
|
|
@ -386,12 +398,30 @@ def ensure_running(
|
|||
tmux_rows = rcl.list_tmux()
|
||||
except RclaudeError:
|
||||
tmux_rows = []
|
||||
alive = any(
|
||||
r.host == cfg.orchestrator.host and slug in r.session_name
|
||||
for r in tmux_rows
|
||||
)
|
||||
if alive:
|
||||
return cfg.orchestrator.session_uuid
|
||||
live = [
|
||||
r for r in tmux_rows
|
||||
if r.host == cfg.orchestrator.host and slug in r.session_name
|
||||
]
|
||||
if live:
|
||||
# Alive — but recycle if it's older than max_session_age_s.
|
||||
# The orchestrator accumulates a turn per round; left running
|
||||
# for many hours its context bloats until it stops replying.
|
||||
max_age = cfg.orchestrator.max_session_age_s
|
||||
stale = False
|
||||
if max_age > 0:
|
||||
oldest_epoch = min(
|
||||
(_tmux_epoch(r.session_name) for r in live),
|
||||
default=None,
|
||||
)
|
||||
if oldest_epoch is not None and (time.time() - oldest_epoch) > max_age:
|
||||
stale = True
|
||||
if not stale:
|
||||
return cfg.orchestrator.session_uuid
|
||||
# Stale — kill the live session(s); fall through to spawn fresh.
|
||||
try:
|
||||
rcl.kill(match=slug, yes=True)
|
||||
except RclaudeError:
|
||||
pass # best-effort; spawn proceeds either way
|
||||
|
||||
# Snapshot pre-spawn session uuids at this cwd so discovery can ignore
|
||||
# stale disk JSONLs from previous Claude runs (Claude doesn't delete
|
||||
|
|
|
|||
|
|
@ -265,3 +265,16 @@ class Rclaude:
|
|||
# back into structured form (rclaude prints stderr-aligned text).
|
||||
# Push A surfaces the raw output; consumers can show it to the user.
|
||||
return SendResult(targets=(), delivered=yes and not dry_run, raw_output=raw)
|
||||
|
||||
def kill(self, *, match: str, yes: bool = True) -> str:
|
||||
"""Invoke `rclaude kill --match <pat>` — ends matching tmux sessions.
|
||||
|
||||
Used by the supervisor to recycle a stale orchestrator session.
|
||||
Returns rclaude's raw output. `RclaudeError` propagates on a
|
||||
non-zero exit (e.g. no matching sessions) — callers that treat a
|
||||
no-match as benign should catch it.
|
||||
"""
|
||||
args = ["kill", "--match", match]
|
||||
if yes:
|
||||
args.append("--yes")
|
||||
return self._run(args)
|
||||
|
|
|
|||
|
|
@ -59,6 +59,13 @@ class _FakeRclaude:
|
|||
def send(self, *, text: str, match: str, yes: bool = False, dry_run: bool = False): # noqa: ARG002
|
||||
return None
|
||||
|
||||
# `.kill()` is used to recycle a stale orchestrator session.
|
||||
def kill(self, *, match: str, yes: bool = True) -> str: # noqa: ARG002
|
||||
self.kill_calls = getattr(self, "kill_calls", [])
|
||||
self.kill_calls.append(match)
|
||||
self._tmux_rows = [] # killed sessions vanish from the tmux roster
|
||||
return "killed"
|
||||
|
||||
def with_initial_rows(self, rows: list[_FakeSessionRow]) -> "_FakeRclaude":
|
||||
self._initial_rows = rows
|
||||
return self
|
||||
|
|
@ -96,18 +103,22 @@ def test_ensure_running_noop_when_session_already_alive(isolated_cfg: Path) -> N
|
|||
leaving the JSONL behind, and the supervisor must detect that and
|
||||
respawn. Liveness is checked against `list_tmux()`.
|
||||
"""
|
||||
import time as _time
|
||||
existing = _uuid.uuid4()
|
||||
bootstrap.write_session_uuid(str(existing))
|
||||
# The cwd-slug `Users-natalie--local-share-clare-orchestrator` is the
|
||||
# substring `_cwd_slug(_DEFAULT_CWD)` looks for in the tmux name.
|
||||
# substring `_cwd_slug(_DEFAULT_CWD)` looks for in the tmux name. The
|
||||
# trailing epoch must be RECENT so the age-recycle check treats it as
|
||||
# fresh (an epoch of `1` would look ancient and trigger a recycle).
|
||||
from clare.orchestrator.bootstrap import _DEFAULT_CWD, _cwd_slug
|
||||
slug = _cwd_slug(str(_DEFAULT_CWD.expanduser().resolve()))
|
||||
now_epoch = int(_time.time())
|
||||
rcl = _FakeRclaude().with_tmux_rows([
|
||||
_FakeTmuxRow(host="local", session_name=f"claude-tester-{slug}-1"),
|
||||
_FakeTmuxRow(host="local", session_name=f"claude-tester-{slug}-{now_epoch}"),
|
||||
])
|
||||
result = bootstrap.ensure_running(rclaude=rcl, discover_timeout_s=1)
|
||||
assert result == str(existing)
|
||||
assert rcl.spawn_calls == [] # nothing spawned — already alive
|
||||
assert rcl.spawn_calls == [] # nothing spawned — already alive + fresh
|
||||
|
||||
|
||||
def test_ensure_running_respawns_when_session_died(isolated_cfg: Path) -> None:
|
||||
|
|
@ -130,6 +141,34 @@ def test_ensure_running_respawns_when_session_died(isolated_cfg: Path) -> None:
|
|||
assert load_or_init().orchestrator.session_uuid == str(new_uuid)
|
||||
|
||||
|
||||
def test_ensure_running_recycles_stale_session(isolated_cfg: Path) -> None:
|
||||
"""An alive-but-too-old orchestrator session is killed + respawned.
|
||||
|
||||
Guards the 2026-05-20 regression: the orchestrator ran one session for
|
||||
~12h, the rounds loop appended 1100+ turns, context bloated, and it
|
||||
stopped replying. The supervisor must recycle past max_session_age_s.
|
||||
"""
|
||||
from clare.orchestrator.bootstrap import _DEFAULT_CWD, _cwd_slug
|
||||
existing = _uuid.uuid4()
|
||||
bootstrap.write_session_uuid(str(existing))
|
||||
slug = _cwd_slug(str(_DEFAULT_CWD.expanduser().resolve()))
|
||||
# tmux name with an ANCIENT epoch (year-2001-ish) → far past 6h cap.
|
||||
new_uuid = _uuid.uuid4()
|
||||
rcl = _FakeRclaude(rows_after_spawn=[
|
||||
_FakeSessionRow(
|
||||
host="local", uuid=new_uuid,
|
||||
cwd=str(_DEFAULT_CWD.expanduser().resolve()), mtime_epoch=999,
|
||||
),
|
||||
]).with_tmux_rows([
|
||||
_FakeTmuxRow(host="local", session_name=f"claude-natalie-{slug}-1000000000"),
|
||||
])
|
||||
result = bootstrap.ensure_running(rclaude=rcl, discover_timeout_s=2)
|
||||
# The stale session was killed and a fresh one spawned + persisted.
|
||||
assert getattr(rcl, "kill_calls", []) == [slug]
|
||||
assert len(rcl.spawn_calls) == 1
|
||||
assert result == str(new_uuid)
|
||||
|
||||
|
||||
def test_ensure_running_returns_none_when_discovery_times_out(
|
||||
isolated_cfg: Path,
|
||||
) -> None:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue