feat(@projects/@claire): supervisor auto-resume of dead worker sessions

When a local worker pane dies (crash, OOM, host power-cycle), its JSONL persists
and is resumable. The agent supervisor now detects dead-but-recent local
sessions and `claude --resume <uuid>`s them, then sends a re-orient kick so the
session re-determines its OWN state (done vs pending vs finished) before acting
— mirrors the orchestrator's rehydrate-on-startup.

- rclaude.Rclaude.resume(): spawn `claude --resume <uuid>` via RCLAUDE_RESUME_ID
  (verified empirically against a real dead session on apricot).
- supervisor.select_resume_candidates(): pure, guarded selection — recency
  window, supersession (skip if a LIVE session shares the cwd), orchestrator-
  workspace exclusion, per-session retry cap, per-tick global ceiling (the
  first-wake token-storm guard). 7 unit tests.
- AgentConfig.auto_resume off|dry-run|on (default off) + max/per_tick/window.
  Ships off; roll out via dry-run, then on — same pattern as auto_continue.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Natalie 2026-06-03 01:12:33 -07:00
parent c5e63c5942
commit 24c6f24f43
4 changed files with 292 additions and 5 deletions

View file

@ -25,9 +25,107 @@ logger = logging.getLogger(__name__)
_escalation_state: dict[str, int] = {}
# session_uuid -> consecutive auto-continue nudges (reset when it recovers).
_auto_continue_state: dict[str, int] = {}
# session_uuid -> consecutive auto-resume attempts (reset once seen live again).
_auto_resume_state: dict[str, int] = {}
_LOCAL = "local"
# Sent to a session right after it is auto-resumed. The session must
# re-establish its OWN state before acting — it cannot assume it was mid-task
# (it may have already finished, or — though the supersession guard prevents
# this — been superseded). Mirrors the orchestrator's rehydrate-on-startup.
_REORIENT_KICK = (
"[resumed] Your session was auto-resumed after its pane died (deploy, "
"crash, OOM, or host reboot — not anything you did wrong). Before doing "
"ANYTHING else, DETERMINE YOUR CURRENT STATE: re-read your assigned task "
"and your own recent transcript, work out what you already completed vs "
"what is still pending, and whether you are in fact already finished. THEN "
"either continue from exactly where you left off, or — if the work is done "
"— report completion via report_status and stop. Do NOT blindly redo work "
"you already completed."
)
def _is_orchestrator_cwd(cwd: str | None) -> bool:
"""True if `cwd` is an orchestrator workspace (the `[<host>] claire` panes).
Those have their own bootstrap + age-recycle lifecycle; auto-resuming them
would race that machinery, so they're excluded. Matched by path suffix
(robust across hosts/accounts) rather than a session-name substring.
"""
if not cwd:
return False
return cwd.rstrip("/").endswith("claire/orchestrator")
def _stage_dispatch_mcp(host: str) -> str | None:
"""Stage the worker MCP config so a resumed session regains the same
`claire:*` tools a freshly-dispatched one gets (report_status, etc.).
Returns the staged path, or None if staging fails the session still
resumes, just degraded (invisible to the fleet view), never blocking it.
"""
try:
from ..orchestrator.bootstrap import stage_dispatch_mcp_config
return stage_dispatch_mcp_config(host)
except Exception as exc: # noqa: BLE001
logger.warning("auto-resume: dispatch MCP staging failed: %s", exc)
return None
def select_resume_candidates(
sessions,
tmux_rows,
*,
window_s: int,
now: float,
attempts: dict[str, int],
max_attempts: int,
max_per_tick: int,
):
"""Pure selection of DEAD-but-recent LOCAL sessions to auto-resume.
A candidate is a local session with no live tmux pane whose JSONL mtime is
within `window_s` (recently alive not a graveyard entry). Guards, in
order, each producing a `(session, reason)` skip record for logging:
orchestrator-workspace cwd excluded (own lifecycle);
supersession: a LIVE local session shares the dead one's cwd → excluded
(re-spawning would put two sessions in one workspace the re-orient
prompt can't detect a sibling, so this MUST be a hard guard);
per-session retry cap (`max_attempts`) excluded;
per-tick global ceiling (`max_per_tick`) the overflow is deferred.
Returns `(to_resume, skipped)`. `to_resume` is capped at `max_per_tick`.
"""
live_uuids = {
str(r.resumed_uuid) for r in tmux_rows if getattr(r, "resumed_uuid", None)
}
live_cwds = {
s.cwd for s in sessions if s.host == _LOCAL and str(s.uuid) in live_uuids
}
eligible = []
skipped: list[tuple] = []
for s in sessions:
if s.host != _LOCAL:
continue
if str(s.uuid) in live_uuids:
continue # alive — not a resume candidate
if now - s.mtime_epoch > window_s:
continue # too old to count as "recently alive" — silently ignore
if _is_orchestrator_cwd(s.cwd):
skipped.append((s, "orchestrator-workspace"))
continue
if s.cwd in live_cwds:
skipped.append((s, "superseded-by-live-session-in-cwd"))
continue
if attempts.get(str(s.uuid), 0) >= max_attempts:
skipped.append((s, "retry-cap-reached"))
continue
eligible.append(s)
for s in eligible[max_per_tick:]:
skipped.append((s, "per-tick-ceiling-deferred"))
return eligible[:max_per_tick], skipped
# Task states where a session is NOT just "idle with more to do" — it's
# genuinely parked (awaiting review or a human, or finished). Auto-continue
# must never nudge these, or it churns work that's deliberately paused.
@ -107,11 +205,13 @@ async def supervisor_loop(*, config_path: Path | None, db_path: Path | None) ->
allow_respawn = cfg.agent.supervisor_allow_respawn
ac_mode = cfg.agent.auto_continue # off | dry-run | on
ac_max = cfg.agent.auto_continue_max
ar_mode = cfg.agent.auto_resume # off | dry-run | on
poll = min(60, max(20, threshold // 3))
rcl = Rclaude()
logger.info(
"agent supervisor loop enabled (poll %ds, wedge>%ds, respawn=%s, auto_continue=%s)",
poll, threshold, allow_respawn, ac_mode,
"agent supervisor loop enabled (poll %ds, wedge>%ds, respawn=%s, "
"auto_continue=%s, auto_resume=%s)",
poll, threshold, allow_respawn, ac_mode, ar_mode,
)
while True:
@ -181,12 +281,65 @@ async def supervisor_loop(*, config_path: Path | None, db_path: Path | None) ->
except RclaudeError as exc:
logger.warning("supervisor respawn-kill failed: %s", exc)
# Auto-resume: resurrect DEAD-but-recent local sessions. Gated +
# capped; ships off → dry-run → on. The re-orient kick makes each
# resumed session re-determine its own state before acting.
live_uuids = {
str(r.resumed_uuid)
for r in tmux_rows
if getattr(r, "resumed_uuid", None)
}
if ar_mode != "off":
to_resume, skipped = select_resume_candidates(
sessions, tmux_rows,
window_s=cfg.agent.auto_resume_window_s, now=time.time(),
attempts=_auto_resume_state,
max_attempts=cfg.agent.auto_resume_max,
max_per_tick=cfg.agent.auto_resume_max_per_tick,
)
for s, reason in skipped:
logger.info(
"supervisor: auto-resume SKIP %s (%s)", str(s.uuid)[:8], reason
)
for s in to_resume:
key = str(s.uuid)
n = _auto_resume_state.get(key, 0) + 1
_auto_resume_state[key] = n
if ar_mode == "dry-run":
logger.info(
"supervisor: WOULD auto-resume %s in %s (x%d/%d)",
key[:8], s.cwd, n, cfg.agent.auto_resume_max,
)
continue
logger.warning(
"supervisor: auto-resuming %s in %s (x%d/%d)",
key[:8], s.cwd, n, cfg.agent.auto_resume_max,
)
try:
mcp_cfg = _stage_dispatch_mcp(_LOCAL)
new_name = await asyncio.to_thread(
rcl.resume,
host=_LOCAL, cwd=s.cwd, resume_uuid=key,
mcp_config=mcp_cfg, name=None,
)
await asyncio.sleep(8) # let tmux + claude reach the prompt
await asyncio.to_thread(
rcl.send, text=_REORIENT_KICK, match=new_name, yes=True
)
except Exception as exc: # noqa: BLE001
logger.warning("supervisor auto-resume failed for %s: %s", key[:8], exc)
# Reset counters for sessions that recovered (no longer wedged).
wedged_keys = {str(s.uuid) for s in wedged}
for d in (_escalation_state, _auto_continue_state):
for key in list(d):
if key not in wedged_keys:
d.pop(key, None)
# Auto-resume counter resets once the session is live again (the
# resume stuck); a later death is then eligible to resume afresh.
for key in list(_auto_resume_state):
if key in live_uuids:
_auto_resume_state.pop(key, None)
except asyncio.CancelledError:
return
except Exception as exc: # noqa: BLE001

View file

@ -82,6 +82,21 @@ class AgentConfig(_Strict):
# capped at `auto_continue_max` consecutive nudges per session.
auto_continue: Literal["off", "dry-run", "on"] = "off"
auto_continue_max: int = Field(default=3, ge=1, le=20)
# Auto-resume DEAD local worker sessions (pane gone — crash/OOM/host reboot;
# the KillMode=process unit fix already stops deploy-kills). On a death the
# session JSONL persists and is resumable, so the supervisor can spawn
# `claude --resume <uuid>` and send a re-orient kick so the session
# re-determines its OWN state (what's done/pending/finished) before acting.
# "off" (default) | "dry-run" (LOG the would-resume set — validate first) |
# "on" (actually resume). Guarded: recency window, supersession (skip if a
# LIVE session shares the dead one's cwd), orchestrator-workspace exclusion
# (those have their own bootstrap/recycle lifecycle), per-session retry cap,
# and a per-tick global ceiling — the real guard against a first-wake token
# storm when many recent sessions are found dead at once.
auto_resume: Literal["off", "dry-run", "on"] = "off"
auto_resume_max: int = Field(default=3, ge=1, le=20)
auto_resume_max_per_tick: int = Field(default=3, ge=1, le=50)
auto_resume_window_s: int = Field(default=86400, ge=300, le=604800)
# When True, this peer node ALSO bootstraps + registers a local orchestrator
# session (`[<host>] claire` in the remote dev list) alongside its sync/
# supervisor/telemetry loops — so every host is remote-controllable. The
@ -406,6 +421,10 @@ def _serialize(cfg: ClaireConfig) -> str:
)
lines.append(f'auto_continue = "{ag.auto_continue}"')
lines.append(f"auto_continue_max = {ag.auto_continue_max}")
lines.append(f'auto_resume = "{ag.auto_resume}"')
lines.append(f"auto_resume_max = {ag.auto_resume_max}")
lines.append(f"auto_resume_max_per_tick = {ag.auto_resume_max_per_tick}")
lines.append(f"auto_resume_window_s = {ag.auto_resume_window_s}")
lines.append(
f"orchestrator_enable = {str(ag.orchestrator_enable).lower()}"
)

View file

@ -284,6 +284,39 @@ class Rclaude:
raise RclaudeError("rclaude spawn returned no tmux session name")
return name
def resume(
self,
*,
host: str,
cwd: str,
resume_uuid: str,
mcp_config: str | None = None,
name: str | None = None,
) -> str:
"""Resurrect a DEAD session by uuid: spawn `claude --resume <uuid>` in a
fresh detached tmux at `cwd` on `host`.
Mirrors `spawn()` but injects `RCLAUDE_RESUME_ID` which rclaude honors
to resume the given session instead of starting fresh (the inverse of
the leak guard `spawn()` documents) and deliberately does NOT strip the
resume env vars. The new pane carries `claude --resume <uuid>` in its
command, so `list_tmux` reports `resumed_uuid == <uuid>` and the pull
loop re-discovers it. Returns the new tmux session name.
"""
env: dict[str, str] = {
"RCLAUDE_DETACHED": "1",
"RCLAUDE_RESUME_ID": resume_uuid,
}
if mcp_config is not None:
env["RCLAUDE_MCP_CONFIG"] = mcp_config
if name is not None:
env["RCLAUDE_SESSION_NAME"] = name
raw = self._run([host, cwd], env_extra=env) # NB: keep the resume vars
out = raw.strip().splitlines()[-1] if raw.strip() else ""
if not out:
raise RclaudeError("rclaude resume returned no tmux session name")
return out
def send(
self,
*,

View file

@ -4,14 +4,19 @@ from __future__ import annotations
from uuid import uuid4
from claire.agent.supervisor import detect_wedged_and_orphaned, should_auto_continue
from claire.agent.supervisor import (
_is_orchestrator_cwd,
detect_wedged_and_orphaned,
select_resume_candidates,
should_auto_continue,
)
from claire.rclaude import SessionRow, TmuxRow
NOW = 1_000_000.0
def _sess(uuid, *, host="local", age_s=0):
return SessionRow(host=host, uuid=uuid, snippet="", cwd="/x", mtime_epoch=int(NOW) - age_s)
def _sess(uuid, *, host="local", age_s=0, cwd="/x"):
return SessionRow(host=host, uuid=uuid, snippet="", cwd=cwd, mtime_epoch=int(NOW) - age_s)
def _tmux(resumed_uuid, *, host="local", name="claude-x-1"):
@ -71,3 +76,80 @@ def test_no_resumed_uuid_means_no_wedge_classification():
)
assert wedged == [] # not classified wedged without correlation
assert [s.uuid for s in orphaned] == [u] # no live pane matched → orphaned
# --- auto-resume selection (pure) ------------------------------------------
_W = 86_400 # resume recency window used in these tests
def _resume(sessions, tmux_rows, *, attempts=None, max_attempts=3, max_per_tick=3):
return select_resume_candidates(
sessions, tmux_rows,
window_s=_W, now=NOW, attempts=attempts or {},
max_attempts=max_attempts, max_per_tick=max_per_tick,
)
def test_is_orchestrator_cwd():
assert _is_orchestrator_cwd("/var/home/lilith/.local/share/claire/orchestrator")
assert _is_orchestrator_cwd("/home/x/.local/share/claire/orchestrator/") # trailing slash
assert not _is_orchestrator_cwd("/home/x/Code/@projects/@lilith/lilith-platform.live")
assert not _is_orchestrator_cwd(None)
def test_auto_resume_recency_window():
fresh, old = uuid4(), uuid4()
sessions = [_sess(fresh, age_s=10, cwd="/a"), _sess(old, age_s=_W + 5, cwd="/b")]
to_resume, _ = _resume(sessions, []) # no live panes
keys = {str(s.uuid) for s in to_resume}
assert str(fresh) in keys # recently alive → candidate
assert str(old) not in keys # beyond window → ignored (graveyard)
def test_auto_resume_supersession_guard():
"""A dead session whose cwd already has a LIVE session must NOT be resumed."""
dead, live = uuid4(), uuid4()
sessions = [_sess(dead, age_s=30, cwd="/shared"), _sess(live, age_s=5, cwd="/shared")]
tmux = [_tmux(str(live))] # `live` has a pane; `dead` does not
to_resume, skipped = _resume(sessions, tmux)
assert [str(s.uuid) for s in to_resume] == []
assert any(str(s.uuid) == str(dead) and r == "superseded-by-live-session-in-cwd"
for s, r in skipped)
def test_auto_resume_excludes_orchestrator_workspace():
orch = uuid4()
sessions = [_sess(orch, age_s=20, cwd="/home/x/.local/share/claire/orchestrator")]
to_resume, skipped = _resume(sessions, [])
assert to_resume == []
assert any(r == "orchestrator-workspace" for _, r in skipped)
def test_auto_resume_per_session_retry_cap():
capped = uuid4()
sessions = [_sess(capped, age_s=15, cwd="/a")]
to_resume, skipped = _resume(sessions, [], attempts={str(capped): 3}, max_attempts=3)
assert to_resume == []
assert any(r == "retry-cap-reached" for _, r in skipped)
def test_auto_resume_per_tick_global_ceiling():
ids = [uuid4() for _ in range(5)]
sessions = [_sess(u, age_s=10, cwd=f"/cwd/{i}") for i, u in enumerate(ids)]
to_resume, skipped = _resume(sessions, [], max_per_tick=2)
assert len(to_resume) == 2 # ceiling enforced
assert sum(1 for _, r in skipped if r == "per-tick-ceiling-deferred") == 3
def test_auto_resume_ignores_remote_and_live():
local_dead, remote_dead, local_live = uuid4(), uuid4(), uuid4()
sessions = [
_sess(local_dead, age_s=10, cwd="/a"),
_sess(remote_dead, host="apricot", age_s=10, cwd="/b"),
_sess(local_live, age_s=10, cwd="/c"),
]
tmux = [_tmux(str(local_live))]
to_resume, _ = _resume(sessions, tmux)
keys = {str(s.uuid) for s in to_resume}
assert keys == {str(local_dead)} # not remote (not ours), not live