From 2d978b5fe303fca6d4a1384fe6deac9ddc9560d5 Mon Sep 17 00:00:00 2001 From: Natalie Date: Fri, 29 May 2026 22:46:28 -0600 Subject: [PATCH] fix(dispatch): discover sessions by canonical host label, not raw "local" discover_session polled `rclaude list sessions` for the freshly spawned session but filtered rows with `r.host == host` where host is the canonical name (e.g. "plum"), while rclaude labels the calling machine's own sessions "local". "local" == "plum" is always False, so discovery matched nothing and timed out even though the session's JSONL was already on disk (observed: 18s after spawn, inside the 30s window). dispatch then falsely returned "spawned but not discovered", orphaning the live session until a manual pull. Root cause is a missing host-label normalization the pull loop already does. Fix discover_session to canonicalize both sides via resolve_host_label, and key local-path symlink resolution on the ROW's raw label. Apply the same normalization to dispatch_task's pre_uuids filter (identical mismatch left it empty, risking a stale-sibling match at a shared cwd). 2 regression tests reproduce rclaude's "local" labeling (the old fake echoed the dispatch host, masking the bug). 310 tests pass. Committed manually with ALLOW_COMMIT=1 per user authorization: the auto-commit service's message LLM was timing out on this repo. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/claire/orchestrator/bootstrap.py | 30 +++++++--- src/claire/web/service.py | 8 ++- tests/test_dispatch.py | 85 ++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 8 deletions(-) diff --git a/src/claire/orchestrator/bootstrap.py b/src/claire/orchestrator/bootstrap.py index b11a52a..5a8d4f9 100644 --- a/src/claire/orchestrator/bootstrap.py +++ b/src/claire/orchestrator/bootstrap.py @@ -199,8 +199,30 @@ def discover_session( surviving session wins on ties. """ rcl = rclaude or Rclaude() + # rclaude reports the calling machine as "local" regardless of its real + # name, so compare host labels in their CANONICAL form — exactly as the + # pull loop does (pull.py: `cfg.resolve_host_label(row.host)`). Without + # this, dispatching to a named host (e.g. "plum") never matches the rows + # rclaude labels "local", and discovery always times out even though the + # session's JSONL is already on disk. + cfg = load_or_init() + target_host = cfg.resolve_host_label(host) target = str(cwd.resolve()) if isinstance(cwd, Path) else cwd.rstrip("/") skip = ignore_uuids or set() + + def _row_is_local(r) -> bool: + # rclaude tags the calling machine's own sessions "local"; only those + # live on this filesystem and so need symlink resolution. Keyed on the + # ROW's raw label, not the (canonical) `host` param. + return r.host == "local" + + def _row_cwd(r) -> str: + return ( + str(Path(r.cwd).resolve()) + if _row_is_local(r) + else (r.cwd or "").rstrip("/") + ) + deadline = time.time() + timeout_s while time.time() < deadline: try: @@ -208,15 +230,9 @@ def discover_session( except RclaudeError: time.sleep(poll_interval_s) continue - def _row_cwd(r): - return ( - str(Path(r.cwd).resolve()) - if host == "local" - else (r.cwd or "").rstrip("/") - ) matches = [ r for r in rows - if r.host == host + if cfg.resolve_host_label(r.host) == target_host and r.cwd and _row_cwd(r) == target and str(r.uuid) not in skip diff --git a/src/claire/web/service.py b/src/claire/web/service.py index c4d7b8c..52742d9 100644 --- a/src/claire/web/service.py +++ b/src/claire/web/service.py @@ -1425,9 +1425,15 @@ def dispatch_task( pre_rows = rcl.list_sessions() except RclaudeError: pre_rows = [] + # Canonicalize host labels here too: rclaude tags this machine's rows + # "local", so `r.host == host` (host e.g. "plum") would never match and + # the pre-existing-session set would be empty — letting discovery latch + # onto a STALE sibling session at the same cwd (e.g. another workstream + # sharing the repo root). Mirror discover_session / pull normalization. pre_uuids = { str(r.uuid) for r in pre_rows - if r.host == host and r.cwd and r.cwd.rstrip("/") == cwd.rstrip("/") + if cfg.resolve_host_label(r.host) == canonical_host + and r.cwd and r.cwd.rstrip("/") == cwd.rstrip("/") } # Wire the dispatched agent's `claire:*` MCP tools (it needs `report_status`). # Stage a claire `.mcp.json` to a stable path *outside* the task repo on the diff --git a/tests/test_dispatch.py b/tests/test_dispatch.py index 6e747ef..3a13847 100644 --- a/tests/test_dispatch.py +++ b/tests/test_dispatch.py @@ -166,6 +166,91 @@ def test_dispatch_success() -> None: assert sess["tmux_name"] == tmux_name +class _FakeRclaudeLocalLabel(_FakeRclaude): + """Reproduces rclaude's real labeling: it reports the calling machine's + own sessions as host ``"local"``, NOT the canonical fleet name. The + base fake echoes the dispatch host, which masked the host-label bug.""" + + def spawn(self, *, host, cwd, mcp_config=None, name=None): # noqa: ARG002 + from claire.rclaude import RclaudeError + if not self._spawn_ok: + raise RclaudeError("spawn failed") + self.spawn_calls.append( + {"host": host, "cwd": cwd, "mcp_config": mcp_config, "name": name} + ) + # The on-disk session row is labeled "local" by rclaude regardless of + # the canonical host we dispatched to. + self._rows.append(_FakeSessionRow( + host="local", uuid=_uuid.uuid4(), cwd=cwd, mtime_epoch=999, + )) + return f"claude-tester-{len(self.spawn_calls)}" + + +def test_dispatch_discovers_session_labeled_local(monkeypatch, tmp_path) -> None: + # Regression: rclaude tags this machine's sessions "local"; dispatching to + # the machine's canonical name ("plum") must still discover them. The old + # `r.host == host` filter compared "local" == "plum" and never matched, so + # discovery timed out and dispatch falsely reported "spawned but not + # discovered" — even though the JSONL was already on disk. + monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config")) + cfg_path = tmp_path / "config" / "claire" / "claire.toml" + cfg_path.parent.mkdir(parents=True, exist_ok=True) + cfg_path.write_text( + 'machine_id = "m"\nthis_host = "plum"\n' + '\n[web]\nhost = "127.0.0.1"\nport = 8765\n', + encoding="utf-8", + ) + conn, gen = _setup() + task_id = _project_with_task(conn, gen) + rcl = _FakeRclaudeLocalLabel() + result = service.dispatch_task( + conn, gen, task_id=task_id, host="plum", cwd="/work", + rclaude=rcl, discover_timeout_s=2, mcp_stager=_no_mcp_stager, + ) + assert result.dispatched is True, result.reason + assert result.session_uuid is not None + assert result.assignment_id is not None + # The assignment binds the task to the discovered (local-labeled) session. + rows = conn.execute( + "SELECT COUNT(*) FROM assignments WHERE task_id = ?", (str(task_id),) + ).fetchone() + assert rows[0] == 1 + + +def test_dispatch_ignores_stale_sibling_at_same_cwd(monkeypatch, tmp_path) -> None: + # Regression: the `pre_uuids` filter had the same "local" vs canonical + # mismatch, so it was empty — letting discovery latch onto a STALE sibling + # session already at the cwd instead of the freshly spawned one. With the + # fix, the pre-existing local-labeled session is excluded and the NEW + # session (higher mtime) is the one assigned. + monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config")) + cfg_path = tmp_path / "config" / "claire" / "claire.toml" + cfg_path.parent.mkdir(parents=True, exist_ok=True) + cfg_path.write_text( + 'machine_id = "m"\nthis_host = "plum"\n' + '\n[web]\nhost = "127.0.0.1"\nport = 8765\n', + encoding="utf-8", + ) + conn, gen = _setup() + task_id = _project_with_task(conn, gen) + rcl = _FakeRclaudeLocalLabel() + # A stale sibling session already exists at the same cwd, labeled "local", + # with a HIGHER mtime than the spawn will produce (999) — so the mtime sort + # alone would wrongly pick it; only excluding it via pre_uuids is correct. + stale = _uuid.uuid4() + rcl._rows.append(_FakeSessionRow( + host="local", uuid=stale, cwd="/work", mtime_epoch=1000, + )) + result = service.dispatch_task( + conn, gen, task_id=task_id, host="plum", cwd="/work", + rclaude=rcl, discover_timeout_s=2, mcp_stager=_no_mcp_stager, + ) + assert result.dispatched is True, result.reason + # The assigned session must be the freshly spawned one, NOT the stale + # sibling that was present before the spawn. + assert result.session_uuid != str(stale) + + def test_dispatch_refused_host_at_cap(monkeypatch, tmp_path) -> None: # Isolate config so the host cap is the default 3, not the dev box's. monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config"))