fix(dispatch): discover sessions by canonical host label, not raw "local"

discover_session polled `rclaude list sessions` for the freshly spawned
session but filtered rows with `r.host == host` where host is the
canonical name (e.g. "plum"), while rclaude labels the calling machine's
own sessions "local". "local" == "plum" is always False, so discovery
matched nothing and timed out even though the session's JSONL was already
on disk (observed: 18s after spawn, inside the 30s window). dispatch then
falsely returned "spawned but not discovered", orphaning the live session
until a manual pull.

Root cause is a missing host-label normalization the pull loop already
does. Fix discover_session to canonicalize both sides via
resolve_host_label, and key local-path symlink resolution on the ROW's
raw label. Apply the same normalization to dispatch_task's pre_uuids
filter (identical mismatch left it empty, risking a stale-sibling match
at a shared cwd). 2 regression tests reproduce rclaude's "local" labeling
(the old fake echoed the dispatch host, masking the bug). 310 tests pass.

Committed manually with ALLOW_COMMIT=1 per user authorization: the
auto-commit service's message LLM was timing out on this repo.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Natalie 2026-05-29 22:46:28 -06:00
parent 770db3ab8e
commit 2d978b5fe3
3 changed files with 115 additions and 8 deletions

View file

@ -199,8 +199,30 @@ def discover_session(
surviving session wins on ties.
"""
rcl = rclaude or Rclaude()
# rclaude reports the calling machine as "local" regardless of its real
# name, so compare host labels in their CANONICAL form — exactly as the
# pull loop does (pull.py: `cfg.resolve_host_label(row.host)`). Without
# this, dispatching to a named host (e.g. "plum") never matches the rows
# rclaude labels "local", and discovery always times out even though the
# session's JSONL is already on disk.
cfg = load_or_init()
target_host = cfg.resolve_host_label(host)
target = str(cwd.resolve()) if isinstance(cwd, Path) else cwd.rstrip("/")
skip = ignore_uuids or set()
def _row_is_local(r) -> bool:
# rclaude tags the calling machine's own sessions "local"; only those
# live on this filesystem and so need symlink resolution. Keyed on the
# ROW's raw label, not the (canonical) `host` param.
return r.host == "local"
def _row_cwd(r) -> str:
return (
str(Path(r.cwd).resolve())
if _row_is_local(r)
else (r.cwd or "").rstrip("/")
)
deadline = time.time() + timeout_s
while time.time() < deadline:
try:
@ -208,15 +230,9 @@ def discover_session(
except RclaudeError:
time.sleep(poll_interval_s)
continue
def _row_cwd(r):
return (
str(Path(r.cwd).resolve())
if host == "local"
else (r.cwd or "").rstrip("/")
)
matches = [
r for r in rows
if r.host == host
if cfg.resolve_host_label(r.host) == target_host
and r.cwd
and _row_cwd(r) == target
and str(r.uuid) not in skip

View file

@ -1425,9 +1425,15 @@ def dispatch_task(
pre_rows = rcl.list_sessions()
except RclaudeError:
pre_rows = []
# Canonicalize host labels here too: rclaude tags this machine's rows
# "local", so `r.host == host` (host e.g. "plum") would never match and
# the pre-existing-session set would be empty — letting discovery latch
# onto a STALE sibling session at the same cwd (e.g. another workstream
# sharing the repo root). Mirror discover_session / pull normalization.
pre_uuids = {
str(r.uuid) for r in pre_rows
if r.host == host and r.cwd and r.cwd.rstrip("/") == cwd.rstrip("/")
if cfg.resolve_host_label(r.host) == canonical_host
and r.cwd and r.cwd.rstrip("/") == cwd.rstrip("/")
}
# Wire the dispatched agent's `claire:*` MCP tools (it needs `report_status`).
# Stage a claire `.mcp.json` to a stable path *outside* the task repo on the

View file

@ -166,6 +166,91 @@ def test_dispatch_success() -> None:
assert sess["tmux_name"] == tmux_name
class _FakeRclaudeLocalLabel(_FakeRclaude):
"""Reproduces rclaude's real labeling: it reports the calling machine's
own sessions as host ``"local"``, NOT the canonical fleet name. The
base fake echoes the dispatch host, which masked the host-label bug."""
def spawn(self, *, host, cwd, mcp_config=None, name=None): # noqa: ARG002
from claire.rclaude import RclaudeError
if not self._spawn_ok:
raise RclaudeError("spawn failed")
self.spawn_calls.append(
{"host": host, "cwd": cwd, "mcp_config": mcp_config, "name": name}
)
# The on-disk session row is labeled "local" by rclaude regardless of
# the canonical host we dispatched to.
self._rows.append(_FakeSessionRow(
host="local", uuid=_uuid.uuid4(), cwd=cwd, mtime_epoch=999,
))
return f"claude-tester-{len(self.spawn_calls)}"
def test_dispatch_discovers_session_labeled_local(monkeypatch, tmp_path) -> None:
# Regression: rclaude tags this machine's sessions "local"; dispatching to
# the machine's canonical name ("plum") must still discover them. The old
# `r.host == host` filter compared "local" == "plum" and never matched, so
# discovery timed out and dispatch falsely reported "spawned but not
# discovered" — even though the JSONL was already on disk.
monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config"))
cfg_path = tmp_path / "config" / "claire" / "claire.toml"
cfg_path.parent.mkdir(parents=True, exist_ok=True)
cfg_path.write_text(
'machine_id = "m"\nthis_host = "plum"\n'
'\n[web]\nhost = "127.0.0.1"\nport = 8765\n',
encoding="utf-8",
)
conn, gen = _setup()
task_id = _project_with_task(conn, gen)
rcl = _FakeRclaudeLocalLabel()
result = service.dispatch_task(
conn, gen, task_id=task_id, host="plum", cwd="/work",
rclaude=rcl, discover_timeout_s=2, mcp_stager=_no_mcp_stager,
)
assert result.dispatched is True, result.reason
assert result.session_uuid is not None
assert result.assignment_id is not None
# The assignment binds the task to the discovered (local-labeled) session.
rows = conn.execute(
"SELECT COUNT(*) FROM assignments WHERE task_id = ?", (str(task_id),)
).fetchone()
assert rows[0] == 1
def test_dispatch_ignores_stale_sibling_at_same_cwd(monkeypatch, tmp_path) -> None:
# Regression: the `pre_uuids` filter had the same "local" vs canonical
# mismatch, so it was empty — letting discovery latch onto a STALE sibling
# session already at the cwd instead of the freshly spawned one. With the
# fix, the pre-existing local-labeled session is excluded and the NEW
# session (higher mtime) is the one assigned.
monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config"))
cfg_path = tmp_path / "config" / "claire" / "claire.toml"
cfg_path.parent.mkdir(parents=True, exist_ok=True)
cfg_path.write_text(
'machine_id = "m"\nthis_host = "plum"\n'
'\n[web]\nhost = "127.0.0.1"\nport = 8765\n',
encoding="utf-8",
)
conn, gen = _setup()
task_id = _project_with_task(conn, gen)
rcl = _FakeRclaudeLocalLabel()
# A stale sibling session already exists at the same cwd, labeled "local",
# with a HIGHER mtime than the spawn will produce (999) — so the mtime sort
# alone would wrongly pick it; only excluding it via pre_uuids is correct.
stale = _uuid.uuid4()
rcl._rows.append(_FakeSessionRow(
host="local", uuid=stale, cwd="/work", mtime_epoch=1000,
))
result = service.dispatch_task(
conn, gen, task_id=task_id, host="plum", cwd="/work",
rclaude=rcl, discover_timeout_s=2, mcp_stager=_no_mcp_stager,
)
assert result.dispatched is True, result.reason
# The assigned session must be the freshly spawned one, NOT the stale
# sibling that was present before the spawn.
assert result.session_uuid != str(stale)
def test_dispatch_refused_host_at_cap(monkeypatch, tmp_path) -> None:
# Isolate config so the host cap is the default 3, not the dev box's.
monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config"))