fix(dispatch): discover sessions by canonical host label, not raw "local"
discover_session polled `rclaude list sessions` for the freshly spawned session but filtered rows with `r.host == host` where host is the canonical name (e.g. "plum"), while rclaude labels the calling machine's own sessions "local". "local" == "plum" is always False, so discovery matched nothing and timed out even though the session's JSONL was already on disk (observed: 18s after spawn, inside the 30s window). dispatch then falsely returned "spawned but not discovered", orphaning the live session until a manual pull. Root cause is a missing host-label normalization the pull loop already does. Fix discover_session to canonicalize both sides via resolve_host_label, and key local-path symlink resolution on the ROW's raw label. Apply the same normalization to dispatch_task's pre_uuids filter (identical mismatch left it empty, risking a stale-sibling match at a shared cwd). 2 regression tests reproduce rclaude's "local" labeling (the old fake echoed the dispatch host, masking the bug). 310 tests pass. Committed manually with ALLOW_COMMIT=1 per user authorization: the auto-commit service's message LLM was timing out on this repo. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
770db3ab8e
commit
2d978b5fe3
3 changed files with 115 additions and 8 deletions
|
|
@ -199,8 +199,30 @@ def discover_session(
|
|||
surviving session wins on ties.
|
||||
"""
|
||||
rcl = rclaude or Rclaude()
|
||||
# rclaude reports the calling machine as "local" regardless of its real
|
||||
# name, so compare host labels in their CANONICAL form — exactly as the
|
||||
# pull loop does (pull.py: `cfg.resolve_host_label(row.host)`). Without
|
||||
# this, dispatching to a named host (e.g. "plum") never matches the rows
|
||||
# rclaude labels "local", and discovery always times out even though the
|
||||
# session's JSONL is already on disk.
|
||||
cfg = load_or_init()
|
||||
target_host = cfg.resolve_host_label(host)
|
||||
target = str(cwd.resolve()) if isinstance(cwd, Path) else cwd.rstrip("/")
|
||||
skip = ignore_uuids or set()
|
||||
|
||||
def _row_is_local(r) -> bool:
|
||||
# rclaude tags the calling machine's own sessions "local"; only those
|
||||
# live on this filesystem and so need symlink resolution. Keyed on the
|
||||
# ROW's raw label, not the (canonical) `host` param.
|
||||
return r.host == "local"
|
||||
|
||||
def _row_cwd(r) -> str:
|
||||
return (
|
||||
str(Path(r.cwd).resolve())
|
||||
if _row_is_local(r)
|
||||
else (r.cwd or "").rstrip("/")
|
||||
)
|
||||
|
||||
deadline = time.time() + timeout_s
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
|
|
@ -208,15 +230,9 @@ def discover_session(
|
|||
except RclaudeError:
|
||||
time.sleep(poll_interval_s)
|
||||
continue
|
||||
def _row_cwd(r):
|
||||
return (
|
||||
str(Path(r.cwd).resolve())
|
||||
if host == "local"
|
||||
else (r.cwd or "").rstrip("/")
|
||||
)
|
||||
matches = [
|
||||
r for r in rows
|
||||
if r.host == host
|
||||
if cfg.resolve_host_label(r.host) == target_host
|
||||
and r.cwd
|
||||
and _row_cwd(r) == target
|
||||
and str(r.uuid) not in skip
|
||||
|
|
|
|||
|
|
@ -1425,9 +1425,15 @@ def dispatch_task(
|
|||
pre_rows = rcl.list_sessions()
|
||||
except RclaudeError:
|
||||
pre_rows = []
|
||||
# Canonicalize host labels here too: rclaude tags this machine's rows
|
||||
# "local", so `r.host == host` (host e.g. "plum") would never match and
|
||||
# the pre-existing-session set would be empty — letting discovery latch
|
||||
# onto a STALE sibling session at the same cwd (e.g. another workstream
|
||||
# sharing the repo root). Mirror discover_session / pull normalization.
|
||||
pre_uuids = {
|
||||
str(r.uuid) for r in pre_rows
|
||||
if r.host == host and r.cwd and r.cwd.rstrip("/") == cwd.rstrip("/")
|
||||
if cfg.resolve_host_label(r.host) == canonical_host
|
||||
and r.cwd and r.cwd.rstrip("/") == cwd.rstrip("/")
|
||||
}
|
||||
# Wire the dispatched agent's `claire:*` MCP tools (it needs `report_status`).
|
||||
# Stage a claire `.mcp.json` to a stable path *outside* the task repo on the
|
||||
|
|
|
|||
|
|
@ -166,6 +166,91 @@ def test_dispatch_success() -> None:
|
|||
assert sess["tmux_name"] == tmux_name
|
||||
|
||||
|
||||
class _FakeRclaudeLocalLabel(_FakeRclaude):
|
||||
"""Reproduces rclaude's real labeling: it reports the calling machine's
|
||||
own sessions as host ``"local"``, NOT the canonical fleet name. The
|
||||
base fake echoes the dispatch host, which masked the host-label bug."""
|
||||
|
||||
def spawn(self, *, host, cwd, mcp_config=None, name=None): # noqa: ARG002
|
||||
from claire.rclaude import RclaudeError
|
||||
if not self._spawn_ok:
|
||||
raise RclaudeError("spawn failed")
|
||||
self.spawn_calls.append(
|
||||
{"host": host, "cwd": cwd, "mcp_config": mcp_config, "name": name}
|
||||
)
|
||||
# The on-disk session row is labeled "local" by rclaude regardless of
|
||||
# the canonical host we dispatched to.
|
||||
self._rows.append(_FakeSessionRow(
|
||||
host="local", uuid=_uuid.uuid4(), cwd=cwd, mtime_epoch=999,
|
||||
))
|
||||
return f"claude-tester-{len(self.spawn_calls)}"
|
||||
|
||||
|
||||
def test_dispatch_discovers_session_labeled_local(monkeypatch, tmp_path) -> None:
|
||||
# Regression: rclaude tags this machine's sessions "local"; dispatching to
|
||||
# the machine's canonical name ("plum") must still discover them. The old
|
||||
# `r.host == host` filter compared "local" == "plum" and never matched, so
|
||||
# discovery timed out and dispatch falsely reported "spawned but not
|
||||
# discovered" — even though the JSONL was already on disk.
|
||||
monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config"))
|
||||
cfg_path = tmp_path / "config" / "claire" / "claire.toml"
|
||||
cfg_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
cfg_path.write_text(
|
||||
'machine_id = "m"\nthis_host = "plum"\n'
|
||||
'\n[web]\nhost = "127.0.0.1"\nport = 8765\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
conn, gen = _setup()
|
||||
task_id = _project_with_task(conn, gen)
|
||||
rcl = _FakeRclaudeLocalLabel()
|
||||
result = service.dispatch_task(
|
||||
conn, gen, task_id=task_id, host="plum", cwd="/work",
|
||||
rclaude=rcl, discover_timeout_s=2, mcp_stager=_no_mcp_stager,
|
||||
)
|
||||
assert result.dispatched is True, result.reason
|
||||
assert result.session_uuid is not None
|
||||
assert result.assignment_id is not None
|
||||
# The assignment binds the task to the discovered (local-labeled) session.
|
||||
rows = conn.execute(
|
||||
"SELECT COUNT(*) FROM assignments WHERE task_id = ?", (str(task_id),)
|
||||
).fetchone()
|
||||
assert rows[0] == 1
|
||||
|
||||
|
||||
def test_dispatch_ignores_stale_sibling_at_same_cwd(monkeypatch, tmp_path) -> None:
|
||||
# Regression: the `pre_uuids` filter had the same "local" vs canonical
|
||||
# mismatch, so it was empty — letting discovery latch onto a STALE sibling
|
||||
# session already at the cwd instead of the freshly spawned one. With the
|
||||
# fix, the pre-existing local-labeled session is excluded and the NEW
|
||||
# session (higher mtime) is the one assigned.
|
||||
monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config"))
|
||||
cfg_path = tmp_path / "config" / "claire" / "claire.toml"
|
||||
cfg_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
cfg_path.write_text(
|
||||
'machine_id = "m"\nthis_host = "plum"\n'
|
||||
'\n[web]\nhost = "127.0.0.1"\nport = 8765\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
conn, gen = _setup()
|
||||
task_id = _project_with_task(conn, gen)
|
||||
rcl = _FakeRclaudeLocalLabel()
|
||||
# A stale sibling session already exists at the same cwd, labeled "local",
|
||||
# with a HIGHER mtime than the spawn will produce (999) — so the mtime sort
|
||||
# alone would wrongly pick it; only excluding it via pre_uuids is correct.
|
||||
stale = _uuid.uuid4()
|
||||
rcl._rows.append(_FakeSessionRow(
|
||||
host="local", uuid=stale, cwd="/work", mtime_epoch=1000,
|
||||
))
|
||||
result = service.dispatch_task(
|
||||
conn, gen, task_id=task_id, host="plum", cwd="/work",
|
||||
rclaude=rcl, discover_timeout_s=2, mcp_stager=_no_mcp_stager,
|
||||
)
|
||||
assert result.dispatched is True, result.reason
|
||||
# The assigned session must be the freshly spawned one, NOT the stale
|
||||
# sibling that was present before the spawn.
|
||||
assert result.session_uuid != str(stale)
|
||||
|
||||
|
||||
def test_dispatch_refused_host_at_cap(monkeypatch, tmp_path) -> None:
|
||||
# Isolate config so the host cap is the default 3, not the dev box's.
|
||||
monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config"))
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue