diff --git a/.claude/skills/multi-agent-orchestration/SKILL.md b/.claude/skills/multi-agent-orchestration/SKILL.md index 1661f2dea9..f8c6baaab3 100644 --- a/.claude/skills/multi-agent-orchestration/SKILL.md +++ b/.claude/skills/multi-agent-orchestration/SKILL.md @@ -338,6 +338,7 @@ When modifying orchestration, verify: - [ ] **INV-O7**: Worker timeouts use 10-minute default (600s for resumed sessions) - [ ] **INV-O8**: Cancellation tokens propagated to all async operations - [ ] **INV-O15**: IDLE-DEFER flushes CurrentResponse before breaking (content preservation) +- [ ] **INV-O17**: IDLE-DEFER-REARM re-arms IsProcessing when idle arrives with backgroundTasks but IsProcessing is already false (PR #472) --- @@ -763,6 +764,59 @@ These layers remain as fallbacks for edge cases where IDLE-DEFER doesn't fire: | `RecoverFromPrematureIdleIfNeededAsync` | Poll signal + events.jsonl freshness | Collect full response after partial TCS | | `IsEventsFileActive` | events.jsonl mtime < 15s | Detect ongoing CLI activity | +### IDLE-DEFER-REARM (PR #472) + +When multiple `session.idle` events arrive for a session with active `backgroundTasks`, +the first IDLE-DEFER correctly defers completion. But if `CompleteResponse` runs between +the first and second idle (e.g., triggered by a watchdog or another code path), then +`IsProcessing` is already `false` when the second idle arrives. Without re-arming, +background agents would finish their work but no completion event would fire — the +classic "zero-idle" symptom where the session appears stuck. + +**IDLE-DEFER-REARM** detects this scenario and re-arms processing: + +```csharp +case SessionIdleEvent idle: + CancelTurnEndFallback(state); + + if (HasActiveBackgroundTasks(idle)) + { + // ... existing IDLE-DEFER logic (flush, break) ... + + // REARM: If IsProcessing is already false, re-arm it so the + // next idle (without background tasks) triggers CompleteResponse. + if (!state.Info.IsProcessing) + { + state.Info.IsProcessing = true; + state.Info.HasUsedToolsThisTurn = true; // 600s tool timeout + RestartProcessingWatchdog(state); + Debug($"[IDLE-DEFER-REARM] ..."); + } + break; + } + + CompleteResponse(state, idleGeneration); +``` + +**What the re-arm sets:** +- `IsProcessing = true` — so the next non-deferred idle fires `CompleteResponse` +- `HasUsedToolsThisTurn = true` — enables the 600s tool timeout instead of 120s, + since background agents may run for several minutes +- Restarts the processing watchdog — provides a safety net if no further idle arrives +- Logs `[IDLE-DEFER-REARM]` to `event-diagnostics.log` + +### `session.idle` Is Ephemeral — Never Persisted + +> **⚠️ `session.idle` is NEVER written to `events.jsonl` by design.** It is a +> transient signal from the CLI, not a persisted event. Any restore or recovery +> logic that depends on reading `session.idle` from disk is fundamentally flawed. +> The poller in `PollEventsAndResumeWhenIdleAsync` watches for `session.shutdown` +> instead, which IS persisted. + +This matters for IDLE-DEFER because after an app restart, there will be no +`session.idle` replay — the watchdog and resume logic handle completion detection +for interrupted sessions, not idle event replay. + --- ## "Fix with Copilot" — Multi-Agent Awareness @@ -959,6 +1013,7 @@ Reflect mode runs multiple iterations. Expect this pattern: | Orchestration hangs on reconnect | Check for missing TrySetCanceled | TCS not canceled; see INV-O9 | | Many IDLE-DEFER entries | `grep "IDLE-DEFER" diagnostics.log` | Normal — worker has active sub-agents; wait for completion | | IDLE-DEFER but worker never completes | Check if background tasks are leaking | Sub-agent/shell not terminating; check CLI logs | +| IDLE-DEFER-REARM entries | `grep "IDLE-DEFER-REARM" diagnostics.log` | Normal — IsProcessing was false when deferred idle arrived; re-armed for next completion | | Worker stuck 30+ min, events.jsonl fresh | Check file size across watchdog cycles | Dead connection — file-size-growth check should catch in ~360s (INV-O16) | --- diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 2009c2c21d..8116600a6b 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -254,6 +254,8 @@ For multi-agent sessions, Case B also checks **file-size-growth**: if events.jso Note: `session.idle` is an ephemeral event (`ephemeral: true` in the SDK schema) — it is delivered over the live event stream but intentionally NOT written to `events.jsonl`. When `session.idle` includes active `backgroundTasks` (sub-agents, shells), the IDLE-DEFER logic defers completion until a subsequent idle arrives with empty/null backgroundTasks. In rare cases where `IsProcessing` was already cleared (by watchdog timeout or reconnect) before the deferred idle arrives, the session may appear stuck until the watchdog fires again — see issue #403. +When `session.idle` with active `backgroundTasks` arrives but `IsProcessing` is already `false`, the handler re-arms: sets `IsProcessing = true`, `HasUsedToolsThisTurn = true` (600s timeout), restarts watchdog, logs `[IDLE-DEFER-REARM]`. This prevents zero-idle symptoms where IDLE-DEFER fires once but subsequent idles are no-ops. + When the watchdog fires, it marshals state mutations to the UI thread via `InvokeOnUI()` and adds a system warning message. ### Diagnostic Log Tags @@ -262,6 +264,7 @@ The event diagnostics log (`~/.polypilot/event-diagnostics.log`) uses these tags - `[EVT]` — SDK event received (only SessionIdleEvent, AssistantTurnEndEvent, SessionErrorEvent) - `[IDLE]` — SessionIdleEvent dispatched to CompleteResponse - `[IDLE-DEFER]` — SessionIdleEvent deferred due to active background tasks (agents/shells) +- `[IDLE-DEFER-REARM]` — SessionIdleEvent re-armed IsProcessing after it was already cleared - `[COMPLETE]` — CompleteResponse executed or skipped - `[RECONNECT]` — session replaced after disconnect - `[ERROR]` — SessionErrorEvent or SendAsync/reconnect failure cleared IsProcessing @@ -288,6 +291,8 @@ When a user changes the model via the UI dropdown: - The SDK continues using the original model from session creation - To truly switch models, the session must be destroyed and recreated +`GetSessionModel` prioritizes: (1) user's explicit choice (`session.Model`), (2) backend-reported model from usage info, (3) `DefaultModel` fallback. `ShouldAcceptObservedModel()` in `ModelHelper.cs` prevents `SessionUsageInfoEvent` and `AssistantUsageEvent` from overwriting an explicit user model selection — the observed model is only accepted if no explicit choice was made or if the observed model matches the explicit choice. + ### SDK Data Types - `AssistantUsageData` properties (`InputTokens`, `OutputTokens`, etc.) are `Double?` not `int?` - Use `Convert.ToInt32(value)` for conversion, not `value as int?` @@ -302,6 +307,9 @@ Avoid `@bind:event="oninput"` — causes round-trip lag per keystroke. Use plain For detailed stuck-session debugging knowledge (8 invariants from 7 PRs of fix cycles), see `.claude/skills/processing-state-safety/SKILL.md`. +### Smart Completion Scan +**Smart completion scan:** `assistant.turn_end` and `assistant.message` are not unconditionally terminal — they appear between every tool round. `IsSessionStillProcessing` uses `IsCleanNoToolSubturn()` to scan backward from the last event within the current sub-turn (bounded by `assistant.turn_start`/`session.resume`/`session.start`). If any `tool.execution_*` event is found in the current sub-turn, the session is considered still processing. Clean no-tool turns are detected immediately, eliminating the 600s watchdog delay for simple conversations. + ### Session Persistence - Active sessions: `~/.polypilot/active-sessions.json` (includes `LastPrompt` — last user message if session was processing during save) - Session state: `~/.copilot/session-state//events.jsonl` (SDK-managed, stays in ~/.copilot) diff --git a/PolyPilot.Tests/StuckSessionRecoveryTests.cs b/PolyPilot.Tests/StuckSessionRecoveryTests.cs index 805f62d011..c3508d1bcb 100644 --- a/PolyPilot.Tests/StuckSessionRecoveryTests.cs +++ b/PolyPilot.Tests/StuckSessionRecoveryTests.cs @@ -303,8 +303,6 @@ public void StalenessThreshold_UsesWatchdogToolExecutionTimeout() [InlineData("assistant.reasoning")] [InlineData("assistant.reasoning_delta")] [InlineData("assistant.intent")] - [InlineData("assistant.turn_end")] // between tool rounds — still processing - [InlineData("assistant.message")] // mid-turn text — still processing [InlineData("tool.execution_end")] // tool just completed — still processing public void IsSessionStillProcessing_AllActiveEventTypes_ReturnTrue(string eventType) { @@ -331,7 +329,7 @@ public void IsSessionStillProcessing_AllActiveEventTypes_ReturnTrue(string event [InlineData("session.idle")] [InlineData("session.error")] [InlineData("session.shutdown")] - [InlineData("session.start")] // created but never used — not actively processing + [InlineData("session.start")] // created but never used — not actively processing public void IsSessionStillProcessing_TerminalEventTypes_ReturnFalse(string eventType) { var svc = CreateService(); @@ -353,6 +351,133 @@ public void IsSessionStillProcessing_TerminalEventTypes_ReturnFalse(string event } } + // --- Smart completion: assistant.turn_end/message without pending tools --- + + [Fact] + public void IsSessionStillProcessing_TurnEndWithoutPendingTools_ReturnsFalse() + { + // Turn completed cleanly — no tool.execution_start after the last turn_end. + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, + """{"type":"assistant.turn_start","data":{}}""" + "\n" + + """{"type":"assistant.message","data":{"content":"hello"}}""" + "\n" + + """{"type":"assistant.turn_end","data":{}}"""); + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + Assert.False(result, "Clean turn_end without pending tools should not report still processing"); + } + finally { Directory.Delete(tmpDir, true); } + } + + [Fact] + public void IsSessionStillProcessing_TurnEndWithPendingTools_ReturnsTrue() + { + // Turn ended after a tool round. This can still be between rounds, so restore + // must stay conservative and keep the session active until the next events arrive. + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, + """{"type":"assistant.turn_start","data":{}}""" + "\n" + + """{"type":"assistant.message","data":{}}""" + "\n" + + """{"type":"tool.execution_start","data":{}}""" + "\n" + + """{"type":"tool.execution_complete","data":{}}""" + "\n" + + """{"type":"assistant.turn_end","data":{}}"""); + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + Assert.True(result, "turn_end after tool activity in the same sub-turn should report still processing"); + } + finally { Directory.Delete(tmpDir, true); } + } + + [Fact] + public void IsSessionStillProcessing_TurnEndNoCurrentToolsAfterEarlierToolRound_ReturnsFalse() + { + // Older tool rounds must not leak across assistant.turn_start boundaries and keep a + // later clean no-tool turn alive. + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, + """{"type":"assistant.turn_start","data":{}}""" + "\n" + + """{"type":"assistant.message","data":{}}""" + "\n" + + """{"type":"tool.execution_start","data":{}}""" + "\n" + + """{"type":"tool.execution_complete","data":{}}""" + "\n" + + """{"type":"assistant.turn_end","data":{}}""" + "\n" + + """{"type":"assistant.turn_start","data":{}}""" + "\n" + + """{"type":"assistant.message","data":{"content":"final answer"}}""" + "\n" + + """{"type":"assistant.turn_end","data":{}}"""); + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + Assert.False(result, "A clean no-tool turn_end should not be kept active by tools from an earlier round"); + } + finally { Directory.Delete(tmpDir, true); } + } + + [Fact] + public void IsSessionStillProcessing_MessageWithContentNoTools_ReturnsFalse() + { + // Final assistant.message with content, no pending tools — session done. + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, + """{"type":"assistant.turn_start","data":{}}""" + "\n" + + """{"type":"assistant.message","data":{"content":"Here is your answer."}}"""); + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + Assert.False(result, "assistant.message without pending tools should not report still processing"); + } + finally { Directory.Delete(tmpDir, true); } + } + + [Fact] + public void IsSessionStillProcessing_MessageWithPendingToolInSameTurn_ReturnsTrue() + { + // assistant.message as last event, but tool.execution_start earlier in same turn + // → smart scan should detect the pending tool and report still processing. + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, + """{"type":"assistant.turn_start","data":{}}""" + "\n" + + """{"type":"assistant.message","data":{}}""" + "\n" + + """{"type":"tool.execution_start","data":{}}""" + "\n" + + """{"type":"assistant.message","data":{"content":"Running the tool now..."}}"""); + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + Assert.True(result, "assistant.message with pending tool.execution_start should report still processing"); + } + finally { Directory.Delete(tmpDir, true); } + } + // --- Resume branch selection: RESUME-ACTIVE vs RESUME-QUIESCE --- [Fact] diff --git a/PolyPilot/Services/CopilotService.Utilities.cs b/PolyPilot/Services/CopilotService.Utilities.cs index 74c84f06fa..5db6491cf3 100644 --- a/PolyPilot/Services/CopilotService.Utilities.cs +++ b/PolyPilot/Services/CopilotService.Utilities.cs @@ -136,15 +136,27 @@ internal bool IsSessionStillProcessing(string sessionId, string basePath) var type = doc.RootElement.GetProperty("type").GetString(); if (type == null) return false; // Corrupt/partial event — treat as terminal - // Use a blacklist of terminal events rather than a whitelist of active ones. - // Any event that is NOT terminal means the session is still processing. - // The old whitelist missed intermediate states like assistant.turn_end (between - // tool rounds), assistant.message, and tool.execution_complete, causing - // actively-processing sessions to be incorrectly detected as idle on restore. - // session.idle is ephemeral (never on disk). session.start means session was - // created but never used — not actively processing. All are non-active states. + // session.idle is ephemeral (never on disk). session.start can also be the only + // on-disk event for a never-used session, so treat it as terminal on restore. var terminalEvents = new[] { "session.idle", "session.error", "session.shutdown", "session.start" }; - return !terminalEvents.Contains(type); + if (terminalEvents.Contains(type)) return false; + + // Smart completion for assistant.message / assistant.turn_end: session.idle is not + // written to disk, so a clean no-tool turn can appear to end on one of these events. + // Only treat it as idle if the current sub-turn shows no tool activity at all. + // If tools were involved, stay conservative and keep waiting for resumed events or + // the watchdog timeout — this avoids losing between-round tool output after relaunch. + if (type is "assistant.message" or "assistant.turn_end") + { + var tailTypes = GetTailEventTypes(eventsFile, 30); + if (tailTypes != null && tailTypes.Count > 1 && IsCleanNoToolSubturn(tailTypes)) + { + Debug($"[RESTORE] events.jsonl for '{sessionId}' ends with {type} and the current sub-turn has no tool activity — treating session as idle"); + return false; + } + } + + return true; } catch { return false; } } @@ -186,6 +198,68 @@ internal bool IsSessionStillProcessing(string sessionId, string basePath) } } + /// + /// Reads the last N event types from events.jsonl in reverse order (most recent first). + /// Uses a tail-read (last 8KB) to avoid full-file scan on large sessions. + /// Returns null on error. + /// + internal static List? GetTailEventTypes(string eventsFilePath, int count) + { + try + { + if (!File.Exists(eventsFilePath)) return null; + + const int tailBytes = 8192; + using var fs = new FileStream(eventsFilePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite); + if (fs.Length == 0) return null; + + var offset = Math.Max(0, fs.Length - tailBytes); + fs.Seek(offset, SeekOrigin.Begin); + using var reader = new StreamReader(fs); + var lines = new List(); + while (reader.ReadLine() is { } line) + { + if (!string.IsNullOrWhiteSpace(line)) + lines.Add(line); + } + + // Parse types in reverse order (most recent first) + var result = new List(); + for (int i = lines.Count - 1; i >= 0 && result.Count < count; i--) + { + try + { + using var doc = JsonDocument.Parse(lines[i]); + var t = doc.RootElement.GetProperty("type").GetString(); + if (t != null) result.Add(t); + } + catch { } + } + return result; + } + catch { return null; } + } + + /// + /// Returns true when the current sub-turn contains no tool activity before the latest + /// assistant.message / assistant.turn_end. The scan stops at the start of the current + /// sub-turn so older tool rounds don't keep a clean completion alive. + /// + internal static bool IsCleanNoToolSubturn(List tailTypes) + { + for (int i = 1; i < tailTypes.Count; i++) + { + var evt = tailTypes[i]; + if (evt.StartsWith("tool.execution_", StringComparison.Ordinal)) + return false; + + if (evt is "assistant.turn_start" or "assistant.turn_end" or "session.resume" or "session.start") + break; + } + + return true; + } + /// /// Checks whether events.jsonl shows a session that was interrupted mid-tool-execution. /// Returns true if the last non-control events are tool.execution_start without matching