From c7cca5c2efd9a9622b3ebaeb2967d02cd43fcf7a Mon Sep 17 00:00:00 2001
From: Brian Love <brian@liveloveapp.com>
Date: Sat, 16 May 2026 10:50:23 -0700
Subject: [PATCH] fix(c-generative-ui): use gpt-5 + minimal reasoning for
 planner LLM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gpt-5-mini ignored the "EXACTLY ONE tool" directive added in PR #363
and kept calling all four data tools on every filter follow-up.
Verified live: my prompt rewrite was strict and explicit ("call EXACTLY
ONE tool", "Do NOT call the other tools") but gpt-5-mini still fanned
out — the model's default reasoning prefers thoroughness over literal
directive-following.

Split the LLMs:
- `_llm` (gpt-5-mini) — unchanged for shell-gen + respond. Cheap and
  good enough for prose + JSON-spec emission.
- `_planner_llm` (gpt-5, reasoning_effort='minimal') — bound to tools,
  used in plan_tools. gpt-5 follows directives more precisely;
  reasoning_effort='minimal' suppresses the "let me be thorough"
  deliberation that drives the fan-out.

Standalone smoke (separate from chrome):
  prompt: "Filter to cancelled flights only"
  result: ['query_recent_disruptions']  ← exactly one

Chrome MCP end-to-end:
  backend log confirms AI tool_calls: ['query_recent_disruptions'],
  data grid updates to 3 cancelled rows.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 cockpit/chat/generative-ui/python/src/graph.py   | 16 +++++++++++++++-
 .../streaming/python/src/dashboard_graph.py      | 16 +++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/cockpit/chat/generative-ui/python/src/graph.py b/cockpit/chat/generative-ui/python/src/graph.py
index 850bf8b35..b64d8b5cf 100644
--- a/cockpit/chat/generative-ui/python/src/graph.py
+++ b/cockpit/chat/generative-ui/python/src/graph.py
@@ -20,7 +20,21 @@
 _PROMPT = (Path(__file__).parent.parent / "prompts" / "dashboard.md").read_text()
 
 _llm = ChatOpenAI(model="gpt-5-mini", temperature=0, streaming=True)
-_llm_with_tools = _llm.bind_tools(ALL_TOOLS)
+
+# Dedicated planner: full gpt-5 with minimal reasoning effort.
+# gpt-5-mini at default reasoning ignores the "EXACTLY ONE tool" directive
+# in plan_tools and reflexively calls all four data tools on every
+# follow-up — verified in chrome MCP after PR #363 tightened the prompt
+# but the model still over-called. Bumping the planner to gpt-5 sharpens
+# instruction-following, and reasoning_effort='minimal' suppresses the
+# "let me be thorough" deliberation that drives the fan-out.
+_planner_llm = ChatOpenAI(
+    model="gpt-5",
+    temperature=0,
+    streaming=True,
+    reasoning_effort="minimal",
+)
+_llm_with_tools = _planner_llm.bind_tools(ALL_TOOLS)
 
 
 class DashboardState(MessagesState):
diff --git a/cockpit/langgraph/streaming/python/src/dashboard_graph.py b/cockpit/langgraph/streaming/python/src/dashboard_graph.py
index 56e18e715..652690b2d 100644
--- a/cockpit/langgraph/streaming/python/src/dashboard_graph.py
+++ b/cockpit/langgraph/streaming/python/src/dashboard_graph.py
@@ -20,7 +20,21 @@
 _PROMPT = (Path(__file__).parent.parent / "prompts" / "dashboard.md").read_text()
 
 _llm = ChatOpenAI(model="gpt-5-mini", temperature=0, streaming=True)
-_llm_with_tools = _llm.bind_tools(ALL_TOOLS)
+
+# Dedicated planner: full gpt-5 with minimal reasoning effort.
+# gpt-5-mini at default reasoning ignores the "EXACTLY ONE tool" directive
+# in plan_tools and reflexively calls all four data tools on every
+# follow-up — verified in chrome MCP after PR #363 tightened the prompt
+# but the model still over-called. Bumping the planner to gpt-5 sharpens
+# instruction-following, and reasoning_effort='minimal' suppresses the
+# "let me be thorough" deliberation that drives the fan-out.
+_planner_llm = ChatOpenAI(
+    model="gpt-5",
+    temperature=0,
+    streaming=True,
+    reasoning_effort="minimal",
+)
+_llm_with_tools = _planner_llm.bind_tools(ALL_TOOLS)
 
 
 class DashboardState(MessagesState):