From 7c4cbe12ea22abe7977858aad35a42c2cf7723a8 Mon Sep 17 00:00:00 2001
From: Brian Love <brian@liveloveapp.com>
Date: Sat, 16 May 2026 09:16:49 -0700
Subject: [PATCH] fix(c-generative-ui): tighten plan_tools to call ONE tool for
 filter/scope
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"Filter to cancelled flights only" was calling all 4 tools (kpis +
trend + airlines + disruptions) and dumping the filtered result as
plaintext instead of letting the data_grid component re-render. The
per-turn plan_tools context just said "decide which tools to call" —
too permissive; gpt-5-mini defaults to refreshing everything.

Tighter rules now in the per-turn system context (not just the static
prompt file, which the model demonstrably ignores for tool-selection):

  1) FILTER / SCOPE  → exactly ONE tool, the one backing the affected
     component, with new parameters. No spec regen.
  2) STRUCTURAL     → regen spec, then call only tools for NEW components.
  3) QUESTION       → no tools, no JSON, just prose.

Calling all four is now explicitly reserved for "refresh" / "reload" /
"update everything". Applied to both umbrella backend and standalone.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../chat/generative-ui/python/src/graph.py    | 23 ++++++++++++++++---
 .../streaming/python/src/dashboard_graph.py   | 23 ++++++++++++++++---
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/cockpit/chat/generative-ui/python/src/graph.py b/cockpit/chat/generative-ui/python/src/graph.py
index af02264d6..850bf8b35 100644
--- a/cockpit/chat/generative-ui/python/src/graph.py
+++ b/cockpit/chat/generative-ui/python/src/graph.py
@@ -47,11 +47,28 @@ async def generate_shell(state: DashboardState) -> DashboardState:
 
 
 async def plan_tools(state: DashboardState) -> DashboardState:
-    """On follow-up turns, let the LLM decide which tools to call."""
+    """On follow-up turns, pick the MINIMAL set of tools — usually one."""
     context = (
         f"The current dashboard spec is:\n{state['dashboard_spec']}\n\n"
-        "Based on the user's message, decide which tools to call to update the dashboard data. "
-        "If the user asks a question about the data that doesn't need fresh data, just respond conversationally."
+        "Classify the user's message and act ONCE — do not refetch everything.\n"
+        "\n"
+        "1) FILTER / SCOPE existing data (e.g. 'filter to cancelled flights only',\n"
+        "   'show last 6 months', 'limit to enterprise', 'sort by date',\n"
+        "   'only show delayed', 'top 3'): call EXACTLY ONE tool — the one that\n"
+        "   backs the affected component — with the new parameters. Do NOT call\n"
+        "   the other tools. Do NOT regenerate the spec.\n"
+        "\n"
+        "2) STRUCTURAL change (e.g. 'add a card for X', 'remove the table',\n"
+        "   'split this into two columns'): regenerate the spec, then call only\n"
+        "   the tools needed to populate NEW components.\n"
+        "\n"
+        "3) QUESTION about existing data (e.g. 'why', 'how', 'explain',\n"
+        "   'what does this mean'): respond conversationally in plain prose.\n"
+        "   Call NO tools. Output NO JSON.\n"
+        "\n"
+        "If none of these fit, call only the smallest set of tools you need.\n"
+        "Calling all four tools is reserved for an explicit 'refresh' /\n"
+        "'reload' / 'update everything' request."
     )
     messages = [SystemMessage(content=_PROMPT + "\n\n" + context)] + state["messages"]
     response = await _llm_with_tools.ainvoke(messages)
diff --git a/cockpit/langgraph/streaming/python/src/dashboard_graph.py b/cockpit/langgraph/streaming/python/src/dashboard_graph.py
index d9e849f54..56e18e715 100644
--- a/cockpit/langgraph/streaming/python/src/dashboard_graph.py
+++ b/cockpit/langgraph/streaming/python/src/dashboard_graph.py
@@ -47,11 +47,28 @@ async def generate_shell(state: DashboardState) -> DashboardState:
 
 
 async def plan_tools(state: DashboardState) -> DashboardState:
-    """On follow-up turns, let the LLM decide which tools to call."""
+    """On follow-up turns, pick the MINIMAL set of tools — usually one."""
     context = (
         f"The current dashboard spec is:\n{state['dashboard_spec']}\n\n"
-        "Based on the user's message, decide which tools to call to update the dashboard data. "
-        "If the user asks a question about the data that doesn't need fresh data, just respond conversationally."
+        "Classify the user's message and act ONCE — do not refetch everything.\n"
+        "\n"
+        "1) FILTER / SCOPE existing data (e.g. 'filter to cancelled flights only',\n"
+        "   'show last 6 months', 'limit to enterprise', 'sort by date',\n"
+        "   'only show delayed', 'top 3'): call EXACTLY ONE tool — the one that\n"
+        "   backs the affected component — with the new parameters. Do NOT call\n"
+        "   the other tools. Do NOT regenerate the spec.\n"
+        "\n"
+        "2) STRUCTURAL change (e.g. 'add a card for X', 'remove the table',\n"
+        "   'split this into two columns'): regenerate the spec, then call only\n"
+        "   the tools needed to populate NEW components.\n"
+        "\n"
+        "3) QUESTION about existing data (e.g. 'why', 'how', 'explain',\n"
+        "   'what does this mean'): respond conversationally in plain prose.\n"
+        "   Call NO tools. Output NO JSON.\n"
+        "\n"
+        "If none of these fit, call only the smallest set of tools you need.\n"
+        "Calling all four tools is reserved for an explicit 'refresh' /\n"
+        "'reload' / 'update everything' request."
     )
     messages = [SystemMessage(content=_PROMPT + "\n\n" + context)] + state["messages"]
     response = await _llm_with_tools.ainvoke(messages)