From d347a9f46c02e6bbccf7bfb83e927f5c3522bd24 Mon Sep 17 00:00:00 2001
From: Yash Verma <its_yash@icloud.com>
Date: Thu, 12 Feb 2026 16:58:28 -0600
Subject: [PATCH] feat: automation script with MAB modes and context passing

- automation_runner.py: run test cases with multiple models, optional feedback (stationary/non-stationary MAB)

- main.py: allStepsAtOnceWithoutKnowledgeAgent, return debug/verification for context, handle dict knowledge_response

- agents.py: model/temperature override, tool_call_limit, timeout 180s, generic verification prompt

- metrics_db.py: temperature column, attempt_context table for storing/retrieving context between attempts

- .gitignore: logs, debug_logs, db, cache, env, docs
---
 .gitignore                                  |  23 ++-
 debug_assistant_latest/agents.py            |  23 ++-
 debug_assistant_latest/automation_runner.py | 217 ++++++++++++++++++++
 debug_assistant_latest/main.py              | 124 ++++++++++-
 debug_assistant_latest/metrics_db.py        |  95 ++++++++-
 5 files changed, 463 insertions(+), 19 deletions(-)
 create mode 100644 debug_assistant_latest/automation_runner.py

diff --git a/.gitignore b/.gitignore
index 99c520d..d749db3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,25 @@
-debug_assistant_latest/result_logs/
+# Build / cache
 __pycache__/
+*.py[cod]
+*.pyo
+.Python
+*.so
+
+# Logs and generated data
+*.log
 nohup.out
 debug_assistant_latest/nohup.out
-README.md
+debug_assistant_latest/result_logs/
+debug_assistant_latest/debug_logs/
+debug_assistant_latest/docs/
+
+# Database (local metrics)
 token_metrics.db
+
+# Environment / secrets (if present)
+.env
+.env.local
+*.pem
+
+# Repo-specific (existing)
+README.md
diff --git a/debug_assistant_latest/agents.py b/debug_assistant_latest/agents.py
index ae3b73f..528f3f0 100644
--- a/debug_assistant_latest/agents.py
+++ b/debug_assistant_latest/agents.py
@@ -114,23 +114,27 @@ def askQuestion(self):
 
 
 class AgentDebug(Agent):
-    def __init__(self, agentType, config):
+    def __init__(self, agentType, config, model_override=None, temperature_override=None):
         super().__init__(agentType, config)
         self.agentAPIResponse = None
         self.debugStatus = None
         self.response = None  # Store the debug agent's response for verification
+        self.model_override = model_override  # Allow model override for automation
+        self.temperature_override = temperature_override  # Allow temperature override for automation
 
     def prepareAgent(self):
         """ Prepare the debug assistant based on the config file """
         try:
+            # Use override if provided, otherwise use config
+            model_name = self.model_override if self.model_override else self.agentProperties["model"]
+            temperature = self.temperature_override if self.temperature_override is not None else self.agentProperties.get("temperature", 0.0)
             
-            model_name = self.agentProperties["model"]
             if any(token in model_name for token in ['gpt', 'o3', 'o4', 'o1']):
-                model = OpenAIChat(id=model_name)
+                model = OpenAIChat(id=model_name, temperature=temperature)
             elif 'llama' in model_name:
-                model = Ollama(id=model_name)
+                model = Ollama(id=model_name, temperature=temperature)
             elif 'gemini' in model_name:
-                model = Gemini(id=model_name)
+                model = Gemini(id=model_name, temperature=temperature)
             else:
                 raise Exception("Invalid model name provided.")
 
@@ -141,7 +145,7 @@ def prepareAgent(self):
                 instructions=[x for x in self.agentProperties["instructions"]],
                 show_tool_calls=True,
                 #read_chat_history=True,
-                # tool_call_limit=1
+                tool_call_limit=15,  # Limit tool calls to prevent infinite loops and improve efficiency
                 markdown=True,
                 guidelines=[x for x in self.agentProperties["guidelines"]]
                 #add_history_to_messages=True,
@@ -163,7 +167,7 @@ def preparePrompt(self):
             sys.exit()
 
     @withTimeout(False)
-    @timeout_decorator.timeout(480)
+    @timeout_decorator.timeout(180)  # Reduced from 480s to 180s (3 minutes) for better efficiency
     def askQuestion(self):
         """ Ask the formatted prepared question to the debug agent """
         try:
@@ -511,7 +515,8 @@ def prepareAgent(self):
                 instructions=instructions,
                 show_tool_calls=True,
                 markdown=True,
-                guidelines=guidelines
+                guidelines=guidelines,
+                tool_call_limit=10  # Limit tool calls to prevent infinite loops and speed up verification
             )
         except Exception as e:
             print(f"Error preparing verification agent: {e}")
@@ -559,7 +564,7 @@ def preparePrompt(self):
             print(f"Error creating verification agent prompt: {e}")
             sys.exit()
 
-    @timeout_decorator.timeout(480)
+    @timeout_decorator.timeout(180)  # Reduced from 480s to 180s (3 minutes) for better efficiency
     @withTimeout(None)
     def askQuestion(self):
         """ Ask the verification agent to verify the fix """
diff --git a/debug_assistant_latest/automation_runner.py b/debug_assistant_latest/automation_runner.py
new file mode 100644
index 0000000..3e3f4e9
--- /dev/null
+++ b/debug_assistant_latest/automation_runner.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""Automation script: runs test cases with multiple models until success or all exhausted.
+
+Supports two MAB modes:
+- Stationary MAB (--no-feedback): Each model sees only knowledge agent's plan (fixed success probability per model)
+- Non-stationary MAB (--include-feedback, default): Each model sees knowledge plan + feedback from previous attempts (success probability can change)
+"""
+
+import os
+import re
+from pathlib import Path
+from main import allStepsAtOnce, allStepsAtOnceWithoutKnowledgeAgent
+from metrics_db import store_attempt_context, get_latest_attempt_context, clear_attempt_context
+
+DEFAULT_MODELS = ["gpt-5-nano", "gpt-5-mini", "gpt-4o-mini", "gpt-4o"]
+
+# Tokens agents use; regex to extract verdict + reasoning before it
+DEBUG_TOKENS_RE = re.compile(r"(.{0,500})(<\|SOLVED\|>|<\|FAILED\|>|<\|ERROR\|>)", re.DOTALL)
+VERIFICATION_TOKENS_RE = re.compile(r"(.{0,350})(<\|VERIFIED\|>|<\|FAILED\|>|<\|VERIFICATION_ERROR\|>)\s*$", re.DOTALL)
+
+
+def _extract_debug_excerpt(text, max_fallback=500):
+    """Extract reasoning + verdict from debug response using regex (last occurrence of token)."""
+    if not (text or "").strip():
+        return ""
+    text = text.strip()
+    matches = list(DEBUG_TOKENS_RE.finditer(text))
+    if matches:
+        m = matches[-1]
+        excerpt = (m.group(1) + m.group(2)).strip()
+        return excerpt if len(excerpt) <= max_fallback + 50 else "..." + excerpt[-max_fallback:]
+    return text[-max_fallback:] if len(text) > max_fallback else text
+
+
+def _extract_verification_excerpt(text, max_fallback=400):
+    """Extract reasoning + verdict from verification report (token at end)."""
+    if not (text or "").strip():
+        return ""
+    text = text.strip()
+    m = VERIFICATION_TOKENS_RE.search(text)
+    if m:
+        return (m.group(1) + m.group(2)).strip()
+    return text[-max_fallback:] if len(text) > max_fallback else text
+
+
+def build_attempt_summary(debug_response, verification_report, model_used, max_debug_chars=500, max_verification_chars=400):
+    """Build a short bounded summary of a failed attempt for the next model.
+    Uses regex to extract verdict tokens and the reasoning just before them."""
+    if not debug_response and not verification_report:
+        return f"Previous attempt (model: {model_used}) did not resolve the issue."
+    parts = [f"Previous attempt (model: {model_used}) did not resolve the issue."]
+    if debug_response:
+        s = _extract_debug_excerpt(debug_response, max_fallback=max_debug_chars)
+        if s:
+            parts.append("What was done: " + s)
+    if verification_report:
+        s = _extract_verification_excerpt(verification_report, max_fallback=max_verification_chars)
+        if s:
+            parts.append("Verification outcome: " + s)
+    return "\n".join(parts)
+
+
+def teardown_environment(test_name):
+    """Teardown the environment for a test case using Python function from kube_test."""
+    try:
+        from kube_test import tearDownEnviornment
+        tearDownEnviornment(test_name)
+        print(f"  Teardown completed")
+        return True
+    except ImportError:
+        print(f"  Warning: Could not import kube_test.tearDownEnviornment")
+        return False
+    except Exception as e:
+        print(f"  Warning: Teardown failed: {e}")
+        return False
+
+def get_test_cases(troubleshooting_dir):
+    """Get all test cases from troubleshooting directory."""
+    test_cases = []
+    for test_dir in Path(troubleshooting_dir).iterdir():
+        if test_dir.is_dir():
+            config_file = test_dir / "config_step.json"
+            if config_file.exists():
+                test_cases.append({"name": test_dir.name, "config_path": str(config_file)})
+    return test_cases
+
+def run_automation_suite(troubleshooting_dir=None, models=None, temperature=None, test_cases=None, test_mode="allStepsAtOnce", include_feedback=True):
+    """Run test cases with multiple models until success or all exhausted.
+    
+    Note: Only 'allStepsAtOnce' mode is supported as verification agent only works with this mode.
+    Temperature is only changed if explicitly provided via --temperature argument.
+    
+    Args:
+        include_feedback: If True (default), next debug agent sees knowledge plan + feedback from previous attempts (non-stationary MAB).
+                         If False, next debug agent sees ONLY knowledge plan (stationary MAB - fixed success probability per model).
+    """
+    if test_mode != "allStepsAtOnce":
+        raise ValueError(f"Automation only supports 'allStepsAtOnce' mode. Verification agent requires this mode. Got: {test_mode}")
+    
+    troubleshooting_dir = troubleshooting_dir or os.path.expanduser("~/KubeLLM/debug_assistant_latest/troubleshooting")
+    models = models or DEFAULT_MODELS
+    
+    if test_cases is None:
+        test_cases = get_test_cases(troubleshooting_dir)
+    else:
+        test_cases = [{"name": tc, "config_path": os.path.join(troubleshooting_dir, tc, "config_step.json")} for tc in test_cases]
+    
+    for test_case in test_cases:
+        test_name = test_case["name"]
+        config_path = test_case["config_path"]
+        
+        if not os.path.exists(config_path):
+            continue
+        
+        print(f"\n{'='*80}\nTEST: {test_name}\n{'='*80}")
+        
+        # Clear any old context for this test case (fresh start)
+        db_path = os.path.expanduser("~/KubeLLM/token_metrics.db")
+        clear_attempt_context(db_path, test_name)
+        
+        # Run knowledge agent once
+        # Use temperature override if provided, otherwise uses config default
+        try:
+            result = allStepsAtOnce(config_path, model_override=None, temperature_override=temperature)
+            knowledge_response = result.get("knowledge_response")
+            if result.get("verification_status"):
+                print(f"[SUCCESS] {test_name} succeeded")
+                clear_attempt_context(db_path, test_name)  # Clear context on success
+                continue
+            else:
+                # First attempt failed; build short summary, store in DB, then teardown
+                last_attempt_summary = build_attempt_summary(
+                    result.get("debug_response", ""),
+                    result.get("verification_report", ""),
+                    result.get("debug_model_used", "unknown"),
+                )
+                # Store context in DB for next model iteration (always store, but may not use if --no-feedback)
+                store_attempt_context(db_path, test_name, result.get("debug_model_used", "unknown"), last_attempt_summary)
+                mode_str = "non-stationary MAB" if include_feedback else "stationary MAB"
+                print(f"  First attempt failed, context stored in DB ({mode_str} mode), tearing down environment...")
+                teardown_environment(test_name)
+        except Exception as e:
+            print(f"Error: {e}")
+            # Teardown even if there was an error
+            print(f"  Tearing down environment after error...")
+            teardown_environment(test_name)
+            continue
+
+        # Try different models if first attempt failed
+        # Temperature is only used if explicitly provided, otherwise uses config/default
+        for model in models:
+            print(f"Trying {model}...")
+            # Retrieve context from DB only if include_feedback is True (for non-stationary MAB)
+            # If False (stationary MAB), next agent sees only knowledge plan (no feedback)
+            last_attempt_summary = None
+            if include_feedback:
+                last_attempt_summary = get_latest_attempt_context(db_path, test_name)
+                if last_attempt_summary:
+                    print(f"  Retrieved context from DB for previous attempt (non-stationary MAB mode)")
+            else:
+                print(f"  Using only knowledge plan (stationary MAB mode - no feedback from previous attempts)")
+            
+            try:
+                result = allStepsAtOnceWithoutKnowledgeAgent(
+                    config_path, knowledge_response, model, temperature,
+                    previous_attempt_summary=last_attempt_summary,
+                )
+                if result.get("verification_status"):
+                    print(f"[SUCCESS] {test_name} succeeded with {model}")
+                    clear_attempt_context(db_path, test_name)  # Clear context on success
+                    break
+                else:
+                    # Build short summary, store in DB for next model (only if include_feedback is True)
+                    # Even if not used immediately, storing allows switching modes mid-run
+                    last_attempt_summary = build_attempt_summary(
+                        result.get("debug_response", ""),
+                        result.get("verification_report", ""),
+                        result.get("debug_model_used", "unknown"),
+                    )
+                    # Always store context in DB (allows switching between stationary/non-stationary modes)
+                    store_attempt_context(db_path, test_name, result.get("debug_model_used", "unknown"), last_attempt_summary)
+                    if include_feedback:
+                        print(f"  Attempt with {model} failed, context stored in DB, tearing down environment...")
+                    else:
+                        print(f"  Attempt with {model} failed (context stored but not used in stationary mode), tearing down environment...")
+                    teardown_environment(test_name)
+            except Exception as e:
+                print(f"Error: {e}")
+                # Teardown even if there was an error
+                print(f"  Tearing down environment after error...")
+                teardown_environment(test_name)
+        else:
+            print(f"[FAILED] {test_name} failed after trying all models")
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Automation script for running test cases with multiple models. Only supports 'allStepsAtOnce' mode.")
+    parser.add_argument("--troubleshooting-dir", type=str, help="Path to troubleshooting directory")
+    parser.add_argument("--models", nargs="+", help="List of models to try (default: gpt-5-nano, gpt-5-mini, gpt-4o-mini, gpt-4o)")
+    parser.add_argument("--temperature", type=float, help="Temperature to use for all attempts (if not provided, uses config file default)")
+    parser.add_argument("--test-cases", nargs="+", help="Specific test cases to run")
+    parser.add_argument("--test-mode", type=str, default="allStepsAtOnce", 
+                       help="Test mode (default: allStepsAtOnce). Only 'allStepsAtOnce' is supported as verification requires this mode.")
+    parser.add_argument("--include-feedback", action="store_true", default=True,
+                       help="Include feedback from previous attempts (default: True). Non-stationary MAB - success probability can change.")
+    parser.add_argument("--no-feedback", dest="include_feedback", action="store_false",
+                       help="Do NOT include feedback from previous attempts. Stationary MAB - fixed success probability per model.")
+    args = parser.parse_args()
+    
+    if args.test_mode != "allStepsAtOnce":
+        print(f"Error: Automation only supports 'allStepsAtOnce' mode.")
+        print(f"Verification agent only works with 'allStepsAtOnce' mode.")
+        print(f"Received: {args.test_mode}")
+        exit(1)
+    
+    run_automation_suite(args.troubleshooting_dir, args.models, args.temperature, args.test_cases, args.test_mode, args.include_feedback)
+
diff --git a/debug_assistant_latest/main.py b/debug_assistant_latest/main.py
index 5962e46..0a8e336 100644
--- a/debug_assistant_latest/main.py
+++ b/debug_assistant_latest/main.py
@@ -6,13 +6,18 @@
 
 db_path = os.path.expanduser("~/KubeLLM/token_metrics.db")
 
-def allStepsAtOnce(configFile = None):
+def allStepsAtOnce(configFile = None, model_override=None, temperature_override=None):
     """
         This function will run the knowledge agent and debug agent. 
         When the debug agent receives the response from the knowledge
         agent, the debug agent will run all the commands all at once.
 
         Approach by: William Clifford
+        
+        Args:
+            configFile: Path to config file
+            model_override: Optional model name to override config (for automation)
+            temperature_override: Optional temperature to override config (for automation)
     """
 
     #read config to initilize enviornment
@@ -20,7 +25,7 @@ def allStepsAtOnce(configFile = None):
     setUpEnvironment(config)
     #initilize needed LLMs
     apiAgent = AgentAPI("api-agent" , config)
-    debugAgent = AgentDebug("debug-agent" , config)
+    debugAgent = AgentDebug("debug-agent" , config, model_override=model_override, temperature_override=temperature_override)
     #set up the LLMs
     apiAgent.setupAgent()
     debugAgent.setupAgent()
@@ -33,6 +38,9 @@ def allStepsAtOnce(configFile = None):
     debug_end_time = time.perf_counter()
     debug_duration_s = debug_end_time - debug_start_time
 
+    # Get temperature from override or config
+    temperature = temperature_override if temperature_override is not None else config.get("debug-agent", {}).get("temperature", 0.0)
+    
     # Calculate the cost
     debug_cost = calculate_cost(debug_metrics.get("model"), debug_metrics.get("input_tokens"), debug_metrics.get("output_tokens"))
     
@@ -68,9 +76,10 @@ def allStepsAtOnce(configFile = None):
     # Calculate the cost
     verification_cost = calculate_cost(verification_metrics.get("model"), verification_metrics.get("input_tokens"), verification_metrics.get("output_tokens"))
 
-    # Update debug_metrics and verification_metrics
+    # Update debug_metrics and verification_metrics with temperature
     debug_metrics["duration_s"] = round(debug_duration_s, 2)
     debug_metrics["cost"] = round(debug_cost, 4)
+    debug_metrics["temperature"] = temperature
     verification_metrics["duration_s"] = round(verification_duration_s, 2)
     verification_metrics["cost"] = round(verification_cost, 4)
 
@@ -79,7 +88,14 @@ def allStepsAtOnce(configFile = None):
     store_metrics_entry(db_path, verification_metrics, verification_metrics.get("task_status"))
     printFinishMessage()
 
-    return verificationAgent.verificationStatus  # Return verification result instead of debug agent's self-report
+    # Ensure knowledge_response is stored as-is (dict or string) for reuse
+    return {
+        "verification_status": verificationAgent.verificationStatus,
+        "knowledge_response": apiAgent.response,  # Keep as-is (dict or string)
+        "debug_response": debugAgent.response if debugAgent.response else "",
+        "verification_report": verificationAgent.verificationReport if verificationAgent.verificationReport else "",
+        "debug_model_used": debug_metrics.get("model"),
+    }  # Return for automation (and for building next-iteration summary on failure)
 
 def stepByStep( configFile = None ):
     """
@@ -111,6 +127,103 @@ def stepByStep( configFile = None ):
     return debugAgent.debugStatus
 
 
+def allStepsAtOnceWithoutKnowledgeAgent(configFile=None, knowledge_response=None, model_override=None, temperature_override=None, previous_attempt_summary=None):
+    """
+        This function runs the debug agent and verification agent WITHOUT calling the knowledge agent.
+        It reuses a previously obtained knowledge response. This is used for automation when
+        we want to try different models/temperatures with the same knowledge plan.
+        Optionally passes a short summary of the last failed attempt so the next model can avoid repeating it.
+        
+        Args:
+            configFile: Path to config file
+            knowledge_response: The knowledge agent's response to reuse
+            model_override: Model name to use (overrides config)
+            temperature_override: Temperature to use (overrides config)
+            previous_attempt_summary: Optional short summary of the last failed attempt (for next model context)
+        
+        Returns:
+            dict with verification_status, knowledge_response, debug_response, verification_report
+    """
+    if knowledge_response is None:
+        raise ValueError("knowledge_response is required for allStepsAtOnceWithoutKnowledgeAgent")
+    
+    #read config to initialize environment
+    config = readTheJSONConfigFile(configFile=configFile)
+    setUpEnvironment(config)
+    
+    # Initialize debug agent with overrides
+    debugAgent = AgentDebug("debug-agent", config, model_override=model_override, temperature_override=temperature_override)
+    debugAgent.setupAgent()
+    
+    # Use the provided knowledge response; append summary of previous attempt so next model has context
+    # Handle both dict and string responses from knowledge agent
+    if isinstance(knowledge_response, dict):
+        knowledge_text = knowledge_response.get("response", str(knowledge_response))
+    else:
+        knowledge_text = str(knowledge_response)
+    
+    if previous_attempt_summary:
+        debugAgent.agentAPIResponse = knowledge_text + "\n\n---\nContext from previous attempt(s) (use to avoid repeating failed approaches):\n" + previous_attempt_summary
+    else:
+        debugAgent.agentAPIResponse = knowledge_text
+    
+    # Run debug agent
+    debug_start_time = time.perf_counter()
+    debug_metrics = debugAgent.askQuestion()
+    debug_end_time = time.perf_counter()
+    debug_duration_s = debug_end_time - debug_start_time
+    
+    # Get temperature from override or config
+    temperature = temperature_override if temperature_override is not None else config.get("debug-agent", {}).get("temperature", 0.0)
+    
+    # Calculate the cost
+    debug_cost = calculate_cost(debug_metrics.get("model"), debug_metrics.get("input_tokens"), debug_metrics.get("output_tokens"))
+    
+    # Call the verification agent
+    print("\n" + "="*80)
+    print("STARTING VERIFICATION PHASE")
+    print("="*80 + "\n")
+    
+    verificationAgent = AgentVerification_v2("verification-agent", config)
+    verificationAgent.setupAgent()
+    
+    # Pass the debug agent's response to the verification agent
+    verificationAgent.debugAgentResponse = debugAgent.response if debugAgent.response else "Debug agent completed execution"
+    
+    # Run verification
+    verification_start_time = time.perf_counter()
+    verification_metrics = verificationAgent.askQuestion()
+    verification_end_time = time.perf_counter()
+    verification_duration_s = verification_end_time - verification_start_time
+    
+    print(f"\nFinal Task Status: {'SUCCESS' if verificationAgent.verificationStatus else 'FAILURE'}")
+    print(f"Debug Agent Self-Report: {'SUCCESS' if debugAgent.debugStatus else 'FAILURE'}")
+    print(f"Verification Agent Report: {'VERIFIED' if verificationAgent.verificationStatus else 'FAILED' if verificationAgent.verificationStatus is False else 'UNKNOWN'}\n")
+    
+    # Calculate the cost
+    verification_cost = calculate_cost(verification_metrics.get("model"), verification_metrics.get("input_tokens"), verification_metrics.get("output_tokens"))
+    
+    # Update metrics with temperature
+    debug_metrics["duration_s"] = round(debug_duration_s, 2)
+    debug_metrics["cost"] = round(debug_cost, 4)
+    debug_metrics["temperature"] = temperature
+    verification_metrics["duration_s"] = round(verification_duration_s, 2)
+    verification_metrics["cost"] = round(verification_cost, 4)
+    
+    # Store metrics entry into the database
+    store_metrics_entry(db_path, debug_metrics, verification_metrics.get("task_status"))
+    store_metrics_entry(db_path, verification_metrics, verification_metrics.get("task_status"))
+    printFinishMessage()
+    
+    return {
+        "verification_status": verificationAgent.verificationStatus,
+        "knowledge_response": knowledge_response,
+        "debug_response": debugAgent.response if debugAgent.response else "",
+        "verification_report": verificationAgent.verificationReport if verificationAgent.verificationReport else "",
+        "debug_model_used": debug_metrics.get("model"),
+    }
+
+
 def singleAgentApproach( configFile = None ):
     """
         This function will run a single agent which will do the
@@ -134,7 +247,8 @@ def singleAgentApproach( configFile = None ):
 
 def run( debugType, configFile ):
     if debugType == "allStepsAtOnce":
-        allStepsAtOnce(configFile)
+        result = allStepsAtOnce(configFile)
+        return result.get("verification_status", False) if isinstance(result, dict) else result
     elif debugType == "stepByStep":
         stepByStep(configFile)
     elif debugType == "singleAgent":
diff --git a/debug_assistant_latest/metrics_db.py b/debug_assistant_latest/metrics_db.py
index 1bd7c84..c5ef6dc 100644
--- a/debug_assistant_latest/metrics_db.py
+++ b/debug_assistant_latest/metrics_db.py
@@ -40,6 +40,7 @@ def store_metrics_entry(db_path, metrics, task_status_verified):
             test_case TEXT NOT NULL,
             model TEXT,
             agent_type TEXT,
+            temperature REAL DEFAULT 0.0,
             input_tokens INTEGER DEFAULT 0,
             output_tokens INTEGER DEFAULT 0,
             total_tokens INTEGER DEFAULT 0,
@@ -49,13 +50,19 @@ def store_metrics_entry(db_path, metrics, task_status_verified):
             cost REAL DEFAULT 0.0
         )
     ''')
+    
+    # Add temperature column if it doesn't exist (migration for existing databases)
+    try:
+        cursor.execute('ALTER TABLE metrics ADD COLUMN temperature REAL DEFAULT 0.0')
+    except sqlite3.OperationalError:
+        pass  # Column already exists
 
     # Insert the entry
     timestamp = datetime.now().isoformat()
     cursor.execute('''
-        INSERT INTO metrics (timestamp, test_case, model, agent_type, input_tokens, output_tokens, total_tokens, task_status, task_status_verified, duration_s, cost)
-        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-    ''', (timestamp, metrics.get("test_case"), metrics.get("model"), metrics.get("agent_type"), metrics.get("input_tokens"), metrics.get("output_tokens"), metrics.get("total_tokens"), metrics.get("task_status"), task_status_verified, metrics.get("duration_s"), metrics.get("cost")))
+        INSERT INTO metrics (timestamp, test_case, model, agent_type, temperature, input_tokens, output_tokens, total_tokens, task_status, task_status_verified, duration_s, cost)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    ''', (timestamp, metrics.get("test_case"), metrics.get("model"), metrics.get("agent_type"), metrics.get("temperature", 0.0), metrics.get("input_tokens"), metrics.get("output_tokens"), metrics.get("total_tokens"), metrics.get("task_status"), task_status_verified, metrics.get("duration_s"), metrics.get("cost")))
 
     conn.commit()
     conn.close()
@@ -129,3 +136,85 @@ def calculate_totals(db_path):
         "total_successes": total_successes,
         "total_verified_successes": total_verified_successes
     }
+
+def store_attempt_context(db_path, test_case, model_used, context_summary):
+    """Store context summary for a failed attempt. Used to pass context to next model iteration."""
+    os.makedirs(os.path.dirname(db_path), exist_ok=True)
+    
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    
+    # Create table if not exists
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS attempt_context (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            timestamp TEXT NOT NULL,
+            test_case TEXT NOT NULL,
+            model_used TEXT,
+            context_summary TEXT
+        )
+    ''')
+    
+    # Insert the context
+    timestamp = datetime.now().isoformat()
+    cursor.execute('''
+        INSERT INTO attempt_context (timestamp, test_case, model_used, context_summary)
+        VALUES (?, ?, ?, ?)
+    ''', (timestamp, test_case, model_used, context_summary))
+    
+    conn.commit()
+    conn.close()
+
+def get_latest_attempt_context(db_path, test_case):
+    """Retrieve the most recent context summary for a test case. Returns None if none exists."""
+    os.makedirs(os.path.dirname(db_path), exist_ok=True)
+    
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    
+    # Create table if not exists (in case it doesn't exist yet)
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS attempt_context (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            timestamp TEXT NOT NULL,
+            test_case TEXT NOT NULL,
+            model_used TEXT,
+            context_summary TEXT
+        )
+    ''')
+    
+    # Get most recent context for this test case
+    cursor.execute('''
+        SELECT context_summary FROM attempt_context
+        WHERE test_case = ?
+        ORDER BY timestamp DESC
+        LIMIT 1
+    ''', (test_case,))
+    
+    result = cursor.fetchone()
+    conn.close()
+    
+    return result[0] if result else None
+
+def clear_attempt_context(db_path, test_case):
+    """Clear all context entries for a test case (e.g., when test succeeds or starts fresh)."""
+    os.makedirs(os.path.dirname(db_path), exist_ok=True)
+    
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    
+    # Create table if not exists (in case it doesn't exist yet)
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS attempt_context (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            timestamp TEXT NOT NULL,
+            test_case TEXT NOT NULL,
+            model_used TEXT,
+            context_summary TEXT
+        )
+    ''')
+    
+    cursor.execute('DELETE FROM attempt_context WHERE test_case = ?', (test_case,))
+    
+    conn.commit()
+    conn.close()