From 6879f36f8ff6b20efe55c6fb904cb3b4719d1c66 Mon Sep 17 00:00:00 2001
From: Alex Mikhalev <alex@metacortex.engineer>
Date: Thu, 26 Mar 2026 13:44:42 +0100
Subject: [PATCH] feat(terraphim-agent): add learn auto-extract subcommand for
 session transcripts

Implements automatic extraction of corrections from JSONL session transcripts:

- Add auto_extract_corrections() function that scans JSONL files
- Detects failed Bash commands followed by successful variants
- Identifies explicit correction phrases: 'instead use', 'should be',
  'correct way', 'use X not Y'
- Add CLI subcommand 'learn auto-extract <transcript-path>' with --save flag
- Export auto_extract_corrections in learnings/mod.rs
- Add comprehensive unit tests for pattern detection

Refs #89
---
 .../terraphim_agent/src/learnings/capture.rs  | 352 ++++++++++++++++++
 crates/terraphim_agent/src/learnings/mod.rs   |   6 +-
 crates/terraphim_agent/src/main.rs            |  62 +++
 3 files changed, 417 insertions(+), 3 deletions(-)
diff --git a/crates/terraphim_agent/src/learnings/capture.rs b/crates/terraphim_agent/src/learnings/capture.rs
index c28824cb3..8cf483d32 100644
--- a/crates/terraphim_agent/src/learnings/capture.rs
+++ b/crates/terraphim_agent/src/learnings/capture.rs
@@ -979,6 +979,223 @@ impl ScoredEntry {
     }
 }
 
+/// JSONL transcript entry types for auto-extraction.
+#[derive(Debug, Clone, Deserialize)]
+pub struct TranscriptEntry {
+    #[serde(default)]
+    pub r#type: Option<String>,
+    #[serde(default)]
+    pub content: Option<String>,
+    #[serde(default)]
+    pub tool_name: Option<String>,
+    #[serde(default)]
+    pub tool_input: Option<serde_json::Value>,
+    #[serde(default)]
+    pub tool_result: Option<serde_json::Value>,
+    #[serde(default)]
+    pub exit_code: Option<i32>,
+    #[serde(default)]
+    pub error: Option<String>,
+}
+
+/// Check if content contains explicit correction phrases.
+fn contains_correction_phrase(content: &str) -> Option<(String, String)> {
+    let lower = content.to_lowercase();
+
+    // Pattern: "instead use X" or "use X instead"
+    if let Some(idx) = lower.find("instead use") {
+        let after = &content[idx + 11..];
+        return Some((content.to_string(), after.trim().to_string()));
+    }
+    if let Some(idx) = lower.find("use ") {
+        let rest = &lower[idx + 4..];
+        if rest.contains("instead") {
+            let end = rest.find("instead").unwrap_or(rest.len());
+            let tool = &content[idx + 4..idx + 4 + end].trim();
+            return Some((content.to_string(), tool.to_string()));
+        }
+    }
+
+    // Pattern: "should be"
+    if let Some(idx) = lower.find("should be") {
+        let after = &content[idx + 9..];
+        return Some((content.to_string(), after.trim().to_string()));
+    }
+
+    // Pattern: "correct way"
+    if let Some(idx) = lower.find("correct way") {
+        let after = &content[idx + 11..];
+        // Look for "is to" or "to"
+        if after.contains("is to") {
+            let start = after.find("is to").unwrap_or(0) + 5;
+            return Some((content.to_string(), after[start..].trim().to_string()));
+        }
+        return Some((content.to_string(), after.trim().to_string()));
+    }
+
+    // Pattern: "use X not Y" or "use X, not Y"
+    if let Some(idx) = lower.find("use ") {
+        let rest = &content[idx + 4..];
+        let lower_rest = rest.to_lowercase();
+        if let Some(not_idx) = lower_rest.find(" not ") {
+            let tool = rest[..not_idx].trim();
+            // Find the end of the old tool (rest of string or next word boundary)
+            let old_tool_rest = &rest[not_idx + 5..];
+            let old_tool = old_tool_rest
+                .split_whitespace()
+                .next()
+                .unwrap_or(old_tool_rest)
+                .trim();
+            return Some((old_tool.to_string(), tool.to_string()));
+        }
+    }
+
+    None
+}
+
+/// Extract command from Bash tool input.
+fn extract_command_from_input(input: &serde_json::Value) -> Option<String> {
+    input
+        .get("command")
+        .or_else(|| input.get("cmd"))
+        .and_then(|v| v.as_str())
+        .map(|s| s.to_string())
+}
+
+/// Auto-extract corrections from a JSONL session transcript.
+///
+/// Scans the transcript line by line and identifies:
+/// 1. Failed Bash commands (exit code != 0) followed by successful variants
+/// 2. Explicit correction phrases like "instead use", "should be", etc.
+///
+/// # Arguments
+///
+/// * `transcript_path` - Path to the JSONL transcript file
+///
+/// # Returns
+///
+/// Vector of extracted CorrectionEvent objects.
+pub fn auto_extract_corrections(
+    transcript_path: &std::path::Path,
+) -> Result<Vec<CorrectionEvent>, LearningError> {
+    use std::io::BufRead;
+
+    let file = fs::File::open(transcript_path)?;
+    let reader = std::io::BufReader::new(file);
+
+    let mut corrections = Vec::new();
+    let mut last_failed_command: Option<(String, i32, String)> = None; // (command, exit_code, error)
+
+    for line in reader.lines() {
+        let line = line?;
+        if line.trim().is_empty() {
+            continue;
+        }
+
+        let entry: TranscriptEntry = match serde_json::from_str(&line) {
+            Ok(e) => e,
+            Err(_) => continue, // Skip malformed lines
+        };
+
+        // Check for Bash tool results with exit codes
+        if entry.tool_name.as_deref() == Some("Bash")
+            || entry.r#type.as_deref() == Some("tool_result")
+        {
+            // Check if this is a failed Bash command
+            if let Some(exit_code) = entry.exit_code {
+                if exit_code != 0 {
+                    // Extract the command from tool_input in previous context or from error
+                    if let Some(ref tool_input) = entry.tool_input {
+                        if let Some(cmd) = extract_command_from_input(tool_input) {
+                            let error = entry
+                                .error
+                                .clone()
+                                .or_else(|| entry.content.clone())
+                                .unwrap_or_default();
+                            last_failed_command = Some((cmd, exit_code, error));
+                        }
+                    }
+                } else if exit_code == 0 {
+                    // Successful command - check if we had a previous failure
+                    if let Some((failed_cmd, failed_exit, failed_error)) =
+                        last_failed_command.take()
+                    {
+                        // Extract the successful command
+                        if let Some(ref tool_input) = entry.tool_input {
+                            if let Some(success_cmd) = extract_command_from_input(tool_input) {
+                                // Only create correction if commands are different
+                                if failed_cmd != success_cmd {
+                                    let context = format!(
+                                        "Auto-extracted from session transcript. Failed with exit {}: {}",
+                                        failed_exit, failed_error
+                                    );
+                                    let correction = CorrectionEvent::new(
+                                        CorrectionType::ToolPreference,
+                                        failed_cmd,
+                                        success_cmd,
+                                        context,
+                                        LearningSource::Project,
+                                    )
+                                    .with_tags(vec![
+                                        "auto-extracted".to_string(),
+                                        "transcript".to_string(),
+                                    ]);
+                                    corrections.push(correction);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Check for explicit correction phrases in content
+        if let Some(ref content) = entry.content {
+            if let Some((original, corrected)) = contains_correction_phrase(content) {
+                let context = format!(
+                    "Auto-extracted from session transcript content: {}",
+                    content.chars().take(100).collect::<String>()
+                );
+                let correction = CorrectionEvent::new(
+                    CorrectionType::Other("phrase-detected".to_string()),
+                    original,
+                    corrected,
+                    context,
+                    LearningSource::Project,
+                )
+                .with_tags(vec!["auto-extracted".to_string(), "phrase".to_string()]);
+                corrections.push(correction);
+            }
+        }
+
+        // Also check in tool_result if it's a string
+        if let Some(ref tool_result) = entry.tool_result {
+            if let Some(content) = tool_result.as_str() {
+                if let Some((original, corrected)) = contains_correction_phrase(content) {
+                    let context = format!(
+                        "Auto-extracted from tool result: {}",
+                        content.chars().take(100).collect::<String>()
+                    );
+                    let correction = CorrectionEvent::new(
+                        CorrectionType::Other("phrase-detected".to_string()),
+                        original,
+                        corrected,
+                        context,
+                        LearningSource::Project,
+                    )
+                    .with_tags(vec![
+                        "auto-extracted".to_string(),
+                        "tool-result".to_string(),
+                    ]);
+                    corrections.push(correction);
+                }
+            }
+        }
+    }
+
+    Ok(corrections)
+}
+
 /// Suggest learnings based on context relevance.
 ///
 /// Takes a context string (e.g., current working directory or task description),
@@ -1439,4 +1656,139 @@ mod tests {
         assert!(correction_entry.summary().contains("npm"));
         assert!(correction_entry.summary().contains("bun"));
     }
+
+    #[test]
+    fn test_contains_correction_phrase_instead_use() {
+        let content = "You should instead use cargo build";
+        let result = contains_correction_phrase(content);
+        assert!(result.is_some());
+        let (original, _corrected) = result.unwrap();
+        assert!(original.contains("You should"));
+    }
+
+    #[test]
+    fn test_contains_correction_phrase_use_instead() {
+        let content = "Use bun instead of npm for faster installs";
+        let result = contains_correction_phrase(content);
+        assert!(result.is_some());
+        let (original, _corrected) = result.unwrap();
+        assert!(original.contains("Use bun"));
+    }
+
+    #[test]
+    fn test_contains_correction_phrase_should_be() {
+        let content = "The variable name should be user_count";
+        let result = contains_correction_phrase(content);
+        assert!(result.is_some());
+        let (original, _corrected) = result.unwrap();
+        assert!(original.contains("variable name"));
+    }
+
+    #[test]
+    fn test_contains_correction_phrase_correct_way() {
+        let content = "The correct way is to use cargo check first";
+        let result = contains_correction_phrase(content);
+        assert!(result.is_some());
+        let (original, _corrected) = result.unwrap();
+        assert!(original.contains("The correct way"));
+    }
+
+    #[test]
+    fn test_contains_correction_phrase_use_not() {
+        let content = "Use yarn not npm for this project";
+        let result = contains_correction_phrase(content);
+        assert!(result.is_some());
+        let (original, corrected) = result.unwrap();
+        assert_eq!(original, "npm");
+        assert_eq!(corrected, "yarn");
+    }
+
+    #[test]
+    fn test_contains_correction_phrase_no_match() {
+        let content = "This is just a normal sentence without corrections";
+        let result = contains_correction_phrase(content);
+        assert!(result.is_none());
+    }
+
+    #[test]
+    fn test_auto_extract_corrections_from_transcript() {
+        use std::io::Write;
+
+        let temp_dir = TempDir::new().unwrap();
+        let storage = temp_dir.path().join("learnings");
+        fs::create_dir(&storage).unwrap();
+
+        // Create a mock transcript with failed then successful commands
+        let transcript_path = temp_dir.path().join("session.jsonl");
+        let transcript_content = r#"
+{"type": "tool_use", "tool_name": "Bash", "tool_input": {"command": "git push -f"}}
+{"type": "tool_result", "tool_name": "Bash", "exit_code": 1, "error": "remote: rejected", "tool_input": {"command": "git push -f"}}
+{"type": "tool_use", "tool_name": "Bash", "tool_input": {"command": "git push origin main"}}
+{"type": "tool_result", "tool_name": "Bash", "exit_code": 0, "tool_input": {"command": "git push origin main"}}
+{"content": "You should instead use cargo check before building"}
+"#;
+        let mut file = fs::File::create(&transcript_path).unwrap();
+        file.write_all(transcript_content.as_bytes()).unwrap();
+
+        let corrections = auto_extract_corrections(&transcript_path).unwrap();
+
+        // Should find at least 2 corrections: the command fix + the phrase
+        assert!(
+            corrections.len() >= 2,
+            "Expected at least 2 corrections, got {}",
+            corrections.len()
+        );
+
+        // Check for the command correction
+        let cmd_correction = corrections
+            .iter()
+            .find(|c| c.original == "git push -f" && c.corrected == "git push origin main");
+        assert!(
+            cmd_correction.is_some(),
+            "Should find command correction: git push -f -> git push origin main"
+        );
+
+        // Check for the phrase correction
+        let phrase_correction = corrections
+            .iter()
+            .find(|c| c.corrected.contains("cargo check"));
+        assert!(
+            phrase_correction.is_some(),
+            "Should find phrase correction containing 'cargo check'"
+        );
+    }
+
+    #[test]
+    fn test_auto_extract_corrections_empty_transcript() {
+        let temp_dir = TempDir::new().unwrap();
+
+        // Create an empty transcript
+        let transcript_path = temp_dir.path().join("empty.jsonl");
+        fs::write(&transcript_path, "").unwrap();
+
+        let corrections = auto_extract_corrections(&transcript_path).unwrap();
+        assert!(corrections.is_empty());
+    }
+
+    #[test]
+    fn test_auto_extract_corrections_no_failures() {
+        use std::io::Write;
+
+        let temp_dir = TempDir::new().unwrap();
+
+        // Create a transcript with only successful commands
+        let transcript_path = temp_dir.path().join("success.jsonl");
+        let transcript_content = r#"
+{"type": "tool_use", "tool_name": "Bash", "tool_input": {"command": "git status"}}
+{"type": "tool_result", "tool_name": "Bash", "exit_code": 0, "tool_input": {"command": "git status"}}
+{"type": "tool_use", "tool_name": "Bash", "tool_input": {"command": "git log"}}
+{"type": "tool_result", "tool_name": "Bash", "exit_code": 0, "tool_input": {"command": "git log"}}
+"#;
+        let mut file = fs::File::create(&transcript_path).unwrap();
+        file.write_all(transcript_content.as_bytes()).unwrap();
+
+        let corrections = auto_extract_corrections(&transcript_path).unwrap();
+        // No corrections since all commands succeeded
+        assert!(corrections.is_empty());
+    }
 }
diff --git a/crates/terraphim_agent/src/learnings/mod.rs b/crates/terraphim_agent/src/learnings/mod.rs
index ff139165d..20931c779 100644
--- a/crates/terraphim_agent/src/learnings/mod.rs
+++ b/crates/terraphim_agent/src/learnings/mod.rs
@@ -32,9 +32,9 @@ mod redaction;
 
 #[allow(unused_imports)]
 pub use capture::{
-    CorrectionType, LearningEntry, LearningSource, ScoredEntry, capture_correction,
-    capture_failed_command, correct_learning, list_all_entries, query_all_entries,
-    suggest_learnings,
+    CorrectionType, LearningEntry, LearningSource, ScoredEntry, auto_extract_corrections,
+    capture_correction, capture_failed_command, correct_learning, list_all_entries,
+    query_all_entries, suggest_learnings,
 };
 
 // Re-export for testing - not used by CLI yet
diff --git a/crates/terraphim_agent/src/main.rs b/crates/terraphim_agent/src/main.rs
index d806ff362..3aae92f36 100644
--- a/crates/terraphim_agent/src/main.rs
+++ b/crates/terraphim_agent/src/main.rs
@@ -800,6 +800,17 @@ enum LearnSub {
         #[arg(value_enum)]
         agent: learnings::AgentType,
     },
+    /// Auto-extract corrections from session transcript
+    AutoExtract {
+        /// Path to JSONL transcript file
+        transcript_path: String,
+        /// Save extracted corrections to storage
+        #[arg(long, default_value_t = false)]
+        save: bool,
+        /// Show global learnings instead of project
+        #[arg(long, default_value_t = false)]
+        global: bool,
+    },
 }
 
 #[derive(Subcommand, Debug)]
@@ -2155,6 +2166,57 @@ async fn run_learn_command(sub: LearnSub) -> Result<()> {
         LearnSub::InstallHook { agent } => {
             learnings::install_hook(agent).await.map_err(|e| e.into())
         }
+        LearnSub::AutoExtract {
+            transcript_path,
+            save,
+            global,
+        } => {
+            use learnings::auto_extract_corrections;
+            use std::path::PathBuf;
+
+            let storage_loc = config.storage_location();
+            let storage_dir = if global {
+                &config.global_dir
+            } else {
+                &storage_loc
+            };
+            let transcript_path = PathBuf::from(transcript_path);
+
+            match auto_extract_corrections(&transcript_path) {
+                Ok(corrections) => {
+                    if corrections.is_empty() {
+                        println!("No corrections found in transcript.");
+                    } else {
+                        println!("Extracted {} correction(s):", corrections.len());
+                        for (i, correction) in corrections.iter().enumerate() {
+                            println!("  {}. Type: {}", i + 1, correction.correction_type);
+                            println!("     Original: {}", correction.original);
+                            println!("     Corrected: {}", correction.corrected);
+                            if !correction.context_description.is_empty() {
+                                println!("     Context: {}", correction.context_description);
+                            }
+                            if save {
+                                // Save the correction to storage
+                                let filename = format!("correction-{}.md", correction.id);
+                                let filepath = storage_dir.join(&filename);
+                                match std::fs::write(&filepath, correction.to_markdown()) {
+                                    Ok(_) => println!("     Saved: {}", filepath.display()),
+                                    Err(e) => eprintln!("     Error saving: {}", e),
+                                }
+                            }
+                        }
+                        if save {
+                            println!("\nAll corrections saved to: {}", storage_dir.display());
+                        }
+                    }
+                    Ok(())
+                }
+                Err(e) => {
+                    eprintln!("Failed to extract corrections: {}", e);
+                    Err(e.into())
+                }
+            }
+        }
     }
 }