From 6879f36f8ff6b20efe55c6fb904cb3b4719d1c66 Mon Sep 17 00:00:00 2001 From: Alex Mikhalev Date: Thu, 26 Mar 2026 13:44:42 +0100 Subject: [PATCH] feat(terraphim-agent): add learn auto-extract subcommand for session transcripts Implements automatic extraction of corrections from JSONL session transcripts: - Add auto_extract_corrections() function that scans JSONL files - Detects failed Bash commands followed by successful variants - Identifies explicit correction phrases: 'instead use', 'should be', 'correct way', 'use X not Y' - Add CLI subcommand 'learn auto-extract ' with --save flag - Export auto_extract_corrections in learnings/mod.rs - Add comprehensive unit tests for pattern detection Refs #89 --- .../terraphim_agent/src/learnings/capture.rs | 352 ++++++++++++++++++ crates/terraphim_agent/src/learnings/mod.rs | 6 +- crates/terraphim_agent/src/main.rs | 62 +++ 3 files changed, 417 insertions(+), 3 deletions(-) diff --git a/crates/terraphim_agent/src/learnings/capture.rs b/crates/terraphim_agent/src/learnings/capture.rs index c28824cb3..8cf483d32 100644 --- a/crates/terraphim_agent/src/learnings/capture.rs +++ b/crates/terraphim_agent/src/learnings/capture.rs @@ -979,6 +979,223 @@ impl ScoredEntry { } } +/// JSONL transcript entry types for auto-extraction. +#[derive(Debug, Clone, Deserialize)] +pub struct TranscriptEntry { + #[serde(default)] + pub r#type: Option, + #[serde(default)] + pub content: Option, + #[serde(default)] + pub tool_name: Option, + #[serde(default)] + pub tool_input: Option, + #[serde(default)] + pub tool_result: Option, + #[serde(default)] + pub exit_code: Option, + #[serde(default)] + pub error: Option, +} + +/// Check if content contains explicit correction phrases. +fn contains_correction_phrase(content: &str) -> Option<(String, String)> { + let lower = content.to_lowercase(); + + // Pattern: "instead use X" or "use X instead" + if let Some(idx) = lower.find("instead use") { + let after = &content[idx + 11..]; + return Some((content.to_string(), after.trim().to_string())); + } + if let Some(idx) = lower.find("use ") { + let rest = &lower[idx + 4..]; + if rest.contains("instead") { + let end = rest.find("instead").unwrap_or(rest.len()); + let tool = &content[idx + 4..idx + 4 + end].trim(); + return Some((content.to_string(), tool.to_string())); + } + } + + // Pattern: "should be" + if let Some(idx) = lower.find("should be") { + let after = &content[idx + 9..]; + return Some((content.to_string(), after.trim().to_string())); + } + + // Pattern: "correct way" + if let Some(idx) = lower.find("correct way") { + let after = &content[idx + 11..]; + // Look for "is to" or "to" + if after.contains("is to") { + let start = after.find("is to").unwrap_or(0) + 5; + return Some((content.to_string(), after[start..].trim().to_string())); + } + return Some((content.to_string(), after.trim().to_string())); + } + + // Pattern: "use X not Y" or "use X, not Y" + if let Some(idx) = lower.find("use ") { + let rest = &content[idx + 4..]; + let lower_rest = rest.to_lowercase(); + if let Some(not_idx) = lower_rest.find(" not ") { + let tool = rest[..not_idx].trim(); + // Find the end of the old tool (rest of string or next word boundary) + let old_tool_rest = &rest[not_idx + 5..]; + let old_tool = old_tool_rest + .split_whitespace() + .next() + .unwrap_or(old_tool_rest) + .trim(); + return Some((old_tool.to_string(), tool.to_string())); + } + } + + None +} + +/// Extract command from Bash tool input. +fn extract_command_from_input(input: &serde_json::Value) -> Option { + input + .get("command") + .or_else(|| input.get("cmd")) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) +} + +/// Auto-extract corrections from a JSONL session transcript. +/// +/// Scans the transcript line by line and identifies: +/// 1. Failed Bash commands (exit code != 0) followed by successful variants +/// 2. Explicit correction phrases like "instead use", "should be", etc. +/// +/// # Arguments +/// +/// * `transcript_path` - Path to the JSONL transcript file +/// +/// # Returns +/// +/// Vector of extracted CorrectionEvent objects. +pub fn auto_extract_corrections( + transcript_path: &std::path::Path, +) -> Result, LearningError> { + use std::io::BufRead; + + let file = fs::File::open(transcript_path)?; + let reader = std::io::BufReader::new(file); + + let mut corrections = Vec::new(); + let mut last_failed_command: Option<(String, i32, String)> = None; // (command, exit_code, error) + + for line in reader.lines() { + let line = line?; + if line.trim().is_empty() { + continue; + } + + let entry: TranscriptEntry = match serde_json::from_str(&line) { + Ok(e) => e, + Err(_) => continue, // Skip malformed lines + }; + + // Check for Bash tool results with exit codes + if entry.tool_name.as_deref() == Some("Bash") + || entry.r#type.as_deref() == Some("tool_result") + { + // Check if this is a failed Bash command + if let Some(exit_code) = entry.exit_code { + if exit_code != 0 { + // Extract the command from tool_input in previous context or from error + if let Some(ref tool_input) = entry.tool_input { + if let Some(cmd) = extract_command_from_input(tool_input) { + let error = entry + .error + .clone() + .or_else(|| entry.content.clone()) + .unwrap_or_default(); + last_failed_command = Some((cmd, exit_code, error)); + } + } + } else if exit_code == 0 { + // Successful command - check if we had a previous failure + if let Some((failed_cmd, failed_exit, failed_error)) = + last_failed_command.take() + { + // Extract the successful command + if let Some(ref tool_input) = entry.tool_input { + if let Some(success_cmd) = extract_command_from_input(tool_input) { + // Only create correction if commands are different + if failed_cmd != success_cmd { + let context = format!( + "Auto-extracted from session transcript. Failed with exit {}: {}", + failed_exit, failed_error + ); + let correction = CorrectionEvent::new( + CorrectionType::ToolPreference, + failed_cmd, + success_cmd, + context, + LearningSource::Project, + ) + .with_tags(vec![ + "auto-extracted".to_string(), + "transcript".to_string(), + ]); + corrections.push(correction); + } + } + } + } + } + } + } + + // Check for explicit correction phrases in content + if let Some(ref content) = entry.content { + if let Some((original, corrected)) = contains_correction_phrase(content) { + let context = format!( + "Auto-extracted from session transcript content: {}", + content.chars().take(100).collect::() + ); + let correction = CorrectionEvent::new( + CorrectionType::Other("phrase-detected".to_string()), + original, + corrected, + context, + LearningSource::Project, + ) + .with_tags(vec!["auto-extracted".to_string(), "phrase".to_string()]); + corrections.push(correction); + } + } + + // Also check in tool_result if it's a string + if let Some(ref tool_result) = entry.tool_result { + if let Some(content) = tool_result.as_str() { + if let Some((original, corrected)) = contains_correction_phrase(content) { + let context = format!( + "Auto-extracted from tool result: {}", + content.chars().take(100).collect::() + ); + let correction = CorrectionEvent::new( + CorrectionType::Other("phrase-detected".to_string()), + original, + corrected, + context, + LearningSource::Project, + ) + .with_tags(vec![ + "auto-extracted".to_string(), + "tool-result".to_string(), + ]); + corrections.push(correction); + } + } + } + } + + Ok(corrections) +} + /// Suggest learnings based on context relevance. /// /// Takes a context string (e.g., current working directory or task description), @@ -1439,4 +1656,139 @@ mod tests { assert!(correction_entry.summary().contains("npm")); assert!(correction_entry.summary().contains("bun")); } + + #[test] + fn test_contains_correction_phrase_instead_use() { + let content = "You should instead use cargo build"; + let result = contains_correction_phrase(content); + assert!(result.is_some()); + let (original, _corrected) = result.unwrap(); + assert!(original.contains("You should")); + } + + #[test] + fn test_contains_correction_phrase_use_instead() { + let content = "Use bun instead of npm for faster installs"; + let result = contains_correction_phrase(content); + assert!(result.is_some()); + let (original, _corrected) = result.unwrap(); + assert!(original.contains("Use bun")); + } + + #[test] + fn test_contains_correction_phrase_should_be() { + let content = "The variable name should be user_count"; + let result = contains_correction_phrase(content); + assert!(result.is_some()); + let (original, _corrected) = result.unwrap(); + assert!(original.contains("variable name")); + } + + #[test] + fn test_contains_correction_phrase_correct_way() { + let content = "The correct way is to use cargo check first"; + let result = contains_correction_phrase(content); + assert!(result.is_some()); + let (original, _corrected) = result.unwrap(); + assert!(original.contains("The correct way")); + } + + #[test] + fn test_contains_correction_phrase_use_not() { + let content = "Use yarn not npm for this project"; + let result = contains_correction_phrase(content); + assert!(result.is_some()); + let (original, corrected) = result.unwrap(); + assert_eq!(original, "npm"); + assert_eq!(corrected, "yarn"); + } + + #[test] + fn test_contains_correction_phrase_no_match() { + let content = "This is just a normal sentence without corrections"; + let result = contains_correction_phrase(content); + assert!(result.is_none()); + } + + #[test] + fn test_auto_extract_corrections_from_transcript() { + use std::io::Write; + + let temp_dir = TempDir::new().unwrap(); + let storage = temp_dir.path().join("learnings"); + fs::create_dir(&storage).unwrap(); + + // Create a mock transcript with failed then successful commands + let transcript_path = temp_dir.path().join("session.jsonl"); + let transcript_content = r#" +{"type": "tool_use", "tool_name": "Bash", "tool_input": {"command": "git push -f"}} +{"type": "tool_result", "tool_name": "Bash", "exit_code": 1, "error": "remote: rejected", "tool_input": {"command": "git push -f"}} +{"type": "tool_use", "tool_name": "Bash", "tool_input": {"command": "git push origin main"}} +{"type": "tool_result", "tool_name": "Bash", "exit_code": 0, "tool_input": {"command": "git push origin main"}} +{"content": "You should instead use cargo check before building"} +"#; + let mut file = fs::File::create(&transcript_path).unwrap(); + file.write_all(transcript_content.as_bytes()).unwrap(); + + let corrections = auto_extract_corrections(&transcript_path).unwrap(); + + // Should find at least 2 corrections: the command fix + the phrase + assert!( + corrections.len() >= 2, + "Expected at least 2 corrections, got {}", + corrections.len() + ); + + // Check for the command correction + let cmd_correction = corrections + .iter() + .find(|c| c.original == "git push -f" && c.corrected == "git push origin main"); + assert!( + cmd_correction.is_some(), + "Should find command correction: git push -f -> git push origin main" + ); + + // Check for the phrase correction + let phrase_correction = corrections + .iter() + .find(|c| c.corrected.contains("cargo check")); + assert!( + phrase_correction.is_some(), + "Should find phrase correction containing 'cargo check'" + ); + } + + #[test] + fn test_auto_extract_corrections_empty_transcript() { + let temp_dir = TempDir::new().unwrap(); + + // Create an empty transcript + let transcript_path = temp_dir.path().join("empty.jsonl"); + fs::write(&transcript_path, "").unwrap(); + + let corrections = auto_extract_corrections(&transcript_path).unwrap(); + assert!(corrections.is_empty()); + } + + #[test] + fn test_auto_extract_corrections_no_failures() { + use std::io::Write; + + let temp_dir = TempDir::new().unwrap(); + + // Create a transcript with only successful commands + let transcript_path = temp_dir.path().join("success.jsonl"); + let transcript_content = r#" +{"type": "tool_use", "tool_name": "Bash", "tool_input": {"command": "git status"}} +{"type": "tool_result", "tool_name": "Bash", "exit_code": 0, "tool_input": {"command": "git status"}} +{"type": "tool_use", "tool_name": "Bash", "tool_input": {"command": "git log"}} +{"type": "tool_result", "tool_name": "Bash", "exit_code": 0, "tool_input": {"command": "git log"}} +"#; + let mut file = fs::File::create(&transcript_path).unwrap(); + file.write_all(transcript_content.as_bytes()).unwrap(); + + let corrections = auto_extract_corrections(&transcript_path).unwrap(); + // No corrections since all commands succeeded + assert!(corrections.is_empty()); + } } diff --git a/crates/terraphim_agent/src/learnings/mod.rs b/crates/terraphim_agent/src/learnings/mod.rs index ff139165d..20931c779 100644 --- a/crates/terraphim_agent/src/learnings/mod.rs +++ b/crates/terraphim_agent/src/learnings/mod.rs @@ -32,9 +32,9 @@ mod redaction; #[allow(unused_imports)] pub use capture::{ - CorrectionType, LearningEntry, LearningSource, ScoredEntry, capture_correction, - capture_failed_command, correct_learning, list_all_entries, query_all_entries, - suggest_learnings, + CorrectionType, LearningEntry, LearningSource, ScoredEntry, auto_extract_corrections, + capture_correction, capture_failed_command, correct_learning, list_all_entries, + query_all_entries, suggest_learnings, }; // Re-export for testing - not used by CLI yet diff --git a/crates/terraphim_agent/src/main.rs b/crates/terraphim_agent/src/main.rs index d806ff362..3aae92f36 100644 --- a/crates/terraphim_agent/src/main.rs +++ b/crates/terraphim_agent/src/main.rs @@ -800,6 +800,17 @@ enum LearnSub { #[arg(value_enum)] agent: learnings::AgentType, }, + /// Auto-extract corrections from session transcript + AutoExtract { + /// Path to JSONL transcript file + transcript_path: String, + /// Save extracted corrections to storage + #[arg(long, default_value_t = false)] + save: bool, + /// Show global learnings instead of project + #[arg(long, default_value_t = false)] + global: bool, + }, } #[derive(Subcommand, Debug)] @@ -2155,6 +2166,57 @@ async fn run_learn_command(sub: LearnSub) -> Result<()> { LearnSub::InstallHook { agent } => { learnings::install_hook(agent).await.map_err(|e| e.into()) } + LearnSub::AutoExtract { + transcript_path, + save, + global, + } => { + use learnings::auto_extract_corrections; + use std::path::PathBuf; + + let storage_loc = config.storage_location(); + let storage_dir = if global { + &config.global_dir + } else { + &storage_loc + }; + let transcript_path = PathBuf::from(transcript_path); + + match auto_extract_corrections(&transcript_path) { + Ok(corrections) => { + if corrections.is_empty() { + println!("No corrections found in transcript."); + } else { + println!("Extracted {} correction(s):", corrections.len()); + for (i, correction) in corrections.iter().enumerate() { + println!(" {}. Type: {}", i + 1, correction.correction_type); + println!(" Original: {}", correction.original); + println!(" Corrected: {}", correction.corrected); + if !correction.context_description.is_empty() { + println!(" Context: {}", correction.context_description); + } + if save { + // Save the correction to storage + let filename = format!("correction-{}.md", correction.id); + let filepath = storage_dir.join(&filename); + match std::fs::write(&filepath, correction.to_markdown()) { + Ok(_) => println!(" Saved: {}", filepath.display()), + Err(e) => eprintln!(" Error saving: {}", e), + } + } + } + if save { + println!("\nAll corrections saved to: {}", storage_dir.display()); + } + } + Ok(()) + } + Err(e) => { + eprintln!("Failed to extract corrections: {}", e); + Err(e.into()) + } + } + } } }