aaif-goose · eugenio · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026 · chatgpt-codex-connector
diff --git a/crates/goose/src/agents/moim.rs b/crates/goose/src/agents/moim.rs
@@ -36,6 +36,9 @@ pub async fn inject_moim(
             !issue.contains("Merged consecutive user messages")
                 && !issue.contains("Merged consecutive assistant messages")
                 && !issue.contains("Added placeholder to empty tool result")
+                && !issue.contains("Merged text content")
+                && !issue.contains("Removed trailing assistant message")
+                && !issue.contains("Trimmed trailing whitespace from assistant message")
         });
 
         if has_unexpected_issues {

diff --git a/crates/goose/src/agents/platform_extensions/summon.rs b/crates/goose/src/agents/platform_extensions/summon.rs
@@ -149,7 +149,10 @@ struct AgentMetadata {
     model: Option<String>,
 }
 
-fn parse_frontmatter<T: for<'de> Deserialize<'de>>(content: &str) -> Option<(T, String)> {
+fn parse_frontmatter<T: for<'de> Deserialize<'de>>(
+    content: &str,
+    source_path: Option<&Path>,
+) -> Option<(T, String)> {
     let parts: Vec<&str> = content.split("---").collect();
     if parts.len() < 3 {
         return None;
@@ -159,7 +162,11 @@ fn parse_frontmatter<T: for<'de> Deserialize<'de>>(content: &str) -> Option<(T,
     let metadata: T = match serde_yaml::from_str(yaml_content) {
         Ok(m) => m,
         Err(e) => {
-            warn!("Failed to parse frontmatter: {}", e);
+            if let Some(path) = source_path {
+                tracing::debug!("Failed to parse frontmatter in {}: {}", path.display(), e);
+            } else {
+                tracing::debug!("Failed to parse frontmatter: {}", e);
+            }
             return None;
         }
     };
@@ -169,7 +176,7 @@ fn parse_frontmatter<T: for<'de> Deserialize<'de>>(content: &str) -> Option<(T,
 }
 
 fn parse_skill_content(content: &str, path: PathBuf) -> Option<Source> {
-    let (metadata, body): (SkillMetadata, String) = parse_frontmatter(content)?;
+    let (metadata, body): (SkillMetadata, String) = parse_frontmatter(content, Some(&path))?;
 
     if metadata.name.contains('/') {
         warn!(
@@ -190,7 +197,7 @@ fn parse_skill_content(content: &str, path: PathBuf) -> Option<Source> {
 }
 
 fn parse_agent_content(content: &str, path: PathBuf) -> Option<Source> {
-    let (metadata, body): (AgentMetadata, String) = parse_frontmatter(content)?;
+    let (metadata, body): (AgentMetadata, String) = parse_frontmatter(content, Some(&path))?;
 
     let description = metadata.description.unwrap_or_else(|| {
         let model_info = metadata
@@ -1589,7 +1596,7 @@ impl SummonClient {
         };
 
         let (metadata, _): (AgentMetadata, String) =
-            parse_frontmatter(&agent_content).ok_or("Failed to parse agent frontmatter")?;
+            parse_frontmatter(&agent_content, None).ok_or("Failed to parse agent frontmatter")?;
 
         let model = metadata.model;
 

diff --git a/crates/goose/src/agents/reply_parts.rs b/crates/goose/src/agents/reply_parts.rs
@@ -13,13 +13,15 @@
 use crate::conversation::message::{Message, MessageContent, ToolRequest};
 use crate::conversation::Conversation;
 #[cfg(test)]
 use crate::providers::base::stream_from_single_message;
 use crate::providers::base::{MessageStream, Provider, ProviderUsage};
 use crate::providers::errors::ProviderError;
 use crate::providers::toolshim::{
     augment_message_with_tool_calls, convert_tool_messages_to_text,
     modify_system_prompt_for_tool_json, OllamaInterpreter,
 };
+#[cfg(feature = "local-inference")]
+use crate::providers::toolshim::LlamaCppInterpreter;
 use rmcp::model::Tool;
 
 async fn enhance_model_error(error: ProviderError, provider: &Arc<dyn Provider>) -> ProviderError {
@@ -123,6 +125,32 @@
     response: Message,
     toolshim_tools: &[Tool],
 ) -> Result<Message, ProviderError> {
+    // Try llama.cpp interpreter first (no external server needed).
+    // Fall back to Ollama if creation OR augmentation fails.
+    #[cfg(feature = "local-inference")]
+    {
+        match LlamaCppInterpreter::new() {
+            Ok(interpreter) => {
+                match augment_message_with_tool_calls(&interpreter, response.clone(), toolshim_tools).await {
+                    Ok(msg) => return Ok(msg),
+                    Err(e) => {
+                        tracing::debug!(
+                            "LlamaCpp augmentation failed ({}), falling back to Ollama",
+                            e
+                        );
+                    }
+                }
+            }
+            Err(e) => {
+                tracing::debug!(
+                    "LlamaCppInterpreter unavailable ({}), falling back to Ollama",
+                    e
+                );
+            }
+        }
+    }
+
+    // Fallback to Ollama interpreter
     let interpreter = OllamaInterpreter::new().map_err(|e| {
         ProviderError::ExecutionError(format!("Failed to create OllamaInterpreter: {}", e))
     })?;
@@ -312,20 +340,53 @@
         };
 
         Ok(Box::pin(try_stream! {
+            let mut accumulated_message: Option<Message> = None;
+            let mut stream_done = false;
             while let Some(result) = stream.next().await {
-                let (mut message, usage) = result?;
+                let (message, usage) = result?;
 
                 // Store the model information in the global store
                 if let Some(usage) = usage.as_ref() {
                     crate::providers::base::set_current_model(&usage.model);
                 }
 
-                // Post-process / structure the response only if tool interpretation is enabled
-                if message.is_some() && config.toolshim {
-                    message = Some(toolshim_postprocess(message.unwrap(), &toolshim_tools).await?);
+                // Accumulate message content across streaming chunks.
+                // Each chunk may contain only a delta; we keep the latest
+                // complete message snapshot for toolshim post-processing.
+                if let Some(msg) = &message {
+                    accumulated_message = Some(msg.clone());
                 }
 
-                yield (message, usage);
+                // Detect stream completion: usage present or message without
+                // further chunks. Only apply toolshim on the final message so
+                // the interpreter sees complete text, not fragments.
+                let is_final = usage.is_some();
+
+                if config.toolshim && is_final {
+                    if let Some(msg) = accumulated_message.take() {
+                        let augmented = toolshim_postprocess(msg, &toolshim_tools).await?;
+                        yield (Some(augmented), usage);
+                        stream_done = true;
+                        continue;
+                    }
+                }
+
+                if !config.toolshim {
+                    yield (message, usage);
+                } else {
+                    // In toolshim mode, yield intermediate chunks as-is for
+                    // streaming display, but defer tool interpretation to the end.
+                    yield (message, usage);
+                }
+            }
+
+            // If the stream ended without usage (some OpenAI-compatible providers
+            // omit it), run toolshim on whatever we accumulated.
+            if config.toolshim && !stream_done {
+                if let Some(msg) = accumulated_message.take() {
+                    let augmented = toolshim_postprocess(msg, &toolshim_tools).await?;
+                    yield (Some(augmented), None);
+                }
             }
         }))
     }

diff --git a/crates/goose/src/providers/local_inference.rs b/crates/goose/src/providers/local_inference.rs
@@ -1,6 +1,6 @@
 pub mod hf_models;
 mod inference_emulated_tools;
-mod inference_engine;
+pub(crate) mod inference_engine;
 mod inference_native_tools;
 pub mod local_model_registry;
 mod tool_parsing;
@@ -93,7 +93,7 @@
         &self.backend
     }
 
-    fn get_or_create_model_slot(&self, model_id: &str) -> ModelSlot {
+    pub fn get_or_create_model_slot(&self, model_id: &str) -> ModelSlot {
         let mut map = self.models.lock().expect("model cache lock poisoned");
         map.entry(model_id.to_string())
             .or_insert_with(|| Arc::new(Mutex::new(None)))
@@ -318,6 +318,15 @@
         })
     }
 
+    /// Public wrapper for loading a model, used by the toolshim LlamaCppInterpreter.
+    pub(crate) fn load_model_sync_public(
+        runtime: &InferenceRuntime,
+        model_id: &str,
+        settings: &crate::providers::local_inference::local_model_registry::ModelSettings,
+    ) -> Result<LoadedModel, ProviderError> {
+        Self::load_model_sync(runtime, model_id, settings)
+    }
+
     fn load_model_sync(
         runtime: &InferenceRuntime,
         model_id: &str,

diff --git a/crates/goose/src/providers/local_inference/inference_engine.rs b/crates/goose/src/providers/local_inference/inference_engine.rs
@@ -21,7 +21,7 @@ pub(super) struct GenerationContext<'a> {
     pub log: &'a mut RequestLog,
 }
 
-pub(super) struct LoadedModel {
+pub(crate) struct LoadedModel {
     pub model: LlamaModel,
     pub template: LlamaChatTemplate,
 }
@@ -30,7 +30,7 @@ pub(super) struct LoadedModel {
 /// memory based on the model's KV cache requirements.
 ///
 /// Returns `None` if the model architecture values are unavailable.
-pub(super) fn estimate_max_context_for_memory(
+pub(crate) fn estimate_max_context_for_memory(
     model: &LlamaModel,
     runtime: &InferenceRuntime,
 ) -> Option<usize> {
@@ -109,7 +109,7 @@ pub(super) fn context_cap(
     }
 }
 
-pub(super) fn effective_context_size(
+pub(crate) fn effective_context_size(
     prompt_token_count: usize,
     settings: &crate::providers::local_inference::local_model_registry::ModelSettings,
     context_limit: usize,
@@ -129,7 +129,7 @@ pub(super) fn effective_context_size(
     needed.min(limit)
 }
 
-pub(super) fn build_context_params(
+pub(crate) fn build_context_params(
     ctx_size: u32,
     settings: &crate::providers::local_inference::local_model_registry::ModelSettings,
 ) -> LlamaContextParams {
@@ -201,7 +201,7 @@ pub(super) fn build_sampler(
 
 /// Validate prompt tokens against memory limits and compute the effective
 /// context size. Returns `(prompt_token_count, effective_ctx)`.
-pub(super) fn validate_and_compute_context(
+pub(crate) fn validate_and_compute_context(
     loaded: &LoadedModel,
     runtime: &InferenceRuntime,
     prompt_token_count: usize,
@@ -237,7 +237,7 @@ pub(super) fn validate_and_compute_context(
 }
 
 /// Create a llama context and prefill (decode) all prompt tokens.
-pub(super) fn create_and_prefill_context<'model>(
+pub(crate) fn create_and_prefill_context<'model>(
     loaded: &'model LoadedModel,
     runtime: &InferenceRuntime,
     tokens: &[llama_cpp_2::token::LlamaToken],
@@ -262,15 +262,15 @@ pub(super) fn create_and_prefill_context<'model>(
 }
 
 /// Action to take after processing a generated token piece.
-pub(super) enum TokenAction {
+pub(crate) enum TokenAction {
     Continue,
     Stop,
 }
 
 /// Run the autoregressive generation loop. Calls `on_piece` for each non-empty
 /// token piece. The callback returns `TokenAction::Stop` to break early.
 /// Returns the total number of generated tokens.
-pub(super) fn generation_loop(
+pub(crate) fn generation_loop(
     model: &LlamaModel,
     ctx: &mut llama_cpp_2::context::LlamaContext<'_>,
     settings: &crate::providers::local_inference::local_model_registry::ModelSettings,