Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions crates/goose/src/agents/moim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ pub async fn inject_moim(
!issue.contains("Merged consecutive user messages")
&& !issue.contains("Merged consecutive assistant messages")
&& !issue.contains("Added placeholder to empty tool result")
&& !issue.contains("Merged text content")
&& !issue.contains("Removed trailing assistant message")
&& !issue.contains("Trimmed trailing whitespace from assistant message")
});

if has_unexpected_issues {
Expand Down
17 changes: 12 additions & 5 deletions crates/goose/src/agents/platform_extensions/summon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,10 @@ struct AgentMetadata {
model: Option<String>,
}

fn parse_frontmatter<T: for<'de> Deserialize<'de>>(content: &str) -> Option<(T, String)> {
fn parse_frontmatter<T: for<'de> Deserialize<'de>>(
content: &str,
source_path: Option<&Path>,
) -> Option<(T, String)> {
let parts: Vec<&str> = content.split("---").collect();
if parts.len() < 3 {
return None;
Expand All @@ -159,7 +162,11 @@ fn parse_frontmatter<T: for<'de> Deserialize<'de>>(content: &str) -> Option<(T,
let metadata: T = match serde_yaml::from_str(yaml_content) {
Ok(m) => m,
Err(e) => {
warn!("Failed to parse frontmatter: {}", e);
if let Some(path) = source_path {
tracing::debug!("Failed to parse frontmatter in {}: {}", path.display(), e);
} else {
tracing::debug!("Failed to parse frontmatter: {}", e);
}
return None;
}
};
Expand All @@ -169,7 +176,7 @@ fn parse_frontmatter<T: for<'de> Deserialize<'de>>(content: &str) -> Option<(T,
}

fn parse_skill_content(content: &str, path: PathBuf) -> Option<Source> {
let (metadata, body): (SkillMetadata, String) = parse_frontmatter(content)?;
let (metadata, body): (SkillMetadata, String) = parse_frontmatter(content, Some(&path))?;

if metadata.name.contains('/') {
warn!(
Expand All @@ -190,7 +197,7 @@ fn parse_skill_content(content: &str, path: PathBuf) -> Option<Source> {
}

fn parse_agent_content(content: &str, path: PathBuf) -> Option<Source> {
let (metadata, body): (AgentMetadata, String) = parse_frontmatter(content)?;
let (metadata, body): (AgentMetadata, String) = parse_frontmatter(content, Some(&path))?;

let description = metadata.description.unwrap_or_else(|| {
let model_info = metadata
Expand Down Expand Up @@ -1589,7 +1596,7 @@ impl SummonClient {
};

let (metadata, _): (AgentMetadata, String) =
parse_frontmatter(&agent_content).ok_or("Failed to parse agent frontmatter")?;
parse_frontmatter(&agent_content, None).ok_or("Failed to parse agent frontmatter")?;

let model = metadata.model;

Expand Down
71 changes: 66 additions & 5 deletions crates/goose/src/agents/reply_parts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
use crate::conversation::message::{Message, MessageContent, ToolRequest};
use crate::conversation::Conversation;
#[cfg(test)]
use crate::providers::base::stream_from_single_message;

Check warning on line 16 in crates/goose/src/agents/reply_parts.rs

View workflow job for this annotation

GitHub Actions / Check Rust Code Format

Diff in /home/runner/work/goose/goose/crates/goose/src/agents/reply_parts.rs
use crate::providers::base::{MessageStream, Provider, ProviderUsage};
use crate::providers::errors::ProviderError;
use crate::providers::toolshim::{
augment_message_with_tool_calls, convert_tool_messages_to_text,
modify_system_prompt_for_tool_json, OllamaInterpreter,
};

Check warning on line 22 in crates/goose/src/agents/reply_parts.rs

View workflow job for this annotation

GitHub Actions / Check Rust Code Format

Diff in /home/runner/work/goose/goose/crates/goose/src/agents/reply_parts.rs
#[cfg(feature = "local-inference")]
use crate::providers::toolshim::LlamaCppInterpreter;
use rmcp::model::Tool;

async fn enhance_model_error(error: ProviderError, provider: &Arc<dyn Provider>) -> ProviderError {
Expand Down Expand Up @@ -123,6 +125,32 @@
response: Message,
toolshim_tools: &[Tool],
) -> Result<Message, ProviderError> {
// Try llama.cpp interpreter first (no external server needed).
// Fall back to Ollama if creation OR augmentation fails.
#[cfg(feature = "local-inference")]
{

Check warning on line 131 in crates/goose/src/agents/reply_parts.rs

View workflow job for this annotation

GitHub Actions / Check Rust Code Format

Diff in /home/runner/work/goose/goose/crates/goose/src/agents/reply_parts.rs
match LlamaCppInterpreter::new() {
Ok(interpreter) => {
match augment_message_with_tool_calls(&interpreter, response.clone(), toolshim_tools).await {
Ok(msg) => return Ok(msg),
Err(e) => {
tracing::debug!(
"LlamaCpp augmentation failed ({}), falling back to Ollama",
e
);
}
}
}
Err(e) => {
tracing::debug!(
"LlamaCppInterpreter unavailable ({}), falling back to Ollama",
e
);
}
}
}

// Fallback to Ollama interpreter
let interpreter = OllamaInterpreter::new().map_err(|e| {
ProviderError::ExecutionError(format!("Failed to create OllamaInterpreter: {}", e))
})?;
Expand Down Expand Up @@ -312,20 +340,53 @@
};

Ok(Box::pin(try_stream! {
let mut accumulated_message: Option<Message> = None;
let mut stream_done = false;
while let Some(result) = stream.next().await {
let (mut message, usage) = result?;
let (message, usage) = result?;

// Store the model information in the global store
if let Some(usage) = usage.as_ref() {
crate::providers::base::set_current_model(&usage.model);
}

// Post-process / structure the response only if tool interpretation is enabled
if message.is_some() && config.toolshim {
message = Some(toolshim_postprocess(message.unwrap(), &toolshim_tools).await?);
// Accumulate message content across streaming chunks.
// Each chunk may contain only a delta; we keep the latest
// complete message snapshot for toolshim post-processing.
if let Some(msg) = &message {
accumulated_message = Some(msg.clone());
Comment on lines +356 to +357
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Accumulate full streamed text before running toolshim

accumulated_message is overwritten with each incoming chunk, so the final toolshim pass only sees the last fragment instead of the full assistant response. This breaks tool extraction for delta-style streams (for example, OpenAI-format streams emit per-chunk deltas), where tool-call JSON is split across many chunks; the interpreter then receives an incomplete suffix and typically returns no tool calls.

Useful? React with 👍 / 👎.

}

yield (message, usage);
// Detect stream completion: usage present or message without
// further chunks. Only apply toolshim on the final message so
// the interpreter sees complete text, not fragments.
let is_final = usage.is_some();

if config.toolshim && is_final {
if let Some(msg) = accumulated_message.take() {
let augmented = toolshim_postprocess(msg, &toolshim_tools).await?;
yield (Some(augmented), usage);
stream_done = true;
continue;
}
}

if !config.toolshim {
yield (message, usage);
} else {
// In toolshim mode, yield intermediate chunks as-is for
// streaming display, but defer tool interpretation to the end.
yield (message, usage);
}
}

// If the stream ended without usage (some OpenAI-compatible providers
// omit it), run toolshim on whatever we accumulated.
if config.toolshim && !stream_done {
if let Some(msg) = accumulated_message.take() {
let augmented = toolshim_postprocess(msg, &toolshim_tools).await?;
yield (Some(augmented), None);
Comment on lines +385 to +388
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Prevent duplicate output when usage metadata is absent

In toolshim mode, chunks are already yielded inside the loop, but the post-loop fallback emits accumulated_message again when no usage metadata was seen. For providers that omit usage, this produces duplicated assistant content (at least the final delta chunk, or the full message for snapshot streams), which can corrupt the final transcript and downstream parsing.

Useful? React with 👍 / 👎.

}
}
}))
}
Expand Down
13 changes: 11 additions & 2 deletions crates/goose/src/providers/local_inference.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
pub mod hf_models;
mod inference_emulated_tools;
mod inference_engine;
pub(crate) mod inference_engine;
mod inference_native_tools;
pub mod local_model_registry;
mod tool_parsing;
Expand Down Expand Up @@ -93,7 +93,7 @@
&self.backend
}

fn get_or_create_model_slot(&self, model_id: &str) -> ModelSlot {
pub fn get_or_create_model_slot(&self, model_id: &str) -> ModelSlot {

Check failure on line 96 in crates/goose/src/providers/local_inference.rs

View workflow job for this annotation

GitHub Actions / Check OpenAPI Schema is Up-to-Date

type `LoadedModel` is more private than the item `InferenceRuntime::get_or_create_model_slot`

Check failure on line 96 in crates/goose/src/providers/local_inference.rs

View workflow job for this annotation

GitHub Actions / Build and Test Rust Project

type `LoadedModel` is more private than the item `InferenceRuntime::get_or_create_model_slot`

Check failure on line 96 in crates/goose/src/providers/local_inference.rs

View workflow job for this annotation

GitHub Actions / Lint Rust Code

type `providers::local_inference::inference_engine::LoadedModel` is more private than the item `providers::local_inference::InferenceRuntime::get_or_create_model_slot`
let mut map = self.models.lock().expect("model cache lock poisoned");
map.entry(model_id.to_string())
.or_insert_with(|| Arc::new(Mutex::new(None)))
Expand Down Expand Up @@ -318,6 +318,15 @@
})
}

/// Public wrapper for loading a model, used by the toolshim LlamaCppInterpreter.
pub(crate) fn load_model_sync_public(
runtime: &InferenceRuntime,
model_id: &str,
settings: &crate::providers::local_inference::local_model_registry::ModelSettings,
) -> Result<LoadedModel, ProviderError> {
Self::load_model_sync(runtime, model_id, settings)
}

fn load_model_sync(
runtime: &InferenceRuntime,
model_id: &str,
Expand Down
16 changes: 8 additions & 8 deletions crates/goose/src/providers/local_inference/inference_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub(super) struct GenerationContext<'a> {
pub log: &'a mut RequestLog,
}

pub(super) struct LoadedModel {
pub(crate) struct LoadedModel {
pub model: LlamaModel,
pub template: LlamaChatTemplate,
}
Expand All @@ -30,7 +30,7 @@ pub(super) struct LoadedModel {
/// memory based on the model's KV cache requirements.
///
/// Returns `None` if the model architecture values are unavailable.
pub(super) fn estimate_max_context_for_memory(
pub(crate) fn estimate_max_context_for_memory(
model: &LlamaModel,
runtime: &InferenceRuntime,
) -> Option<usize> {
Expand Down Expand Up @@ -109,7 +109,7 @@ pub(super) fn context_cap(
}
}

pub(super) fn effective_context_size(
pub(crate) fn effective_context_size(
prompt_token_count: usize,
settings: &crate::providers::local_inference::local_model_registry::ModelSettings,
context_limit: usize,
Expand All @@ -129,7 +129,7 @@ pub(super) fn effective_context_size(
needed.min(limit)
}

pub(super) fn build_context_params(
pub(crate) fn build_context_params(
ctx_size: u32,
settings: &crate::providers::local_inference::local_model_registry::ModelSettings,
) -> LlamaContextParams {
Expand Down Expand Up @@ -201,7 +201,7 @@ pub(super) fn build_sampler(

/// Validate prompt tokens against memory limits and compute the effective
/// context size. Returns `(prompt_token_count, effective_ctx)`.
pub(super) fn validate_and_compute_context(
pub(crate) fn validate_and_compute_context(
loaded: &LoadedModel,
runtime: &InferenceRuntime,
prompt_token_count: usize,
Expand Down Expand Up @@ -237,7 +237,7 @@ pub(super) fn validate_and_compute_context(
}

/// Create a llama context and prefill (decode) all prompt tokens.
pub(super) fn create_and_prefill_context<'model>(
pub(crate) fn create_and_prefill_context<'model>(
loaded: &'model LoadedModel,
runtime: &InferenceRuntime,
tokens: &[llama_cpp_2::token::LlamaToken],
Expand All @@ -262,15 +262,15 @@ pub(super) fn create_and_prefill_context<'model>(
}

/// Action to take after processing a generated token piece.
pub(super) enum TokenAction {
pub(crate) enum TokenAction {
Continue,
Stop,
}

/// Run the autoregressive generation loop. Calls `on_piece` for each non-empty
/// token piece. The callback returns `TokenAction::Stop` to break early.
/// Returns the total number of generated tokens.
pub(super) fn generation_loop(
pub(crate) fn generation_loop(
model: &LlamaModel,
ctx: &mut llama_cpp_2::context::LlamaContext<'_>,
settings: &crate::providers::local_inference::local_model_registry::ModelSettings,
Expand Down
Loading
Loading