-
Notifications
You must be signed in to change notification settings - Fork 4.3k
feat: add LlamaCppInterpreter for toolshim, fix streaming chunk handling #8281
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,13 +13,15 @@ | |
| use crate::conversation::message::{Message, MessageContent, ToolRequest}; | ||
| use crate::conversation::Conversation; | ||
| #[cfg(test)] | ||
| use crate::providers::base::stream_from_single_message; | ||
| use crate::providers::base::{MessageStream, Provider, ProviderUsage}; | ||
| use crate::providers::errors::ProviderError; | ||
| use crate::providers::toolshim::{ | ||
| augment_message_with_tool_calls, convert_tool_messages_to_text, | ||
| modify_system_prompt_for_tool_json, OllamaInterpreter, | ||
| }; | ||
| #[cfg(feature = "local-inference")] | ||
| use crate::providers::toolshim::LlamaCppInterpreter; | ||
| use rmcp::model::Tool; | ||
|
|
||
| async fn enhance_model_error(error: ProviderError, provider: &Arc<dyn Provider>) -> ProviderError { | ||
|
|
@@ -123,6 +125,32 @@ | |
| response: Message, | ||
| toolshim_tools: &[Tool], | ||
| ) -> Result<Message, ProviderError> { | ||
| // Try llama.cpp interpreter first (no external server needed). | ||
| // Fall back to Ollama if creation OR augmentation fails. | ||
| #[cfg(feature = "local-inference")] | ||
| { | ||
| match LlamaCppInterpreter::new() { | ||
| Ok(interpreter) => { | ||
| match augment_message_with_tool_calls(&interpreter, response.clone(), toolshim_tools).await { | ||
| Ok(msg) => return Ok(msg), | ||
| Err(e) => { | ||
| tracing::debug!( | ||
| "LlamaCpp augmentation failed ({}), falling back to Ollama", | ||
| e | ||
| ); | ||
| } | ||
| } | ||
| } | ||
| Err(e) => { | ||
| tracing::debug!( | ||
| "LlamaCppInterpreter unavailable ({}), falling back to Ollama", | ||
| e | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Fallback to Ollama interpreter | ||
| let interpreter = OllamaInterpreter::new().map_err(|e| { | ||
| ProviderError::ExecutionError(format!("Failed to create OllamaInterpreter: {}", e)) | ||
| })?; | ||
|
|
@@ -312,20 +340,53 @@ | |
| }; | ||
|
|
||
| Ok(Box::pin(try_stream! { | ||
| let mut accumulated_message: Option<Message> = None; | ||
| let mut stream_done = false; | ||
| while let Some(result) = stream.next().await { | ||
| let (mut message, usage) = result?; | ||
| let (message, usage) = result?; | ||
|
|
||
| // Store the model information in the global store | ||
| if let Some(usage) = usage.as_ref() { | ||
| crate::providers::base::set_current_model(&usage.model); | ||
| } | ||
|
|
||
| // Post-process / structure the response only if tool interpretation is enabled | ||
| if message.is_some() && config.toolshim { | ||
| message = Some(toolshim_postprocess(message.unwrap(), &toolshim_tools).await?); | ||
| // Accumulate message content across streaming chunks. | ||
| // Each chunk may contain only a delta; we keep the latest | ||
| // complete message snapshot for toolshim post-processing. | ||
| if let Some(msg) = &message { | ||
| accumulated_message = Some(msg.clone()); | ||
| } | ||
|
|
||
| yield (message, usage); | ||
| // Detect stream completion: usage present or message without | ||
| // further chunks. Only apply toolshim on the final message so | ||
| // the interpreter sees complete text, not fragments. | ||
| let is_final = usage.is_some(); | ||
|
|
||
| if config.toolshim && is_final { | ||
| if let Some(msg) = accumulated_message.take() { | ||
| let augmented = toolshim_postprocess(msg, &toolshim_tools).await?; | ||
| yield (Some(augmented), usage); | ||
| stream_done = true; | ||
| continue; | ||
| } | ||
| } | ||
|
|
||
| if !config.toolshim { | ||
| yield (message, usage); | ||
| } else { | ||
| // In toolshim mode, yield intermediate chunks as-is for | ||
| // streaming display, but defer tool interpretation to the end. | ||
| yield (message, usage); | ||
| } | ||
| } | ||
|
|
||
| // If the stream ended without usage (some OpenAI-compatible providers | ||
| // omit it), run toolshim on whatever we accumulated. | ||
| if config.toolshim && !stream_done { | ||
| if let Some(msg) = accumulated_message.take() { | ||
| let augmented = toolshim_postprocess(msg, &toolshim_tools).await?; | ||
| yield (Some(augmented), None); | ||
|
Comment on lines
+385
to
+388
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In toolshim mode, chunks are already yielded inside the loop, but the post-loop fallback emits Useful? React with 👍 / 👎. |
||
| } | ||
| } | ||
| })) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
accumulated_messageis overwritten with each incoming chunk, so the final toolshim pass only sees the last fragment instead of the full assistant response. This breaks tool extraction for delta-style streams (for example, OpenAI-format streams emit per-chunk deltas), where tool-call JSON is split across many chunks; the interpreter then receives an incomplete suffix and typically returns no tool calls.Useful? React with 👍 / 👎.