tower · bradhe · Aug 14, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,7 +4,8 @@ resolver = "2"
 
 [workspace.package]
 edition = "2021"
-version = "0.3.24"
+version = "0.3.25"
+
 
 
 

diff --git a/crates/tower-runtime/src/local.rs b/crates/tower-runtime/src/local.rs
@@ -269,6 +269,13 @@ async fn execute_local_app(opts: StartOptions, sx: oneshot::Sender<i32>, cancel_
     return Ok(())
 } 
 
+impl Drop for LocalApp {
+    fn drop(&mut self) {
+        // We want to ensure that we cancel the process if it is still running.
+        let _ = self.terminate();
+    }
+}
+
 impl App for LocalApp {
     async fn start(opts: StartOptions) -> Result<Self, Error> {
         let cancel_token = CancellationToken::new();

diff --git a/crates/tower-uv/src/lib.rs b/crates/tower-uv/src/lib.rs
@@ -78,6 +78,7 @@ impl Uv {
         debug!("Executing UV ({:?}) venv in {:?}", &self.uv_path, cwd);
 
         let child = Command::new(&self.uv_path)
+            .kill_on_drop(true)
             .stdin(Stdio::null())
             .stdout(Stdio::piped())
             .stderr(Stdio::piped())
@@ -95,6 +96,7 @@ impl Uv {
         if cwd.join("pyproject.toml").exists() {
             debug!("Executing UV ({:?}) sync in {:?}", &self.uv_path, cwd);
             let child = Command::new(&self.uv_path)
+                .kill_on_drop(true)
                 .stdin(Stdio::null())
                 .stdout(Stdio::piped())
                 .stderr(Stdio::piped())
@@ -112,6 +114,7 @@ impl Uv {
 
             // If there is a requirements.txt, then we can use that to sync.
             let child = Command::new(&self.uv_path)
+                .kill_on_drop(true)
                 .stdin(Stdio::null())
                 .stdout(Stdio::piped())
                 .stderr(Stdio::piped())
@@ -138,6 +141,7 @@ impl Uv {
         debug!("Executing UV ({:?}) run {:?} in {:?}", &self.uv_path, program, cwd);
 
         let child = Command::new(&self.uv_path)
+            .kill_on_drop(true)
             .stdin(Stdio::null())
             .stdout(Stdio::piped())
             .stderr(Stdio::piped())

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,8 @@ build-backend = "maturin"
 
 [project]
 name = "tower"
-version = "0.3.24"
+version = "0.3.25"
+
 
 
 
@@ -43,14 +44,15 @@ dependencies = [
   "attrs==24.2.0",
   "httpx==0.28.1",
   "huggingface-hub>=0.34.3",
-  "ollama>=0.4.7",
-  "pyiceberg==0.9.0",
+  "ollama>=0.5.3",
+  "pydantic-core==2.27.0",
+  "pyiceberg==0.9.1",
   "python-dateutil==2.9.0.post0",
 ]
 
 [project.optional-dependencies]
-ai = ["huggingface-hub==0.34.3", "ollama==0.4.7"]
-iceberg = ["polars==1.27.1", "pyarrow==19.0.1", "pyiceberg==0.9.0"]
+ai = ["huggingface-hub==0.34.3", "ollama==0.5.3"]
+iceberg = ["polars==1.27.1", "pyarrow==19.0.1", "pyiceberg==0.9.1"]
 all = ["tower[ai,iceberg]"]
 
 [tool.maturin]
@@ -71,5 +73,5 @@ dev = [
   "pytest==8.3.5",
   "pytest-httpx==0.35.0",
   "pytest-env>=1.1.3",
-  "pyiceberg[sql-sqlite]==0.9.0",
+  "pyiceberg[sql-sqlite]==0.9.1",
 ]
diff --git a/src/tower/_llms.py b/src/tower/_llms.py
@@ -384,6 +384,43 @@ def resolve_hugging_face_hub_model_name(ctx: TowerContext, requested_model: str)
 
 
 class Llm:
+    """
+    This class provides a unified interface for interacting with language models through
+    different inference providers (e.g. Ollama for local inference, Hugging Face Hub for remote).
+    It abstracts away model name resolution, inference provider selection, and local/remote inference API differences 
+    to provide a consistent interface for text generation tasks.
+
+    The class supports both chat-based interactions (similar to OpenAI Chat Completions API)
+    and simple prompt-based interactions (similar to legacy OpenAI Completions API).
+
+    This class is typically instantiated through the llms() factory function rather than
+    directly.
+
+    Attributes:
+        context (TowerContext): The Tower context containing configuration and settings.
+        requested_model_name (str): The original model name requested by the user.
+        model_name (str): The resolved model name after provider-specific resolution.
+        max_tokens (int): Maximum number of tokens to generate in responses.
+        inference_router (str): The inference router to use (e.g., "ollama", "hugging_face_hub").
+        inference_provider (str): The inference provider (same as router when in local mode).
+        inference_router_api_key (str): API key for the inference router if required.
+
+    Example:
+        # Create an Llm instance (typically done via the llms() factory function)
+        llm = tower.llms("llama3.2", max_tokens=1000)
+
+        # Use for chat completions
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]
+        response = llm.complete_chat(messages)
+
+        # Use for simple prompts
+        response = llm.prompt("What is the capital of France?")
+
+    """
+
     def __init__(self, context: TowerContext, model_name: str, max_tokens: int = 1000):
         """
         Wraps up interfacing with a language model in the Tower system.
@@ -475,11 +512,45 @@ def prompt(self, prompt: str) -> str:
             "content": prompt,
         }])
 
-def llms(model_name: str) -> Llm:
+def llms(model_name: str, max_tokens: int = 1000) -> Llm:
+    """
+    This factory function creates an Llm instance configured with the specified model parameters. 
+    It automatically resolves the model name based on the available inference providers 
+    (Ollama for local inference, Hugging Face Hub for remote).
+    The max_tokens parameter is used to set the maximum number of tokens to generate in responses.
+
+    Args:
+        model_name: Can be a model family name (e.g., "llama3.2", "gemma3.2", "deepseek-r1") 
+                    or a specific model identifier (e.g., "deepseek-r1:14b", "deepseek-ai/DeepSeek-R1-0528").
+                    The function will automatically resolve the exact model name based on
+                    available models in the configured inference provider.
+        max_tokens: Maximum number of tokens to generate in responses. Defaults to 1000.
+
+    Returns:
+        Llm: A configured language model instance that can be used for text generation,
+             chat completions, and other language model interactions.
+
+    Raises:
+        ValueError: If the configured inference router is not supported or if the model
+                   cannot be resolved.
+
+    Example:
+        # Create a language model instance
+        llm = llms("llama3.2", max_tokens=500)
+
+        # Use for chat completions
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]
+        response = llm.complete_chat(messages)
+
+    """
     ctx = TowerContext.build()
     return Llm(
         context = ctx,
-        model_name=model_name
+        model_name=model_name,
+        max_tokens=max_tokens
     )
 
 def extract_ollama_message(resp: ChatResponse) -> str: