docker · krissetto · Apr 8, 2026
@@ -64,29 +64,49 @@ models:
     model: ai/qwen3
     max_tokens: 8192
     provider_opts:
-      runtime_flags: ["--ngl=33", "--top-p=0.9"]
+      runtime_flags: ["--threads", "8"]
 ```
 
 Runtime flags also accept a single string:
 
 ```yaml
 provider_opts:
-  runtime_flags: "--ngl=33 --top-p=0.9"
+  runtime_flags: "--threads 8"
 ```
 
-## Parameter Mapping
+Use only flags your Model Runner backend allows (see `docker model configure --help` and backend docs). **Do not** put sampling parameters (`temperature`, `top_p`, penalties) in `runtime_flags` — set them on the model (`temperature`, `top_p`, etc.); they are sent **per request** via the OpenAI-compatible chat API.
 
-docker-agent model config fields map to llama.cpp flags automatically:
+## Context size
 
-| Config              | llama.cpp Flag        |
-| ------------------- | --------------------- |
-| `temperature`       | `--temp`              |
-| `top_p`             | `--top-p`             |
-| `frequency_penalty` | `--frequency-penalty` |
-| `presence_penalty`  | `--presence-penalty`  |
-| `max_tokens`        | `--context-size`      |
+`max_tokens` controls the **maximum output tokens** per chat completion request. To set the engine's **total context window**, use `provider_opts.context_size`:
 
-`runtime_flags` always take priority over derived flags on conflict.
+```yaml
+models:
+  local:
+    provider: dmr
+    model: ai/qwen3
+    max_tokens: 4096            # max output tokens (per-request)
+    provider_opts:
+      context_size: 32768       # total context window (sent via _configure)
+```
+
+If `context_size` is omitted, Model Runner uses its default. `max_tokens` is **not** used as the context window.
+
+## Thinking / reasoning budget
+
+When using the **llama.cpp** backend, `thinking_budget` is sent as structured `llamacpp.reasoning-budget` on `_configure` (maps to `--reasoning-budget`). String efforts use the same token mapping as other providers; `adaptive` maps to unlimited (`-1`).
+
+When using the **vLLM** backend, `thinking_budget` is sent as `thinking_token_budget` in each chat completion request. Effort levels map to token counts using the same scale as other providers; `adaptive` maps to unlimited (`-1`).
+
+```yaml
+models:
+  local:
+    provider: dmr
+    model: ai/qwen3
+    thinking_budget: medium   # llama.cpp: reasoning-budget=8192; vLLM: thinking_token_budget=8192
+```
+
+On **MLX** and **SGLang** backends, `thinking_budget` is silently ignored — those engines do not currently expose a per-request reasoning token budget knob.
 
 ## Speculative Decoding
 

@@ -54,6 +54,7 @@ type Client struct {
 	client     openai.Client
 	baseURL    string
 	httpClient *http.Client
+	engine     string
 }
 
 // NewClient creates a new DMR client from the provided configuration
@@ -103,18 +104,13 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
 
 	clientOptions = append(clientOptions, option.WithBaseURL(baseURL), option.WithAPIKey("")) // DMR doesn't need auth
 
-	// Build runtime flags from ModelConfig and engine
-	contextSize, providerRuntimeFlags, specOpts := parseDMRProviderOpts(cfg)
-	configFlags := buildRuntimeFlagsFromModelConfig(engine, cfg)
-	finalFlags, warnings := mergeRuntimeFlagsPreferUser(configFlags, providerRuntimeFlags)
-	for _, w := range warnings {
-		slog.Warn(w)
-	}
-	slog.Debug("DMR provider_opts parsed", "model", cfg.Model, "context_size", contextSize, "runtime_flags", finalFlags, "speculative_opts", specOpts, "engine", engine)
+	contextSize, runtimeFlags, specOpts, llamaCpp := parseDMRProviderOpts(engine, cfg)
+	backendCfg := buildConfigureBackendConfig(contextSize, runtimeFlags, specOpts, llamaCpp)
+	slog.Debug("DMR provider_opts parsed", "model", cfg.Model, "context_size", derefInt64(contextSize), "runtime_flags", runtimeFlags, "speculative_opts", specOpts, "llamacpp", llamaCpp, "engine", engine)
 	// Skip model configuration when generating titles to avoid reconfiguring the model
 	// with different settings (e.g., smaller max_tokens) that would affect the main agent.
 	if !globalOptions.GeneratingTitle() {
-		if err := configureModel(ctx, httpClient, baseURL, cfg.Model, contextSize, finalFlags, specOpts); err != nil {
+		if err := configureModel(ctx, httpClient, baseURL, cfg.Model, backendCfg); err != nil {
 			slog.Debug("model configure via API skipped or failed", "error", err)
 		}
 	}
@@ -129,6 +125,7 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
 		client:     openai.NewClient(clientOptions...),
 		baseURL:    baseURL,
 		httpClient: httpClient,
+		engine:     engine,
 	}, nil
 }
 
@@ -214,6 +211,14 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
 		}
 	}
 
+	// For vLLM, apply engine-specific per-request fields (e.g. thinking_token_budget).
+	if c.engine == engineVLLM {
+		if fields := buildVLLMRequestFields(&c.ModelConfig); fields != nil {
+			params.SetExtraFields(fields)
+			slog.Debug("DMR vLLM extra request fields applied", "fields", fields)
+		}
+	}
+
 	// Log the request in JSON format for debugging
 	if requestJSON, err := json.Marshal(params); err == nil {
 		slog.Debug("DMR chat completion request", "request", string(requestJSON))
@@ -222,7 +227,7 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
 	}
 
 	if structuredOutput := c.ModelOptions.StructuredOutput(); structuredOutput != nil {
-		slog.Debug("Adding structured output to DMR request", "structured_output", structuredOutput)
+		slog.Debug("Adding structured output to DMR request", "name", structuredOutput.Name, "strict", structuredOutput.Strict)
 
 		params.ResponseFormat.OfJSONSchema = &openai.ResponseFormatJSONSchemaParam{
 			JSONSchema: openai.ResponseFormatJSONSchemaJSONSchemaParam{