Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 32 additions & 12 deletions docs/providers/dmr/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,29 +64,49 @@ models:
model: ai/qwen3
max_tokens: 8192
provider_opts:
runtime_flags: ["--ngl=33", "--top-p=0.9"]
runtime_flags: ["--threads", "8"]
```

Runtime flags also accept a single string:

```yaml
provider_opts:
runtime_flags: "--ngl=33 --top-p=0.9"
runtime_flags: "--threads 8"
```

## Parameter Mapping
Use only flags your Model Runner backend allows (see `docker model configure --help` and backend docs). **Do not** put sampling parameters (`temperature`, `top_p`, penalties) in `runtime_flags` — set them on the model (`temperature`, `top_p`, etc.); they are sent **per request** via the OpenAI-compatible chat API.

docker-agent model config fields map to llama.cpp flags automatically:
## Context size

| Config | llama.cpp Flag |
| ------------------- | --------------------- |
| `temperature` | `--temp` |
| `top_p` | `--top-p` |
| `frequency_penalty` | `--frequency-penalty` |
| `presence_penalty` | `--presence-penalty` |
| `max_tokens` | `--context-size` |
`max_tokens` controls the **maximum output tokens** per chat completion request. To set the engine's **total context window**, use `provider_opts.context_size`:

`runtime_flags` always take priority over derived flags on conflict.
```yaml
models:
local:
provider: dmr
model: ai/qwen3
max_tokens: 4096 # max output tokens (per-request)
provider_opts:
context_size: 32768 # total context window (sent via _configure)
```

If `context_size` is omitted, Model Runner uses its default. `max_tokens` is **not** used as the context window.

## Thinking / reasoning budget

When using the **llama.cpp** backend, `thinking_budget` is sent as structured `llamacpp.reasoning-budget` on `_configure` (maps to `--reasoning-budget`). String efforts use the same token mapping as other providers; `adaptive` maps to unlimited (`-1`).

When using the **vLLM** backend, `thinking_budget` is sent as `thinking_token_budget` in each chat completion request. Effort levels map to token counts using the same scale as other providers; `adaptive` maps to unlimited (`-1`).

```yaml
models:
local:
provider: dmr
model: ai/qwen3
thinking_budget: medium # llama.cpp: reasoning-budget=8192; vLLM: thinking_token_budget=8192
```

On **MLX** and **SGLang** backends, `thinking_budget` is silently ignored — those engines do not currently expose a per-request reasoning token budget knob.

## Speculative Decoding

Expand Down
25 changes: 15 additions & 10 deletions pkg/model/provider/dmr/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ type Client struct {
client openai.Client
baseURL string
httpClient *http.Client
engine string
}

// NewClient creates a new DMR client from the provided configuration
Expand Down Expand Up @@ -103,18 +104,13 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt

clientOptions = append(clientOptions, option.WithBaseURL(baseURL), option.WithAPIKey("")) // DMR doesn't need auth

// Build runtime flags from ModelConfig and engine
contextSize, providerRuntimeFlags, specOpts := parseDMRProviderOpts(cfg)
configFlags := buildRuntimeFlagsFromModelConfig(engine, cfg)
finalFlags, warnings := mergeRuntimeFlagsPreferUser(configFlags, providerRuntimeFlags)
for _, w := range warnings {
slog.Warn(w)
}
slog.Debug("DMR provider_opts parsed", "model", cfg.Model, "context_size", contextSize, "runtime_flags", finalFlags, "speculative_opts", specOpts, "engine", engine)
contextSize, runtimeFlags, specOpts, llamaCpp := parseDMRProviderOpts(engine, cfg)
backendCfg := buildConfigureBackendConfig(contextSize, runtimeFlags, specOpts, llamaCpp)
slog.Debug("DMR provider_opts parsed", "model", cfg.Model, "context_size", derefInt64(contextSize), "runtime_flags", runtimeFlags, "speculative_opts", specOpts, "llamacpp", llamaCpp, "engine", engine)
// Skip model configuration when generating titles to avoid reconfiguring the model
// with different settings (e.g., smaller max_tokens) that would affect the main agent.
if !globalOptions.GeneratingTitle() {
if err := configureModel(ctx, httpClient, baseURL, cfg.Model, contextSize, finalFlags, specOpts); err != nil {
if err := configureModel(ctx, httpClient, baseURL, cfg.Model, backendCfg); err != nil {
slog.Debug("model configure via API skipped or failed", "error", err)
}
}
Expand All @@ -129,6 +125,7 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
client: openai.NewClient(clientOptions...),
baseURL: baseURL,
httpClient: httpClient,
engine: engine,
}, nil
}

Expand Down Expand Up @@ -214,6 +211,14 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
}
}

// For vLLM, apply engine-specific per-request fields (e.g. thinking_token_budget).
if c.engine == engineVLLM {
if fields := buildVLLMRequestFields(&c.ModelConfig); fields != nil {
params.SetExtraFields(fields)
slog.Debug("DMR vLLM extra request fields applied", "fields", fields)
}
}

// Log the request in JSON format for debugging
if requestJSON, err := json.Marshal(params); err == nil {
slog.Debug("DMR chat completion request", "request", string(requestJSON))
Expand All @@ -222,7 +227,7 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
}

if structuredOutput := c.ModelOptions.StructuredOutput(); structuredOutput != nil {
slog.Debug("Adding structured output to DMR request", "structured_output", structuredOutput)
slog.Debug("Adding structured output to DMR request", "name", structuredOutput.Name, "strict", structuredOutput.Strict)

params.ResponseFormat.OfJSONSchema = &openai.ResponseFormatJSONSchemaParam{
JSONSchema: openai.ResponseFormatJSONSchemaJSONSchemaParam{
Expand Down
Loading
Loading