MoonshotAI · wbxl2000 · May 20, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/docs/en/configuration/env-vars.md b/docs/en/configuration/env-vars.md
@@ -17,7 +17,8 @@ The following environment variables take effect when using `kimi` type providers
 | `KIMI_MODEL_CAPABILITIES` | Model capabilities, comma-separated (e.g., `thinking,image_in`) |
 | `KIMI_MODEL_TEMPERATURE` | Generation parameter `temperature` |
 | `KIMI_MODEL_TOP_P` | Generation parameter `top_p` |
-| `KIMI_MODEL_MAX_TOKENS` | Generation parameter `max_tokens` |
+| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | Generation parameter `max_completion_tokens` |
+| `KIMI_MODEL_MAX_TOKENS` | Compatibility alias for `KIMI_MODEL_MAX_COMPLETION_TOKENS` |
 | `KIMI_MODEL_THINKING_KEEP` | Moonshot `thinking.keep` switch for preserved thinking (only applied when thinking mode is active) |
 
 ### `KIMI_BASE_URL`
@@ -76,14 +77,17 @@ Sets the generation parameter `top_p` (nucleus sampling), controlling output div
 export KIMI_MODEL_TOP_P="0.9"
 ```
 
-### `KIMI_MODEL_MAX_TOKENS`
+### `KIMI_MODEL_MAX_COMPLETION_TOKENS`
 
-Sets the generation parameter `max_tokens`, limiting the maximum tokens per response.
+Sets the generation parameter `max_completion_tokens`, limiting the maximum tokens per response.
 
 ```sh
-export KIMI_MODEL_MAX_TOKENS="4096"
+export KIMI_MODEL_MAX_COMPLETION_TOKENS="4096"
 ```
 
+`KIMI_MODEL_MAX_TOKENS` is still accepted. If both variables are set,
+`KIMI_MODEL_MAX_COMPLETION_TOKENS` takes precedence.
+
 ### `KIMI_MODEL_THINKING_KEEP`
 
 Forwards the value verbatim to the Moonshot API as `thinking.keep`, enabling Preserved Thinking (see the [Moonshot docs](https://platform.kimi.com/docs/guide/use-kimi-k2-thinking-model#preserved-thinking)). Setting it to `all` causes the provider to preserve the reasoning content of previous assistant turns across requests. The value is passed through unchanged, no validation or case normalization is performed.

diff --git a/docs/zh/configuration/env-vars.md b/docs/zh/configuration/env-vars.md
@@ -17,7 +17,8 @@ Kimi Code CLI 支持通过环境变量覆盖配置或控制运行行为。本页
 | `KIMI_MODEL_CAPABILITIES` | 模型能力，逗号分隔（如 `thinking,image_in`） |
 | `KIMI_MODEL_TEMPERATURE` | 生成参数 `temperature` |
 | `KIMI_MODEL_TOP_P` | 生成参数 `top_p` |
-| `KIMI_MODEL_MAX_TOKENS` | 生成参数 `max_tokens` |
+| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | 生成参数 `max_completion_tokens` |
+| `KIMI_MODEL_MAX_TOKENS` | `KIMI_MODEL_MAX_COMPLETION_TOKENS` 的兼容别名 |
 | `KIMI_MODEL_THINKING_KEEP` | Moonshot `thinking.keep` 开关（Preserved Thinking），仅在 Thinking 模式下生效 |
 
 ### `KIMI_BASE_URL`
@@ -76,14 +77,17 @@ export KIMI_MODEL_TEMPERATURE="0.7"
 export KIMI_MODEL_TOP_P="0.9"
 ```
 
-### `KIMI_MODEL_MAX_TOKENS`
+### `KIMI_MODEL_MAX_COMPLETION_TOKENS`
 
-设置生成参数 `max_tokens`，限制单次回复的最大 token 数。
+设置生成参数 `max_completion_tokens`，限制单次回复的最大 token 数。
 
 ```sh
-export KIMI_MODEL_MAX_TOKENS="4096"
+export KIMI_MODEL_MAX_COMPLETION_TOKENS="4096"
 ```
 
+`KIMI_MODEL_MAX_TOKENS` 仍可使用；如果两个环境变量都设置，优先使用
+`KIMI_MODEL_MAX_COMPLETION_TOKENS`。
+
 ### `KIMI_MODEL_THINKING_KEEP`
 
 将 env 值原样作为 `thinking.keep` 字段发送给 Moonshot API，用于开启 Preserved Thinking（参考 [Moonshot 官方文档](https://platform.kimi.com/docs/guide/use-kimi-k2-thinking-model#preserved-thinking)）。设为 `all` 可让模型在多轮之间保留历史 reasoning_content。值不做任何校验、不做大小写归一化，透传给 API 自己判断。
@@ -189,4 +193,3 @@ export KIMI_CLI_PASTE_LINE_THRESHOLD="2"
 
 注意：两个阈值的判断逻辑是"满足任一即折叠"（字符数 **或** 行数），因此只需调低行数阈值即可。不建议将字符数阈值设为很小的值（如 `1`），否则所有非空粘贴（包括单行短文本）都会被折叠。
 :::
-
diff --git a/packages/kosong/src/kosong/__init__.py b/packages/kosong/src/kosong/__init__.py
@@ -70,8 +70,9 @@ async def main() -> None:
 """
 
 import asyncio
-from collections.abc import Callable, Sequence
+from collections.abc import Callable, Mapping, Sequence
 from dataclasses import dataclass
+from typing import Any
 
 from loguru import logger
 
@@ -109,6 +110,7 @@ async def step(
     *,
     on_message_part: Callback[[StreamedMessagePart], None] | None = None,
     on_tool_result: Callable[[ToolResult], None] | None = None,
+    generation_overrides: Mapping[str, Any] | None = None,
 ) -> "StepResult":
     """
     Run one agent "step". In one step, the function generates LLM response based on the given
@@ -121,6 +123,15 @@ async def step(
 
     The token usage will be returned in the `StepResult` if available.
 
+    Args:
+        chat_provider: The chat provider to use for generation.
+        system_prompt: The system prompt forwarded to the chat provider.
+        toolset: The toolset that handles tool calls and exposes available tools.
+        history: The message history forwarded to the chat provider.
+        on_message_part: Optional callback fired for each streamed message part.
+        on_tool_result: Optional callback fired when an individual tool result resolves.
+        generation_overrides: Optional per-call overrides forwarded to ``chat_provider.generate``.
+
     Raises:
         APIConnectionError: If the API connection fails.
         APITimeoutError: If the API request times out.
@@ -162,6 +173,7 @@ async def on_tool_call(tool_call: ToolCall):
             history,
             on_message_part=on_message_part,
             on_tool_call=on_tool_call,
+            generation_overrides=generation_overrides,
         )
     except (ChatProviderError, asyncio.CancelledError):
         # cancel all the futures to avoid hanging tasks

diff --git a/packages/kosong/src/kosong/__main__.py b/packages/kosong/src/kosong/__main__.py
@@ -123,7 +123,12 @@ async def main():
             assert api_key is not None, "Expect KIMI_API_KEY environment variable"
             model = model or "kimi-k2-turbo-preview"
 
-            chat_provider = Kimi(base_url=base_url, api_key=api_key, model=model)
+            # ``Kimi.generate`` no longer carries a built-in completion cap, so set a
+            # conservative default here to keep this developer-facing demo from running
+            # unbounded generations.
+            chat_provider = Kimi(
+                base_url=base_url, api_key=api_key, model=model
+            ).with_generation_kwargs(max_completion_tokens=8192)
         case "openai":
             from kosong.contrib.chat_provider.openai_responses import OpenAIResponses
 

diff --git a/packages/kosong/src/kosong/_generate.py b/packages/kosong/src/kosong/_generate.py
@@ -1,5 +1,6 @@
-from collections.abc import Sequence
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
+from typing import Any
 
 from loguru import logger
 
@@ -22,6 +23,7 @@ async def generate(
     *,
     on_message_part: Callback[[StreamedMessagePart], None] | None = None,
     on_tool_call: Callback[[ToolCall], None] | None = None,
+    generation_overrides: Mapping[str, Any] | None = None,
 ) -> "GenerateResult":
     """
     Generate one message based on the given context.
@@ -34,6 +36,7 @@ async def generate(
         history: The message history to use for generation.
         on_message_part: An optional callback to be called for each raw message part.
         on_tool_call: An optional callback to be called for each complete tool call.
+        generation_overrides: Optional per-call overrides forwarded to ``chat_provider.generate``.
 
     Returns:
         A tuple of the generated message and the token usage (if available).
@@ -50,7 +53,9 @@ async def generate(
     pending_part: StreamedMessagePart | None = None  # message part that is currently incomplete
 
     logger.trace("Generating with history: {history}", history=history)
-    stream = await chat_provider.generate(system_prompt, tools, history)
+    stream = await chat_provider.generate(
+        system_prompt, tools, history, generation_overrides=generation_overrides
+    )
     async for part in stream:
         logger.trace("Received part: {part}", part=part)
         if on_message_part:

diff --git a/packages/kosong/src/kosong/chat_provider/__init__.py b/packages/kosong/src/kosong/chat_provider/__init__.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from collections.abc import AsyncIterator, Sequence
-from typing import TYPE_CHECKING, Literal, Protocol, Self, runtime_checkable
+from collections.abc import AsyncIterator, Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Literal, Protocol, Self, runtime_checkable
 
 from pydantic import BaseModel
 
@@ -40,10 +40,21 @@ async def generate(
         system_prompt: str,
         tools: Sequence[Tool],
         history: Sequence[Message],
+        *,
+        generation_overrides: Mapping[str, Any] | None = None,
     ) -> StreamedMessage:
         """
         Generate a new message based on the given system prompt, tools, and history.
 
+        Args:
+            system_prompt: The system prompt to use for generation.
+            tools: The tools available for the model to call.
+            history: The message history to use for generation.
+            generation_overrides: Optional per-call overrides merged on top of the provider's
+                generation kwargs without mutating provider state. Treat the mapping as
+                read-only and request-scoped — implementations must not alias it into any
+                long-lived state.
+
         Raises:
             APIConnectionError: If the API connection fails.
             APITimeoutError: If the API request times out.

diff --git a/packages/kosong/src/kosong/chat_provider/chaos.py b/packages/kosong/src/kosong/chat_provider/chaos.py
@@ -1,7 +1,7 @@
 import json
 import os
 import random
-from collections.abc import AsyncIterator, Sequence
+from collections.abc import AsyncIterator, Mapping, Sequence
 from typing import TYPE_CHECKING, Any
 
 import httpx
@@ -115,8 +115,12 @@ async def generate(
         system_prompt: str,
         tools: Sequence[Tool],
         history: Sequence[Message],
+        *,
+        generation_overrides: Mapping[str, Any] | None = None,
     ) -> "ChaosStreamedMessage":
-        base_stream = await self._provider.generate(system_prompt, tools, history)
+        base_stream = await self._provider.generate(
+            system_prompt, tools, history, generation_overrides=generation_overrides
+        )
         return ChaosStreamedMessage(base_stream, self._chaos_config)
 
     def _monkey_patch_client(self):

diff --git a/packages/kosong/src/kosong/chat_provider/echo/echo.py b/packages/kosong/src/kosong/chat_provider/echo/echo.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
 import copy
-from collections.abc import AsyncIterator, Sequence
-from typing import TYPE_CHECKING, Self
+from collections.abc import AsyncIterator, Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Self
 
 from kosong.chat_provider import (
     ChatProvider,
@@ -72,7 +72,10 @@ async def generate(
         system_prompt: str,
         tools: Sequence[Tool],
         history: Sequence[Message],
+        *,
+        generation_overrides: Mapping[str, Any] | None = None,
     ) -> EchoStreamedMessage:
+        del generation_overrides  # echo provider has no API to forward overrides to
         if not history:
             raise ChatProviderError("EchoChatProvider requires at least one message in history.")
         if history[-1].role != "user":

diff --git a/packages/kosong/src/kosong/chat_provider/echo/scripted_echo.py b/packages/kosong/src/kosong/chat_provider/echo/scripted_echo.py
@@ -3,8 +3,8 @@
 import copy
 import json
 from collections import deque
-from collections.abc import AsyncIterator, Iterable, Sequence
-from typing import TYPE_CHECKING, Self
+from collections.abc import AsyncIterator, Iterable, Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Self
 
 from kosong.chat_provider import (
     ChatProvider,
@@ -49,7 +49,10 @@ async def generate(
         system_prompt: str,
         tools: Sequence[Tool],
         history: Sequence[Message],
+        *,
+        generation_overrides: Mapping[str, Any] | None = None,
     ) -> ScriptedEchoStreamedMessage:
+        del generation_overrides  # scripted echo replays canned scripts; overrides not applicable
         if not self._scripts:
             raise ChatProviderError(f"ScriptedEchoChatProvider exhausted at turn {self._turn + 1}.")
         script_text = self._scripts.popleft()

diff --git a/packages/kosong/src/kosong/chat_provider/kimi.py b/packages/kosong/src/kosong/chat_provider/kimi.py
@@ -2,7 +2,7 @@
 import mimetypes
 import os
 import uuid
-from collections.abc import AsyncIterator, Sequence
+from collections.abc import AsyncIterator, Mapping, Sequence
 from typing import TYPE_CHECKING, Any, Literal, Self, Unpack, cast
 
 import httpx
@@ -85,7 +85,9 @@ class GenerationKwargs(TypedDict, total=False):
         See https://platform.moonshot.ai/docs/api/chat#request-body.
         """
 
+        max_completion_tokens: int | None
         max_tokens: int | None
+        """Deprecated alias. Normalized to ``max_completion_tokens`` before requests."""
         temperature: float | None
         top_p: float | None
         n: int | None
@@ -154,17 +156,21 @@ async def generate(
         system_prompt: str,
         tools: Sequence[Tool],
         history: Sequence[Message],
+        *,
+        generation_overrides: Mapping[str, Any] | None = None,
     ) -> "KimiStreamedMessage":
         messages: list[ChatCompletionMessageParam] = []
         if system_prompt:
             messages.append({"role": "system", "content": system_prompt})
         messages.extend(_convert_message(message) for message in history)
 
-        generation_kwargs: dict[str, Any] = {
-            # default kimi generation kwargs
-            "max_tokens": 32000,
-        }
-        generation_kwargs.update(self._generation_kwargs)
+        generation_kwargs: dict[str, Any] = dict(self._generation_kwargs)
+        if generation_overrides:
+            generation_kwargs.update(
+                _normalize_generation_kwargs(
+                    cast(Kimi.GenerationKwargs, dict(generation_overrides))
+                )
+            )
 
         try:
             response = await self.client.chat.completions.create(
@@ -221,7 +227,7 @@ def with_generation_kwargs(self, **kwargs: Unpack[GenerationKwargs]) -> Self:
         """
         new_self = copy.copy(self)
         new_self._generation_kwargs = copy.deepcopy(self._generation_kwargs)
-        new_self._generation_kwargs.update(kwargs)
+        new_self._generation_kwargs.update(_normalize_generation_kwargs(kwargs))
         return new_self
 
     def with_extra_body(self, extra_body: ExtraBody) -> Self:
@@ -304,6 +310,14 @@ def _guess_filename(mime_type: str) -> str:
     return f"upload{extension}"
 
 
+def _normalize_generation_kwargs(kwargs: Kimi.GenerationKwargs) -> Kimi.GenerationKwargs:
+    normalized: dict[str, Any] = dict(kwargs)
+    max_tokens = normalized.pop("max_tokens", None)
+    if max_tokens is not None and "max_completion_tokens" not in normalized:
+        normalized["max_completion_tokens"] = max_tokens
+    return cast(Kimi.GenerationKwargs, normalized)
+
+
 def _convert_message(message: Message) -> ChatCompletionMessageParam:
     message = message.model_copy(deep=True)
     reasoning_content: str = ""
@@ -507,7 +521,7 @@ async def _dev_main():
         ]
         stream = await chat.with_generation_kwargs(
             temperature=0,
-            max_tokens=1000,
+            max_completion_tokens=1000,
         ).generate(system_prompt, [], history)
         async for part in stream:
             print(part.model_dump(exclude_none=True))

diff --git a/packages/kosong/src/kosong/chat_provider/mock.py b/packages/kosong/src/kosong/chat_provider/mock.py
@@ -1,6 +1,6 @@
 import copy
-from collections.abc import AsyncIterator, Sequence
-from typing import TYPE_CHECKING, Self
+from collections.abc import AsyncIterator, Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Self
 
 from kosong.chat_provider import (
     ChatProvider,
@@ -45,8 +45,11 @@ async def generate(
         system_prompt: str,
         tools: Sequence[Tool],
         history: Sequence[Message],
+        *,
+        generation_overrides: Mapping[str, Any] | None = None,
     ) -> "MockStreamedMessage":
         """Always return the predefined message parts."""
+        del generation_overrides  # mock provider ignores per-call overrides
         return MockStreamedMessage(self._message_parts)
 
     def with_thinking(self, effort: ThinkingEffort) -> Self:

diff --git a/packages/kosong/src/kosong/contrib/chat_provider/anthropic.py b/packages/kosong/src/kosong/contrib/chat_provider/anthropic.py
@@ -281,6 +281,8 @@ async def generate(
         system_prompt: str,
         tools: Sequence[Tool],
         history: Sequence[Message],
+        *,
+        generation_overrides: Mapping[str, Any] | None = None,
     ) -> "AnthropicStreamedMessage":
         # https://docs.claude.com/en/api/messages#body-messages
         # Anthropic API does not support system roles, but just a system prompt.
@@ -341,6 +343,8 @@ async def generate(
                         pass
         generation_kwargs: dict[str, Any] = {}
         generation_kwargs.update(self._generation_kwargs)
+        if generation_overrides:
+            generation_kwargs.update(generation_overrides)
         betas = generation_kwargs.pop("beta_features", [])
         extra_headers = {
             **{"anthropic-beta": ",".join(str(e) for e in betas)},