diff --git a/packages/kosong/src/kosong/contrib/chat_provider/openai_legacy.py b/packages/kosong/src/kosong/contrib/chat_provider/openai_legacy.py index ef736360b..1ba804001 100644 --- a/packages/kosong/src/kosong/contrib/chat_provider/openai_legacy.py +++ b/packages/kosong/src/kosong/contrib/chat_provider/openai_legacy.py @@ -30,7 +30,15 @@ tool_to_openai, ) from kosong.contrib.chat_provider.common import ToolMessageConversion -from kosong.message import ContentPart, Message, TextPart, ThinkPart, ToolCall, ToolCallPart +from kosong.message import ( + ContentPart, + ImageURLPart, + Message, + TextPart, + ThinkPart, + ToolCall, + ToolCallPart, +) from kosong.tooling import Tool if TYPE_CHECKING: @@ -40,6 +48,11 @@ def type_check(openai_legacy: "OpenAILegacy"): _: RetryableChatProvider = openai_legacy +# ContentPart subclasses that can be filtered by supported_content_types +# Note: ToolCall and ToolCallPart are NOT ContentPart subclasses, they are handled separately +_CONTENT_PART_TYPES: set[type] = {TextPart, ImageURLPart} + + class OpenAILegacy: """ A chat provider that uses the OpenAI Chat Completions API. @@ -77,6 +90,7 @@ def __init__( stream: bool = True, reasoning_key: str | None = None, tool_message_conversion: ToolMessageConversion | None = None, + supported_content_types: set[type] | None = None, **client_kwargs: Any, ): """ @@ -84,6 +98,12 @@ def __init__( To support OpenAI-compatible APIs that inject reasoning content in a extra field in the message, such as `{"reasoning": ...}`, `reasoning_key` can be set to the key name. + + Args: + supported_content_types: Set of ContentPart subclass types that the API supports. + If None, defaults to {TextPart, ImageURLPart}. + Only ContentPart subclasses (TextPart, ImageURLPart, etc.) should be included. + ToolCall and ToolCallPart are handled separately by message serialization. """ self.model = model self.stream = stream @@ -100,6 +120,13 @@ def __init__( self._reasoning_key = reasoning_key self._tool_message_conversion: ToolMessageConversion | None = tool_message_conversion self._generation_kwargs: OpenAILegacy.GenerationKwargs = {} + # Use explicit None check instead of `or` to handle empty set correctly. + # An empty set is falsy in Python, which would incorrectly trigger the default. + self._supported_content_types: set[type] = ( + supported_content_types + if supported_content_types is not None + else {TextPart, ImageURLPart} + ) @property def model_name(self) -> str: @@ -139,10 +166,18 @@ async def generate( reasoning_effort = "medium" try: + # Build tools parameter: omit if empty to avoid API errors. + # Some APIs (DashScope, Xunfei) reject empty tools array with errors like: + # - "[] is too short - 'tools'" (DashScope) + # - "EngineInternalError: Bad Request" (Xunfei) + # See: https://github.com/MoonshotAI/kimi-cli/issues/1344 + tools_param = ( + [tool_to_openai(tool) for tool in tools] if tools else omit + ) response = await self.client.chat.completions.create( model=self.model, messages=messages, - tools=(tool_to_openai(tool) for tool in tools), + tools=tools_param, stream=self.stream, stream_options={"include_usage": True} if self.stream else omit, reasoning_effort=reasoning_effort, @@ -193,7 +228,12 @@ def model_parameters(self) -> dict[str, Any]: return model_parameters def _convert_message(self, message: Message) -> ChatCompletionMessageParam: - """Convert a Kosong message to OpenAI message.""" + """Convert a Kosong message to OpenAI message. + + Filters out content parts that are not supported by the API (e.g., VideoURLPart, AudioURLPart). + This prevents API errors when sending messages containing multimedia content to APIs that + don't support them. See: https://github.com/MoonshotAI/kimi-cli/issues/796 + """ # Note: for openai, `developer` role is more standard, but `system` is still accepted. # And many openai-compatible models do not accept `developer` role. # So we use `system` role here. OpenAIResponses will use `developer` role. @@ -204,8 +244,10 @@ def _convert_message(self, message: Message) -> ChatCompletionMessageParam: for part in message.content: if isinstance(part, ThinkPart): reasoning_content += part.think - else: + elif type(part) in self._supported_content_types: + # Only include content types supported by this API content.append(part) + # else: filter out unsupported content types (e.g., VideoURLPart, AudioURLPart) # if tool message and `tool_result_conversion` is `extract_text`, patch all text parts into # one so that we can make use of the serialization process of `Message` to output string if message.role == "tool" and self._tool_message_conversion == "extract_text": diff --git a/packages/kosong/tests/api_snapshot_tests/test_openai_legacy.py b/packages/kosong/tests/api_snapshot_tests/test_openai_legacy.py index 2e2020d79..c5321d1da 100644 --- a/packages/kosong/tests/api_snapshot_tests/test_openai_legacy.py +++ b/packages/kosong/tests/api_snapshot_tests/test_openai_legacy.py @@ -27,26 +27,20 @@ async def test_openai_legacy_message_conversion(): "messages": [ {"role": "system", "content": "You are helpful."}, {"role": "user", "content": "Hello!"}, - ], - "tools": [], - }, + ]}, "multi_turn_conversation": { "messages": [ {"role": "user", "content": "What is 2+2?"}, {"role": "assistant", "content": "2+2 equals 4."}, {"role": "user", "content": "And 3+3?"}, - ], - "tools": [], - }, + ]}, "multi_turn_with_system": { "messages": [ {"role": "system", "content": "You are a math tutor."}, {"role": "user", "content": "What is 2+2?"}, {"role": "assistant", "content": "2+2 equals 4."}, {"role": "user", "content": "And 3+3?"}, - ], - "tools": [], - }, + ]}, "image_url": { "messages": [ { @@ -62,9 +56,7 @@ async def test_openai_legacy_message_conversion(): }, ], } - ], - "tools": [], - }, + ]}, "tool_definition": { "messages": [{"role": "user", "content": "Add 2 and 3"}], "tools": [ @@ -134,9 +126,7 @@ async def test_openai_legacy_message_conversion(): ], "tool_call_id": "call_abc123", }, - ], - "tools": [], - }, + ]}, "tool_call": { "messages": [ {"role": "user", "content": "Add 2 and 3"}, @@ -152,9 +142,7 @@ async def test_openai_legacy_message_conversion(): ], }, {"role": "tool", "content": "5", "tool_call_id": "call_abc123"}, - ], - "tools": [], - }, + ]}, "parallel_tool_calls": { "messages": [ {"role": "user", "content": "Calculate 2+3 and 4*5"}, diff --git a/src/kimi_cli/config.py b/src/kimi_cli/config.py index 78019555a..d2bd9a032 100644 --- a/src/kimi_cli/config.py +++ b/src/kimi_cli/config.py @@ -47,6 +47,17 @@ class LLMProvider(BaseModel): """Custom headers to include in API requests""" oauth: OAuthRef | None = None """OAuth credential reference (do not store tokens here).""" + reasoning_key: str | None = None + """Key name for reasoning/thinking content in API response (e.g., 'reasoning_content'). + Used by openai_legacy provider to extract thinking content from OpenAI-compatible APIs. + See: https://github.com/MoonshotAI/kimi-cli/issues/1155""" + supported_content_types: list[str] | None = None + """List of supported content part types for openai_legacy provider. + Valid values: 'text', 'image_url'. Invalid values will raise ValueError at startup. + Note: 'text' is ALWAYS included to ensure text content is never dropped. + If not specified, auto-detects from model capabilities (image_in). + 'video_url' and 'audio_url' are filtered out as most APIs don't support them. + See: https://github.com/MoonshotAI/kimi-cli/issues/796""" @field_serializer("api_key", when_used="json") def dump_secret(self, v: SecretStr): diff --git a/src/kimi_cli/llm.py b/src/kimi_cli/llm.py index 4e9801965..282b9e757 100644 --- a/src/kimi_cli/llm.py +++ b/src/kimi_cli/llm.py @@ -147,12 +147,42 @@ def create_llm( chat_provider = chat_provider.with_generation_kwargs(**gen_kwargs) case "openai_legacy": from kosong.contrib.chat_provider.openai_legacy import OpenAILegacy + from kosong.message import ImageURLPart, TextPart + + # Build supported content types based on config or model capabilities. + # TextPart is ALWAYS included to ensure text content is never dropped. + # Note: Only ContentPart subclasses (TextPart, ImageURLPart) should be here. + # ToolCall and ToolCallPart are handled separately by message serialization. + + # Valid content type names that users can configure + VALID_CONTENT_TYPES = {"text", "image_url"} + + if provider.supported_content_types is not None: + # Validate configured values + unknown_types = set(provider.supported_content_types) - VALID_CONTENT_TYPES + if unknown_types: + raise ValueError( + f"Invalid supported_content_types in provider config: {unknown_types}. " + f"Valid values are: {VALID_CONTENT_TYPES}" + ) + # Build supported types from user config, always including TextPart + supported_types: set[type] = {TextPart} + if "image_url" in provider.supported_content_types: + supported_types.add(ImageURLPart) + else: + # Auto-detect from model capabilities + model_caps = derive_model_capabilities(model) + supported_types = {TextPart} + if "image_in" in model_caps: + supported_types.add(ImageURLPart) chat_provider = OpenAILegacy( model=model.model, base_url=provider.base_url, api_key=resolved_api_key, default_headers=dict(provider.custom_headers) if provider.custom_headers else None, + reasoning_key=provider.reasoning_key, + supported_content_types=supported_types, ) case "openai_responses": from kosong.contrib.chat_provider.openai_responses import OpenAIResponses