diff --git a/openevolve/config.py b/openevolve/config.py index bef193da21..5681298163 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -83,6 +83,12 @@ class LLMModelConfig: manual_mode: Optional[bool] = None _manual_queue_dir: Optional[str] = None + # OpenAI API mode + # - "chat_completions": use client.chat.completions.create(...) + # - "responses": use client.responses.create(...) + # - "auto": choose based on endpoint compatibility + api_mode: Optional[str] = None + def __post_init__(self): """Post-initialization to resolve ${VAR} env var references in api_key""" self.api_key = _resolve_env_var(self.api_key) @@ -124,6 +130,9 @@ class LLMConfig(LLMModelConfig): # Manual mode switch manual_mode: bool = False + # OpenAI API mode (inherited by per-model config when unset there) + api_mode: str = "chat_completions" + def __post_init__(self): """Post-initialization to set up model configurations""" super().__post_init__() # Resolve ${VAR} in api_key at LLMConfig level @@ -179,6 +188,7 @@ def __post_init__(self): "random_seed": self.random_seed, "reasoning_effort": self.reasoning_effort, "manual_mode": self.manual_mode, + "api_mode": self.api_mode, } self.update_model_params(shared_config) @@ -232,6 +242,7 @@ def rebuild_models(self) -> None: "retry_delay": self.retry_delay, "random_seed": self.random_seed, "reasoning_effort": self.reasoning_effort, + "api_mode": self.api_mode, } self.update_model_params(shared_config) diff --git a/openevolve/llm/openai.py b/openevolve/llm/openai.py index 7477e5b349..737662df31 100644 --- a/openevolve/llm/openai.py +++ b/openevolve/llm/openai.py @@ -63,6 +63,7 @@ def __init__( self.api_key = model_cfg.api_key self.random_seed = getattr(model_cfg, "random_seed", None) self.reasoning_effort = getattr(model_cfg, "reasoning_effort", None) + self.api_mode = str(getattr(model_cfg, "api_mode", "chat_completions") or "chat_completions") # Manual mode: enabled via llm.manual_mode in config.yaml self.manual_mode = (getattr(model_cfg, "manual_mode", False) is True) @@ -97,26 +98,28 @@ def __init__( logger.info(f"Initialized OpenAI LLM with model: {self.model}") logger._initialized_models.add(self.model) - async def generate(self, prompt: str, **kwargs) -> str: - """Generate text from a prompt""" - return await self.generate_with_context( - system_message=self.system_message, - messages=[{"role": "user", "content": prompt}], - **kwargs, - ) + def _resolve_api_mode(self) -> str: + """Resolve effective API mode for this request.""" + mode = str(self.api_mode).lower() + valid_modes = {"chat_completions", "responses", "auto"} + if mode not in valid_modes: + raise ValueError( + f"Invalid api_mode '{self.api_mode}'. Expected one of: {sorted(valid_modes)}" + ) - async def generate_with_context( - self, system_message: str, messages: List[Dict[str, str]], **kwargs - ) -> str: - """Generate text using a system message and conversational context""" - # Prepare messages with system message - formatted_messages = [{"role": "system", "content": system_message}] - formatted_messages.extend(messages) + if mode != "auto": + return mode + + base = (self.api_base or "").lower().rstrip("/") + # Conservative auto mode: prefer Responses API only for official OpenAI endpoints. + if base in {"https://api.openai.com/v1", "https://api.openai.com"}: + return "responses" + return "chat_completions" - # Set up generation parameters - # Define OpenAI reasoning models that require max_completion_tokens - # These models don't support temperature/top_p and use different parameters - OPENAI_REASONING_MODEL_PREFIXES = ( + @staticmethod + def _is_openai_reasoning_model(model_name: str) -> bool: + """Check whether model follows OpenAI reasoning model naming patterns.""" + openai_reasoning_model_prefixes = ( # O-series reasoning models "o1-", "o1", # o1, o1-mini, o1-preview @@ -126,43 +129,128 @@ async def generate_with_context( # GPT-5 series are also reasoning models "gpt-5-", "gpt-5", # gpt-5, gpt-5-mini, gpt-5-nano - # The GPT OSS series are also reasoning models + # GPT OSS series "gpt-oss-120b", "gpt-oss-20b", ) + return str(model_name).lower().startswith(openai_reasoning_model_prefixes) - # Check if this is an OpenAI reasoning model based on model name pattern - # This works for all endpoints (OpenAI, Azure, OptiLLM, OpenRouter, etc.) - model_lower = str(self.model).lower() - is_openai_reasoning_model = model_lower.startswith(OPENAI_REASONING_MODEL_PREFIXES) + def _build_chat_completion_params( + self, formatted_messages: List[Dict[str, str]], **kwargs + ) -> Dict[str, Any]: + """Build request params for Chat Completions API.""" + is_openai_reasoning_model = self._is_openai_reasoning_model(self.model) if is_openai_reasoning_model: - # For OpenAI reasoning models params = { "model": self.model, "messages": formatted_messages, "max_completion_tokens": kwargs.get("max_tokens", self.max_tokens), } - # Add optional reasoning parameters if provided reasoning_effort = kwargs.get("reasoning_effort", self.reasoning_effort) if reasoning_effort is not None: params["reasoning_effort"] = reasoning_effort if "verbosity" in kwargs: params["verbosity"] = kwargs["verbosity"] - else: - # Standard parameters for all other models - params = { - "model": self.model, - "messages": formatted_messages, - "temperature": kwargs.get("temperature", self.temperature), - "top_p": kwargs.get("top_p", self.top_p), - "max_tokens": kwargs.get("max_tokens", self.max_tokens), - } + return params + + params = { + "model": self.model, + "messages": formatted_messages, + "temperature": kwargs.get("temperature", self.temperature), + "top_p": kwargs.get("top_p", self.top_p), + "max_tokens": kwargs.get("max_tokens", self.max_tokens), + } + reasoning_effort = kwargs.get("reasoning_effort", self.reasoning_effort) + if reasoning_effort is not None: + params["reasoning_effort"] = reasoning_effort + return params + + def _build_responses_params( + self, formatted_messages: List[Dict[str, str]], **kwargs + ) -> Dict[str, Any]: + """Build request params for Responses API.""" + # Responses API accepts role/content messages via the input field. + input_messages = [ + {"role": m.get("role", "user"), "content": m.get("content", "")} + for m in formatted_messages + ] + + params = { + "model": self.model, + "input": input_messages, + "max_output_tokens": kwargs.get("max_tokens", self.max_tokens), + } - # Handle reasoning_effort for open source reasoning models. - reasoning_effort = kwargs.get("reasoning_effort", self.reasoning_effort) - if reasoning_effort is not None: - params["reasoning_effort"] = reasoning_effort + is_openai_reasoning_model = self._is_openai_reasoning_model(self.model) + if not is_openai_reasoning_model: + params["temperature"] = kwargs.get("temperature", self.temperature) + params["top_p"] = kwargs.get("top_p", self.top_p) + + reasoning_effort = kwargs.get("reasoning_effort", self.reasoning_effort) + if reasoning_effort is not None: + params["reasoning"] = {"effort": reasoning_effort} + + if "verbosity" in kwargs: + params["verbosity"] = kwargs["verbosity"] + + return params + + @staticmethod + def _extract_responses_text(response: Any) -> str: + """Extract text content from OpenAI Responses API result.""" + output_text = getattr(response, "output_text", None) + if isinstance(output_text, str) and output_text: + return output_text + + # Fallback for SDK objects and dict-like responses. + output_items = getattr(response, "output", None) + if output_items is None and isinstance(response, dict): + output_items = response.get("output") + + text_chunks: List[str] = [] + for item in output_items or []: + content_list = getattr(item, "content", None) + if content_list is None and isinstance(item, dict): + content_list = item.get("content") + + for content in content_list or []: + content_type = getattr(content, "type", None) + if content_type is None and isinstance(content, dict): + content_type = content.get("type") + + if content_type in {"output_text", "text"}: + text_value = getattr(content, "text", None) + if text_value is None and isinstance(content, dict): + text_value = content.get("text") + if isinstance(text_value, str) and text_value: + text_chunks.append(text_value) + + if text_chunks: + return "\n".join(text_chunks) + + raise ValueError("Responses API returned no text output") + + async def generate(self, prompt: str, **kwargs) -> str: + """Generate text from a prompt""" + return await self.generate_with_context( + system_message=self.system_message, + messages=[{"role": "user", "content": prompt}], + **kwargs, + ) + + async def generate_with_context( + self, system_message: str, messages: List[Dict[str, str]], **kwargs + ) -> str: + """Generate text using a system message and conversational context""" + # Prepare messages with system message + formatted_messages = [{"role": "system", "content": system_message}] + formatted_messages.extend(messages) + effective_api_mode = self._resolve_api_mode() + if effective_api_mode == "responses": + params = self._build_responses_params(formatted_messages, **kwargs) + else: + params = self._build_chat_completion_params(formatted_messages, **kwargs) # Add seed parameter for reproducibility if configured # Skip seed parameter for Google AI Studio endpoint as it doesn't support it @@ -184,13 +272,17 @@ async def generate_with_context( # Manual mode: no timeout unless explicitly passed by the caller if self.manual_mode: timeout = kwargs.get("timeout", None) - return await self._manual_wait_for_answer(params, timeout=timeout) + manual_params = params.copy() + manual_params["messages"] = formatted_messages + return await self._manual_wait_for_answer(manual_params, timeout=timeout) timeout = kwargs.get("timeout", self.timeout) for attempt in range(retries + 1): try: - response = await asyncio.wait_for(self._call_api(params), timeout=timeout) + response = await asyncio.wait_for( + self._call_api(params, api_mode=effective_api_mode), timeout=timeout + ) return response except asyncio.TimeoutError: if attempt < retries: @@ -209,18 +301,22 @@ async def generate_with_context( logger.error(f"All {retries + 1} attempts failed with error: {str(e)}") raise - async def _call_api(self, params: Dict[str, Any]) -> str: + async def _call_api(self, params: Dict[str, Any], api_mode: str = "chat_completions") -> str: """Make the actual API call""" if self.client is None: raise RuntimeError("OpenAI client is not initialized (manual_mode enabled?)") # Use asyncio to run the blocking API call in a thread pool loop = asyncio.get_event_loop() - response = await loop.run_in_executor( - None, lambda: self.client.chat.completions.create(**params) - ) + if api_mode == "responses": + response = await loop.run_in_executor(None, lambda: self.client.responses.create(**params)) + response_text = self._extract_responses_text(response) + logger.debug(f"API parameters: {params}") + logger.debug(f"API response: {response_text}") + return response_text + + response = await loop.run_in_executor(None, lambda: self.client.chat.completions.create(**params)) # Logging of system prompt, user message and response content - logger = logging.getLogger(__name__) logger.debug(f"API parameters: {params}") logger.debug(f"API response: {response.choices[0].message.content}") return response.choices[0].message.content @@ -249,9 +345,11 @@ async def _manual_wait_for_answer( "meta": { "max_tokens": params.get("max_tokens"), "max_completion_tokens": params.get("max_completion_tokens"), + "max_output_tokens": params.get("max_output_tokens"), "temperature": params.get("temperature"), "top_p": params.get("top_p"), - "reasoning_effort": params.get("reasoning_effort"), + "reasoning_effort": params.get("reasoning_effort") + or (params.get("reasoning", {}) or {}).get("effort"), "verbosity": params.get("verbosity"), }, } diff --git a/tests/test_reasoning_effort_config.py b/tests/test_reasoning_effort_config.py index 584c7ddfdf..4956e553c5 100644 --- a/tests/test_reasoning_effort_config.py +++ b/tests/test_reasoning_effort_config.py @@ -116,6 +116,27 @@ def test_reasoning_effort_model_override(self): self.assertEqual(config.llm.reasoning_effort, "low") self.assertEqual(config.llm.models[0].reasoning_effort, "high") + def test_api_mode_in_llm_config(self): + """Test that api_mode can be set at LLM level and inherited by models.""" + yaml_config = { + "llm": { + "api_base": "https://api.openai.com/v1", + "api_key": "test-key", + "api_mode": "responses", + "models": [ + { + "name": "gpt-4o-mini", + "weight": 1.0 + } + ] + } + } + + config = Config.from_dict(yaml_config) + + self.assertEqual(config.llm.api_mode, "responses") + self.assertEqual(config.llm.models[0].api_mode, "responses") + def test_openai_llm_uses_reasoning_effort(self): """Test that OpenAILLM stores and uses reasoning_effort from config""" # Create a mock model config with reasoning_effort @@ -178,6 +199,118 @@ def test_reasoning_effort_passed_to_api_params(self): # Verify the API was called with reasoning_effort llm.client.chat.completions.create.assert_called_once_with(**test_params) + def test_responses_api_mode_uses_responses_client(self): + """Test that api_mode=responses calls client.responses.create.""" + model_cfg = Mock() + model_cfg.name = "gpt-4o-mini" + model_cfg.system_message = "system" + model_cfg.temperature = 0.7 + model_cfg.top_p = 0.95 + model_cfg.max_tokens = 512 + model_cfg.timeout = 60 + model_cfg.retries = 0 + model_cfg.retry_delay = 0 + model_cfg.api_base = "https://api.openai.com/v1" + model_cfg.api_key = "test-key" + model_cfg.random_seed = None + model_cfg.reasoning_effort = None + model_cfg.api_mode = "responses" + + with unittest.mock.patch('openai.OpenAI'): + llm = OpenAILLM(model_cfg) + + mock_response = Mock() + mock_response.output_text = "Responses API output" + llm.client.responses.create.return_value = mock_response + + result = asyncio.run( + llm.generate_with_context( + system_message="You are helpful.", + messages=[{"role": "user", "content": "Hello"}], + ) + ) + + self.assertEqual(result, "Responses API output") + llm.client.responses.create.assert_called_once() + llm.client.chat.completions.create.assert_not_called() + + called_kwargs = llm.client.responses.create.call_args.kwargs + self.assertEqual(called_kwargs["model"], "gpt-4o-mini") + self.assertIn("input", called_kwargs) + self.assertEqual(called_kwargs["input"][0]["role"], "system") + self.assertEqual(called_kwargs["input"][1]["role"], "user") + + def test_auto_api_mode_prefers_responses_for_openai_base(self): + """Test that api_mode=auto selects Responses API for official OpenAI base URL.""" + model_cfg = Mock() + model_cfg.name = "gpt-4o-mini" + model_cfg.system_message = "system" + model_cfg.temperature = 0.7 + model_cfg.top_p = 0.95 + model_cfg.max_tokens = 128 + model_cfg.timeout = 60 + model_cfg.retries = 0 + model_cfg.retry_delay = 0 + model_cfg.api_base = "https://api.openai.com/v1" + model_cfg.api_key = "test-key" + model_cfg.random_seed = None + model_cfg.reasoning_effort = None + model_cfg.api_mode = "auto" + + with unittest.mock.patch('openai.OpenAI'): + llm = OpenAILLM(model_cfg) + + mock_response = Mock() + mock_response.output_text = "Auto mode output" + llm.client.responses.create.return_value = mock_response + + result = asyncio.run( + llm.generate_with_context( + system_message="System", + messages=[{"role": "user", "content": "Hello"}], + ) + ) + + self.assertEqual(result, "Auto mode output") + llm.client.responses.create.assert_called_once() + llm.client.chat.completions.create.assert_not_called() + + def test_auto_api_mode_uses_chat_for_non_openai_base(self): + """Test that api_mode=auto falls back to Chat Completions for non-OpenAI endpoints.""" + model_cfg = Mock() + model_cfg.name = "gpt-4o-mini" + model_cfg.system_message = "system" + model_cfg.temperature = 0.7 + model_cfg.top_p = 0.95 + model_cfg.max_tokens = 128 + model_cfg.timeout = 60 + model_cfg.retries = 0 + model_cfg.retry_delay = 0 + model_cfg.api_base = "http://localhost:11434/v1" + model_cfg.api_key = "test-key" + model_cfg.random_seed = None + model_cfg.reasoning_effort = None + model_cfg.api_mode = "auto" + + with unittest.mock.patch('openai.OpenAI'): + llm = OpenAILLM(model_cfg) + + mock_response = Mock() + mock_response.choices = [Mock()] + mock_response.choices[0].message.content = "Chat mode output" + llm.client.chat.completions.create.return_value = mock_response + + result = asyncio.run( + llm.generate_with_context( + system_message="System", + messages=[{"role": "user", "content": "Hello"}], + ) + ) + + self.assertEqual(result, "Chat mode output") + llm.client.chat.completions.create.assert_called_once() + llm.client.responses.create.assert_not_called() + def test_yaml_file_loading_with_reasoning_effort(self): """Test loading reasoning_effort from actual YAML file""" yaml_content = """ @@ -208,4 +341,4 @@ def test_yaml_file_loading_with_reasoning_effort(self): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main()