From 2a2f927aa4d4bfdcfe63f860a9aa97890b9a90f5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 18 May 2026 02:44:36 -0500 Subject: [PATCH] fix(benchmarks): guard against empty choices and message=None in LLM eval calls client.chat.completions.create() can return choices=[] on content-policy rejections or provider errors, and choices[0].message=None on filtered responses (e.g. Gemini PROHIBITED_CONTENT via OpenAI-compatible endpoint). Both crash with IndexError/AttributeError. The existing try/except blocks catch these as generic 'LLM evaluation failed' errors, making them hard to diagnose. Explicit guards surface the root cause clearly. --- .../EvoAgentBench/src/domains/information_retrieval/judge.py | 2 ++ benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py | 2 ++ benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py b/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py index 0269fe33..213a5fb9 100644 --- a/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py +++ b/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py @@ -118,6 +118,8 @@ def call_judge(question: str, response: str, correct_answer: str, temperature=temperature, extra_body=extra_body if extra_body else None, ) + if not resp.choices or resp.choices[0].message is None: + raise ValueError("LLM returned empty or filtered response") judge_text = resp.choices[0].message.content or "" except Exception as e: log.error(f"Judge API error: {e}") diff --git a/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py b/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py index 2aed41ad..26f06271 100644 --- a/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py +++ b/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py @@ -370,6 +370,8 @@ def evaluate_artifact(self, task: Dict, artifact_paths: list[str], {"role": "user", "content": content}, ], ) + if not resp.choices or resp.choices[0].message is None: + raise ValueError("LLM returned empty or filtered response") eval_text = resp.choices[0].message.content except Exception as e: raise RuntimeError(f"LLM evaluation failed: {e}") from e diff --git a/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py b/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py index 3c908a71..6ff7dd37 100644 --- a/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py +++ b/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py @@ -149,6 +149,8 @@ def _llm_verify(expected: str, actual: str, problem: str, temperature=0.0, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) + if not response.choices or response.choices[0].message is None: + raise ValueError("LLM returned empty or filtered response") text = (response.choices[0].message.content or "").strip() if not text: raise ValueError("Empty response from judge model")