diff --git a/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py b/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py index 0269fe33..213a5fb9 100644 --- a/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py +++ b/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py @@ -118,6 +118,8 @@ def call_judge(question: str, response: str, correct_answer: str, temperature=temperature, extra_body=extra_body if extra_body else None, ) + if not resp.choices or resp.choices[0].message is None: + raise ValueError("LLM returned empty or filtered response") judge_text = resp.choices[0].message.content or "" except Exception as e: log.error(f"Judge API error: {e}") diff --git a/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py b/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py index 2aed41ad..26f06271 100644 --- a/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py +++ b/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py @@ -370,6 +370,8 @@ def evaluate_artifact(self, task: Dict, artifact_paths: list[str], {"role": "user", "content": content}, ], ) + if not resp.choices or resp.choices[0].message is None: + raise ValueError("LLM returned empty or filtered response") eval_text = resp.choices[0].message.content except Exception as e: raise RuntimeError(f"LLM evaluation failed: {e}") from e diff --git a/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py b/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py index 3c908a71..6ff7dd37 100644 --- a/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py +++ b/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py @@ -149,6 +149,8 @@ def _llm_verify(expected: str, actual: str, problem: str, temperature=0.0, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) + if not response.choices or response.choices[0].message is None: + raise ValueError("LLM returned empty or filtered response") text = (response.choices[0].message.content or "").strip() if not text: raise ValueError("Empty response from judge model")