EverMind-AI · qizwiz · May 18, 2026
diff --git a/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py b/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py
@@ -118,6 +118,8 @@ def call_judge(question: str, response: str, correct_answer: str,
             temperature=temperature,
             extra_body=extra_body if extra_body else None,
         )
+        if not resp.choices or resp.choices[0].message is None:
+            raise ValueError("LLM returned empty or filtered response")
         judge_text = resp.choices[0].message.content or ""
     except Exception as e:
         log.error(f"Judge API error: {e}")

diff --git a/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py b/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py
@@ -370,6 +370,8 @@ def evaluate_artifact(self, task: Dict, artifact_paths: list[str],
                     {"role": "user", "content": content},
                 ],
             )
+            if not resp.choices or resp.choices[0].message is None:
+                raise ValueError("LLM returned empty or filtered response")
             eval_text = resp.choices[0].message.content
         except Exception as e:
             raise RuntimeError(f"LLM evaluation failed: {e}") from e

diff --git a/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py b/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py
@@ -149,6 +149,8 @@ def _llm_verify(expected: str, actual: str, problem: str,
                 temperature=0.0,
                 extra_body={"chat_template_kwargs": {"enable_thinking": False}},
             )
+            if not response.choices or response.choices[0].message is None:
+                raise ValueError("LLM returned empty or filtered response")
             text = (response.choices[0].message.content or "").strip()
             if not text:
                 raise ValueError("Empty response from judge model")