From 2a2f927aa4d4bfdcfe63f860a9aa97890b9a90f5 Mon Sep 17 00:00:00 2001
From: Your Name <jonathan.f.hill@gmail.com>
Date: Mon, 18 May 2026 02:44:36 -0500
Subject: [PATCH] fix(benchmarks): guard against empty choices and message=None
 in LLM eval calls

client.chat.completions.create() can return choices=[] on content-policy
rejections or provider errors, and choices[0].message=None on filtered
responses (e.g. Gemini PROHIBITED_CONTENT via OpenAI-compatible endpoint).
Both crash with IndexError/AttributeError. The existing try/except blocks
catch these as generic 'LLM evaluation failed' errors, making them hard
to diagnose. Explicit guards surface the root cause clearly.
---
 .../EvoAgentBench/src/domains/information_retrieval/judge.py    | 2 ++
 benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py | 2 ++
 benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py      | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py b/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py
index 0269fe33..213a5fb9 100644
--- a/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py
+++ b/benchmarks/EvoAgentBench/src/domains/information_retrieval/judge.py
@@ -118,6 +118,8 @@ def call_judge(question: str, response: str, correct_answer: str,
             temperature=temperature,
             extra_body=extra_body if extra_body else None,
         )
+        if not resp.choices or resp.choices[0].message is None:
+            raise ValueError("LLM returned empty or filtered response")
         judge_text = resp.choices[0].message.content or ""
     except Exception as e:
         log.error(f"Judge API error: {e}")
diff --git a/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py b/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py
index 2aed41ad..26f06271 100644
--- a/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py
+++ b/benchmarks/EvoAgentBench/src/domains/knowledge_work/evaluate.py
@@ -370,6 +370,8 @@ def evaluate_artifact(self, task: Dict, artifact_paths: list[str],
                     {"role": "user", "content": content},
                 ],
             )
+            if not resp.choices or resp.choices[0].message is None:
+                raise ValueError("LLM returned empty or filtered response")
             eval_text = resp.choices[0].message.content
         except Exception as e:
             raise RuntimeError(f"LLM evaluation failed: {e}") from e
diff --git a/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py b/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py
index 3c908a71..6ff7dd37 100644
--- a/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py
+++ b/benchmarks/EvoAgentBench/src/domains/reasoning/evaluate.py
@@ -149,6 +149,8 @@ def _llm_verify(expected: str, actual: str, problem: str,
                 temperature=0.0,
                 extra_body={"chat_template_kwargs": {"enable_thinking": False}},
             )
+            if not response.choices or response.choices[0].message is None:
+                raise ValueError("LLM returned empty or filtered response")
             text = (response.choices[0].message.content or "").strip()
             if not text:
                 raise ValueError("Empty response from judge model")