From 4b7276b537d9117c8f681c15571e96f9759cc611 Mon Sep 17 00:00:00 2001
From: Yufeng He <40085740+he-yufeng@users.noreply.github.com>
Date: Thu, 14 May 2026 06:47:46 +0800
Subject: [PATCH 1/2] fix(eval): include intermediate text in final response
 match

---
 src/google/adk/evaluation/eval_metrics.py     | 10 ++++
 .../adk/evaluation/final_response_match_v2.py | 13 +++-
 .../adk/evaluation/llm_as_judge_utils.py      | 23 ++++++-
 .../rubric_based_final_response_quality_v1.py | 11 +++-
 .../test_final_response_match_v2.py           | 60 ++++++++++++++++++-
 .../evaluation/test_llm_as_judge_utils.py     | 42 ++++++++++++-
 6 files changed, 151 insertions(+), 8 deletions(-)

diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
index 50c3473c3a..e9af586833 100644
--- a/src/google/adk/evaluation/eval_metrics.py
+++ b/src/google/adk/evaluation/eval_metrics.py
@@ -115,6 +115,16 @@ class BaseCriterion(BaseModel):
       description="The threshold to be used by the metric.",
   )
 
+  include_intermediate_responses_in_final: bool = Field(
+      default=False,
+      description=(
+          "Whether to evaluate the full agent response including intermediate"
+          " natural language text, such as text emitted before tool calls, in"
+          " addition to the final response. By default, only the final"
+          " response text is sent to the judge."
+      ),
+  )
+
 
 class LlmAsAJudgeCriterion(BaseCriterion):
   """Criterion when using LLM-As-A-Judge metric."""
diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py
index 713b421e3d..4ca3a9883f 100644
--- a/src/google/adk/evaluation/final_response_match_v2.py
+++ b/src/google/adk/evaluation/final_response_match_v2.py
@@ -159,8 +159,17 @@ def format_auto_rater_prompt(
     if expected_invocation is None:
       raise ValueError("expected_invocation is required for this metric.")
 
-    reference = get_text_from_content(expected_invocation.final_response)
-    response = get_text_from_content(actual_invocation.final_response)
+    include_intermediate = (
+        self._criterion.include_intermediate_responses_in_final
+    )
+    reference = get_text_from_content(
+        expected_invocation,
+        include_intermediate_responses_in_final=include_intermediate,
+    )
+    response = get_text_from_content(
+        actual_invocation,
+        include_intermediate_responses_in_final=include_intermediate,
+    )
     user_prompt = get_text_from_content(expected_invocation.user_content)
     return self._auto_rater_prompt_template.format(
         prompt=user_prompt,
diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py
index cf1309ca38..0518d896e5 100644
--- a/src/google/adk/evaluation/llm_as_judge_utils.py
+++ b/src/google/adk/evaluation/llm_as_judge_utils.py
@@ -26,6 +26,8 @@
 from .common import EvalBaseModel
 from .eval_case import get_all_tool_calls_with_responses
 from .eval_case import IntermediateDataType
+from .eval_case import Invocation
+from .eval_case import InvocationEvents
 from .eval_metrics import RubricScore
 from .evaluator import EvalStatus
 
@@ -44,8 +46,27 @@ class Label(enum.Enum):
 
 
 def get_text_from_content(
-    content: Optional[genai_types.Content],
+    content: Optional[Union[genai_types.Content, Invocation]],
+    *,
+    include_intermediate_responses_in_final: bool = False,
 ) -> Optional[str]:
+  if isinstance(content, Invocation):
+    if not include_intermediate_responses_in_final:
+      return get_text_from_content(content.final_response)
+
+    parts: list[str] = []
+    if isinstance(content.intermediate_data, InvocationEvents):
+      for event in content.intermediate_data.invocation_events:
+        text = get_text_from_content(event.content)
+        if text:
+          parts.append(text)
+
+    final_text = get_text_from_content(content.final_response)
+    if final_text:
+      parts.append(final_text)
+
+    return "\n".join(parts) if parts else None
+
   if content and content.parts:
     return "\n".join([p.text for p in content.parts if p.text])
 
diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
index df01aba4ff..db7318c562 100644
--- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
+++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -25,7 +25,6 @@
 from .eval_case import InvocationEvents
 from .eval_metrics import EvalMetric
 from .eval_metrics import RubricsBasedCriterion
-from .eval_rubrics import Rubric
 from .llm_as_judge_utils import get_text_from_content
 from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
 from .llm_as_judge_utils import get_tool_declarations_as_json_str
@@ -274,7 +273,15 @@ def format_auto_rater_prompt(
     """Returns the autorater prompt."""
     self.create_effective_rubrics_list(actual_invocation.rubrics)
     user_input = get_text_from_content(actual_invocation.user_content)
-    final_response = get_text_from_content(actual_invocation.final_response)
+    final_response = (
+        get_text_from_content(
+            actual_invocation,
+            include_intermediate_responses_in_final=(
+                self._criterion.include_intermediate_responses_in_final
+            ),
+        )
+        or ""
+    )
 
     rubrics_text = "\n".join([
         f"*  {r.rubric_content.text_property}"
diff --git a/tests/unittests/evaluation/test_final_response_match_v2.py b/tests/unittests/evaluation/test_final_response_match_v2.py
index ce44901ab5..4384e327c0 100644
--- a/tests/unittests/evaluation/test_final_response_match_v2.py
+++ b/tests/unittests/evaluation/test_final_response_match_v2.py
@@ -15,11 +15,11 @@
 from __future__ import annotations
 
 from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_case import InvocationEvent
+from google.adk.evaluation.eval_case import InvocationEvents
 from google.adk.evaluation.eval_metrics import BaseCriterion
 from google.adk.evaluation.eval_metrics import EvalMetric
 from google.adk.evaluation.eval_metrics import EvalStatus
-from google.adk.evaluation.eval_metrics import JudgeModelOptions
-from google.adk.evaluation.eval_metrics import PrebuiltMetrics
 from google.adk.evaluation.evaluator import PerInvocationResult
 from google.adk.evaluation.final_response_match_v2 import _parse_critique
 from google.adk.evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
@@ -127,6 +127,8 @@ def create_test_template() -> str:
 
 def _create_test_evaluator_gemini(
     threshold: float,
+    *,
+    include_intermediate_responses_in_final: bool = False,
 ) -> FinalResponseMatchV2Evaluator:
   evaluator = FinalResponseMatchV2Evaluator(
       EvalMetric(
@@ -134,6 +136,9 @@ def _create_test_evaluator_gemini(
           threshold=threshold,
           criterion=BaseCriterion(
               threshold=0.5,
+              include_intermediate_responses_in_final=(
+                  include_intermediate_responses_in_final
+              ),
           ),
       ),
   )
@@ -168,6 +173,21 @@ def _create_test_invocations(
   return actual_invocation, expected_invocation
 
 
+def _add_intermediate_text(invocation: Invocation, text: str) -> Invocation:
+  invocation.intermediate_data = InvocationEvents(
+      invocation_events=[
+          InvocationEvent(
+              author="agent",
+              content=genai_types.Content(
+                  parts=[genai_types.Part(text=text)],
+                  role="model",
+              ),
+          ),
+      ]
+  )
+  return invocation
+
+
 def test_format_auto_rater_prompt():
   evaluator = _create_test_evaluator_gemini(threshold=0.8)
   actual_invocation, expected_invocation = _create_test_invocations(
@@ -193,6 +213,42 @@ def test_format_auto_rater_prompt():
 """
 
 
+def test_format_auto_rater_prompt_ignores_intermediate_by_default():
+  evaluator = _create_test_evaluator_gemini(threshold=0.8)
+  actual_invocation, expected_invocation = _create_test_invocations(
+      "candidate final", "reference final"
+  )
+  _add_intermediate_text(actual_invocation, "candidate intro")
+  _add_intermediate_text(expected_invocation, "reference intro")
+
+  prompt = evaluator.format_auto_rater_prompt(
+      actual_invocation, expected_invocation
+  )
+
+  assert "candidate final" in prompt
+  assert "reference final" in prompt
+  assert "candidate intro" not in prompt
+  assert "reference intro" not in prompt
+
+
+def test_format_auto_rater_prompt_includes_intermediate_when_enabled():
+  evaluator = _create_test_evaluator_gemini(
+      threshold=0.8, include_intermediate_responses_in_final=True
+  )
+  actual_invocation, expected_invocation = _create_test_invocations(
+      "candidate final", "reference final"
+  )
+  _add_intermediate_text(actual_invocation, "candidate intro")
+  _add_intermediate_text(expected_invocation, "reference intro")
+
+  prompt = evaluator.format_auto_rater_prompt(
+      actual_invocation, expected_invocation
+  )
+
+  assert "candidate intro\ncandidate final" in prompt
+  assert "reference intro\nreference final" in prompt
+
+
 def test_convert_auto_rater_response_to_score_valid():
   evaluator = _create_test_evaluator_gemini(threshold=0.8)
   auto_rater_response = """```json
diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py
index e5327cf454..22ac74f201 100644
--- a/tests/unittests/evaluation/test_llm_as_judge_utils.py
+++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py
@@ -19,6 +19,7 @@
 from google.adk.evaluation.app_details import AgentDetails
 from google.adk.evaluation.app_details import AppDetails
 from google.adk.evaluation.eval_case import IntermediateData
+from google.adk.evaluation.eval_case import Invocation
 from google.adk.evaluation.eval_case import InvocationEvent
 from google.adk.evaluation.eval_case import InvocationEvents
 from google.adk.evaluation.eval_rubrics import RubricScore
@@ -45,7 +46,7 @@ def test_get_text_from_content_with_content_and_none_parts():
 def test_get_text_from_content_with_empty_parts():
   """Tests get_text_from_content with an empty parts list."""
   content = genai_types.Content(parts=[])
-  assert get_text_from_content(content) == None
+  assert get_text_from_content(content) is None
 
 
 def test_get_text_from_content_with_parts_but_no_text():
@@ -88,6 +89,45 @@ def test_get_text_from_content_with_mixed_parts():
   assert get_text_from_content(content) == "Hello\nWorld"
 
 
+def test_get_text_from_content_with_invocation_full_response():
+  invocation = Invocation(
+      user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
+      intermediate_data=InvocationEvents(
+          invocation_events=[
+              InvocationEvent(
+                  author="agent",
+                  content=genai_types.Content(
+                      parts=[genai_types.Part(text="thinking aloud")]
+                  ),
+              ),
+              InvocationEvent(
+                  author="tool",
+                  content=genai_types.Content(
+                      parts=[
+                          genai_types.Part(
+                              function_call=genai_types.FunctionCall(
+                                  name="lookup"
+                              )
+                          )
+                      ]
+                  ),
+              ),
+          ]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="final answer")]
+      ),
+  )
+
+  assert get_text_from_content(invocation) == "final answer"
+  assert (
+      get_text_from_content(
+          invocation, include_intermediate_responses_in_final=True
+      )
+      == "thinking aloud\nfinal answer"
+  )
+
+
 def test_get_eval_status_with_none_score():
   """Tests get_eval_status returns NOT_EVALUATED for a None score."""
   assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED

From e88194bcb76644398d84b14b2dbb345e984381ec Mon Sep 17 00:00:00 2001
From: Yufeng He <40085740+he-yufeng@users.noreply.github.com>
Date: Thu, 14 May 2026 15:17:35 +0800
Subject: [PATCH 2/2] fix(eval): cover legacy intermediate responses

---
 .../adk/evaluation/final_response_match_v2.py |  4 +--
 .../adk/evaluation/llm_as_judge_utils.py      |  6 ++++
 .../test_final_response_match_v2.py           | 17 +++++++++++
 .../evaluation/test_llm_as_judge_utils.py     | 30 +++++++++++++++++++
 4 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py
index 4ca3a9883f..445d65c13d 100644
--- a/src/google/adk/evaluation/final_response_match_v2.py
+++ b/src/google/adk/evaluation/final_response_match_v2.py
@@ -173,8 +173,8 @@ def format_auto_rater_prompt(
     user_prompt = get_text_from_content(expected_invocation.user_content)
     return self._auto_rater_prompt_template.format(
         prompt=user_prompt,
-        response=response,
-        golden_response=reference,
+        response=response or "",
+        golden_response=reference or "",
     )
 
   @override
diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py
index 0518d896e5..4ac6cc6638 100644
--- a/src/google/adk/evaluation/llm_as_judge_utils.py
+++ b/src/google/adk/evaluation/llm_as_judge_utils.py
@@ -25,6 +25,7 @@
 from .app_details import AppDetails
 from .common import EvalBaseModel
 from .eval_case import get_all_tool_calls_with_responses
+from .eval_case import IntermediateData
 from .eval_case import IntermediateDataType
 from .eval_case import Invocation
 from .eval_case import InvocationEvents
@@ -60,6 +61,11 @@ def get_text_from_content(
         text = get_text_from_content(event.content)
         if text:
           parts.append(text)
+    elif isinstance(content.intermediate_data, IntermediateData):
+      for _, response_parts in content.intermediate_data.intermediate_responses:
+        text = get_text_from_content(genai_types.Content(parts=response_parts))
+        if text:
+          parts.append(text)
 
     final_text = get_text_from_content(content.final_response)
     if final_text:
diff --git a/tests/unittests/evaluation/test_final_response_match_v2.py b/tests/unittests/evaluation/test_final_response_match_v2.py
index 4384e327c0..b4ab173405 100644
--- a/tests/unittests/evaluation/test_final_response_match_v2.py
+++ b/tests/unittests/evaluation/test_final_response_match_v2.py
@@ -213,6 +213,23 @@ def test_format_auto_rater_prompt():
 """
 
 
+def test_format_auto_rater_prompt_uses_empty_text_for_missing_final_response():
+  evaluator = _create_test_evaluator_gemini(threshold=0.8)
+  actual_invocation, expected_invocation = _create_test_invocations(
+      "candidate text", "reference text"
+  )
+  actual_invocation.final_response = None
+  expected_invocation.final_response = None
+
+  prompt = evaluator.format_auto_rater_prompt(
+      actual_invocation, expected_invocation
+  )
+
+  assert "None" not in prompt
+  assert '"Agent response": ,' in prompt
+  assert '"Reference response": ,' in prompt
+
+
 def test_format_auto_rater_prompt_ignores_intermediate_by_default():
   evaluator = _create_test_evaluator_gemini(threshold=0.8)
   actual_invocation, expected_invocation = _create_test_invocations(
diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py
index 22ac74f201..d06eafed4b 100644
--- a/tests/unittests/evaluation/test_llm_as_judge_utils.py
+++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py
@@ -128,6 +128,36 @@ def test_get_text_from_content_with_invocation_full_response():
   )
 
 
+def test_get_text_from_content_with_intermediate_data_full_response():
+  invocation = Invocation(
+      user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
+      intermediate_data=IntermediateData(
+          intermediate_responses=[
+              ("agent", [genai_types.Part(text="legacy intro")]),
+              (
+                  "tool",
+                  [
+                      genai_types.Part(
+                          function_call=genai_types.FunctionCall(name="lookup")
+                      )
+                  ],
+              ),
+          ]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="final answer")]
+      ),
+  )
+
+  assert get_text_from_content(invocation) == "final answer"
+  assert (
+      get_text_from_content(
+          invocation, include_intermediate_responses_in_final=True
+      )
+      == "legacy intro\nfinal answer"
+  )
+
+
 def test_get_eval_status_with_none_score():
   """Tests get_eval_status returns NOT_EVALUATED for a None score."""
   assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED