From 4b7276b537d9117c8f681c15571e96f9759cc611 Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Thu, 14 May 2026 06:47:46 +0800 Subject: [PATCH 1/2] fix(eval): include intermediate text in final response match --- src/google/adk/evaluation/eval_metrics.py | 10 ++++ .../adk/evaluation/final_response_match_v2.py | 13 +++- .../adk/evaluation/llm_as_judge_utils.py | 23 ++++++- .../rubric_based_final_response_quality_v1.py | 11 +++- .../test_final_response_match_v2.py | 60 ++++++++++++++++++- .../evaluation/test_llm_as_judge_utils.py | 42 ++++++++++++- 6 files changed, 151 insertions(+), 8 deletions(-) diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 50c3473c3a..e9af586833 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -115,6 +115,16 @@ class BaseCriterion(BaseModel): description="The threshold to be used by the metric.", ) + include_intermediate_responses_in_final: bool = Field( + default=False, + description=( + "Whether to evaluate the full agent response including intermediate" + " natural language text, such as text emitted before tool calls, in" + " addition to the final response. By default, only the final" + " response text is sent to the judge." + ), + ) + class LlmAsAJudgeCriterion(BaseCriterion): """Criterion when using LLM-As-A-Judge metric.""" diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py index 713b421e3d..4ca3a9883f 100644 --- a/src/google/adk/evaluation/final_response_match_v2.py +++ b/src/google/adk/evaluation/final_response_match_v2.py @@ -159,8 +159,17 @@ def format_auto_rater_prompt( if expected_invocation is None: raise ValueError("expected_invocation is required for this metric.") - reference = get_text_from_content(expected_invocation.final_response) - response = get_text_from_content(actual_invocation.final_response) + include_intermediate = ( + self._criterion.include_intermediate_responses_in_final + ) + reference = get_text_from_content( + expected_invocation, + include_intermediate_responses_in_final=include_intermediate, + ) + response = get_text_from_content( + actual_invocation, + include_intermediate_responses_in_final=include_intermediate, + ) user_prompt = get_text_from_content(expected_invocation.user_content) return self._auto_rater_prompt_template.format( prompt=user_prompt, diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index cf1309ca38..0518d896e5 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -26,6 +26,8 @@ from .common import EvalBaseModel from .eval_case import get_all_tool_calls_with_responses from .eval_case import IntermediateDataType +from .eval_case import Invocation +from .eval_case import InvocationEvents from .eval_metrics import RubricScore from .evaluator import EvalStatus @@ -44,8 +46,27 @@ class Label(enum.Enum): def get_text_from_content( - content: Optional[genai_types.Content], + content: Optional[Union[genai_types.Content, Invocation]], + *, + include_intermediate_responses_in_final: bool = False, ) -> Optional[str]: + if isinstance(content, Invocation): + if not include_intermediate_responses_in_final: + return get_text_from_content(content.final_response) + + parts: list[str] = [] + if isinstance(content.intermediate_data, InvocationEvents): + for event in content.intermediate_data.invocation_events: + text = get_text_from_content(event.content) + if text: + parts.append(text) + + final_text = get_text_from_content(content.final_response) + if final_text: + parts.append(final_text) + + return "\n".join(parts) if parts else None + if content and content.parts: return "\n".join([p.text for p in content.parts if p.text]) diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index df01aba4ff..db7318c562 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -25,7 +25,6 @@ from .eval_case import InvocationEvents from .eval_metrics import EvalMetric from .eval_metrics import RubricsBasedCriterion -from .eval_rubrics import Rubric from .llm_as_judge_utils import get_text_from_content from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str from .llm_as_judge_utils import get_tool_declarations_as_json_str @@ -274,7 +273,15 @@ def format_auto_rater_prompt( """Returns the autorater prompt.""" self.create_effective_rubrics_list(actual_invocation.rubrics) user_input = get_text_from_content(actual_invocation.user_content) - final_response = get_text_from_content(actual_invocation.final_response) + final_response = ( + get_text_from_content( + actual_invocation, + include_intermediate_responses_in_final=( + self._criterion.include_intermediate_responses_in_final + ), + ) + or "" + ) rubrics_text = "\n".join([ f"* {r.rubric_content.text_property}" diff --git a/tests/unittests/evaluation/test_final_response_match_v2.py b/tests/unittests/evaluation/test_final_response_match_v2.py index ce44901ab5..4384e327c0 100644 --- a/tests/unittests/evaluation/test_final_response_match_v2.py +++ b/tests/unittests/evaluation/test_final_response_match_v2.py @@ -15,11 +15,11 @@ from __future__ import annotations from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_case import InvocationEvent +from google.adk.evaluation.eval_case import InvocationEvents from google.adk.evaluation.eval_metrics import BaseCriterion from google.adk.evaluation.eval_metrics import EvalMetric from google.adk.evaluation.eval_metrics import EvalStatus -from google.adk.evaluation.eval_metrics import JudgeModelOptions -from google.adk.evaluation.eval_metrics import PrebuiltMetrics from google.adk.evaluation.evaluator import PerInvocationResult from google.adk.evaluation.final_response_match_v2 import _parse_critique from google.adk.evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator @@ -127,6 +127,8 @@ def create_test_template() -> str: def _create_test_evaluator_gemini( threshold: float, + *, + include_intermediate_responses_in_final: bool = False, ) -> FinalResponseMatchV2Evaluator: evaluator = FinalResponseMatchV2Evaluator( EvalMetric( @@ -134,6 +136,9 @@ def _create_test_evaluator_gemini( threshold=threshold, criterion=BaseCriterion( threshold=0.5, + include_intermediate_responses_in_final=( + include_intermediate_responses_in_final + ), ), ), ) @@ -168,6 +173,21 @@ def _create_test_invocations( return actual_invocation, expected_invocation +def _add_intermediate_text(invocation: Invocation, text: str) -> Invocation: + invocation.intermediate_data = InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=genai_types.Content( + parts=[genai_types.Part(text=text)], + role="model", + ), + ), + ] + ) + return invocation + + def test_format_auto_rater_prompt(): evaluator = _create_test_evaluator_gemini(threshold=0.8) actual_invocation, expected_invocation = _create_test_invocations( @@ -193,6 +213,42 @@ def test_format_auto_rater_prompt(): """ +def test_format_auto_rater_prompt_ignores_intermediate_by_default(): + evaluator = _create_test_evaluator_gemini(threshold=0.8) + actual_invocation, expected_invocation = _create_test_invocations( + "candidate final", "reference final" + ) + _add_intermediate_text(actual_invocation, "candidate intro") + _add_intermediate_text(expected_invocation, "reference intro") + + prompt = evaluator.format_auto_rater_prompt( + actual_invocation, expected_invocation + ) + + assert "candidate final" in prompt + assert "reference final" in prompt + assert "candidate intro" not in prompt + assert "reference intro" not in prompt + + +def test_format_auto_rater_prompt_includes_intermediate_when_enabled(): + evaluator = _create_test_evaluator_gemini( + threshold=0.8, include_intermediate_responses_in_final=True + ) + actual_invocation, expected_invocation = _create_test_invocations( + "candidate final", "reference final" + ) + _add_intermediate_text(actual_invocation, "candidate intro") + _add_intermediate_text(expected_invocation, "reference intro") + + prompt = evaluator.format_auto_rater_prompt( + actual_invocation, expected_invocation + ) + + assert "candidate intro\ncandidate final" in prompt + assert "reference intro\nreference final" in prompt + + def test_convert_auto_rater_response_to_score_valid(): evaluator = _create_test_evaluator_gemini(threshold=0.8) auto_rater_response = """```json diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py index e5327cf454..22ac74f201 100644 --- a/tests/unittests/evaluation/test_llm_as_judge_utils.py +++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py @@ -19,6 +19,7 @@ from google.adk.evaluation.app_details import AgentDetails from google.adk.evaluation.app_details import AppDetails from google.adk.evaluation.eval_case import IntermediateData +from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_case import InvocationEvent from google.adk.evaluation.eval_case import InvocationEvents from google.adk.evaluation.eval_rubrics import RubricScore @@ -45,7 +46,7 @@ def test_get_text_from_content_with_content_and_none_parts(): def test_get_text_from_content_with_empty_parts(): """Tests get_text_from_content with an empty parts list.""" content = genai_types.Content(parts=[]) - assert get_text_from_content(content) == None + assert get_text_from_content(content) is None def test_get_text_from_content_with_parts_but_no_text(): @@ -88,6 +89,45 @@ def test_get_text_from_content_with_mixed_parts(): assert get_text_from_content(content) == "Hello\nWorld" +def test_get_text_from_content_with_invocation_full_response(): + invocation = Invocation( + user_content=genai_types.Content(parts=[genai_types.Part(text="user")]), + intermediate_data=InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=genai_types.Content( + parts=[genai_types.Part(text="thinking aloud")] + ), + ), + InvocationEvent( + author="tool", + content=genai_types.Content( + parts=[ + genai_types.Part( + function_call=genai_types.FunctionCall( + name="lookup" + ) + ) + ] + ), + ), + ] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="final answer")] + ), + ) + + assert get_text_from_content(invocation) == "final answer" + assert ( + get_text_from_content( + invocation, include_intermediate_responses_in_final=True + ) + == "thinking aloud\nfinal answer" + ) + + def test_get_eval_status_with_none_score(): """Tests get_eval_status returns NOT_EVALUATED for a None score.""" assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED From e88194bcb76644398d84b14b2dbb345e984381ec Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Thu, 14 May 2026 15:17:35 +0800 Subject: [PATCH 2/2] fix(eval): cover legacy intermediate responses --- .../adk/evaluation/final_response_match_v2.py | 4 +-- .../adk/evaluation/llm_as_judge_utils.py | 6 ++++ .../test_final_response_match_v2.py | 17 +++++++++++ .../evaluation/test_llm_as_judge_utils.py | 30 +++++++++++++++++++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py index 4ca3a9883f..445d65c13d 100644 --- a/src/google/adk/evaluation/final_response_match_v2.py +++ b/src/google/adk/evaluation/final_response_match_v2.py @@ -173,8 +173,8 @@ def format_auto_rater_prompt( user_prompt = get_text_from_content(expected_invocation.user_content) return self._auto_rater_prompt_template.format( prompt=user_prompt, - response=response, - golden_response=reference, + response=response or "", + golden_response=reference or "", ) @override diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index 0518d896e5..4ac6cc6638 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -25,6 +25,7 @@ from .app_details import AppDetails from .common import EvalBaseModel from .eval_case import get_all_tool_calls_with_responses +from .eval_case import IntermediateData from .eval_case import IntermediateDataType from .eval_case import Invocation from .eval_case import InvocationEvents @@ -60,6 +61,11 @@ def get_text_from_content( text = get_text_from_content(event.content) if text: parts.append(text) + elif isinstance(content.intermediate_data, IntermediateData): + for _, response_parts in content.intermediate_data.intermediate_responses: + text = get_text_from_content(genai_types.Content(parts=response_parts)) + if text: + parts.append(text) final_text = get_text_from_content(content.final_response) if final_text: diff --git a/tests/unittests/evaluation/test_final_response_match_v2.py b/tests/unittests/evaluation/test_final_response_match_v2.py index 4384e327c0..b4ab173405 100644 --- a/tests/unittests/evaluation/test_final_response_match_v2.py +++ b/tests/unittests/evaluation/test_final_response_match_v2.py @@ -213,6 +213,23 @@ def test_format_auto_rater_prompt(): """ +def test_format_auto_rater_prompt_uses_empty_text_for_missing_final_response(): + evaluator = _create_test_evaluator_gemini(threshold=0.8) + actual_invocation, expected_invocation = _create_test_invocations( + "candidate text", "reference text" + ) + actual_invocation.final_response = None + expected_invocation.final_response = None + + prompt = evaluator.format_auto_rater_prompt( + actual_invocation, expected_invocation + ) + + assert "None" not in prompt + assert '"Agent response": ,' in prompt + assert '"Reference response": ,' in prompt + + def test_format_auto_rater_prompt_ignores_intermediate_by_default(): evaluator = _create_test_evaluator_gemini(threshold=0.8) actual_invocation, expected_invocation = _create_test_invocations( diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py index 22ac74f201..d06eafed4b 100644 --- a/tests/unittests/evaluation/test_llm_as_judge_utils.py +++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py @@ -128,6 +128,36 @@ def test_get_text_from_content_with_invocation_full_response(): ) +def test_get_text_from_content_with_intermediate_data_full_response(): + invocation = Invocation( + user_content=genai_types.Content(parts=[genai_types.Part(text="user")]), + intermediate_data=IntermediateData( + intermediate_responses=[ + ("agent", [genai_types.Part(text="legacy intro")]), + ( + "tool", + [ + genai_types.Part( + function_call=genai_types.FunctionCall(name="lookup") + ) + ], + ), + ] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="final answer")] + ), + ) + + assert get_text_from_content(invocation) == "final answer" + assert ( + get_text_from_content( + invocation, include_intermediate_responses_in_final=True + ) + == "legacy intro\nfinal answer" + ) + + def test_get_eval_status_with_none_score(): """Tests get_eval_status returns NOT_EVALUATED for a None score.""" assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED