Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ class BaseCriterion(BaseModel):
description="The threshold to be used by the metric.",
)

include_intermediate_responses_in_final: bool = Field(
default=False,
description=(
"Whether to evaluate the full agent response including intermediate"
" natural language text, such as text emitted before tool calls, in"
" addition to the final response. By default, only the final"
" response text is sent to the judge."
),
)


class LlmAsAJudgeCriterion(BaseCriterion):
"""Criterion when using LLM-As-A-Judge metric."""
Expand Down
17 changes: 13 additions & 4 deletions src/google/adk/evaluation/final_response_match_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,22 @@ def format_auto_rater_prompt(
if expected_invocation is None:
raise ValueError("expected_invocation is required for this metric.")

reference = get_text_from_content(expected_invocation.final_response)
response = get_text_from_content(actual_invocation.final_response)
include_intermediate = (
self._criterion.include_intermediate_responses_in_final
)
reference = get_text_from_content(
expected_invocation,
include_intermediate_responses_in_final=include_intermediate,
)
response = get_text_from_content(
actual_invocation,
include_intermediate_responses_in_final=include_intermediate,
)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding or "" for reference and response to ensure the prompt remains valid JSON-like structure even if get_text_from_content returns None.

    return self._auto_rater_prompt_template.format(
        prompt=user_prompt,
        response=response or "",
        golden_response=reference or "",
    )

This would be consistent with the change you made in rubric_based_final_response_quality_v1.py.

user_prompt = get_text_from_content(expected_invocation.user_content)
return self._auto_rater_prompt_template.format(
prompt=user_prompt,
response=response,
golden_response=reference,
response=response or "",
golden_response=reference or "",
)

@override
Expand Down
29 changes: 28 additions & 1 deletion src/google/adk/evaluation/llm_as_judge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
from .app_details import AppDetails
from .common import EvalBaseModel
from .eval_case import get_all_tool_calls_with_responses
from .eval_case import IntermediateData
from .eval_case import IntermediateDataType
from .eval_case import Invocation
from .eval_case import InvocationEvents
from .eval_metrics import RubricScore
from .evaluator import EvalStatus

Expand All @@ -44,8 +47,32 @@ class Label(enum.Enum):


def get_text_from_content(
content: Optional[genai_types.Content],
content: Optional[Union[genai_types.Content, Invocation]],
*,
include_intermediate_responses_in_final: bool = False,
) -> Optional[str]:
if isinstance(content, Invocation):
if not include_intermediate_responses_in_final:
return get_text_from_content(content.final_response)

parts: list[str] = []
if isinstance(content.intermediate_data, InvocationEvents):
for event in content.intermediate_data.invocation_events:
text = get_text_from_content(event.content)
if text:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While InvocationEvents is the newer format, many parts of the codebase still use IntermediateData. Consider adding support for it here as well:

    elif isinstance(content.intermediate_data, IntermediateData):
      for author, parts in content.intermediate_data.intermediate_responses:
        text = get_text_from_content(genai_types.Content(parts=parts))
        if text:
          parts.append(text)

(Note: you'll need to import IntermediateData from .eval_case)

parts.append(text)
elif isinstance(content.intermediate_data, IntermediateData):
for _, response_parts in content.intermediate_data.intermediate_responses:
text = get_text_from_content(genai_types.Content(parts=response_parts))
if text:
parts.append(text)

final_text = get_text_from_content(content.final_response)
if final_text:
parts.append(final_text)

return "\n".join(parts) if parts else None

if content and content.parts:
return "\n".join([p.text for p in content.parts if p.text])

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from .eval_case import InvocationEvents
from .eval_metrics import EvalMetric
from .eval_metrics import RubricsBasedCriterion
from .eval_rubrics import Rubric
from .llm_as_judge_utils import get_text_from_content
from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
from .llm_as_judge_utils import get_tool_declarations_as_json_str
Expand Down Expand Up @@ -274,7 +273,15 @@ def format_auto_rater_prompt(
"""Returns the autorater prompt."""
self.create_effective_rubrics_list(actual_invocation.rubrics)
user_input = get_text_from_content(actual_invocation.user_content)
final_response = get_text_from_content(actual_invocation.final_response)
final_response = (
get_text_from_content(
actual_invocation,
include_intermediate_responses_in_final=(
self._criterion.include_intermediate_responses_in_final
),
)
or ""
)

rubrics_text = "\n".join([
f"* {r.rubric_content.text_property}"
Expand Down
77 changes: 75 additions & 2 deletions tests/unittests/evaluation/test_final_response_match_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
from __future__ import annotations

from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_case import InvocationEvent
from google.adk.evaluation.eval_case import InvocationEvents
from google.adk.evaluation.eval_metrics import BaseCriterion
from google.adk.evaluation.eval_metrics import EvalMetric
from google.adk.evaluation.eval_metrics import EvalStatus
from google.adk.evaluation.eval_metrics import JudgeModelOptions
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
from google.adk.evaluation.evaluator import PerInvocationResult
from google.adk.evaluation.final_response_match_v2 import _parse_critique
from google.adk.evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
Expand Down Expand Up @@ -127,13 +127,18 @@ def create_test_template() -> str:

def _create_test_evaluator_gemini(
threshold: float,
*,
include_intermediate_responses_in_final: bool = False,
) -> FinalResponseMatchV2Evaluator:
evaluator = FinalResponseMatchV2Evaluator(
EvalMetric(
metric_name="final_response_match_v2",
threshold=threshold,
criterion=BaseCriterion(
threshold=0.5,
include_intermediate_responses_in_final=(
include_intermediate_responses_in_final
),
),
),
)
Expand Down Expand Up @@ -168,6 +173,21 @@ def _create_test_invocations(
return actual_invocation, expected_invocation


def _add_intermediate_text(invocation: Invocation, text: str) -> Invocation:
invocation.intermediate_data = InvocationEvents(
invocation_events=[
InvocationEvent(
author="agent",
content=genai_types.Content(
parts=[genai_types.Part(text=text)],
role="model",
),
),
]
)
return invocation


def test_format_auto_rater_prompt():
evaluator = _create_test_evaluator_gemini(threshold=0.8)
actual_invocation, expected_invocation = _create_test_invocations(
Expand All @@ -193,6 +213,59 @@ def test_format_auto_rater_prompt():
"""


def test_format_auto_rater_prompt_uses_empty_text_for_missing_final_response():
evaluator = _create_test_evaluator_gemini(threshold=0.8)
actual_invocation, expected_invocation = _create_test_invocations(
"candidate text", "reference text"
)
actual_invocation.final_response = None
expected_invocation.final_response = None

prompt = evaluator.format_auto_rater_prompt(
actual_invocation, expected_invocation
)

assert "None" not in prompt
assert '"Agent response": ,' in prompt
assert '"Reference response": ,' in prompt


def test_format_auto_rater_prompt_ignores_intermediate_by_default():
evaluator = _create_test_evaluator_gemini(threshold=0.8)
actual_invocation, expected_invocation = _create_test_invocations(
"candidate final", "reference final"
)
_add_intermediate_text(actual_invocation, "candidate intro")
_add_intermediate_text(expected_invocation, "reference intro")

prompt = evaluator.format_auto_rater_prompt(
actual_invocation, expected_invocation
)

assert "candidate final" in prompt
assert "reference final" in prompt
assert "candidate intro" not in prompt
assert "reference intro" not in prompt


def test_format_auto_rater_prompt_includes_intermediate_when_enabled():
evaluator = _create_test_evaluator_gemini(
threshold=0.8, include_intermediate_responses_in_final=True
)
actual_invocation, expected_invocation = _create_test_invocations(
"candidate final", "reference final"
)
_add_intermediate_text(actual_invocation, "candidate intro")
_add_intermediate_text(expected_invocation, "reference intro")

prompt = evaluator.format_auto_rater_prompt(
actual_invocation, expected_invocation
)

assert "candidate intro\ncandidate final" in prompt
assert "reference intro\nreference final" in prompt


def test_convert_auto_rater_response_to_score_valid():
evaluator = _create_test_evaluator_gemini(threshold=0.8)
auto_rater_response = """```json
Expand Down
72 changes: 71 additions & 1 deletion tests/unittests/evaluation/test_llm_as_judge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from google.adk.evaluation.app_details import AgentDetails
from google.adk.evaluation.app_details import AppDetails
from google.adk.evaluation.eval_case import IntermediateData
from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_case import InvocationEvent
from google.adk.evaluation.eval_case import InvocationEvents
from google.adk.evaluation.eval_rubrics import RubricScore
Expand All @@ -45,7 +46,7 @@ def test_get_text_from_content_with_content_and_none_parts():
def test_get_text_from_content_with_empty_parts():
"""Tests get_text_from_content with an empty parts list."""
content = genai_types.Content(parts=[])
assert get_text_from_content(content) == None
assert get_text_from_content(content) is None
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch on updating this to use is None for better Python idiomaticity.



def test_get_text_from_content_with_parts_but_no_text():
Expand Down Expand Up @@ -88,6 +89,75 @@ def test_get_text_from_content_with_mixed_parts():
assert get_text_from_content(content) == "Hello\nWorld"


def test_get_text_from_content_with_invocation_full_response():
invocation = Invocation(
user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
intermediate_data=InvocationEvents(
invocation_events=[
InvocationEvent(
author="agent",
content=genai_types.Content(
parts=[genai_types.Part(text="thinking aloud")]
),
),
InvocationEvent(
author="tool",
content=genai_types.Content(
parts=[
genai_types.Part(
function_call=genai_types.FunctionCall(
name="lookup"
)
)
]
),
),
]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text="final answer")]
),
)

assert get_text_from_content(invocation) == "final answer"
assert (
get_text_from_content(
invocation, include_intermediate_responses_in_final=True
)
== "thinking aloud\nfinal answer"
)


def test_get_text_from_content_with_intermediate_data_full_response():
invocation = Invocation(
user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
intermediate_data=IntermediateData(
intermediate_responses=[
("agent", [genai_types.Part(text="legacy intro")]),
(
"tool",
[
genai_types.Part(
function_call=genai_types.FunctionCall(name="lookup")
)
],
),
]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text="final answer")]
),
)

assert get_text_from_content(invocation) == "final answer"
assert (
get_text_from_content(
invocation, include_intermediate_responses_in_final=True
)
== "legacy intro\nfinal answer"
)


def test_get_eval_status_with_none_score():
"""Tests get_eval_status returns NOT_EVALUATED for a None score."""
assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED
Expand Down