Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/google/adk/evaluation/eval_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ class InvocationEvent(EvalBaseModel):
content: Optional[genai_types.Content]
"""The content of the event."""

grounding_metadata: Optional[genai_types.GroundingMetadata] = None
"""Grounding metadata emitted with the event."""


class InvocationEvents(EvalBaseModel):
"""A container for events that occur during the course of an invocation."""
Expand Down
25 changes: 18 additions & 7 deletions src/google/adk/evaluation/evaluation_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,21 +653,32 @@ def convert_events_to_eval_invocations(
final_response = event.content
final_event = event

should_add_event = event.grounding_metadata is not None
for p in event.content.parts:
if (
p.function_call
or p.function_response
or p.text
or p.inline_data
):
events_to_add.append(event)
should_add_event = True
break

invocation_events = [
InvocationEvent(author=e.author, content=e.content)
for e in events_to_add
if e is not final_event
]
if should_add_event:
events_to_add.append(event)
elif event.grounding_metadata is not None:
events_to_add.append(event)

invocation_events = []
for e in events_to_add:
if e is final_event and not e.grounding_metadata:
continue
invocation_events.append(
InvocationEvent(
author=e.author,
content=None if e is final_event else e.content,
grounding_metadata=e.grounding_metadata,
)
)
invocations.append(
Invocation(
invocation_id=invocation_id,
Expand Down
45 changes: 45 additions & 0 deletions src/google/adk/evaluation/llm_as_judge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,20 @@ class _ToolCallsAndResponses(EvalBaseModel):
tool_calls_and_response: list[_ToolCallAndResponse]


class _GroundingMetadataEntry(EvalBaseModel):
"""Internal data model to capture grounding metadata from an invocation."""

step: int
author: str
grounding_metadata: genai_types.GroundingMetadata


class _GroundingMetadataEntries(EvalBaseModel):
"""Internal data model used for serializing grounding metadata."""

grounding_metadata: list[_GroundingMetadataEntry]


def get_tool_calls_and_responses_as_json_str(
intermediate_data: Optional[IntermediateDataType],
) -> str:
Expand Down Expand Up @@ -180,3 +194,34 @@ def get_tool_calls_and_responses_as_json_str(
exclude_defaults=True,
exclude_none=True,
)


def get_grounding_metadata_as_json_str(
intermediate_data: Optional[IntermediateDataType],
) -> str:
"""Returns a JSON string representation of grounding metadata."""
if not isinstance(intermediate_data, InvocationEvents):
return "No grounding metadata was provided."

grounding_metadata = []
for idx, invocation_event in enumerate(intermediate_data.invocation_events):
if invocation_event.grounding_metadata:
grounding_metadata.append(
_GroundingMetadataEntry(
step=idx,
author=invocation_event.author,
grounding_metadata=invocation_event.grounding_metadata,
)
)

if not grounding_metadata:
return "No grounding metadata was provided."

return _GroundingMetadataEntries(
grounding_metadata=grounding_metadata
).model_dump_json(
indent=2,
exclude_unset=True,
exclude_defaults=True,
exclude_none=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from .eval_case import InvocationEvents
from .eval_metrics import EvalMetric
from .eval_metrics import RubricsBasedCriterion
from .eval_rubrics import Rubric
from .llm_as_judge_utils import get_grounding_metadata_as_json_str
from .llm_as_judge_utils import get_text_from_content
from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
from .llm_as_judge_utils import get_tool_declarations_as_json_str
Expand All @@ -46,8 +46,9 @@

# Key Evaluation Principles
Your evaluation must follow a two-part process: first, collect trusted evidence from the agent's work, and second, judge the final answer against it.
1. **Establish Trusted Evidence from Tool Calls**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt.
* Your ONLY sources of truth are the <user_prompt> and the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the <response_steps>. Examples of procedural flaws include:
1. **Establish Trusted Evidence from Tool Calls and Grounding**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt.
* Your ONLY sources of truth are the <user_prompt>, the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the <response_steps>, and model-supplied grounding metadata found in <grounding_metadata>.
* Grounding metadata is trusted evidence for model-internal tools such as google_search whose raw search results may not appear as function tool responses. Examples of procedural flaws include:
* The agent failed to call a tool that will enable it to answer the user's prompt despite having all the necessary parameters to do so.
* The agent called the tool with incorrect or missing parameters.
* The agent called a tool that does not exist, or called a tool with a parameter that does not exist.
Expand Down Expand Up @@ -215,6 +216,9 @@
<response_steps>
{response_steps}
</response_steps>
<grounding_metadata>
{grounding_metadata}
</grounding_metadata>
<final_answer>
{final_response}
</final_answer>
Expand Down Expand Up @@ -297,6 +301,9 @@ def format_auto_rater_prompt(
response_steps = get_tool_calls_and_responses_as_json_str(
actual_invocation.intermediate_data
)
grounding_metadata = get_grounding_metadata_as_json_str(
actual_invocation.intermediate_data
)

app_details = actual_invocation.app_details
if app_details:
Expand All @@ -316,6 +323,7 @@ def format_auto_rater_prompt(
tool_declarations=tool_declarations,
user_input=user_input,
response_steps=response_steps,
grounding_metadata=grounding_metadata,
final_response=final_response,
rubrics=rubrics_text,
)
Expand Down
25 changes: 25 additions & 0 deletions tests/unittests/evaluation/test_evaluation_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,31 @@ def test_convert_multi_agent_final_responses(
assert intermediate_events[0].author == "agent1"
assert intermediate_events[0].content.parts[0].text == "First response"

def test_convert_preserves_grounding_metadata_from_final_response(
self,
):
"""Tests final grounding metadata is available to evaluators."""
grounding_metadata = types.GroundingMetadata(
web_search_queries=["recent AI news"]
)
events = [
_build_event("user", [types.Part(text="What's new in AI?")], "inv1"),
Event(
author="agent",
content=types.Content(parts=[types.Part(text="Here are sources.")]),
invocation_id="inv1",
grounding_metadata=grounding_metadata,
),
]

invocations = EvaluationGenerator.convert_events_to_eval_invocations(events)

assert len(invocations) == 1
invocation_events = invocations[0].intermediate_data.invocation_events
assert len(invocation_events) == 1
assert invocation_events[0].content is None
assert invocation_events[0].grounding_metadata == grounding_metadata


class TestGetAppDetailsByInvocationId:
"""Test cases for EvaluationGenerator._get_app_details_by_invocation_id method."""
Expand Down
34 changes: 34 additions & 0 deletions tests/unittests/evaluation/test_llm_as_judge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from google.adk.evaluation.evaluator import EvalStatus
from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score
from google.adk.evaluation.llm_as_judge_utils import get_eval_status
from google.adk.evaluation.llm_as_judge_utils import get_grounding_metadata_as_json_str
from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
from google.adk.evaluation.llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
from google.adk.evaluation.llm_as_judge_utils import get_tool_declarations_as_json_str
Expand Down Expand Up @@ -332,3 +333,36 @@ def test_get_tool_calls_and_responses_as_json_str_with_invocation_events_multipl
]
}
assert json.loads(json_str) == expected_json


def test_get_grounding_metadata_as_json_str_with_invocation_events():
"""Tests grounding metadata is serialized for LLM-as-judge prompts."""
grounding_metadata = genai_types.GroundingMetadata(
web_search_queries=["recent AI news"]
)
intermediate_data = InvocationEvents(
invocation_events=[
InvocationEvent(
author="agent",
content=None,
grounding_metadata=grounding_metadata,
)
]
)

json_str = get_grounding_metadata_as_json_str(intermediate_data)
parsed = json.loads(json_str)

assert parsed["grounding_metadata"][0]["step"] == 0
assert parsed["grounding_metadata"][0]["author"] == "agent"
assert parsed["grounding_metadata"][0]["grounding_metadata"][
"web_search_queries"
] == ["recent AI news"]


def test_get_grounding_metadata_as_json_str_without_metadata():
"""Tests empty grounding metadata serialization."""
assert (
get_grounding_metadata_as_json_str(InvocationEvents())
== "No grounding metadata was provided."
)
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,37 @@ def test_format_auto_rater_prompt_with_intermediate_data(
assert '"result": "ok"' in prompt


def test_format_auto_rater_prompt_with_grounding_metadata(
evaluator: RubricBasedFinalResponseQualityV1Evaluator,
):
"""Tests grounding metadata is included as trusted evidence."""
grounding_metadata = genai_types.GroundingMetadata(
web_search_queries=["recent AI news"]
)
invocation = Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="What's new in AI?")]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text="Here are sources.")]
),
intermediate_data=InvocationEvents(
invocation_events=[
InvocationEvent(
author="agent",
content=None,
grounding_metadata=grounding_metadata,
)
]
),
)
prompt = evaluator.format_auto_rater_prompt(invocation, None)

assert "<grounding_metadata>" in prompt
assert "recent AI news" in prompt
assert "model-supplied grounding metadata" in prompt


def test_format_auto_rater_prompt_with_app_details_no_tools(
evaluator: RubricBasedFinalResponseQualityV1Evaluator,
):
Expand Down