diff --git a/src/google/adk/evaluation/eval_case.py b/src/google/adk/evaluation/eval_case.py index 8560762483..983becffee 100644 --- a/src/google/adk/evaluation/eval_case.py +++ b/src/google/adk/evaluation/eval_case.py @@ -66,6 +66,9 @@ class InvocationEvent(EvalBaseModel): content: Optional[genai_types.Content] """The content of the event.""" + grounding_metadata: Optional[genai_types.GroundingMetadata] = None + """Grounding metadata emitted with the event.""" + class InvocationEvents(EvalBaseModel): """A container for events that occur during the course of an invocation.""" diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 5b0100818c..d8f0374c0d 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -653,6 +653,7 @@ def convert_events_to_eval_invocations( final_response = event.content final_event = event + should_add_event = event.grounding_metadata is not None for p in event.content.parts: if ( p.function_call @@ -660,14 +661,24 @@ def convert_events_to_eval_invocations( or p.text or p.inline_data ): - events_to_add.append(event) + should_add_event = True break - - invocation_events = [ - InvocationEvent(author=e.author, content=e.content) - for e in events_to_add - if e is not final_event - ] + if should_add_event: + events_to_add.append(event) + elif event.grounding_metadata is not None: + events_to_add.append(event) + + invocation_events = [] + for e in events_to_add: + if e is final_event and not e.grounding_metadata: + continue + invocation_events.append( + InvocationEvent( + author=e.author, + content=None if e is final_event else e.content, + grounding_metadata=e.grounding_metadata, + ) + ) invocations.append( Invocation( invocation_id=invocation_id, diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index 0986f2bed0..06c452dc42 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -146,6 +146,20 @@ class _ToolCallsAndResponses(EvalBaseModel): tool_calls_and_response: list[_ToolCallAndResponse] +class _GroundingMetadataEntry(EvalBaseModel): + """Internal data model to capture grounding metadata from an invocation.""" + + step: int + author: str + grounding_metadata: genai_types.GroundingMetadata + + +class _GroundingMetadataEntries(EvalBaseModel): + """Internal data model used for serializing grounding metadata.""" + + grounding_metadata: list[_GroundingMetadataEntry] + + def get_tool_calls_and_responses_as_json_str( intermediate_data: Optional[IntermediateDataType], ) -> str: @@ -180,3 +194,34 @@ def get_tool_calls_and_responses_as_json_str( exclude_defaults=True, exclude_none=True, ) + + +def get_grounding_metadata_as_json_str( + intermediate_data: Optional[IntermediateDataType], +) -> str: + """Returns a JSON string representation of grounding metadata.""" + if not isinstance(intermediate_data, InvocationEvents): + return "No grounding metadata was provided." + + grounding_metadata = [] + for idx, invocation_event in enumerate(intermediate_data.invocation_events): + if invocation_event.grounding_metadata: + grounding_metadata.append( + _GroundingMetadataEntry( + step=idx, + author=invocation_event.author, + grounding_metadata=invocation_event.grounding_metadata, + ) + ) + + if not grounding_metadata: + return "No grounding metadata was provided." + + return _GroundingMetadataEntries( + grounding_metadata=grounding_metadata + ).model_dump_json( + indent=2, + exclude_unset=True, + exclude_defaults=True, + exclude_none=True, + ) diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index 135b2b9593..54ee97855f 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -25,7 +25,7 @@ from .eval_case import InvocationEvents from .eval_metrics import EvalMetric from .eval_metrics import RubricsBasedCriterion -from .eval_rubrics import Rubric +from .llm_as_judge_utils import get_grounding_metadata_as_json_str from .llm_as_judge_utils import get_text_from_content from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str from .llm_as_judge_utils import get_tool_declarations_as_json_str @@ -46,8 +46,9 @@ # Key Evaluation Principles Your evaluation must follow a two-part process: first, collect trusted evidence from the agent's work, and second, judge the final answer against it. -1. **Establish Trusted Evidence from Tool Calls**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt. - * Your ONLY sources of truth are the and the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the . Examples of procedural flaws include: +1. **Establish Trusted Evidence from Tool Calls and Grounding**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt. + * Your ONLY sources of truth are the , the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the , and model-supplied grounding metadata found in . + * Grounding metadata is trusted evidence for model-internal tools such as google_search whose raw search results may not appear as function tool responses. Examples of procedural flaws include: * The agent failed to call a tool that will enable it to answer the user's prompt despite having all the necessary parameters to do so. * The agent called the tool with incorrect or missing parameters. * The agent called a tool that does not exist, or called a tool with a parameter that does not exist. @@ -215,6 +216,9 @@ {response_steps} + + {grounding_metadata} + {final_response} @@ -297,6 +301,9 @@ def format_auto_rater_prompt( response_steps = get_tool_calls_and_responses_as_json_str( actual_invocation.intermediate_data ) + grounding_metadata = get_grounding_metadata_as_json_str( + actual_invocation.intermediate_data + ) app_details = actual_invocation.app_details if app_details: @@ -316,6 +323,7 @@ def format_auto_rater_prompt( tool_declarations=tool_declarations, user_input=user_input, response_steps=response_steps, + grounding_metadata=grounding_metadata, final_response=final_response, rubrics=rubrics_text, ) diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py index 05ab25cc72..eb3e29c15f 100644 --- a/tests/unittests/evaluation/test_evaluation_generator.py +++ b/tests/unittests/evaluation/test_evaluation_generator.py @@ -229,6 +229,31 @@ def test_convert_multi_agent_final_responses( assert intermediate_events[0].author == "agent1" assert intermediate_events[0].content.parts[0].text == "First response" + def test_convert_preserves_grounding_metadata_from_final_response( + self, + ): + """Tests final grounding metadata is available to evaluators.""" + grounding_metadata = types.GroundingMetadata( + web_search_queries=["recent AI news"] + ) + events = [ + _build_event("user", [types.Part(text="What's new in AI?")], "inv1"), + Event( + author="agent", + content=types.Content(parts=[types.Part(text="Here are sources.")]), + invocation_id="inv1", + grounding_metadata=grounding_metadata, + ), + ] + + invocations = EvaluationGenerator.convert_events_to_eval_invocations(events) + + assert len(invocations) == 1 + invocation_events = invocations[0].intermediate_data.invocation_events + assert len(invocation_events) == 1 + assert invocation_events[0].content is None + assert invocation_events[0].grounding_metadata == grounding_metadata + class TestGetAppDetailsByInvocationId: """Test cases for EvaluationGenerator._get_app_details_by_invocation_id method.""" diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py index 4b53a2dc43..37efdf96a6 100644 --- a/tests/unittests/evaluation/test_llm_as_judge_utils.py +++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py @@ -26,6 +26,7 @@ from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score from google.adk.evaluation.llm_as_judge_utils import get_eval_status +from google.adk.evaluation.llm_as_judge_utils import get_grounding_metadata_as_json_str from google.adk.evaluation.llm_as_judge_utils import get_text_from_content from google.adk.evaluation.llm_as_judge_utils import get_tool_calls_and_responses_as_json_str from google.adk.evaluation.llm_as_judge_utils import get_tool_declarations_as_json_str @@ -332,3 +333,36 @@ def test_get_tool_calls_and_responses_as_json_str_with_invocation_events_multipl ] } assert json.loads(json_str) == expected_json + + +def test_get_grounding_metadata_as_json_str_with_invocation_events(): + """Tests grounding metadata is serialized for LLM-as-judge prompts.""" + grounding_metadata = genai_types.GroundingMetadata( + web_search_queries=["recent AI news"] + ) + intermediate_data = InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=None, + grounding_metadata=grounding_metadata, + ) + ] + ) + + json_str = get_grounding_metadata_as_json_str(intermediate_data) + parsed = json.loads(json_str) + + assert parsed["grounding_metadata"][0]["step"] == 0 + assert parsed["grounding_metadata"][0]["author"] == "agent" + assert parsed["grounding_metadata"][0]["grounding_metadata"][ + "web_search_queries" + ] == ["recent AI news"] + + +def test_get_grounding_metadata_as_json_str_without_metadata(): + """Tests empty grounding metadata serialization.""" + assert ( + get_grounding_metadata_as_json_str(InvocationEvents()) + == "No grounding metadata was provided." + ) diff --git a/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py b/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py index e100f9c06a..8166b8b62a 100644 --- a/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py +++ b/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py @@ -182,6 +182,37 @@ def test_format_auto_rater_prompt_with_intermediate_data( assert '"result": "ok"' in prompt +def test_format_auto_rater_prompt_with_grounding_metadata( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests grounding metadata is included as trusted evidence.""" + grounding_metadata = genai_types.GroundingMetadata( + web_search_queries=["recent AI news"] + ) + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="What's new in AI?")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="Here are sources.")] + ), + intermediate_data=InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=None, + grounding_metadata=grounding_metadata, + ) + ] + ), + ) + prompt = evaluator.format_auto_rater_prompt(invocation, None) + + assert "" in prompt + assert "recent AI news" in prompt + assert "model-supplied grounding metadata" in prompt + + def test_format_auto_rater_prompt_with_app_details_no_tools( evaluator: RubricBasedFinalResponseQualityV1Evaluator, ):