Skip to content
Open
8 changes: 8 additions & 0 deletions src/google/adk/evaluation/final_response_match_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,14 @@ def aggregate_invocation_results(
continue
num_evaluated += 1
num_valid += result.score

if num_evaluated == 0:
return EvaluationResult(
overall_score=None,
overall_eval_status=EvalStatus.NOT_EVALUATED,
per_invocation_results=per_invocation_results,
)

overall_score = num_valid / num_evaluated
return EvaluationResult(
overall_score=overall_score,
Expand Down
31 changes: 31 additions & 0 deletions tests/unittests/evaluation/test_final_response_match_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,3 +486,34 @@ def test_aggregate_invocation_results():
# Only 4 / 8 invocations are evaluated, and 2 / 4 are valid.
assert aggregated_result.overall_score == 0.5
assert aggregated_result.overall_eval_status == EvalStatus.PASSED


def test_aggregate_invocation_results_none_evaluated():
evaluator = _create_test_evaluator_gemini(threshold=0.5)

actual_invocation, expected_invocation = _create_test_invocations(
"candidate text", "reference text"
)

per_invocation_results = [
PerInvocationResult(
actual_invocation=actual_invocation,
expected_invocation=expected_invocation,
score=None,
eval_status=EvalStatus.NOT_EVALUATED,
),
PerInvocationResult(
actual_invocation=actual_invocation,
expected_invocation=expected_invocation,
score=1.0,
eval_status=EvalStatus.NOT_EVALUATED,
),
]

aggregated_result = evaluator.aggregate_invocation_results(
per_invocation_results
)

assert aggregated_result.overall_score is None
assert aggregated_result.overall_eval_status == EvalStatus.NOT_EVALUATED
assert aggregated_result.per_invocation_results == per_invocation_results