Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/google/adk/dependencies/rouge_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
from __future__ import annotations

from rouge_score import rouge_scorer as rouge_scorer
from rouge_score import tokenizers as tokenizers
39 changes: 38 additions & 1 deletion src/google/adk/evaluation/final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
from __future__ import annotations

from typing import Optional
import unicodedata

from google.genai import types as genai_types
from typing_extensions import override

from ..dependencies.rouge_scorer import rouge_scorer
from ..dependencies.rouge_scorer import tokenizers
from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_metrics import EvalMetric
Expand Down Expand Up @@ -92,6 +94,39 @@ def _get_eval_status(score: float, threshold: float):
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED


def _is_word_char(char: str) -> bool:
# Combining marks (e.g. Thai vowel signs, Devanagari matras) are not
# alphanumeric on their own but must stay attached to their base character.
return char.isalnum() or unicodedata.category(char).startswith("M")


class _UnicodeAwareTokenizer(tokenizers.Tokenizer):
"""Tokenizer that keeps non-ASCII word characters.

The default rouge_score tokenizer discards any character outside [a-z0-9],
so text in non-Latin scripts (e.g. Thai, Chinese, Arabic) tokenizes to
nothing and always scores 0. This tokenizer keeps Unicode word characters
and delegates ASCII tokens to the default tokenizer, preserving its
stemming and scoring behavior for English text.
"""

def __init__(self, use_stemmer: bool = False):
self._default_tokenizer = tokenizers.DefaultTokenizer(use_stemmer)

def tokenize(self, text: str) -> list[str]:
text = text.lower()
words = "".join(
char if _is_word_char(char) else " " for char in text
).split()
tokens = []
for word in words:
if word.isascii():
tokens.extend(self._default_tokenizer.tokenize(word))
else:
tokens.append(word)
return tokens


def _calculate_rouge_1_scores(candidate: str, reference: str):
"""Calculates the ROUGE-1 score between a candidate and reference text.

Expand All @@ -110,7 +145,9 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
Returns:
A dictionary containing the ROUGE-1 precision, recall, and f-measure.
"""
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
scorer = rouge_scorer.RougeScorer(
["rouge1"], tokenizer=_UnicodeAwareTokenizer(use_stemmer=True)
)

# The score method returns a dictionary where keys are the ROUGE types
# and values are Score objects (tuples) with precision, recall, and fmeasure.
Expand Down
60 changes: 60 additions & 0 deletions tests/unittests/evaluation/test_final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
from google.adk.evaluation.evaluator import EvalStatus
from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
from google.adk.evaluation.final_response_match_v1 import _UnicodeAwareTokenizer
from google.adk.evaluation.final_response_match_v1 import RougeEvaluator
from google.genai import types as genai_types
import pytest
from rouge_score import tokenizers


def _create_test_rouge_evaluator(threshold: float) -> RougeEvaluator:
Expand Down Expand Up @@ -87,6 +89,58 @@ def test_calculate_rouge_1_scores():
assert rouge_1_score.fmeasure == pytest.approx(8 / 11)


@pytest.mark.parametrize(
"text",
[
"สวัสดี", # Thai
"你好世界", # Chinese
"مرحبا بالعالم", # Arabic
"こんにちは", # Japanese
"Здравствуйте", # Russian
],
)
def test_calculate_rouge_1_scores_identical_non_english_text(text: str):
rouge_1_score = _calculate_rouge_1_scores(text, text)
assert rouge_1_score.precision == pytest.approx(1)
assert rouge_1_score.recall == pytest.approx(1)
assert rouge_1_score.fmeasure == pytest.approx(1)


def test_calculate_rouge_1_scores_different_non_english_text():
candidate = "мир привет"
reference = "привет только"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.precision == pytest.approx(1 / 2)
assert rouge_1_score.recall == pytest.approx(1 / 2)
assert rouge_1_score.fmeasure == pytest.approx(1 / 2)


def test_calculate_rouge_1_scores_mixed_language_text():
candidate = "hello สวัสดี"
reference = "hello world"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.precision == pytest.approx(1 / 2)
assert rouge_1_score.recall == pytest.approx(1 / 2)
assert rouge_1_score.fmeasure == pytest.approx(1 / 2)


@pytest.mark.parametrize(
"text",
[
"The quick brown fox jumps over the lazy dog.",
"Testing stemmed words like running and jumped, don't split!",
"Numbers 123 and mixed a1b2 tokens under_scored.",
"",
],
)
def test_unicode_aware_tokenizer_matches_default_tokenizer_for_ascii(
text: str,
):
default_tokens = tokenizers.DefaultTokenizer(use_stemmer=True).tokenize(text)
unicode_tokens = _UnicodeAwareTokenizer(use_stemmer=True).tokenize(text)
assert unicode_tokens == default_tokens


@pytest.mark.parametrize(
"candidates, references, expected_score, expected_status",
[
Expand Down Expand Up @@ -114,6 +168,12 @@ def test_calculate_rouge_1_scores():
1.0,
EvalStatus.PASSED,
),
(
["สวัสดี", "你好"],
["สวัสดี", "你好"],
1.0,
EvalStatus.PASSED,
),
],
)
def test_rouge_evaluator_multiple_invocations(
Expand Down