From 3f5519a355edb70f0a09092c2500f9d1f750becb Mon Sep 17 00:00:00 2001 From: agharsallah <17379925+agharsallah@users.noreply.github.com> Date: Sat, 4 Jul 2026 23:53:56 +0200 Subject: [PATCH] fix(evaluation): Support non-English responses in ROUGE-1 matching The default rouge_score tokenizer drops every character outside [a-z0-9], so responses in non-Latin scripts (Thai, Chinese, Arabic, etc.) tokenize to nothing and response_match_score is always 0, even for identical texts. Use a Unicode-aware tokenizer that keeps non-ASCII word characters (including combining marks such as Thai vowel signs) and delegates ASCII tokens to the default tokenizer, so stemming and scores for English text are unchanged. Fixes #3111 Signed-off-by: agharsallah <17379925+agharsallah@users.noreply.github.com> --- src/google/adk/dependencies/rouge_scorer.py | 1 + .../adk/evaluation/final_response_match_v1.py | 39 +++++++++++- .../test_final_response_match_v1.py | 60 +++++++++++++++++++ 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/src/google/adk/dependencies/rouge_scorer.py b/src/google/adk/dependencies/rouge_scorer.py index 622a190ab73..e7e6cfcd8c0 100644 --- a/src/google/adk/dependencies/rouge_scorer.py +++ b/src/google/adk/dependencies/rouge_scorer.py @@ -15,3 +15,4 @@ from __future__ import annotations from rouge_score import rouge_scorer as rouge_scorer +from rouge_score import tokenizers as tokenizers diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 24b77da1499..8c1d5e24dbf 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -15,11 +15,13 @@ from __future__ import annotations from typing import Optional +import unicodedata from google.genai import types as genai_types from typing_extensions import override from ..dependencies.rouge_scorer import rouge_scorer +from ..dependencies.rouge_scorer import tokenizers from .eval_case import ConversationScenario from .eval_case import Invocation from .eval_metrics import EvalMetric @@ -92,6 +94,39 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED +def _is_word_char(char: str) -> bool: + # Combining marks (e.g. Thai vowel signs, Devanagari matras) are not + # alphanumeric on their own but must stay attached to their base character. + return char.isalnum() or unicodedata.category(char).startswith("M") + + +class _UnicodeAwareTokenizer(tokenizers.Tokenizer): + """Tokenizer that keeps non-ASCII word characters. + + The default rouge_score tokenizer discards any character outside [a-z0-9], + so text in non-Latin scripts (e.g. Thai, Chinese, Arabic) tokenizes to + nothing and always scores 0. This tokenizer keeps Unicode word characters + and delegates ASCII tokens to the default tokenizer, preserving its + stemming and scoring behavior for English text. + """ + + def __init__(self, use_stemmer: bool = False): + self._default_tokenizer = tokenizers.DefaultTokenizer(use_stemmer) + + def tokenize(self, text: str) -> list[str]: + text = text.lower() + words = "".join( + char if _is_word_char(char) else " " for char in text + ).split() + tokens = [] + for word in words: + if word.isascii(): + tokens.extend(self._default_tokenizer.tokenize(word)) + else: + tokens.append(word) + return tokens + + def _calculate_rouge_1_scores(candidate: str, reference: str): """Calculates the ROUGE-1 score between a candidate and reference text. @@ -110,7 +145,9 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): Returns: A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + scorer = rouge_scorer.RougeScorer( + ["rouge1"], tokenizer=_UnicodeAwareTokenizer(use_stemmer=True) + ) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure. diff --git a/tests/unittests/evaluation/test_final_response_match_v1.py b/tests/unittests/evaluation/test_final_response_match_v1.py index b60c4b46be9..2a210f1e550 100644 --- a/tests/unittests/evaluation/test_final_response_match_v1.py +++ b/tests/unittests/evaluation/test_final_response_match_v1.py @@ -19,9 +19,11 @@ from google.adk.evaluation.eval_metrics import PrebuiltMetrics from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores +from google.adk.evaluation.final_response_match_v1 import _UnicodeAwareTokenizer from google.adk.evaluation.final_response_match_v1 import RougeEvaluator from google.genai import types as genai_types import pytest +from rouge_score import tokenizers def _create_test_rouge_evaluator(threshold: float) -> RougeEvaluator: @@ -87,6 +89,58 @@ def test_calculate_rouge_1_scores(): assert rouge_1_score.fmeasure == pytest.approx(8 / 11) +@pytest.mark.parametrize( + "text", + [ + "สวัสดี", # Thai + "你好世界", # Chinese + "مرحبا بالعالم", # Arabic + "こんにちは", # Japanese + "Здравствуйте", # Russian + ], +) +def test_calculate_rouge_1_scores_identical_non_english_text(text: str): + rouge_1_score = _calculate_rouge_1_scores(text, text) + assert rouge_1_score.precision == pytest.approx(1) + assert rouge_1_score.recall == pytest.approx(1) + assert rouge_1_score.fmeasure == pytest.approx(1) + + +def test_calculate_rouge_1_scores_different_non_english_text(): + candidate = "мир привет" + reference = "привет только" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.precision == pytest.approx(1 / 2) + assert rouge_1_score.recall == pytest.approx(1 / 2) + assert rouge_1_score.fmeasure == pytest.approx(1 / 2) + + +def test_calculate_rouge_1_scores_mixed_language_text(): + candidate = "hello สวัสดี" + reference = "hello world" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.precision == pytest.approx(1 / 2) + assert rouge_1_score.recall == pytest.approx(1 / 2) + assert rouge_1_score.fmeasure == pytest.approx(1 / 2) + + +@pytest.mark.parametrize( + "text", + [ + "The quick brown fox jumps over the lazy dog.", + "Testing stemmed words like running and jumped, don't split!", + "Numbers 123 and mixed a1b2 tokens under_scored.", + "", + ], +) +def test_unicode_aware_tokenizer_matches_default_tokenizer_for_ascii( + text: str, +): + default_tokens = tokenizers.DefaultTokenizer(use_stemmer=True).tokenize(text) + unicode_tokens = _UnicodeAwareTokenizer(use_stemmer=True).tokenize(text) + assert unicode_tokens == default_tokens + + @pytest.mark.parametrize( "candidates, references, expected_score, expected_status", [ @@ -114,6 +168,12 @@ def test_calculate_rouge_1_scores(): 1.0, EvalStatus.PASSED, ), + ( + ["สวัสดี", "你好"], + ["สวัสดี", "你好"], + 1.0, + EvalStatus.PASSED, + ), ], ) def test_rouge_evaluator_multiple_invocations(