google · agharsallah · Jul 4, 2026
diff --git a/src/google/adk/dependencies/rouge_scorer.py b/src/google/adk/dependencies/rouge_scorer.py
@@ -15,3 +15,4 @@
 from __future__ import annotations
 
 from rouge_score import rouge_scorer as rouge_scorer
+from rouge_score import tokenizers as tokenizers
diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
@@ -15,11 +15,13 @@
 from __future__ import annotations
 
 from typing import Optional
+import unicodedata
 
 from google.genai import types as genai_types
 from typing_extensions import override
 
 from ..dependencies.rouge_scorer import rouge_scorer
+from ..dependencies.rouge_scorer import tokenizers
 from .eval_case import ConversationScenario
 from .eval_case import Invocation
 from .eval_metrics import EvalMetric
@@ -92,6 +94,39 @@ def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
+def _is_word_char(char: str) -> bool:
+  # Combining marks (e.g. Thai vowel signs, Devanagari matras) are not
+  # alphanumeric on their own but must stay attached to their base character.
+  return char.isalnum() or unicodedata.category(char).startswith("M")
+
+
+class _UnicodeAwareTokenizer(tokenizers.Tokenizer):
+  """Tokenizer that keeps non-ASCII word characters.
+
+  The default rouge_score tokenizer discards any character outside [a-z0-9],
+  so text in non-Latin scripts (e.g. Thai, Chinese, Arabic) tokenizes to
+  nothing and always scores 0. This tokenizer keeps Unicode word characters
+  and delegates ASCII tokens to the default tokenizer, preserving its
+  stemming and scoring behavior for English text.
+  """
+
+  def __init__(self, use_stemmer: bool = False):
+    self._default_tokenizer = tokenizers.DefaultTokenizer(use_stemmer)
+
+  def tokenize(self, text: str) -> list[str]:
+    text = text.lower()
+    words = "".join(
+        char if _is_word_char(char) else " " for char in text
+    ).split()
+    tokens = []
+    for word in words:
+      if word.isascii():
+        tokens.extend(self._default_tokenizer.tokenize(word))
+      else:
+        tokens.append(word)
+    return tokens
+
+
 def _calculate_rouge_1_scores(candidate: str, reference: str):
   """Calculates the ROUGE-1 score between a candidate and reference text.
 
@@ -110,7 +145,9 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
   Returns:
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
-  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+  scorer = rouge_scorer.RougeScorer(
+      ["rouge1"], tokenizer=_UnicodeAwareTokenizer(use_stemmer=True)
+  )
 
   # The score method returns a dictionary where keys are the ROUGE types
   # and values are Score objects (tuples) with precision, recall, and fmeasure.

diff --git a/tests/unittests/evaluation/test_final_response_match_v1.py b/tests/unittests/evaluation/test_final_response_match_v1.py
@@ -19,9 +19,11 @@
 from google.adk.evaluation.eval_metrics import PrebuiltMetrics
 from google.adk.evaluation.evaluator import EvalStatus
 from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
+from google.adk.evaluation.final_response_match_v1 import _UnicodeAwareTokenizer
 from google.adk.evaluation.final_response_match_v1 import RougeEvaluator
 from google.genai import types as genai_types
 import pytest
+from rouge_score import tokenizers
 
 
 def _create_test_rouge_evaluator(threshold: float) -> RougeEvaluator:
@@ -87,6 +89,58 @@ def test_calculate_rouge_1_scores():
   assert rouge_1_score.fmeasure == pytest.approx(8 / 11)
 
 
+@pytest.mark.parametrize(
+    "text",
+    [
+        "สวัสดี",  # Thai
+        "你好世界",  # Chinese
+        "مرحبا بالعالم",  # Arabic
+        "こんにちは",  # Japanese
+        "Здравствуйте",  # Russian
+    ],
+)
+def test_calculate_rouge_1_scores_identical_non_english_text(text: str):
+  rouge_1_score = _calculate_rouge_1_scores(text, text)
+  assert rouge_1_score.precision == pytest.approx(1)
+  assert rouge_1_score.recall == pytest.approx(1)
+  assert rouge_1_score.fmeasure == pytest.approx(1)
+
+
+def test_calculate_rouge_1_scores_different_non_english_text():
+  candidate = "мир привет"
+  reference = "привет только"
+  rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+  assert rouge_1_score.precision == pytest.approx(1 / 2)
+  assert rouge_1_score.recall == pytest.approx(1 / 2)
+  assert rouge_1_score.fmeasure == pytest.approx(1 / 2)
+
+
+def test_calculate_rouge_1_scores_mixed_language_text():
+  candidate = "hello สวัสดี"
+  reference = "hello world"
+  rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+  assert rouge_1_score.precision == pytest.approx(1 / 2)
+  assert rouge_1_score.recall == pytest.approx(1 / 2)
+  assert rouge_1_score.fmeasure == pytest.approx(1 / 2)
+
+
+@pytest.mark.parametrize(
+    "text",
+    [
+        "The quick brown fox jumps over the lazy dog.",
+        "Testing stemmed words like running and jumped, don't split!",
+        "Numbers 123 and mixed a1b2 tokens under_scored.",
+        "",
+    ],
+)
+def test_unicode_aware_tokenizer_matches_default_tokenizer_for_ascii(
+    text: str,
+):
+  default_tokens = tokenizers.DefaultTokenizer(use_stemmer=True).tokenize(text)
+  unicode_tokens = _UnicodeAwareTokenizer(use_stemmer=True).tokenize(text)
+  assert unicode_tokens == default_tokens
+
+
 @pytest.mark.parametrize(
     "candidates, references, expected_score, expected_status",
     [
@@ -114,6 +168,12 @@ def test_calculate_rouge_1_scores():
             1.0,
             EvalStatus.PASSED,
         ),
+        (
+            ["สวัสดี", "你好"],
+            ["สวัสดี", "你好"],
+            1.0,
+            EvalStatus.PASSED,
+        ),
     ],
 )
 def test_rouge_evaluator_multiple_invocations(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,3 +15,4 @@
		from __future__ import annotations

		from rouge_score import rouge_scorer as rouge_scorer
		from rouge_score import tokenizers as tokenizers