From 1ac82065d8ac6e9e74c8a512349229ca45e582c6 Mon Sep 17 00:00:00 2001 From: Khoi-Nguyen Tran Date: Mon, 27 Apr 2026 21:16:19 +0000 Subject: [PATCH 1/2] test: add tokenizer regression test for granite-4.0-micro (#947) Assert golden token IDs and tokenizer class to catch transformers upgrades that change AutoTokenizer resolution for granite-4.0-micro. Motivated by transformers v5 returning GPT2Tokenizer (constructs BPE from vocab.json/merges.txt) instead of GPT2TokenizerFast (loads tokenizer.json), which produced different token IDs and broke 3/6 RAG intrinsics. Closes #947 Assisted-by: Claude Code Signed-off-by: Khoi-Nguyen Tran --- .../granite/test_tokenizer_regression.py | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 test/formatters/granite/test_tokenizer_regression.py diff --git a/test/formatters/granite/test_tokenizer_regression.py b/test/formatters/granite/test_tokenizer_regression.py new file mode 100644 index 000000000..e3c058915 --- /dev/null +++ b/test/formatters/granite/test_tokenizer_regression.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Regression tests for granite-4.0-micro tokenizer stability. + +Transformers v5 removed ``tokenization_gpt2_fast.py`` and merged GPT2Tokenizer +into a single class that inherits from ``TokenizersBackend``. The new +GPT2Tokenizer constructs BPE from ``vocab.json`` + ``merges.txt`` +(``VOCAB_FILES_NAMES`` in ``tokenization_gpt2.py``), whereas the old +GPT2TokenizerFast loaded the pre-built ``tokenizer.json`` +(``VOCAB_FILES_NAMES`` in ``tokenization_gpt2_fast.py``). The two file sources +produce different token IDs for granite-4.0-micro, causing regressions in 3/6 +RAG intrinsics. Because the v5 GPT2Tokenizer still uses the Rust backend, +``is_fast`` reports ``True`` — so the attribute is unreliable and golden token +ID assertions are the real guard. + +Fix: use ``PreTrainedTokenizerFast.from_pretrained()`` directly, which loads +``tokenizer.json``. + +References (line numbers approximate): + tf4 GPT2TokenizerFast — tokenization_gpt2_fast.py: + VOCAB_FILES_NAMES = {..., "tokenizer_file": "tokenizer.json"} + tf5 GPT2Tokenizer — tokenization_gpt2.py: + VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} +""" + +# Third Party +import pytest + +transformers = pytest.importorskip( + "transformers", reason="transformers not installed -- install mellea[hf]" +) + +pytestmark = pytest.mark.integration + +MODEL_ID = "ibm-granite/granite-4.0-micro" + +GOLDEN_TOKEN_IDS = { + "2023": [2366, 18], + "d.o.o": [67, 14778, 14778], + "60-138-3818": [1399, 12, 10350, 12, 19162, 23], + "Hello world": [9906, 1917], + "relevant": [98673], + "irrelevant": [404, 98673], + "partially relevant": [4581, 34575, 9959], + "answerable": [9399, 481], + "unanswerable": [359, 9399, 481], + "CLEAR": [91449], + "faithful": [75710, 1285], + "unfaithful": [359, 75710, 1285], +} + + +@pytest.fixture(scope="module") +def tokenizer(): + return transformers.AutoTokenizer.from_pretrained(MODEL_ID) + + +class TestTokenizerType: + def test_tokenizer_is_fast(self, tokenizer): + assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast), ( + f"Expected a PreTrainedTokenizerFast subclass but got " + f"{type(tokenizer).__name__}. The v5 GPT2Tokenizer constructs BPE " + f"from vocab.json/merges.txt instead of loading tokenizer.json, " + f"producing different token IDs that break RAG intrinsics." + ) + + +class TestTokenizationStability: + @pytest.mark.parametrize( + "text, expected_ids", + list(GOLDEN_TOKEN_IDS.items()), + ids=list(GOLDEN_TOKEN_IDS.keys()), + ) + def test_encode_golden_ids(self, tokenizer, text, expected_ids): + actual = tokenizer.encode(text, add_special_tokens=False) + assert actual == expected_ids + + @pytest.mark.parametrize( + "text, expected_ids", + list(GOLDEN_TOKEN_IDS.items()), + ids=list(GOLDEN_TOKEN_IDS.keys()), + ) + def test_roundtrip_decode(self, tokenizer, text, expected_ids): + encoded = tokenizer.encode(text, add_special_tokens=False) + decoded = tokenizer.decode(encoded) + assert decoded == text + + def test_special_tokens(self, tokenizer): + assert tokenizer.eos_token_id == 100257 + assert tokenizer.bos_token_id == 100257 From db2967eab40a29088e6df40b84f6fd26ce5d9ccc Mon Sep 17 00:00:00 2001 From: Khoi-Nguyen Tran Date: Tue, 28 Apr 2026 18:20:49 +0000 Subject: [PATCH 2/2] test: diversify golden token IDs with real RAG corruption patterns (#947) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace full-query golden entries and intrinsic output labels with 20 diverse corrupted snippets extracted from context_relevance and query_rewrite eval data. Coverage now spans pure numbers, DUNS, company suffixes, fiscal year compounds, industry codes, standards, and mixed text+number phrases — all confirmed to diverge under tf5. Assisted-by: Claude Code --- .../granite/test_tokenizer_regression.py | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/test/formatters/granite/test_tokenizer_regression.py b/test/formatters/granite/test_tokenizer_regression.py index e3c058915..35aeea34c 100644 --- a/test/formatters/granite/test_tokenizer_regression.py +++ b/test/formatters/granite/test_tokenizer_regression.py @@ -35,18 +35,34 @@ MODEL_ID = "ibm-granite/granite-4.0-micro" GOLDEN_TOKEN_IDS = { + # Pure numbers from query_rewrite corruptions "2023": [2366, 18], - "d.o.o": [67, 14778, 14778], + "650841823": [13655, 25496, 23848], + "409473852": [12378, 21505, 24571], + "914588298": [24579, 20691, 17690], + # Hyphenated numbers (DUNS) from query_rewrite corruptions "60-138-3818": [1399, 12, 10350, 12, 19162, 23], + "65-005-6716": [2397, 12, 8504, 12, 23403, 21], + # Text + dots/symbols from query_rewrite corruptions + "d.o.o": [67, 14778, 14778], + "D&B Score": [35, 49339, 18607], + "corp.net": [81827, 5181], + # Fiscal year compounds from context_relevance (financebench) + "FY2020": [82029, 2366, 15], + "FY2023": [82029, 2366, 18], + "Q3 2024": [48, 18, 220, 2366, 19], + "H1 2025": [39, 16, 220, 2366, 20], + # Text phrases with numbers from real RAG queries + "Broadcom in 2023": [69424, 884, 304, 220, 2366, 18], + "Maruti Enterprises in 2022": [12331, 32973, 67056, 304, 220, 2366, 17], + "spend in 2023": [2203, 408, 304, 220, 2366, 18], + "NAICS 541512": [7476, 19645, 220, 22058, 8358], + "IMAGINE d.o.o": [1829, 1929, 4069, 294, 14778, 14778], + # Standards and references + "ISO 9001:2015": [25141, 220, 7467, 16, 25, 679, 20], + "ref#2847": [1116, 2, 17058, 22], + # Baseline (matches across versions) "Hello world": [9906, 1917], - "relevant": [98673], - "irrelevant": [404, 98673], - "partially relevant": [4581, 34575, 9959], - "answerable": [9399, 481], - "unanswerable": [359, 9399, 481], - "CLEAR": [91449], - "faithful": [75710, 1285], - "unfaithful": [359, 75710, 1285], }