From 35a41e42232908c57f0b136b65ef5812c5802a9e Mon Sep 17 00:00:00 2001 From: Jurie Smit Date: Mon, 11 May 2026 07:03:37 +0200 Subject: [PATCH 1/2] Add tokenutil alias backend support --- src/tokenutil/_backends.py | 10 ++++++++- src/tokenutil/_sentencepiece.py | 38 +++++++++++++++++++++++++++++++++ tests/test_count_tokens.py | 8 ++----- 3 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 src/tokenutil/_sentencepiece.py diff --git a/src/tokenutil/_backends.py b/src/tokenutil/_backends.py index f25d57e..4956853 100644 --- a/src/tokenutil/_backends.py +++ b/src/tokenutil/_backends.py @@ -32,6 +32,14 @@ "gpt-3.5", "text-embedding-3", "text-embedding-ada", + # Sluice policy aliases. + "auto", + "cheap-fast", + "cheap-reasoning", + "cheap-long-context", + "cheap-coding", + "premium", + "openrouter-free", # Inference aggregators hosting OpenAI-compatible open-weight models "groq", "together", @@ -97,7 +105,7 @@ def count_tokens_for_text(text: str, model: str) -> int: # Optional SentencePiece path for Gemini if "gemini" in model.lower(): try: - from tokenutil._sentencepiece import count_sp # type: ignore[import-not-found] + from tokenutil._sentencepiece import count_sp return count_sp(text) except ImportError: diff --git a/src/tokenutil/_sentencepiece.py b/src/tokenutil/_sentencepiece.py new file mode 100644 index 0000000..adbb833 --- /dev/null +++ b/src/tokenutil/_sentencepiece.py @@ -0,0 +1,38 @@ +"""Optional SentencePiece backend for Gemini-family models. + +This module is imported only when a Gemini model is requested and the +``tokenutil[gemini]`` extra is installed. The initial implementation uses a +generic SentencePieceProcessor path configured by environment variable because +Google's Gemini tokenizer model is not distributed by this package. +""" + +from __future__ import annotations + +import os +import warnings + + +def count_sp(text: str) -> int: + """Count tokens with a SentencePiece model configured at runtime. + + Set ``TOKENUTIL_SENTENCEPIECE_MODEL`` to the local ``.model`` file. If the + optional dependency or model file is missing, this function raises + ``ImportError`` so the caller can fall back to the conservative heuristic. + """ + try: + import sentencepiece as spm # type: ignore[import-not-found] + except ImportError as exc: # pragma: no cover - optional dependency + raise ImportError("sentencepiece is not installed") from exc + + model_path = os.getenv("TOKENUTIL_SENTENCEPIECE_MODEL", "") + if not model_path: + warnings.warn( + "tokenutil: TOKENUTIL_SENTENCEPIECE_MODEL is not set; " + "using fallback tokenizer for Gemini.", + UserWarning, + stacklevel=2, + ) + raise ImportError("TOKENUTIL_SENTENCEPIECE_MODEL is not set") + + processor = spm.SentencePieceProcessor(model_file=model_path) + return len(processor.encode(text, out_type=int)) diff --git a/tests/test_count_tokens.py b/tests/test_count_tokens.py index 17e8b1c..796aab6 100644 --- a/tests/test_count_tokens.py +++ b/tests/test_count_tokens.py @@ -87,13 +87,9 @@ def test_responses_api_input_text_type() -> None: def test_sluice_alias_cheap_fast_uses_cl100k() -> None: - # "cheap-fast" contains no known keyword; falls to heuristic BUT - # we just want a non-zero count without crash text = "The quick brown fox jumps over the lazy dog." - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - n = count_tokens(text, model="cheap-fast") - assert n > 0 + n = count_tokens(text, model="cheap-fast") + assert n == count_tokens(text, model="gpt-4") def test_sluice_alias_groq_model() -> None: From 4acb5f4932a3ed50be5264fd87c53b36190bc196 Mon Sep 17 00:00:00 2001 From: Jurie Smit Date: Mon, 11 May 2026 07:29:53 +0200 Subject: [PATCH 2/2] Handle invalid SentencePiece model fallback --- src/tokenutil/_sentencepiece.py | 6 +++++- tests/test_count_tokens.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/tokenutil/_sentencepiece.py b/src/tokenutil/_sentencepiece.py index adbb833..b053990 100644 --- a/src/tokenutil/_sentencepiece.py +++ b/src/tokenutil/_sentencepiece.py @@ -34,5 +34,9 @@ def count_sp(text: str) -> int: ) raise ImportError("TOKENUTIL_SENTENCEPIECE_MODEL is not set") - processor = spm.SentencePieceProcessor(model_file=model_path) + try: + processor = spm.SentencePieceProcessor(model_file=model_path) + except Exception as exc: + raise ImportError("failed to load TOKENUTIL_SENTENCEPIECE_MODEL") from exc + return len(processor.encode(text, out_type=int)) diff --git a/tests/test_count_tokens.py b/tests/test_count_tokens.py index 796aab6..f44675d 100644 --- a/tests/test_count_tokens.py +++ b/tests/test_count_tokens.py @@ -3,6 +3,7 @@ from __future__ import annotations import warnings +from types import SimpleNamespace import pytest @@ -105,6 +106,24 @@ def test_kimi_coding_alias() -> None: assert n == count_tokens(text, model="gpt-4") # kimi → cl100k_base +def test_gemini_sentencepiece_load_error_falls_back(monkeypatch: pytest.MonkeyPatch) -> None: + class BrokenSentencePieceProcessor: + def __init__(self, model_file: str) -> None: + raise OSError(f"missing model: {model_file}") + + monkeypatch.setenv("TOKENUTIL_SENTENCEPIECE_MODEL", "missing.model") + monkeypatch.setitem( + __import__("sys").modules, + "sentencepiece", + SimpleNamespace(SentencePieceProcessor=BrokenSentencePieceProcessor), + ) + + with pytest.warns(UserWarning, match="using 4 chars/token heuristic"): + n = count_tokens("a" * 400, model="gemini-flash") + + assert n == 100 + + # --------------------------------------------------------------------------- # Unknown model — should warn and return a positive heuristic count # ---------------------------------------------------------------------------