From 39c32bb26900a8f8cca195059ccefa8d5a2c767b Mon Sep 17 00:00:00 2001 From: J Berg Date: Tue, 31 Mar 2026 22:19:37 +0100 Subject: [PATCH 1/2] Expose _cache and test cache state --- spacy/tests/tokenizer/test_tokenizer.py | 12 ++++++++++++ spacy/tokenizer.pxd | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 1ea5f78c9a8..610531ab5a0 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -5,6 +5,7 @@ from spacy.lang.de import German from spacy.lang.en import English +from spacy.strings import hash_string from spacy.symbols import ORTH from spacy.tokenizer import Tokenizer from spacy.tokens import Doc @@ -555,3 +556,14 @@ def test_tokenizer_initial_special_case_explain(en_vocab): tokens = [t.text for t in tokenizer("id")] explain_tokens = [t[1] for t in tokenizer.explain("id")] assert tokens == explain_tokens + + +@pytest.mark.issue(13950) +def test_issue13950(en_tokenizer): + # Special contraction occurs before regular words + en_tokenizer("I can't believe you have done this") + + # "believe" and "this" appear after the special case "can't". + # They should still be cached. + assert hash_string("believe") in en_tokenizer._cache + assert hash_string("this") in en_tokenizer._cache diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 88e4b06b024..5b17ea42389 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -12,7 +12,7 @@ from .vocab cimport LexemesOrTokens, Vocab, _Cached cdef class Tokenizer: cdef Pool mem - cdef PreshMap _cache + cdef readonly PreshMap _cache # readonly so tests can check state cdef PreshMap _specials cdef readonly Vocab vocab From b0834be78c233c65af660a833624cff57777492f Mon Sep 17 00:00:00 2001 From: J Berg Date: Tue, 31 Mar 2026 22:32:52 +0100 Subject: [PATCH 2/2] fix --- spacy/tokenizer.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 77718a75b0c..0c16fde509a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -192,6 +192,7 @@ cdef class Tokenizer: # we don't have to create the slice when we hit the cache. span = string[start:i] key = hash_string(span) + has_special = 0 if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases): self._tokenize(doc, span, key, &has_special, with_special_cases) if uc == ' ': @@ -204,6 +205,7 @@ cdef class Tokenizer: if start < i: span = string[start:] key = hash_string(span) + has_special = 0 if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases): self._tokenize(doc, span, key, &has_special, with_special_cases) doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws