diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 1ea5f78c9a8..610531ab5a0 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -5,6 +5,7 @@ from spacy.lang.de import German from spacy.lang.en import English +from spacy.strings import hash_string from spacy.symbols import ORTH from spacy.tokenizer import Tokenizer from spacy.tokens import Doc @@ -555,3 +556,14 @@ def test_tokenizer_initial_special_case_explain(en_vocab): tokens = [t.text for t in tokenizer("id")] explain_tokens = [t[1] for t in tokenizer.explain("id")] assert tokens == explain_tokens + + +@pytest.mark.issue(13950) +def test_issue13950(en_tokenizer): + # Special contraction occurs before regular words + en_tokenizer("I can't believe you have done this") + + # "believe" and "this" appear after the special case "can't". + # They should still be cached. + assert hash_string("believe") in en_tokenizer._cache + assert hash_string("this") in en_tokenizer._cache diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 88e4b06b024..5b17ea42389 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -12,7 +12,7 @@ from .vocab cimport LexemesOrTokens, Vocab, _Cached cdef class Tokenizer: cdef Pool mem - cdef PreshMap _cache + cdef readonly PreshMap _cache # readonly so tests can check state cdef PreshMap _specials cdef readonly Vocab vocab diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 77718a75b0c..0c16fde509a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -192,6 +192,7 @@ cdef class Tokenizer: # we don't have to create the slice when we hit the cache. span = string[start:i] key = hash_string(span) + has_special = 0 if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases): self._tokenize(doc, span, key, &has_special, with_special_cases) if uc == ' ': @@ -204,6 +205,7 @@ cdef class Tokenizer: if start < i: span = string[start:] key = hash_string(span) + has_special = 0 if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases): self._tokenize(doc, span, key, &has_special, with_special_cases) doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws