diff --git a/fast_llm/data/preprocessing/tokenizer.py b/fast_llm/data/preprocessing/tokenizer.py index 157744f51..bed121af4 100644 --- a/fast_llm/data/preprocessing/tokenizer.py +++ b/fast_llm/data/preprocessing/tokenizer.py @@ -32,6 +32,11 @@ class TokenizerConfig(PreprocessingConfig): desc="BOS token to use if the tokenizer doesn't define one; must be an existing token.", hint=FieldHint.core, ) + allow_no_bos: bool = Field( + default=False, + desc="Allow the tokenizer to not have a BOS token. Set to True for tokenizers without BOS (e.g. Qwen).", + hint=FieldHint.core, + ) max_vocab_size: int | None = Field( default=None, desc="Constrain output tokens to a specific range. Used for testing.", @@ -63,8 +68,8 @@ def __init__(self, config: ConfigType): self.tokenizer.bos_token = self._config.bos_token if self.tokenizer.eos_token_id is None: raise ValueError("Tokenizer does not have an EOS token.") - if self.tokenizer.bos_token_id is None: - raise ValueError("Tokenizer does not have an BOS token.") + if self.tokenizer.bos_token_id is None and not self._config.allow_no_bos: + raise ValueError("Tokenizer does not have a BOS token. Set allow_no_bos=True to allow this.") self.eod_id = self.tokenizer.eos_token_id self.bod_id = self.tokenizer.bos_token_id @@ -89,9 +94,9 @@ def tokenize( import torch tokens = self.tokenizer.encode(text, add_special_tokens=False) - if begin: + if (begin and self.bod_id is not None): tokens.insert(0, self.bod_id) - if end: + if (end and self.eod_id is not None): tokens.append(self.eod_id) if self._config.max_vocab_size is not None: @@ -271,10 +276,10 @@ def tokenize_chat( # Prepend BOS / append EOS if not already present anywhere in the sequence. # We check anywhere (not just first/last) because some chat templates add trailing # whitespace after the final EOS token, e.g. "<|im_end|>\n". - prepend_bos = begin and self.bod_id not in tokens - append_eos = end and self.eod_id not in tokens + prepend_bos = begin and self.bod_id is not None and self.bod_id not in tokens + append_eos = end and self.eod_id is not None and self.eod_id not in tokens tokens = [self.bod_id] * prepend_bos + list(tokens) + [self.eod_id] * append_eos - train_mask = [False] * prepend_bos + [bool(m) for m in train_mask] + [False] * append_eos + train_mask = [False] * prepend_bos + [bool(m) for m in train_mask] + [True] * append_eos # Convert boolean train mask to loss masking spans (spans where train_mask[i] == False) loss_masking_spans = _train_mask_to_loss_spans(train_mask) diff --git a/tests/data/test_tokenizer.py b/tests/data/test_tokenizer.py index 4e9e2fdd5..a1b4a0b19 100644 --- a/tests/data/test_tokenizer.py +++ b/tests/data/test_tokenizer.py @@ -79,14 +79,14 @@ def test_validate_chat_template_with_markers(common_tokenizer): ("messages", "expected_tokens", "expected_loss_masking_spans"), ( # Single turn: full assistant turn (Hello) is trainable - # 15 tokens, trainable indices 7-13, loss mask spans cover 0-6 and 14 + # 15 tokens, trainable indices 7-14, loss mask spans cover 0-6 ( [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello"}], [49152, 27, 789, 29, 16946, 750, 789, 2293, 17822, 29, 7371, 750, 17822, 29, 49152], - [(0, 7), (14, 15)], + [(0, 7)], ), # Multi-turn: both assistant turns are fully trainable - # 27 tokens, trainable indices 7-13 and 19-25 + # 27 tokens, trainable indices 7-13 and 19-26 ( [ {"role": "user", "content": "A"}, @@ -123,10 +123,10 @@ def test_validate_chat_template_with_markers(common_tokenizer): 29, 49152, ], - [(0, 7), (14, 19), (26, 27)], + [(0, 7), (14, 19)], ), # System + user + assistant: full assistant turn trainable - # 23 tokens, trainable indices 15-21 + # 23 tokens, trainable indices 15-22 ( [ {"role": "system", "content": "You are helpful."}, @@ -158,17 +158,17 @@ def test_validate_chat_template_with_markers(common_tokenizer): 29, 49152, ], - [(0, 15), (22, 23)], + [(0, 15)], ), - # User only: no trainable tokens - # 9 tokens, no trainable indices + # User only: no trainable tokens except EOS + # 9 tokens, trainable index 8 (EOS) ( [{"role": "user", "content": "Hi"}], [49152, 27, 789, 29, 16946, 750, 789, 29, 49152], - [(0, 9)], + [(0, 8)], ), # Long multi-turn (85 tokens, 3 assistant responses with tags, tests span machinery) - # Trainable: indices 27-40, 49-62, 70-83 + # Trainable: indices 27-40, 49-62, 70-84 ( [ {"role": "system", "content": "You are a helpful assistant that answers questions."}, @@ -266,7 +266,7 @@ def test_validate_chat_template_with_markers(common_tokenizer): 29, 49152, ], - [(0, 27), (41, 49), (63, 70), (84, 85)], + [(0, 27), (41, 49), (63, 70)], ), ), )