Merge pull request #16 from helpmefindaname/fix-special-token-handling

fix handling of special tokens for tokenizer that have strange buildups
helpmefindaname · Jul 8, 2024 · ecb6a76 · ecb6a76
2 parents 8eadc47 + 967fed0
commit ecb6a76
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/tests/test_set_tokenizer_vocab.py b/tests/test_set_tokenizer_vocab.py
@@ -20,6 +20,7 @@
     ("microsoft/layoutlm-large-uncased", "WordPiece"),
     ("microsoft/layoutlm-base-cased", "BPE"),
     ("xlm-roberta-large", "Unigram"),
+    ("sentence-transformers/all-mpnet-base-v2", "WordPiece"),
 ]
 unsupported_tokenizers = ["google/electra-small-discriminator"]
 

diff --git a/transformer_smaller_training_vocab/token_stats.py b/transformer_smaller_training_vocab/token_stats.py
@@ -8,8 +8,7 @@ def get_token_stats(
     tokenizer: PreTrainedTokenizer,
     texts: Sequence[Union[TextInput, PreTokenizedInput, TextInputPair, PreTokenizedInputPair]],
 ) -> List[int]:
-    used = set()
-    used.update(tokenizer.all_special_ids)
+    used = {token_id for token_id, token in tokenizer.added_tokens_decoder.items() if token.special}
     for text in texts:
         if isinstance(text, tuple):
             encoding = tokenizer(text[0], text[1], is_split_into_words=isinstance(text[0], list))