From 352691ef1cb9589661fe41ed3eff34e13a69d593 Mon Sep 17 00:00:00 2001 From: zeel sheladiya <46935793+zeelsheladiya@users.noreply.github.com> Date: Tue, 8 Aug 2023 00:35:35 -0400 Subject: [PATCH 1/2] Update tokenizer.py --- llama/tokenizer.py | 50 +++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/llama/tokenizer.py b/llama/tokenizer.py index e3af01112..717ce7cde 100755 --- a/llama/tokenizer.py +++ b/llama/tokenizer.py @@ -3,7 +3,7 @@ import os from logging import getLogger -from typing import List +from typing import List, Union from sentencepiece import SentencePieceProcessor @@ -12,25 +12,35 @@ class Tokenizer: - def __init__(self, model_path: str): - # reload tokenizer - assert os.path.isfile(model_path), model_path - self.sp_model = SentencePieceProcessor(model_file=model_path) - logger.info(f"Reloaded SentencePiece model from {model_path}") - - # BOS / EOS token IDs - self.n_words: int = self.sp_model.vocab_size() - self.bos_id: int = self.sp_model.bos_id() - self.eos_id: int = self.sp_model.eos_id() - self.pad_id: int = self.sp_model.pad_id() - logger.info( - f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" - ) - assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() - - def encode(self, s: str, bos: bool, eos: bool) -> List[int]: - assert type(s) is str - t = self.sp_model.encode(s) + def __init__(self, model_path: Union[str, None] = None): + + if model_path is not None: + if not os.path.isfile(model_path): + raise FileNotFoundError(f"Model file not found: {model_path}") + self.sp_model = SentencePieceProcessor(model_file=model_path) + logger.info(f"Reloaded SentencePiece model from {model_path}") + + # BOS / EOS / PAD / UNK token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.pad_id() + self.unk_id: int = self.sp_model.unk_id() + logger.info( + f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" + ) + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert isinstance(s, str), "Input 's' must be a string" + try: + t = self.sp_model.encode(s) + except Exception as e: + raise ValueError(f"Error during tokenization: {e}") + + # Handle unknown tokens + t = [token_id if token_id in range(self.n_words) else self.unk_id for token_id in t] + if bos: t = [self.bos_id] + t if eos: From f4ef6bce1f0b225e0d9bdfe98dfbb65e1e70c3d9 Mon Sep 17 00:00:00 2001 From: zeel sheladiya <46935793+zeelsheladiya@users.noreply.github.com> Date: Wed, 6 Sep 2023 23:38:33 -0400 Subject: [PATCH 2/2] Refactor: Replace Union[str, None] with Optional[str] #640 In this commit, I have addressed the feedback from the code review by replacing instances of `Union[str, None]` with `Optional[str]` in the Tokenizer class. This change aligns with the Python Typing documentation's recommendation for better type hinting. --- llama/tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama/tokenizer.py b/llama/tokenizer.py index 717ce7cde..f80690d84 100755 --- a/llama/tokenizer.py +++ b/llama/tokenizer.py @@ -3,7 +3,7 @@ import os from logging import getLogger -from typing import List, Union +from typing import List, Optional from sentencepiece import SentencePieceProcessor @@ -12,7 +12,7 @@ class Tokenizer: - def __init__(self, model_path: Union[str, None] = None): + def __init__(self, model_path: Optional[str] = None): if model_path is not None: if not os.path.isfile(model_path):