From 352691ef1cb9589661fe41ed3eff34e13a69d593 Mon Sep 17 00:00:00 2001
From: zeel sheladiya <46935793+zeelsheladiya@users.noreply.github.com>
Date: Tue, 8 Aug 2023 00:35:35 -0400
Subject: [PATCH 1/2] Update tokenizer.py

---
 llama/tokenizer.py | 50 +++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/llama/tokenizer.py b/llama/tokenizer.py
index e3af01112..717ce7cde 100755
--- a/llama/tokenizer.py
+++ b/llama/tokenizer.py
@@ -3,7 +3,7 @@
 
 import os
 from logging import getLogger
-from typing import List
+from typing import List, Union
 
 from sentencepiece import SentencePieceProcessor
 
@@ -12,25 +12,35 @@
 
 
 class Tokenizer:
-    def __init__(self, model_path: str):
-        # reload tokenizer
-        assert os.path.isfile(model_path), model_path
-        self.sp_model = SentencePieceProcessor(model_file=model_path)
-        logger.info(f"Reloaded SentencePiece model from {model_path}")
-
-        # BOS / EOS token IDs
-        self.n_words: int = self.sp_model.vocab_size()
-        self.bos_id: int = self.sp_model.bos_id()
-        self.eos_id: int = self.sp_model.eos_id()
-        self.pad_id: int = self.sp_model.pad_id()
-        logger.info(
-            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
-        )
-        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
-
-    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
-        assert type(s) is str
-        t = self.sp_model.encode(s)
+    def __init__(self, model_path: Union[str, None] = None):
+
+        if model_path is not None:
+            if not os.path.isfile(model_path):
+                raise FileNotFoundError(f"Model file not found: {model_path}")
+            self.sp_model = SentencePieceProcessor(model_file=model_path)
+            logger.info(f"Reloaded SentencePiece model from {model_path}")
+
+            # BOS / EOS / PAD / UNK token IDs
+            self.n_words: int = self.sp_model.vocab_size()
+            self.bos_id: int = self.sp_model.bos_id()
+            self.eos_id: int = self.sp_model.eos_id()
+            self.pad_id: int = self.sp_model.pad_id()
+            self.unk_id: int = self.sp_model.unk_id()
+            logger.info(
+                f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+            )
+            assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert isinstance(s, str), "Input 's' must be a string"
+        try:
+            t = self.sp_model.encode(s)
+        except Exception as e:
+            raise ValueError(f"Error during tokenization: {e}")
+        
+        # Handle unknown tokens
+        t = [token_id if token_id in range(self.n_words) else self.unk_id for token_id in t]
+        
         if bos:
             t = [self.bos_id] + t
         if eos:

From f4ef6bce1f0b225e0d9bdfe98dfbb65e1e70c3d9 Mon Sep 17 00:00:00 2001
From: zeel sheladiya <46935793+zeelsheladiya@users.noreply.github.com>
Date: Wed, 6 Sep 2023 23:38:33 -0400
Subject: [PATCH 2/2] Refactor: Replace Union[str, None] with Optional[str]

#640  In this commit, I have addressed the feedback from the code review by replacing instances of `Union[str, None]` with `Optional[str]` in the Tokenizer class. This change aligns with the Python Typing documentation's recommendation for better type hinting.
---
 llama/tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama/tokenizer.py b/llama/tokenizer.py
index 717ce7cde..f80690d84 100755
--- a/llama/tokenizer.py
+++ b/llama/tokenizer.py
@@ -3,7 +3,7 @@
 
 import os
 from logging import getLogger
-from typing import List, Union
+from typing import List, Optional
 
 from sentencepiece import SentencePieceProcessor
 
@@ -12,7 +12,7 @@
 
 
 class Tokenizer:
-    def __init__(self, model_path: Union[str, None] = None):
+    def __init__(self, model_path: Optional[str] = None):
 
         if model_path is not None:
             if not os.path.isfile(model_path):