From 471a10caa7c71e25880df1034d9c967b36ba0ba8 Mon Sep 17 00:00:00 2001
From: Lingxiao Ma <xysmlx@gmail.com>
Date: Fri, 9 Aug 2024 12:12:53 +0000
Subject: [PATCH 1/3] fix BitNet integration for vLLM

---
 integration/BitNet/README.md                    |  2 +-
 integration/BitNet/maint/create_bitblas_ckpt.py |  9 ++++++---
 .../generate_bitnet_model_bitblas_format.sh     |  6 ++++++
 .../generate_bitnet_model_native_format.sh      |  6 +++---
 .../{quant_config.json => quantize_config.json} |  0
 .../inference_with_compress_format.py           |  4 ++--
 .../inference_with_native_format.py             | 17 ++---------------
 7 files changed, 20 insertions(+), 24 deletions(-)
 rename integration/BitNet/maint/{quant_config.json => quantize_config.json} (100%)

diff --git a/integration/BitNet/README.md b/integration/BitNet/README.md
index 78d8a7eb..4cac4984 100644
--- a/integration/BitNet/README.md
+++ b/integration/BitNet/README.md
@@ -24,7 +24,7 @@ cd /root/to/BitBLAS/integration/BitNet
 The second script is `generate_bitnet_model_bitblas_format.sh`, which is used to make a checkpoint with BitBLAS compressed metadata, which can avoid the online dequantize sage for the profiling of vLLM, which lead to more efficient memory utilization.
 
 ```bash
-./maint/generate_bitnet_model_bitblas_format.sh ./models/bitnet_3B_1.58bit ./models/bitnet_3B_1.58bit_bitblas
+./maint/generate_bitnet_model_bitblas_format.sh ./models/bitnet_b1_58-3B ./models/bitnet_b1_58-3B_bitblas
 # the output ckpy will be saved in the `./models/bitnet_b1_58-3B_bitblas` directory
 ```
 
diff --git a/integration/BitNet/maint/create_bitblas_ckpt.py b/integration/BitNet/maint/create_bitblas_ckpt.py
index d71f5958..6d5b3d59 100644
--- a/integration/BitNet/maint/create_bitblas_ckpt.py
+++ b/integration/BitNet/maint/create_bitblas_ckpt.py
@@ -4,14 +4,17 @@
 import argparse
 import torch
 import bitblas
-from modeling_bitnet import BitnetForCausalLM
-from tokenization_bitnet import BitnetTokenizer
 from transformers.utils.hub import cached_file
 import os
 from transformers import GenerationConfig
 import time
 import json
 
+import sys
+sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))+"/../")
+from modeling_bitnet import BitnetForCausalLM
+from tokenization_bitnet import BitnetTokenizer
+
 filepath = os.path.abspath(__file__)
 dirpath = os.path.dirname(filepath)
 
@@ -19,7 +22,7 @@
 bitblas.set_log_level("INFO")
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--model_name_or_path", type=str, default="BitBLASModel/open_llama_3b_1.58bits")
+parser.add_argument("--model_name_or_path", type=str, default="1bitLLM/bitnet_b1_58-3B")
 parser.add_argument("--saved_model_path", type=str, default=None)
 args = parser.parse_args()
 
diff --git a/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh b/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh
index aea62db9..3ace5803 100755
--- a/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh
+++ b/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh
@@ -24,4 +24,10 @@ fi
 # get the realpath of the saved model directory
 SAVED_MODEL_DIR=$(realpath $SAVED_MODEL_DIR)
 
+# cp files
+cp $MODEL_DIR/quantize_config.json $SAVED_MODEL_DIR/
+cp $MODEL_DIR/tokenizer.json $SAVED_MODEL_DIR/
+cp $MODEL_DIR/tokenizer.model $SAVED_MODEL_DIR/
+cp $MODEL_DIR/tokenizer_config.json $SAVED_MODEL_DIR/
+
 echo "Model has been converted and save to $SAVED_MODEL_DIR"
diff --git a/integration/BitNet/maint/generate_bitnet_model_native_format.sh b/integration/BitNet/maint/generate_bitnet_model_native_format.sh
index 75bac8a7..e066033b 100755
--- a/integration/BitNet/maint/generate_bitnet_model_native_format.sh
+++ b/integration/BitNet/maint/generate_bitnet_model_native_format.sh
@@ -14,13 +14,13 @@ mkdir -p models
 cd models
 
 # download the model
-git clone https://huggingface.co/1bitLLM/bitnet_b1_58-3B bitnet_3B_1.58bits --depth 1
+git clone https://huggingface.co/1bitLLM/bitnet_b1_58-3B bitnet_b1_58-3B --depth 1
 
 # copy quantized config into the model directory
-cp ../maint/quant_config.json bitnet_3B_1.58bits
+cp ../maint/quantize_config.json bitnet_b1_58-3B
 
 # get the realpath of the model directory
-MODEL_DIR=$(realpath bitnet_3B_1.58bits)
+MODEL_DIR=$(realpath bitnet_b1_58-3B)
 
 cd ..
 
diff --git a/integration/BitNet/maint/quant_config.json b/integration/BitNet/maint/quantize_config.json
similarity index 100%
rename from integration/BitNet/maint/quant_config.json
rename to integration/BitNet/maint/quantize_config.json
diff --git a/integration/BitNet/vllm_workspace/inference_with_compress_format.py b/integration/BitNet/vllm_workspace/inference_with_compress_format.py
index 45426d65..fcd728fc 100644
--- a/integration/BitNet/vllm_workspace/inference_with_compress_format.py
+++ b/integration/BitNet/vllm_workspace/inference_with_compress_format.py
@@ -19,7 +19,7 @@
 current_file_path = os.path.realpath(__file__)
 current_dir = os.path.dirname(current_file_path)
 
-ckpt_path = os.path.join(current_dir, "../models/bitnet_3b_1.58bits_bitblas")
+ckpt_path = os.path.join(current_dir, "../models/bitnet_b1_58-3B_bitblas")
 parser = argparse.ArgumentParser(description="Inference with BitNet")
 parser.add_argument(
     "--ckpt_path",
@@ -35,7 +35,7 @@
     ckpt_path,
     dtype="half",
     quantization="bitblas",
-    enforce_eager=True,
+    enforce_eager=True, # set False to enable cuda graph
 ) as bitnet_model:
     bitbnet_outputs = bitnet_model.generate_greedy(
         ["Hi, tell me about microsoft?"], max_tokens=1024
diff --git a/integration/BitNet/vllm_workspace/inference_with_native_format.py b/integration/BitNet/vllm_workspace/inference_with_native_format.py
index 07aefeec..85f409fb 100644
--- a/integration/BitNet/vllm_workspace/inference_with_native_format.py
+++ b/integration/BitNet/vllm_workspace/inference_with_native_format.py
@@ -19,7 +19,7 @@
 # get the path of the current file
 current_file_path = os.path.realpath(__file__)
 current_dir = os.path.dirname(current_file_path)
-ckpt_path = os.path.join(current_dir, "../models/bitnet_3b_1.58bits")
+ckpt_path = os.path.join(current_dir, "../models/bitnet_b1_58-3B_bitblas")
 
 parser = argparse.ArgumentParser(description="Inference with BitNet")
 parser.add_argument(
@@ -46,17 +46,4 @@
     print(bitbnet_outputs[0][0])
     print(bitbnet_outputs[0][1])
 
-# with VllmRunner(
-#     "BitBLASModel/open_llama_3b_1.58bits_bitblas",
-#     dtype="half",
-#     quantization="bitblas",
-#     enforce_eager=True,
-# ) as bitnet_model:
-#     torch.cuda.profiler.start()
-#     bitbnet_outputs = bitnet_model.generate_greedy(
-#         ["Hi, tell me about microsoft?"], max_tokens=1024
-#     )
-#     torch.cuda.profiler.stop()
-#     print("bitnet:")
-#     print(bitbnet_outputs[0][0])
-#     print(bitbnet_outputs[0][1])
+

From 7164521a81da37425e35a7e5f0ffd58d6ff6a42f Mon Sep 17 00:00:00 2001
From: Lingxiao Ma <xysmlx@gmail.com>
Date: Fri, 9 Aug 2024 12:23:46 +0000
Subject: [PATCH 2/3] update ckpt name of BitNet integration for vLLM

---
 integration/BitNet/README.md                                | 6 +++---
 .../BitNet/maint/generate_bitnet_model_native_format.sh     | 6 +++---
 .../BitNet/vllm_workspace/inference_with_compress_format.py | 2 +-
 .../BitNet/vllm_workspace/inference_with_native_format.py   | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/integration/BitNet/README.md b/integration/BitNet/README.md
index 4cac4984..63cc3e27 100644
--- a/integration/BitNet/README.md
+++ b/integration/BitNet/README.md
@@ -18,14 +18,14 @@ We provide two scripts to make the checkpoints for vLLM. The first script is `ge
 cd /root/to/BitBLAS/integration/BitNet
 # make the checkpoint
 ./maint/generate_bitnet_model_native_format.sh
-# the output ckpy will be saved in the `./models/bitnet_b1_58-3B` directory
+# the output ckpy will be saved in the `./models/ckpt_bitnet_b1_58-3B` directory
 ```
 
 The second script is `generate_bitnet_model_bitblas_format.sh`, which is used to make a checkpoint with BitBLAS compressed metadata, which can avoid the online dequantize sage for the profiling of vLLM, which lead to more efficient memory utilization.
 
 ```bash
-./maint/generate_bitnet_model_bitblas_format.sh ./models/bitnet_b1_58-3B ./models/bitnet_b1_58-3B_bitblas
-# the output ckpy will be saved in the `./models/bitnet_b1_58-3B_bitblas` directory
+./maint/generate_bitnet_model_bitblas_format.sh ./models/ckpt_bitnet_b1_58-3B ./models/ckpt_bitnet_b1_58-3B_bitblas
+# the output ckpy will be saved in the `./models/ckpt_bitnet_b1_58-3B_bitblas` directory
 ```
 
 Finnaly, you can use the ckpt in vLLM with:
diff --git a/integration/BitNet/maint/generate_bitnet_model_native_format.sh b/integration/BitNet/maint/generate_bitnet_model_native_format.sh
index e066033b..c002f6e1 100755
--- a/integration/BitNet/maint/generate_bitnet_model_native_format.sh
+++ b/integration/BitNet/maint/generate_bitnet_model_native_format.sh
@@ -14,13 +14,13 @@ mkdir -p models
 cd models
 
 # download the model
-git clone https://huggingface.co/1bitLLM/bitnet_b1_58-3B bitnet_b1_58-3B --depth 1
+git clone https://huggingface.co/1bitLLM/bitnet_b1_58-3B ckpt_bitnet_b1_58-3B --depth 1
 
 # copy quantized config into the model directory
-cp ../maint/quantize_config.json bitnet_b1_58-3B
+cp ../maint/quantize_config.json ckpt_bitnet_b1_58-3B
 
 # get the realpath of the model directory
-MODEL_DIR=$(realpath bitnet_b1_58-3B)
+MODEL_DIR=$(realpath ckpt_bitnet_b1_58-3B)
 
 cd ..
 
diff --git a/integration/BitNet/vllm_workspace/inference_with_compress_format.py b/integration/BitNet/vllm_workspace/inference_with_compress_format.py
index fcd728fc..d99eb493 100644
--- a/integration/BitNet/vllm_workspace/inference_with_compress_format.py
+++ b/integration/BitNet/vllm_workspace/inference_with_compress_format.py
@@ -19,7 +19,7 @@
 current_file_path = os.path.realpath(__file__)
 current_dir = os.path.dirname(current_file_path)
 
-ckpt_path = os.path.join(current_dir, "../models/bitnet_b1_58-3B_bitblas")
+ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B_bitblas")
 parser = argparse.ArgumentParser(description="Inference with BitNet")
 parser.add_argument(
     "--ckpt_path",
diff --git a/integration/BitNet/vllm_workspace/inference_with_native_format.py b/integration/BitNet/vllm_workspace/inference_with_native_format.py
index 85f409fb..e6db3a6c 100644
--- a/integration/BitNet/vllm_workspace/inference_with_native_format.py
+++ b/integration/BitNet/vllm_workspace/inference_with_native_format.py
@@ -19,7 +19,7 @@
 # get the path of the current file
 current_file_path = os.path.realpath(__file__)
 current_dir = os.path.dirname(current_file_path)
-ckpt_path = os.path.join(current_dir, "../models/bitnet_b1_58-3B_bitblas")
+ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B_bitblas")
 
 parser = argparse.ArgumentParser(description="Inference with BitNet")
 parser.add_argument(

From 5eae019ef33863075ee78311667a04d23d344d73 Mon Sep 17 00:00:00 2001
From: Lingxiao Ma <xysmlx@gmail.com>
Date: Fri, 9 Aug 2024 12:34:27 +0000
Subject: [PATCH 3/3] format code

---
 integration/BitNet/maint/create_bitblas_ckpt.py   |  7 +++++--
 .../inference_with_compress_format.py             | 13 ++++++-------
 .../inference_with_native_format.py               | 15 +++++----------
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/integration/BitNet/maint/create_bitblas_ckpt.py b/integration/BitNet/maint/create_bitblas_ckpt.py
index 6d5b3d59..0bf603e0 100644
--- a/integration/BitNet/maint/create_bitblas_ckpt.py
+++ b/integration/BitNet/maint/create_bitblas_ckpt.py
@@ -11,7 +11,8 @@
 import json
 
 import sys
-sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))+"/../")
+
+sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + "/../")
 from modeling_bitnet import BitnetForCausalLM
 from tokenization_bitnet import BitnetTokenizer
 
@@ -27,7 +28,9 @@
 args = parser.parse_args()
 
 model_name_or_path = args.model_name_or_path
-saved_model_path = os.path.join(dirpath, "models", f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
+saved_model_path = os.path.join(
+    dirpath, "models",
+    f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
 
 
 def generate_text(model, tokenizer, prompt, max_length=100):
diff --git a/integration/BitNet/vllm_workspace/inference_with_compress_format.py b/integration/BitNet/vllm_workspace/inference_with_compress_format.py
index d99eb493..9e60fa97 100644
--- a/integration/BitNet/vllm_workspace/inference_with_compress_format.py
+++ b/integration/BitNet/vllm_workspace/inference_with_compress_format.py
@@ -32,14 +32,13 @@
 
 ckpt_path = args.ckpt_path
 with VllmRunner(
-    ckpt_path,
-    dtype="half",
-    quantization="bitblas",
-    enforce_eager=True, # set False to enable cuda graph
+        ckpt_path,
+        dtype="half",
+        quantization="bitblas",
+        enforce_eager=True,  # set False to enable cuda graph
 ) as bitnet_model:
-    bitbnet_outputs = bitnet_model.generate_greedy(
-        ["Hi, tell me about microsoft?"], max_tokens=1024
-    )
+    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"],
+                                                   max_tokens=1024)
     print("bitnet inference:")
     print(bitbnet_outputs[0][0])
     print(bitbnet_outputs[0][1])
diff --git a/integration/BitNet/vllm_workspace/inference_with_native_format.py b/integration/BitNet/vllm_workspace/inference_with_native_format.py
index e6db3a6c..579c5e17 100644
--- a/integration/BitNet/vllm_workspace/inference_with_native_format.py
+++ b/integration/BitNet/vllm_workspace/inference_with_native_format.py
@@ -15,7 +15,6 @@
 import os
 import argparse
 
-
 # get the path of the current file
 current_file_path = os.path.realpath(__file__)
 current_dir = os.path.dirname(current_file_path)
@@ -34,16 +33,12 @@
 ckpt_path = args.ckpt_path
 
 with VllmRunner(
-    ckpt_path,
-    dtype="half",
-    quantization="bitnet",
-    gpu_memory_utilization=0.5,
+        ckpt_path,
+        dtype="half",
+        quantization="bitnet",
+        gpu_memory_utilization=0.5,
 ) as bitnet_model:
-    bitbnet_outputs = bitnet_model.generate_greedy(
-        ["Hi, tell me about microsoft?"], max_tokens=128
-    )
+    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=128)
     print("bitnet inference output:")
     print(bitbnet_outputs[0][0])
     print(bitbnet_outputs[0][1])
-
-