From 471a10caa7c71e25880df1034d9c967b36ba0ba8 Mon Sep 17 00:00:00 2001 From: Lingxiao Ma Date: Fri, 9 Aug 2024 12:12:53 +0000 Subject: [PATCH 1/3] fix BitNet integration for vLLM --- integration/BitNet/README.md | 2 +- integration/BitNet/maint/create_bitblas_ckpt.py | 9 ++++++--- .../generate_bitnet_model_bitblas_format.sh | 6 ++++++ .../generate_bitnet_model_native_format.sh | 6 +++--- .../{quant_config.json => quantize_config.json} | 0 .../inference_with_compress_format.py | 4 ++-- .../inference_with_native_format.py | 17 ++--------------- 7 files changed, 20 insertions(+), 24 deletions(-) rename integration/BitNet/maint/{quant_config.json => quantize_config.json} (100%) diff --git a/integration/BitNet/README.md b/integration/BitNet/README.md index 78d8a7eb..4cac4984 100644 --- a/integration/BitNet/README.md +++ b/integration/BitNet/README.md @@ -24,7 +24,7 @@ cd /root/to/BitBLAS/integration/BitNet The second script is `generate_bitnet_model_bitblas_format.sh`, which is used to make a checkpoint with BitBLAS compressed metadata, which can avoid the online dequantize sage for the profiling of vLLM, which lead to more efficient memory utilization. ```bash -./maint/generate_bitnet_model_bitblas_format.sh ./models/bitnet_3B_1.58bit ./models/bitnet_3B_1.58bit_bitblas +./maint/generate_bitnet_model_bitblas_format.sh ./models/bitnet_b1_58-3B ./models/bitnet_b1_58-3B_bitblas # the output ckpy will be saved in the `./models/bitnet_b1_58-3B_bitblas` directory ``` diff --git a/integration/BitNet/maint/create_bitblas_ckpt.py b/integration/BitNet/maint/create_bitblas_ckpt.py index d71f5958..6d5b3d59 100644 --- a/integration/BitNet/maint/create_bitblas_ckpt.py +++ b/integration/BitNet/maint/create_bitblas_ckpt.py @@ -4,14 +4,17 @@ import argparse import torch import bitblas -from modeling_bitnet import BitnetForCausalLM -from tokenization_bitnet import BitnetTokenizer from transformers.utils.hub import cached_file import os from transformers import GenerationConfig import time import json +import sys +sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))+"/../") +from modeling_bitnet import BitnetForCausalLM +from tokenization_bitnet import BitnetTokenizer + filepath = os.path.abspath(__file__) dirpath = os.path.dirname(filepath) @@ -19,7 +22,7 @@ bitblas.set_log_level("INFO") parser = argparse.ArgumentParser() -parser.add_argument("--model_name_or_path", type=str, default="BitBLASModel/open_llama_3b_1.58bits") +parser.add_argument("--model_name_or_path", type=str, default="1bitLLM/bitnet_b1_58-3B") parser.add_argument("--saved_model_path", type=str, default=None) args = parser.parse_args() diff --git a/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh b/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh index aea62db9..3ace5803 100755 --- a/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh +++ b/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh @@ -24,4 +24,10 @@ fi # get the realpath of the saved model directory SAVED_MODEL_DIR=$(realpath $SAVED_MODEL_DIR) +# cp files +cp $MODEL_DIR/quantize_config.json $SAVED_MODEL_DIR/ +cp $MODEL_DIR/tokenizer.json $SAVED_MODEL_DIR/ +cp $MODEL_DIR/tokenizer.model $SAVED_MODEL_DIR/ +cp $MODEL_DIR/tokenizer_config.json $SAVED_MODEL_DIR/ + echo "Model has been converted and save to $SAVED_MODEL_DIR" diff --git a/integration/BitNet/maint/generate_bitnet_model_native_format.sh b/integration/BitNet/maint/generate_bitnet_model_native_format.sh index 75bac8a7..e066033b 100755 --- a/integration/BitNet/maint/generate_bitnet_model_native_format.sh +++ b/integration/BitNet/maint/generate_bitnet_model_native_format.sh @@ -14,13 +14,13 @@ mkdir -p models cd models # download the model -git clone https://huggingface.co/1bitLLM/bitnet_b1_58-3B bitnet_3B_1.58bits --depth 1 +git clone https://huggingface.co/1bitLLM/bitnet_b1_58-3B bitnet_b1_58-3B --depth 1 # copy quantized config into the model directory -cp ../maint/quant_config.json bitnet_3B_1.58bits +cp ../maint/quantize_config.json bitnet_b1_58-3B # get the realpath of the model directory -MODEL_DIR=$(realpath bitnet_3B_1.58bits) +MODEL_DIR=$(realpath bitnet_b1_58-3B) cd .. diff --git a/integration/BitNet/maint/quant_config.json b/integration/BitNet/maint/quantize_config.json similarity index 100% rename from integration/BitNet/maint/quant_config.json rename to integration/BitNet/maint/quantize_config.json diff --git a/integration/BitNet/vllm_workspace/inference_with_compress_format.py b/integration/BitNet/vllm_workspace/inference_with_compress_format.py index 45426d65..fcd728fc 100644 --- a/integration/BitNet/vllm_workspace/inference_with_compress_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_compress_format.py @@ -19,7 +19,7 @@ current_file_path = os.path.realpath(__file__) current_dir = os.path.dirname(current_file_path) -ckpt_path = os.path.join(current_dir, "../models/bitnet_3b_1.58bits_bitblas") +ckpt_path = os.path.join(current_dir, "../models/bitnet_b1_58-3B_bitblas") parser = argparse.ArgumentParser(description="Inference with BitNet") parser.add_argument( "--ckpt_path", @@ -35,7 +35,7 @@ ckpt_path, dtype="half", quantization="bitblas", - enforce_eager=True, + enforce_eager=True, # set False to enable cuda graph ) as bitnet_model: bitbnet_outputs = bitnet_model.generate_greedy( ["Hi, tell me about microsoft?"], max_tokens=1024 diff --git a/integration/BitNet/vllm_workspace/inference_with_native_format.py b/integration/BitNet/vllm_workspace/inference_with_native_format.py index 07aefeec..85f409fb 100644 --- a/integration/BitNet/vllm_workspace/inference_with_native_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_native_format.py @@ -19,7 +19,7 @@ # get the path of the current file current_file_path = os.path.realpath(__file__) current_dir = os.path.dirname(current_file_path) -ckpt_path = os.path.join(current_dir, "../models/bitnet_3b_1.58bits") +ckpt_path = os.path.join(current_dir, "../models/bitnet_b1_58-3B_bitblas") parser = argparse.ArgumentParser(description="Inference with BitNet") parser.add_argument( @@ -46,17 +46,4 @@ print(bitbnet_outputs[0][0]) print(bitbnet_outputs[0][1]) -# with VllmRunner( -# "BitBLASModel/open_llama_3b_1.58bits_bitblas", -# dtype="half", -# quantization="bitblas", -# enforce_eager=True, -# ) as bitnet_model: -# torch.cuda.profiler.start() -# bitbnet_outputs = bitnet_model.generate_greedy( -# ["Hi, tell me about microsoft?"], max_tokens=1024 -# ) -# torch.cuda.profiler.stop() -# print("bitnet:") -# print(bitbnet_outputs[0][0]) -# print(bitbnet_outputs[0][1]) + From 7164521a81da37425e35a7e5f0ffd58d6ff6a42f Mon Sep 17 00:00:00 2001 From: Lingxiao Ma Date: Fri, 9 Aug 2024 12:23:46 +0000 Subject: [PATCH 2/3] update ckpt name of BitNet integration for vLLM --- integration/BitNet/README.md | 6 +++--- .../BitNet/maint/generate_bitnet_model_native_format.sh | 6 +++--- .../BitNet/vllm_workspace/inference_with_compress_format.py | 2 +- .../BitNet/vllm_workspace/inference_with_native_format.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/integration/BitNet/README.md b/integration/BitNet/README.md index 4cac4984..63cc3e27 100644 --- a/integration/BitNet/README.md +++ b/integration/BitNet/README.md @@ -18,14 +18,14 @@ We provide two scripts to make the checkpoints for vLLM. The first script is `ge cd /root/to/BitBLAS/integration/BitNet # make the checkpoint ./maint/generate_bitnet_model_native_format.sh -# the output ckpy will be saved in the `./models/bitnet_b1_58-3B` directory +# the output ckpy will be saved in the `./models/ckpt_bitnet_b1_58-3B` directory ``` The second script is `generate_bitnet_model_bitblas_format.sh`, which is used to make a checkpoint with BitBLAS compressed metadata, which can avoid the online dequantize sage for the profiling of vLLM, which lead to more efficient memory utilization. ```bash -./maint/generate_bitnet_model_bitblas_format.sh ./models/bitnet_b1_58-3B ./models/bitnet_b1_58-3B_bitblas -# the output ckpy will be saved in the `./models/bitnet_b1_58-3B_bitblas` directory +./maint/generate_bitnet_model_bitblas_format.sh ./models/ckpt_bitnet_b1_58-3B ./models/ckpt_bitnet_b1_58-3B_bitblas +# the output ckpy will be saved in the `./models/ckpt_bitnet_b1_58-3B_bitblas` directory ``` Finnaly, you can use the ckpt in vLLM with: diff --git a/integration/BitNet/maint/generate_bitnet_model_native_format.sh b/integration/BitNet/maint/generate_bitnet_model_native_format.sh index e066033b..c002f6e1 100755 --- a/integration/BitNet/maint/generate_bitnet_model_native_format.sh +++ b/integration/BitNet/maint/generate_bitnet_model_native_format.sh @@ -14,13 +14,13 @@ mkdir -p models cd models # download the model -git clone https://huggingface.co/1bitLLM/bitnet_b1_58-3B bitnet_b1_58-3B --depth 1 +git clone https://huggingface.co/1bitLLM/bitnet_b1_58-3B ckpt_bitnet_b1_58-3B --depth 1 # copy quantized config into the model directory -cp ../maint/quantize_config.json bitnet_b1_58-3B +cp ../maint/quantize_config.json ckpt_bitnet_b1_58-3B # get the realpath of the model directory -MODEL_DIR=$(realpath bitnet_b1_58-3B) +MODEL_DIR=$(realpath ckpt_bitnet_b1_58-3B) cd .. diff --git a/integration/BitNet/vllm_workspace/inference_with_compress_format.py b/integration/BitNet/vllm_workspace/inference_with_compress_format.py index fcd728fc..d99eb493 100644 --- a/integration/BitNet/vllm_workspace/inference_with_compress_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_compress_format.py @@ -19,7 +19,7 @@ current_file_path = os.path.realpath(__file__) current_dir = os.path.dirname(current_file_path) -ckpt_path = os.path.join(current_dir, "../models/bitnet_b1_58-3B_bitblas") +ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B_bitblas") parser = argparse.ArgumentParser(description="Inference with BitNet") parser.add_argument( "--ckpt_path", diff --git a/integration/BitNet/vllm_workspace/inference_with_native_format.py b/integration/BitNet/vllm_workspace/inference_with_native_format.py index 85f409fb..e6db3a6c 100644 --- a/integration/BitNet/vllm_workspace/inference_with_native_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_native_format.py @@ -19,7 +19,7 @@ # get the path of the current file current_file_path = os.path.realpath(__file__) current_dir = os.path.dirname(current_file_path) -ckpt_path = os.path.join(current_dir, "../models/bitnet_b1_58-3B_bitblas") +ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B_bitblas") parser = argparse.ArgumentParser(description="Inference with BitNet") parser.add_argument( From 5eae019ef33863075ee78311667a04d23d344d73 Mon Sep 17 00:00:00 2001 From: Lingxiao Ma Date: Fri, 9 Aug 2024 12:34:27 +0000 Subject: [PATCH 3/3] format code --- integration/BitNet/maint/create_bitblas_ckpt.py | 7 +++++-- .../inference_with_compress_format.py | 13 ++++++------- .../inference_with_native_format.py | 15 +++++---------- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/integration/BitNet/maint/create_bitblas_ckpt.py b/integration/BitNet/maint/create_bitblas_ckpt.py index 6d5b3d59..0bf603e0 100644 --- a/integration/BitNet/maint/create_bitblas_ckpt.py +++ b/integration/BitNet/maint/create_bitblas_ckpt.py @@ -11,7 +11,8 @@ import json import sys -sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))+"/../") + +sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + "/../") from modeling_bitnet import BitnetForCausalLM from tokenization_bitnet import BitnetTokenizer @@ -27,7 +28,9 @@ args = parser.parse_args() model_name_or_path = args.model_name_or_path -saved_model_path = os.path.join(dirpath, "models", f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path +saved_model_path = os.path.join( + dirpath, "models", + f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path def generate_text(model, tokenizer, prompt, max_length=100): diff --git a/integration/BitNet/vllm_workspace/inference_with_compress_format.py b/integration/BitNet/vllm_workspace/inference_with_compress_format.py index d99eb493..9e60fa97 100644 --- a/integration/BitNet/vllm_workspace/inference_with_compress_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_compress_format.py @@ -32,14 +32,13 @@ ckpt_path = args.ckpt_path with VllmRunner( - ckpt_path, - dtype="half", - quantization="bitblas", - enforce_eager=True, # set False to enable cuda graph + ckpt_path, + dtype="half", + quantization="bitblas", + enforce_eager=True, # set False to enable cuda graph ) as bitnet_model: - bitbnet_outputs = bitnet_model.generate_greedy( - ["Hi, tell me about microsoft?"], max_tokens=1024 - ) + bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], + max_tokens=1024) print("bitnet inference:") print(bitbnet_outputs[0][0]) print(bitbnet_outputs[0][1]) diff --git a/integration/BitNet/vllm_workspace/inference_with_native_format.py b/integration/BitNet/vllm_workspace/inference_with_native_format.py index e6db3a6c..579c5e17 100644 --- a/integration/BitNet/vllm_workspace/inference_with_native_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_native_format.py @@ -15,7 +15,6 @@ import os import argparse - # get the path of the current file current_file_path = os.path.realpath(__file__) current_dir = os.path.dirname(current_file_path) @@ -34,16 +33,12 @@ ckpt_path = args.ckpt_path with VllmRunner( - ckpt_path, - dtype="half", - quantization="bitnet", - gpu_memory_utilization=0.5, + ckpt_path, + dtype="half", + quantization="bitnet", + gpu_memory_utilization=0.5, ) as bitnet_model: - bitbnet_outputs = bitnet_model.generate_greedy( - ["Hi, tell me about microsoft?"], max_tokens=128 - ) + bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=128) print("bitnet inference output:") print(bitbnet_outputs[0][0]) print(bitbnet_outputs[0][1]) - -