Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable more LLM examples #28

Merged
merged 20 commits into from
Jul 26, 2024
Merged
75 changes: 59 additions & 16 deletions examples/.config/model_params_onnxrt.json
Original file line number Diff line number Diff line change
@@ -1,60 +1,103 @@
{
"onnxrt": {
"llama-2-7b-rtn": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "RTN"
},
"llama-2-7b-rtn-with-past": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "RTN"
},
"llama-2-7b-awq": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "AWQ"
},
"llama-2-7b-awq-with-past": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "AWQ"
},
"llama-2-7b-gptq": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "GPTQ"
},
"llama-2-7b-gptq-with-past": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "GPTQ"
},
"llama-2-7b-woq_tune": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "WOQ_TUNE"
},
"llama-2-7b-woq_tune-with-past": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "WOQ_TUNE"
},
"llama-3-8b-gptq-with-past": {
"model_name": "meta-llama/Meta-Llama-3-8B",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Meta-Llama-3-8B-with-past",
"main_script": "main.py",
"batch_size": 1,
"algorithm": "GPTQ"
},
"phi-3-mini-128k-instruct-rtn-with-past": {
"model_name": "microsoft/Phi-3-mini-128k-instruct",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Phi-3-mini-128k-instruct-with-past",
"main_script": "main.py",
"batch_size": 1,
"algorithm": "RTN"
},
"qwen2-7b-instruct-rtn-with-past": {
"model_name": "Qwen/Qwen2-7B-Instruct",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Qwen2-7B-Instruct-with-past",
"main_script": "main.py",
"batch_size": 1,
"algorithm": "RTN"
},
"bert_base_MRPC": {
"model_src_dir": "nlp/bert/quantization/ptq_static",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pip install -r requirements.txt

## 2. Prepare Model

Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for weight-only quantization. The following table shows a few models' configurations:
Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. We verified weight-only quantization on other models as follows.

| Model | Num Hidden Layers| Num Attention Heads | Hidden Size |
| --- | --- | --- | --- |
Expand All @@ -24,14 +24,17 @@ Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are
| [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 |
| [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 |
| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 |
| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 32 | 32 | 4096 |
| [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) | 32 | 32 | 3072 |
| [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | 28 | 28 | 3584 |

Export to ONNX model:
```bash
python prepare_model.py --input_model="meta-llama/Llama-2-7b-hf" \
--output_model="./llama-2-7b-hf" \
--task=text-generation-with-past \ # or text-generation
```


# Run

## 1. Quantization
Expand All @@ -53,7 +56,7 @@ Accuracy:

```bash
bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
--batch_size=batch_size \ # optional
--batch_size=batch_size \ # optional
--mode=accuracy \
--tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
--tasks=lambada_openai
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
parser.add_argument("--model_path", type=str, help="Folder path of pre-trained onnx model")
parser.add_argument("--benchmark", action="store_true", default=False)
parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model")
parser.add_argument("--output_model", type=str, default=None, help="output model path")
parser.add_argument("--output_model", type=str, default=None, help="path of output dircectory")
parser.add_argument(
"--batch_size",
default=1,
Expand Down Expand Up @@ -92,11 +92,27 @@
parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy")
parser.add_argument("--intra_op_num_threads", type=int, default=24)
parser.add_argument("--trust_remote_code", type=bool, default=False)
parser.add_argument("--layer_wise", action="store_true", default=False)
parser.add_argument(
"--quantize_lm_head",
action="store_true",
default=False,
help="language modeling head will not be quantized by default. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'",
)
parser.add_argument(
"--nodes_to_exclude",
nargs="+",
default=[],
help="nodes that will not be quantized. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'",
)
args = parser.parse_args()

if args.tune and not os.path.exists(args.output_model):
os.makedirs(args.output_model)

# load model
tokenizer = transformers.LlamaTokenizer.from_pretrained(args.tokenizer)
model_config = transformers.LlamaConfig.from_pretrained(args.model_path)
tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer)
model_config = transformers.AutoConfig.from_pretrained(args.model_path, trust_remote_code=args.trust_remote_code)


def tokenize_function(examples):
Expand All @@ -110,7 +126,8 @@ def replace_architectures(json_path):
# refer to https://github.com/huggingface/transformers/issues/22222#issuecomment-1477171703
with open(json_path, "r") as file:
data = json.load(file)
data["architectures"] = ["LlamaForCausalLM"]
if data["architectures"] == ["LLaMATokenizer"]:
data["architectures"] = ["LlamaForCausalLM"]

with open(json_path, "w") as file:
json.dump(data, file, indent=4)
Expand Down Expand Up @@ -327,14 +344,18 @@ def rewind(self):
model_name = "model.onnx" # require optimum >= 1.14.0
model_path = os.path.join(args.model_path, model_name)
best_model = None

nodes_to_exclude = ["/lm_head/MatMul"] if not args.quantize_lm_head else []
nodes_to_exclude = list(set(args.nodes_to_exclude + nodes_to_exclude))
if args.algorithm.upper() == "RTN":
algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=True)
algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=args.layer_wise)
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
model_path,
n_bits=4,
block_size=32,
is_symmetric=True,
algo_config=algo_config,
nodes_to_exclude=nodes_to_exclude,
)
quant.process()
best_model = quant.model
Expand All @@ -350,21 +371,23 @@ def rewind(self):
block_size=32,
is_symmetric=True,
algo_config=algo_config,
nodes_to_exclude=nodes_to_exclude,
)
quant.process()
best_model = quant.model

elif args.algorithm.upper() == "GPTQ":
calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1)
algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig(
calibration_data_reader=calibration_data_reader, layer_wise_quant=True
calibration_data_reader=calibration_data_reader, layer_wise_quant=args.layer_wise
)
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
model_path,
n_bits=4,
block_size=32,
is_symmetric=False,
algo_config=algo_config,
nodes_to_exclude=nodes_to_exclude,
)
quant.process()
best_model = quant.model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,19 @@

def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--input_model", type=str, required=False, default="")
parser.add_argument("--output_model", type=str, required=True)
parser.add_argument("--input_model", type=str, required=True, default="")
parser.add_argument("--output_model", type=str, required=False, default=None)
parser.add_argument(
"--task",
type=str,
required=False,
default="text-generation-with-past",
choices=["text-generation-with-past", "text-generation"],
)
return parser.parse_args()
args = parser.parse_args()
if args.output_model is None:
args.output_model = os.path.basename(args.input_model) + "-onnx"
return args


def prepare_model(input_model, output_model, task):
Expand All @@ -37,6 +40,7 @@ def prepare_model(input_model, output_model, task):
"--task",
task,
f"{output_model}",
"--trust-remote-code",
],
stdout=subprocess.PIPE,
text=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,27 @@ function init_params {

# run_benchmark
function run_benchmark {

# Check if the input_model ends with the filename extension ".onnx"
if [[ $input_model =~ \.onnx$ ]]; then
# If the string ends with the filename extension, get the path of the file
input_model=$(dirname "$input_model")
fi

python main.py \
if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
extra_cmd="--trust_remote_code True"
fi

eval "python main.py \
--model_path ${input_model} \
--batch_size=${batch_size-1} \
--tokenizer=${tokenizer-meta-llama/Llama-2-7b-hf} \
--tasks=${tasks-lambada_openai} \
--mode=${mode} \
--intra_op_num_threads=${intra_op_num_threads-24} \
--benchmark

--benchmark \
${extra_cmd}"

}

main "$@"

Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,30 @@ function run_tuning {
echo "Created directory $output_model"
fi

python main.py \
if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
nodes_to_exclude="/model/layers.*/self_attn/qkv_proj/MatMul /model/layers.*/mlp/down_proj/MatMul"
extra_cmd="--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True"
fi
if [[ "${tokenizer}" =~ "Llama-3-8B" ]]; then
nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul"
extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
fi
if [[ "${tokenizer}" =~ "Qwen2-7B" ]]; then
nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul /model/layers.*/mlp/up_proj/MatMul"
extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
fi

eval "python main.py \
--model_path ${input_model} \
--tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \
--output_model ${output_model} \
--batch_size ${batch_size-1} \
--dataset ${dataset-NeelNanda/pile-10k} \
--algorithm ${algorithm-WOQ_TUNE} \
--tasks ${tasks-lambada_openai} \
--tune
--layer_wise \
--tune \
${extra_cmd}"
}

main "$@"

11 changes: 1 addition & 10 deletions onnx_neural_compressor/algorithms/layer_wise/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,7 @@ def layer_wise_quant(
# get and check split nodes
split_nodes = origin_model.find_split_nodes()
if len(split_nodes) == 0:
logger.error(
"Can't find split nodes for layer-wise quantization. "
"We recommend applying graph optimization for your model like follows: \n"
"import onnxruntime as ort \n"
"sess_options = ort.SessionOptions() \n"
"sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED "
"# or ORT_ENABLE_BASIC \n"
"sess_options.optimized_model_filepath = 'optimized_model_path' \n"
"ort.InferenceSession(infer_shape_model_path, sess_options)"
)
logger.error("Can't find split nodes for layer-wise quantization.")
raise ValueError("Fail to run layer-wise quantization.")
logger.info(
"Will split model into {} parts to do layer-wise quantization".format(
Expand Down
4 changes: 2 additions & 2 deletions onnx_neural_compressor/onnx_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def save(self, root):
root,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=root.split("/")[-1] + "_data",
location=os.path.basename(root) + "_data",
size_threshold=1024,
convert_attribute=False,
)
Expand Down Expand Up @@ -1001,7 +1001,7 @@ def _save_split_model(self, save_path):
save_path,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=save_path.split("/")[-1] + "_data",
location=os.path.basename(save_path) + "_data",
size_threshold=1024,
convert_attribute=False,
)
Expand Down
Loading