420 lines
19 KiB
Python
420 lines
19 KiB
Python
import argparse
|
||
import os
|
||
|
||
import sys
|
||
import time
|
||
import safetensors
|
||
import logging
|
||
import json
|
||
from huggingface_hub import split_torch_state_dict_into_shards, constants
|
||
|
||
from vllm import LLM
|
||
from vllm.transformers_utils.config import get_config, get_hf_text_config
|
||
from vllm.config import _get_and_verify_max_len
|
||
import transformers
|
||
from transformers.modeling_utils import SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
|
||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
||
|
||
from smooth_quant import generate_weights_of_smoothquant
|
||
from weight_only import generate_weights_of_weight_only
|
||
from utils_internal import (read_model_name, load_tokenizer, torch_dtype_to_str, str_dtype_to_torch,
|
||
copy_files_except_extensions, generate_datetime, get_hf_config_sliding_window)
|
||
from utils_internal import get_skip_patterns, should_skip
|
||
from model_special import smooth_model_config
|
||
from vllm.engine.arg_utils import EngineArgs
|
||
|
||
sys.path.append(os.getcwd())
|
||
|
||
logger = logging.getLogger("smooth_convert")
|
||
|
||
def load_skip_params_from_hf(args):
|
||
'''
|
||
load parameters from transformers that do no need to be quantized.
|
||
'''
|
||
model_type = args.model_type
|
||
if not get_skip_patterns(model_type):
|
||
return {}
|
||
try:
|
||
model = getattr(transformers, args.model_name, None)
|
||
if model is None:
|
||
model = AutoModelForCausalLM
|
||
model = model.from_pretrained(
|
||
args.hf_model_dir,
|
||
trust_remote_code=True,
|
||
torch_dtype=args.torch_dtype,
|
||
device_map="cpu")
|
||
except Exception as e:
|
||
logger.fatal(f"Unsupported model {args.model_name}, error message: {e}")
|
||
sys.exit(1)
|
||
|
||
params_map = {}
|
||
hf_params = dict(model.named_parameters())
|
||
for name, param in hf_params.items():
|
||
if should_skip(model_type, name):
|
||
logger.info(f"load parameters from transformers, name: {name}")
|
||
params_map[name] = param
|
||
return params_map
|
||
|
||
def save_quantized_weights_to_safetensors(quantized_weights, args):
|
||
'''
|
||
save quantized_weights to safetensors format
|
||
'''
|
||
# Store the state_dict to file.
|
||
max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
|
||
state_dict_split = split_torch_state_dict_into_shards(quantized_weights,
|
||
filename_pattern=constants.SAFETENSORS_WEIGHTS_FILE_PATTERN,
|
||
max_shard_size=max_shard_size)
|
||
# Save the model
|
||
for shard_name, tensors in state_dict_split.filename_to_tensors.items():
|
||
shard = {tensor: quantized_weights[tensor] for tensor in tensors}
|
||
safetensors.torch.save_file(shard, os.path.join(args.output_dir, shard_name), metadata={"format": "pt"})
|
||
|
||
if state_dict_split.is_sharded:
|
||
index = {
|
||
"metadata": state_dict_split.metadata,
|
||
"weight_map": state_dict_split.tensor_to_filename,
|
||
}
|
||
save_index_file = os.path.join(args.output_dir, SAFE_WEIGHTS_INDEX_NAME)
|
||
with open(save_index_file, "w", encoding="utf-8") as f:
|
||
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
|
||
f.write(content)
|
||
logger.info(
|
||
f"The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be "
|
||
f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where "
|
||
f"each parameters has been saved in the index located at {save_index_file}."
|
||
)
|
||
else:
|
||
logger.info(f"Model weights saved in {os.path.join(args.output_dir, SAFE_WEIGHTS_NAME)}")
|
||
|
||
|
||
def main(args):
|
||
'''
|
||
main quantization logic
|
||
'''
|
||
logging.basicConfig(
|
||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||
datefmt="%m/%d/%Y %H:%M:%S",
|
||
level=args.log_level,
|
||
force=True,
|
||
)
|
||
|
||
tik = time.time()
|
||
|
||
skip_params = load_skip_params_from_hf(args)
|
||
# Create an LLM.
|
||
max_model_len = max(args.max_input_length + args.output_len, 2048)
|
||
args.max_model_len = min(max_model_len, args.hf_max_model_len)
|
||
|
||
max_num_batched_tokens = max(max(args.max_input_length * args.batch_size, max_model_len), 2048)
|
||
args.max_num_batched_tokens = min(max_num_batched_tokens, args.hf_max_model_len)
|
||
llm = LLM(model=args.hf_model_dir,
|
||
tokenizer=args.tokenizer_dir,
|
||
tensor_parallel_size=args.tp_size,
|
||
distributed_executor_backend='ray',
|
||
dtype=args.dtype,
|
||
enforce_eager=args.enforce_eager,
|
||
trust_remote_code=True,
|
||
block_size=args.block_size,
|
||
max_model_len=args.max_model_len,
|
||
max_num_batched_tokens=args.max_num_batched_tokens,
|
||
max_num_seqs=args.max_num_seqs,
|
||
cpu_offload_gb=args.cpu_offload_gb)
|
||
tok = time.time()
|
||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||
|
||
logger.info(f'Load vLLM model takes: {t}')
|
||
|
||
quantize_config = {}
|
||
if args.use_weight_only:
|
||
st_prefix = f"weight_{args.weight_only_precision}"
|
||
quantized_weights = generate_weights_of_weight_only(llm, args)
|
||
quantize_config['bits'] = 8 if args.weight_only_precision == "int8" else 4
|
||
quantize_config['quant_method'] = "weightonly"
|
||
quantize_config['quant_mode'] = "WeightOnly"
|
||
|
||
if args.use_smoothquant:
|
||
st_prefix = f"smoothquant_{args.smooth_value}"
|
||
quantized_weights, smooth_info = generate_weights_of_smoothquant(llm, args)
|
||
quantize_config['bits'] = 8
|
||
quantize_config['quant_method'] = "smoothquant"
|
||
quantize_config['quant_mode'] = "SmoothQuant"
|
||
quantize_config['input_quant_method'] = "per_token" if args.per_token else "per_tensor"
|
||
quantize_config['smooth_value'] = args.smooth_value
|
||
with open(os.path.join(args.output_dir, 'smooth_info.json'), 'w') as f:
|
||
json.dump(smooth_info, f, indent=4)
|
||
|
||
# Should first copy other files from hf_model_dir, and then save weight, tokenizer, config, quant_config and so on
|
||
extensions = ['.bin', '.safetensors', ".pt", ".index.json"]
|
||
copy_files_except_extensions(args.hf_model_dir, args.output_dir, extensions)
|
||
logger.info(f'copy files except extensions success')
|
||
|
||
for name, param in skip_params.items():
|
||
assert name in quantized_weights
|
||
quantized_weights[name] = param
|
||
save_quantized_weights_to_safetensors(quantized_weights, args)
|
||
logger.info(f'save quantized_weights to safetensors success')
|
||
|
||
with open(os.path.join(args.output_dir, 'quantize_config.json'), 'w') as f:
|
||
json.dump(quantize_config, f, indent=4)
|
||
|
||
from transformers.utils import CONFIG_NAME
|
||
with open(os.path.join(args.hf_model_dir, CONFIG_NAME), 'r') as f:
|
||
config = json.load(f)
|
||
config['quantization_config'] = quantize_config
|
||
config['generate_datetime'] = generate_datetime()
|
||
config['torch_dtype'] = args.dtype
|
||
with open(os.path.join(args.output_dir, CONFIG_NAME), 'w') as f:
|
||
json.dump(config, f, indent=4)
|
||
|
||
logger.info(f'quantized {args.hf_model_dir} finished')
|
||
|
||
if __name__ == '__main__':
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument('--hf_model_dir', type=str, default=None)
|
||
parser.add_argument('--tokenizer_dir',
|
||
default=None,
|
||
help='tokenizer path; defaults to hf_model_dir if left unspecified')
|
||
parser.add_argument(
|
||
'--enforce_eager',
|
||
action="store_true",
|
||
default=True,
|
||
help='Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model '
|
||
'in eager mode. If False, we will use CUDA graph and eager execution in hybrid.')
|
||
parser.add_argument('--dtype',
|
||
type=str,
|
||
choices=['auto', 'float32', 'float16', 'bfloat16'],
|
||
default='auto',
|
||
help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
|
||
parser.add_argument('--scales_smooth_dtype',
|
||
type=str,
|
||
choices=['auto', 'float32', 'float16', 'bfloat16'],
|
||
default='auto',
|
||
help="if auto, scales and smooth weights use args.dtype, else use the setted dtype")
|
||
parser.add_argument(
|
||
'--eval_task',
|
||
type=str,
|
||
default='summarize',
|
||
choices=['summarize', 'summarize_long', 'code_completion', 'summarize_hg', 'text_generation', 'custom'],
|
||
help='''eval task to decide which dataset is selected. When set to custom, you must set these options
|
||
dataset_name, dataset_revision, dataset_input_key, dataset_split to specify which dataset to use''')
|
||
parser.add_argument("--dataset_cache_dir",
|
||
type=str,
|
||
default=None,
|
||
help="cache dir to load the hugging face dataset")
|
||
parser.add_argument("--dataset_name", type=str, default=None, help="custom dataset name")
|
||
parser.add_argument("--dataset_revision", type=str, default=None, help="custom dataset version")
|
||
parser.add_argument("--dataset_input_key", type=str, default=None, help="custom dataset field")
|
||
parser.add_argument("--dataset_split", type=str, default=None, help="custom dataset split")
|
||
parser.add_argument('--log_level', type=int, default=logging.INFO)
|
||
parser.add_argument('--num_samples', type=int, default=512, help='num prompt sample')
|
||
parser.add_argument('--output_len',
|
||
type=int,
|
||
default=100,
|
||
help="Number of output sequences to return for the given prompt")
|
||
parser.add_argument('--max_input_length',
|
||
type=int,
|
||
default=512,
|
||
help='max input length of the prompt')
|
||
parser.add_argument('--block_size', type=int, default=-1, help='Token block size for contiguous chunks of tokens.')
|
||
parser.add_argument('--temperature', type=float, default=1.0)
|
||
parser.add_argument('--top_p', type=float, default=1.0)
|
||
parser.add_argument('--top_k', type=int, default=-1)
|
||
parser.add_argument('--repetition_penalty', type=float, default=1.0)
|
||
parser.add_argument('--max_num_seqs',
|
||
type=int,
|
||
default=EngineArgs.max_num_seqs,
|
||
help='Maximum number of sequences per iteration.')
|
||
parser.add_argument('--output_dir',
|
||
type=str,
|
||
default="output_dir",
|
||
help="The path to save the quantized checkpoint")
|
||
parser.add_argument(
|
||
"--max_shard_size",
|
||
type=str,
|
||
default="10GB",
|
||
help=("The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size "
|
||
"lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`)"),
|
||
)
|
||
parser.add_argument('--tp_size', type=int, default=1, help='N-way tensor parallelism size')
|
||
parser.add_argument('--pp_size', type=int, default=1, help='N-way pipeline parallelism size, now supported num')
|
||
parser.add_argument('--use_smoothquant',
|
||
default=False,
|
||
action="store_true",
|
||
help='Apply smoothquant to generate weight')
|
||
parser.add_argument("--smooth_value",
|
||
type=float,
|
||
default=0.5,
|
||
help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
|
||
" to Smoothquant the model, and output int8 weights."
|
||
" A good first try is 0.5. Must be in [0, 1]")
|
||
parser.add_argument('--per_channel',
|
||
action="store_true",
|
||
default=False,
|
||
help='By default, we use a single static scaling factor for the GEMM\'s result. '
|
||
'per_channel instead uses a different static scaling factor for each channel. '
|
||
'The latter is usually more accurate, but a little slower.')
|
||
parser.add_argument(
|
||
'--per_token',
|
||
action="store_true",
|
||
default=False,
|
||
help='By default, we use a single static scaling factor to scale activations in the int8 range. '
|
||
'per_token chooses at run time, and for each token, a custom scaling factor. '
|
||
'The latter is usually more accurate, but a little slower.')
|
||
parser.add_argument('--use_weight_only',
|
||
default=False,
|
||
action="store_true",
|
||
help='Quantize weights for the various GEMMs to INT4/INT8.'
|
||
'See --weight_only_precision to set the precision')
|
||
parser.add_argument('--weight_only_precision',
|
||
const='int8',
|
||
type=str,
|
||
nargs='?',
|
||
default='int8',
|
||
choices=['int8', 'int4'],
|
||
help='Define the precision for the weights when using weight-only quantization.'
|
||
'You must also use --use_weight_only for that argument to have an impact.')
|
||
parser.add_argument(
|
||
'--has_qzeros',
|
||
action="store_true",
|
||
default=False,
|
||
help='whether to add qzeros weight to vllm_mlu weight',
|
||
)
|
||
parser.add_argument('--model_version',
|
||
type=str,
|
||
default=None,
|
||
help="Set model version to replace parsing from _name_or_path in hf config.")
|
||
parser.add_argument('--model_type',
|
||
type=str,
|
||
default=None,
|
||
help="Set model type to replace parsing from model_type in hf config."
|
||
"if set is None and parsed also None, then set as model_version")
|
||
parser.add_argument('--no_add_special_tokens',
|
||
dest='add_special_tokens',
|
||
default=True,
|
||
action='store_false',
|
||
help="Whether or not to add special tokens")
|
||
parser.add_argument(
|
||
'--has_prompt_token_id',
|
||
action="store_true",
|
||
default=False,
|
||
help='whether to give llm.generate prompt_token_id',
|
||
)
|
||
parser.add_argument(
|
||
'--disable_fused_quantize_expert',
|
||
action="store_true",
|
||
default=False,
|
||
help='''disable fused activation to quantize for unfused moe usage.
|
||
Because to fused_moe smoothquant, input_smooth has shape (hidden_size), act_smooth has shape (inner_size),
|
||
and not every expert can be routed, so we assume that all expert should use the same act_smooth by default.
|
||
You can use this option to close the assumption.'''
|
||
)
|
||
parser.add_argument('--prompt_file',
|
||
type=str,
|
||
default=None,
|
||
help="custom prompt file, should has format that each line is one string prompt,"
|
||
"you can refer the format of summarize_1024_prompts.csv")
|
||
parser.add_argument(
|
||
'--batch_size',
|
||
type=int,
|
||
default=-1,
|
||
help="batch size, used to limit max_num_batched_tokens, -1 means batch_size equals to num_samples"
|
||
)
|
||
parser.add_argument(
|
||
'--cpu_offload_gb',
|
||
type=float,
|
||
default=0.0,
|
||
help='''The size (GiB) of CPU memory to use for offloading the model weights.
|
||
This virtually increases the GPU memory space you can use to hold the model weights,
|
||
at the cost of CPU-GPU data transfer for every forward pass.'''
|
||
)
|
||
parser.add_argument(
|
||
'--dump_prompt_token_ids',
|
||
action="store_true",
|
||
default=False,
|
||
help='dump prompt_token_ids used by llm.generate ',
|
||
)
|
||
parser.add_argument(
|
||
'--dump_input_ids',
|
||
action="store_true",
|
||
default=False,
|
||
help='dump vllm qkv used token ids at llm running',
|
||
)
|
||
parser.add_argument(
|
||
'--dump_act_range',
|
||
action="store_true",
|
||
default=False,
|
||
help='dump act range which is the max hidden dim value of input, output, weigth',
|
||
)
|
||
parser.add_argument(
|
||
'--dump_weights',
|
||
action="store_true",
|
||
default=False,
|
||
help='dump weights of the converted model',
|
||
)
|
||
parser.add_argument(
|
||
'--dump_generate_weights',
|
||
action="store_true",
|
||
default=False,
|
||
help='dump generate weights of the converted model',
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
|
||
assert args.pp_size == 1, "Pipeline parallelism is not supported."
|
||
|
||
if args.tokenizer_dir is None:
|
||
args.tokenizer_dir = args.hf_model_dir
|
||
|
||
if args.has_prompt_token_id is False:
|
||
args.dump_prompt_token_ids = False
|
||
|
||
if not os.path.exists(args.output_dir):
|
||
os.makedirs(args.output_dir)
|
||
|
||
args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
|
||
args.hf_model_dir, args.model_version, args.model_type)
|
||
assert args.model_type in smooth_model_config, f'''{args.model_type} hasn't supported,
|
||
please add it's infomation in model_special.py by your self'''
|
||
|
||
args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
|
||
hf_text_config = get_hf_text_config(args.hf_config)
|
||
args.tie_word_embeddings = getattr(hf_text_config, "tie_word_embeddings", False)
|
||
sliding_window_len = get_hf_config_sliding_window(hf_text_config)
|
||
disable_sliding_window = sliding_window_len is None
|
||
if args.model_type == 'qwen2_vl':
|
||
# workround for qwen2_vl since _get_and_verify_max_len not supported for MRoPE
|
||
# remove this when it is supported.
|
||
args.hf_max_model_len = 32768
|
||
else:
|
||
if args.model_type == 'hunyuan' or args.model_type == 'deepseek_v2':
|
||
disable_sliding_window=False
|
||
args.hf_max_model_len = _get_and_verify_max_len(hf_text_config, None, disable_sliding_window, sliding_window_len)
|
||
|
||
if args.batch_size < 1:
|
||
args.batch_size = args.num_samples
|
||
|
||
args.batch_size = min(args.batch_size, args.num_samples)
|
||
if args.dtype == "auto":
|
||
args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
|
||
|
||
if args.scales_smooth_dtype == "auto":
|
||
args.scales_smooth_dtype = args.dtype
|
||
|
||
args.torch_dtype = str_dtype_to_torch(args.dtype)
|
||
args.torch_scales_smooth_dtype = str_dtype_to_torch(args.scales_smooth_dtype)
|
||
args.hf_config.torch_dtype = args.torch_dtype
|
||
|
||
args.tokenizer, args.pad_id, args.end_id = load_tokenizer(
|
||
tokenizer_dir=args.tokenizer_dir,
|
||
model_name=args.model_name,
|
||
model_version=args.model_version,
|
||
)
|
||
|
||
tik = time.time()
|
||
main(args)
|
||
|
||
tok = time.time()
|
||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||
logger.info(f'Total time of converting checkpoints: {t}')
|