import argparse import os import sys import time import safetensors import logging import json from huggingface_hub import split_torch_state_dict_into_shards, constants from vllm import LLM from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.config import _get_and_verify_max_len import transformers from transformers.modeling_utils import SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig from smooth_quant import generate_weights_of_smoothquant from weight_only import generate_weights_of_weight_only from utils_internal import (read_model_name, load_tokenizer, torch_dtype_to_str, str_dtype_to_torch, copy_files_except_extensions, generate_datetime, get_hf_config_sliding_window) from utils_internal import get_skip_patterns, should_skip from model_special import smooth_model_config from vllm.engine.arg_utils import EngineArgs sys.path.append(os.getcwd()) logger = logging.getLogger("smooth_convert") def load_skip_params_from_hf(args): ''' load parameters from transformers that do no need to be quantized. ''' model_type = args.model_type if not get_skip_patterns(model_type): return {} try: model = getattr(transformers, args.model_name, None) if model is None: model = AutoModelForCausalLM model = model.from_pretrained( args.hf_model_dir, trust_remote_code=True, torch_dtype=args.torch_dtype, device_map="cpu") except Exception as e: logger.fatal(f"Unsupported model {args.model_name}, error message: {e}") sys.exit(1) params_map = {} hf_params = dict(model.named_parameters()) for name, param in hf_params.items(): if should_skip(model_type, name): logger.info(f"load parameters from transformers, name: {name}") params_map[name] = param return params_map def save_quantized_weights_to_safetensors(quantized_weights, args): ''' save quantized_weights to safetensors format ''' # Store the state_dict to file. max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size state_dict_split = split_torch_state_dict_into_shards(quantized_weights, filename_pattern=constants.SAFETENSORS_WEIGHTS_FILE_PATTERN, max_shard_size=max_shard_size) # Save the model for shard_name, tensors in state_dict_split.filename_to_tensors.items(): shard = {tensor: quantized_weights[tensor] for tensor in tensors} safetensors.torch.save_file(shard, os.path.join(args.output_dir, shard_name), metadata={"format": "pt"}) if state_dict_split.is_sharded: index = { "metadata": state_dict_split.metadata, "weight_map": state_dict_split.tensor_to_filename, } save_index_file = os.path.join(args.output_dir, SAFE_WEIGHTS_INDEX_NAME) with open(save_index_file, "w", encoding="utf-8") as f: content = json.dumps(index, indent=2, sort_keys=True) + "\n" f.write(content) logger.info( f"The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be " f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where " f"each parameters has been saved in the index located at {save_index_file}." ) else: logger.info(f"Model weights saved in {os.path.join(args.output_dir, SAFE_WEIGHTS_NAME)}") def main(args): ''' main quantization logic ''' logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=args.log_level, force=True, ) tik = time.time() skip_params = load_skip_params_from_hf(args) # Create an LLM. max_model_len = max(args.max_input_length + args.output_len, 2048) args.max_model_len = min(max_model_len, args.hf_max_model_len) max_num_batched_tokens = max(max(args.max_input_length * args.batch_size, max_model_len), 2048) args.max_num_batched_tokens = min(max_num_batched_tokens, args.hf_max_model_len) llm = LLM(model=args.hf_model_dir, tokenizer=args.tokenizer_dir, tensor_parallel_size=args.tp_size, distributed_executor_backend='ray', dtype=args.dtype, enforce_eager=args.enforce_eager, trust_remote_code=True, block_size=args.block_size, max_model_len=args.max_model_len, max_num_batched_tokens=args.max_num_batched_tokens, max_num_seqs=args.max_num_seqs, cpu_offload_gb=args.cpu_offload_gb) tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) logger.info(f'Load vLLM model takes: {t}') quantize_config = {} if args.use_weight_only: st_prefix = f"weight_{args.weight_only_precision}" quantized_weights = generate_weights_of_weight_only(llm, args) quantize_config['bits'] = 8 if args.weight_only_precision == "int8" else 4 quantize_config['quant_method'] = "weightonly" quantize_config['quant_mode'] = "WeightOnly" if args.use_smoothquant: st_prefix = f"smoothquant_{args.smooth_value}" quantized_weights, smooth_info = generate_weights_of_smoothquant(llm, args) quantize_config['bits'] = 8 quantize_config['quant_method'] = "smoothquant" quantize_config['quant_mode'] = "SmoothQuant" quantize_config['input_quant_method'] = "per_token" if args.per_token else "per_tensor" quantize_config['smooth_value'] = args.smooth_value with open(os.path.join(args.output_dir, 'smooth_info.json'), 'w') as f: json.dump(smooth_info, f, indent=4) # Should first copy other files from hf_model_dir, and then save weight, tokenizer, config, quant_config and so on extensions = ['.bin', '.safetensors', ".pt", ".index.json"] copy_files_except_extensions(args.hf_model_dir, args.output_dir, extensions) logger.info(f'copy files except extensions success') for name, param in skip_params.items(): assert name in quantized_weights quantized_weights[name] = param save_quantized_weights_to_safetensors(quantized_weights, args) logger.info(f'save quantized_weights to safetensors success') with open(os.path.join(args.output_dir, 'quantize_config.json'), 'w') as f: json.dump(quantize_config, f, indent=4) from transformers.utils import CONFIG_NAME with open(os.path.join(args.hf_model_dir, CONFIG_NAME), 'r') as f: config = json.load(f) config['quantization_config'] = quantize_config config['generate_datetime'] = generate_datetime() config['torch_dtype'] = args.dtype with open(os.path.join(args.output_dir, CONFIG_NAME), 'w') as f: json.dump(config, f, indent=4) logger.info(f'quantized {args.hf_model_dir} finished') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--hf_model_dir', type=str, default=None) parser.add_argument('--tokenizer_dir', default=None, help='tokenizer path; defaults to hf_model_dir if left unspecified') parser.add_argument( '--enforce_eager', action="store_true", default=True, help='Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model ' 'in eager mode. If False, we will use CUDA graph and eager execution in hybrid.') parser.add_argument('--dtype', type=str, choices=['auto', 'float32', 'float16', 'bfloat16'], default='auto', help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype") parser.add_argument('--scales_smooth_dtype', type=str, choices=['auto', 'float32', 'float16', 'bfloat16'], default='auto', help="if auto, scales and smooth weights use args.dtype, else use the setted dtype") parser.add_argument( '--eval_task', type=str, default='summarize', choices=['summarize', 'summarize_long', 'code_completion', 'summarize_hg', 'text_generation', 'custom'], help='''eval task to decide which dataset is selected. When set to custom, you must set these options dataset_name, dataset_revision, dataset_input_key, dataset_split to specify which dataset to use''') parser.add_argument("--dataset_cache_dir", type=str, default=None, help="cache dir to load the hugging face dataset") parser.add_argument("--dataset_name", type=str, default=None, help="custom dataset name") parser.add_argument("--dataset_revision", type=str, default=None, help="custom dataset version") parser.add_argument("--dataset_input_key", type=str, default=None, help="custom dataset field") parser.add_argument("--dataset_split", type=str, default=None, help="custom dataset split") parser.add_argument('--log_level', type=int, default=logging.INFO) parser.add_argument('--num_samples', type=int, default=512, help='num prompt sample') parser.add_argument('--output_len', type=int, default=100, help="Number of output sequences to return for the given prompt") parser.add_argument('--max_input_length', type=int, default=512, help='max input length of the prompt') parser.add_argument('--block_size', type=int, default=-1, help='Token block size for contiguous chunks of tokens.') parser.add_argument('--temperature', type=float, default=1.0) parser.add_argument('--top_p', type=float, default=1.0) parser.add_argument('--top_k', type=int, default=-1) parser.add_argument('--repetition_penalty', type=float, default=1.0) parser.add_argument('--max_num_seqs', type=int, default=EngineArgs.max_num_seqs, help='Maximum number of sequences per iteration.') parser.add_argument('--output_dir', type=str, default="output_dir", help="The path to save the quantized checkpoint") parser.add_argument( "--max_shard_size", type=str, default="10GB", help=("The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size " "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`)"), ) parser.add_argument('--tp_size', type=int, default=1, help='N-way tensor parallelism size') parser.add_argument('--pp_size', type=int, default=1, help='N-way pipeline parallelism size, now supported num') parser.add_argument('--use_smoothquant', default=False, action="store_true", help='Apply smoothquant to generate weight') parser.add_argument("--smooth_value", type=float, default=0.5, help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)" " to Smoothquant the model, and output int8 weights." " A good first try is 0.5. Must be in [0, 1]") parser.add_argument('--per_channel', action="store_true", default=False, help='By default, we use a single static scaling factor for the GEMM\'s result. ' 'per_channel instead uses a different static scaling factor for each channel. ' 'The latter is usually more accurate, but a little slower.') parser.add_argument( '--per_token', action="store_true", default=False, help='By default, we use a single static scaling factor to scale activations in the int8 range. ' 'per_token chooses at run time, and for each token, a custom scaling factor. ' 'The latter is usually more accurate, but a little slower.') parser.add_argument('--use_weight_only', default=False, action="store_true", help='Quantize weights for the various GEMMs to INT4/INT8.' 'See --weight_only_precision to set the precision') parser.add_argument('--weight_only_precision', const='int8', type=str, nargs='?', default='int8', choices=['int8', 'int4'], help='Define the precision for the weights when using weight-only quantization.' 'You must also use --use_weight_only for that argument to have an impact.') parser.add_argument( '--has_qzeros', action="store_true", default=False, help='whether to add qzeros weight to vllm_mlu weight', ) parser.add_argument('--model_version', type=str, default=None, help="Set model version to replace parsing from _name_or_path in hf config.") parser.add_argument('--model_type', type=str, default=None, help="Set model type to replace parsing from model_type in hf config." "if set is None and parsed also None, then set as model_version") parser.add_argument('--no_add_special_tokens', dest='add_special_tokens', default=True, action='store_false', help="Whether or not to add special tokens") parser.add_argument( '--has_prompt_token_id', action="store_true", default=False, help='whether to give llm.generate prompt_token_id', ) parser.add_argument( '--disable_fused_quantize_expert', action="store_true", default=False, help='''disable fused activation to quantize for unfused moe usage. Because to fused_moe smoothquant, input_smooth has shape (hidden_size), act_smooth has shape (inner_size), and not every expert can be routed, so we assume that all expert should use the same act_smooth by default. You can use this option to close the assumption.''' ) parser.add_argument('--prompt_file', type=str, default=None, help="custom prompt file, should has format that each line is one string prompt," "you can refer the format of summarize_1024_prompts.csv") parser.add_argument( '--batch_size', type=int, default=-1, help="batch size, used to limit max_num_batched_tokens, -1 means batch_size equals to num_samples" ) parser.add_argument( '--cpu_offload_gb', type=float, default=0.0, help='''The size (GiB) of CPU memory to use for offloading the model weights. This virtually increases the GPU memory space you can use to hold the model weights, at the cost of CPU-GPU data transfer for every forward pass.''' ) parser.add_argument( '--dump_prompt_token_ids', action="store_true", default=False, help='dump prompt_token_ids used by llm.generate ', ) parser.add_argument( '--dump_input_ids', action="store_true", default=False, help='dump vllm qkv used token ids at llm running', ) parser.add_argument( '--dump_act_range', action="store_true", default=False, help='dump act range which is the max hidden dim value of input, output, weigth', ) parser.add_argument( '--dump_weights', action="store_true", default=False, help='dump weights of the converted model', ) parser.add_argument( '--dump_generate_weights', action="store_true", default=False, help='dump generate weights of the converted model', ) args = parser.parse_args() assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir" assert args.pp_size == 1, "Pipeline parallelism is not supported." if args.tokenizer_dir is None: args.tokenizer_dir = args.hf_model_dir if args.has_prompt_token_id is False: args.dump_prompt_token_ids = False if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) args.model_name, args.model_version, args.model_family, args.model_type = read_model_name( args.hf_model_dir, args.model_version, args.model_type) assert args.model_type in smooth_model_config, f'''{args.model_type} hasn't supported, please add it's infomation in model_special.py by your self''' args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True) hf_text_config = get_hf_text_config(args.hf_config) args.tie_word_embeddings = getattr(hf_text_config, "tie_word_embeddings", False) sliding_window_len = get_hf_config_sliding_window(hf_text_config) disable_sliding_window = sliding_window_len is None if args.model_type == 'qwen2_vl': # workround for qwen2_vl since _get_and_verify_max_len not supported for MRoPE # remove this when it is supported. args.hf_max_model_len = 32768 else: if args.model_type == 'hunyuan' or args.model_type == 'deepseek_v2': disable_sliding_window=False args.hf_max_model_len = _get_and_verify_max_len(hf_text_config, None, disable_sliding_window, sliding_window_len) if args.batch_size < 1: args.batch_size = args.num_samples args.batch_size = min(args.batch_size, args.num_samples) if args.dtype == "auto": args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype) if args.scales_smooth_dtype == "auto": args.scales_smooth_dtype = args.dtype args.torch_dtype = str_dtype_to_torch(args.dtype) args.torch_scales_smooth_dtype = str_dtype_to_torch(args.scales_smooth_dtype) args.hf_config.torch_dtype = args.torch_dtype args.tokenizer, args.pad_id, args.end_id = load_tokenizer( tokenizer_dir=args.tokenizer_dir, model_name=args.model_name, model_version=args.model_version, ) tik = time.time() main(args) tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) logger.info(f'Total time of converting checkpoints: {t}')