forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/tools/quant_tools/__init__.py
Normal file
0
vllm-v0.6.2/tools/quant_tools/__init__.py
Normal file
419
vllm-v0.6.2/tools/quant_tools/convert_checkpoint.py
Normal file
419
vllm-v0.6.2/tools/quant_tools/convert_checkpoint.py
Normal file
@@ -0,0 +1,419 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import sys
|
||||
import time
|
||||
import safetensors
|
||||
import logging
|
||||
import json
|
||||
from huggingface_hub import split_torch_state_dict_into_shards, constants
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.transformers_utils.config import get_config, get_hf_text_config
|
||||
from vllm.config import _get_and_verify_max_len
|
||||
import transformers
|
||||
from transformers.modeling_utils import SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
||||
|
||||
from smooth_quant import generate_weights_of_smoothquant
|
||||
from weight_only import generate_weights_of_weight_only
|
||||
from utils_internal import (read_model_name, load_tokenizer, torch_dtype_to_str, str_dtype_to_torch,
|
||||
copy_files_except_extensions, generate_datetime, get_hf_config_sliding_window)
|
||||
from utils_internal import get_skip_patterns, should_skip
|
||||
from model_special import smooth_model_config
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
|
||||
sys.path.append(os.getcwd())
|
||||
|
||||
logger = logging.getLogger("smooth_convert")
|
||||
|
||||
def load_skip_params_from_hf(args):
|
||||
'''
|
||||
load parameters from transformers that do no need to be quantized.
|
||||
'''
|
||||
model_type = args.model_type
|
||||
if not get_skip_patterns(model_type):
|
||||
return {}
|
||||
try:
|
||||
model = getattr(transformers, args.model_name, None)
|
||||
if model is None:
|
||||
model = AutoModelForCausalLM
|
||||
model = model.from_pretrained(
|
||||
args.hf_model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=args.torch_dtype,
|
||||
device_map="cpu")
|
||||
except Exception as e:
|
||||
logger.fatal(f"Unsupported model {args.model_name}, error message: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
params_map = {}
|
||||
hf_params = dict(model.named_parameters())
|
||||
for name, param in hf_params.items():
|
||||
if should_skip(model_type, name):
|
||||
logger.info(f"load parameters from transformers, name: {name}")
|
||||
params_map[name] = param
|
||||
return params_map
|
||||
|
||||
def save_quantized_weights_to_safetensors(quantized_weights, args):
|
||||
'''
|
||||
save quantized_weights to safetensors format
|
||||
'''
|
||||
# Store the state_dict to file.
|
||||
max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
|
||||
state_dict_split = split_torch_state_dict_into_shards(quantized_weights,
|
||||
filename_pattern=constants.SAFETENSORS_WEIGHTS_FILE_PATTERN,
|
||||
max_shard_size=max_shard_size)
|
||||
# Save the model
|
||||
for shard_name, tensors in state_dict_split.filename_to_tensors.items():
|
||||
shard = {tensor: quantized_weights[tensor] for tensor in tensors}
|
||||
safetensors.torch.save_file(shard, os.path.join(args.output_dir, shard_name), metadata={"format": "pt"})
|
||||
|
||||
if state_dict_split.is_sharded:
|
||||
index = {
|
||||
"metadata": state_dict_split.metadata,
|
||||
"weight_map": state_dict_split.tensor_to_filename,
|
||||
}
|
||||
save_index_file = os.path.join(args.output_dir, SAFE_WEIGHTS_INDEX_NAME)
|
||||
with open(save_index_file, "w", encoding="utf-8") as f:
|
||||
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
|
||||
f.write(content)
|
||||
logger.info(
|
||||
f"The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be "
|
||||
f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where "
|
||||
f"each parameters has been saved in the index located at {save_index_file}."
|
||||
)
|
||||
else:
|
||||
logger.info(f"Model weights saved in {os.path.join(args.output_dir, SAFE_WEIGHTS_NAME)}")
|
||||
|
||||
|
||||
def main(args):
|
||||
'''
|
||||
main quantization logic
|
||||
'''
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=args.log_level,
|
||||
force=True,
|
||||
)
|
||||
|
||||
tik = time.time()
|
||||
|
||||
skip_params = load_skip_params_from_hf(args)
|
||||
# Create an LLM.
|
||||
max_model_len = max(args.max_input_length + args.output_len, 2048)
|
||||
args.max_model_len = min(max_model_len, args.hf_max_model_len)
|
||||
|
||||
max_num_batched_tokens = max(max(args.max_input_length * args.batch_size, max_model_len), 2048)
|
||||
args.max_num_batched_tokens = min(max_num_batched_tokens, args.hf_max_model_len)
|
||||
llm = LLM(model=args.hf_model_dir,
|
||||
tokenizer=args.tokenizer_dir,
|
||||
tensor_parallel_size=args.tp_size,
|
||||
distributed_executor_backend='ray',
|
||||
dtype=args.dtype,
|
||||
enforce_eager=args.enforce_eager,
|
||||
trust_remote_code=True,
|
||||
block_size=args.block_size,
|
||||
max_model_len=args.max_model_len,
|
||||
max_num_batched_tokens=args.max_num_batched_tokens,
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
cpu_offload_gb=args.cpu_offload_gb)
|
||||
tok = time.time()
|
||||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||||
|
||||
logger.info(f'Load vLLM model takes: {t}')
|
||||
|
||||
quantize_config = {}
|
||||
if args.use_weight_only:
|
||||
st_prefix = f"weight_{args.weight_only_precision}"
|
||||
quantized_weights = generate_weights_of_weight_only(llm, args)
|
||||
quantize_config['bits'] = 8 if args.weight_only_precision == "int8" else 4
|
||||
quantize_config['quant_method'] = "weightonly"
|
||||
quantize_config['quant_mode'] = "WeightOnly"
|
||||
|
||||
if args.use_smoothquant:
|
||||
st_prefix = f"smoothquant_{args.smooth_value}"
|
||||
quantized_weights, smooth_info = generate_weights_of_smoothquant(llm, args)
|
||||
quantize_config['bits'] = 8
|
||||
quantize_config['quant_method'] = "smoothquant"
|
||||
quantize_config['quant_mode'] = "SmoothQuant"
|
||||
quantize_config['input_quant_method'] = "per_token" if args.per_token else "per_tensor"
|
||||
quantize_config['smooth_value'] = args.smooth_value
|
||||
with open(os.path.join(args.output_dir, 'smooth_info.json'), 'w') as f:
|
||||
json.dump(smooth_info, f, indent=4)
|
||||
|
||||
# Should first copy other files from hf_model_dir, and then save weight, tokenizer, config, quant_config and so on
|
||||
extensions = ['.bin', '.safetensors', ".pt", ".index.json"]
|
||||
copy_files_except_extensions(args.hf_model_dir, args.output_dir, extensions)
|
||||
logger.info(f'copy files except extensions success')
|
||||
|
||||
for name, param in skip_params.items():
|
||||
assert name in quantized_weights
|
||||
quantized_weights[name] = param
|
||||
save_quantized_weights_to_safetensors(quantized_weights, args)
|
||||
logger.info(f'save quantized_weights to safetensors success')
|
||||
|
||||
with open(os.path.join(args.output_dir, 'quantize_config.json'), 'w') as f:
|
||||
json.dump(quantize_config, f, indent=4)
|
||||
|
||||
from transformers.utils import CONFIG_NAME
|
||||
with open(os.path.join(args.hf_model_dir, CONFIG_NAME), 'r') as f:
|
||||
config = json.load(f)
|
||||
config['quantization_config'] = quantize_config
|
||||
config['generate_datetime'] = generate_datetime()
|
||||
config['torch_dtype'] = args.dtype
|
||||
with open(os.path.join(args.output_dir, CONFIG_NAME), 'w') as f:
|
||||
json.dump(config, f, indent=4)
|
||||
|
||||
logger.info(f'quantized {args.hf_model_dir} finished')
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--hf_model_dir', type=str, default=None)
|
||||
parser.add_argument('--tokenizer_dir',
|
||||
default=None,
|
||||
help='tokenizer path; defaults to hf_model_dir if left unspecified')
|
||||
parser.add_argument(
|
||||
'--enforce_eager',
|
||||
action="store_true",
|
||||
default=True,
|
||||
help='Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model '
|
||||
'in eager mode. If False, we will use CUDA graph and eager execution in hybrid.')
|
||||
parser.add_argument('--dtype',
|
||||
type=str,
|
||||
choices=['auto', 'float32', 'float16', 'bfloat16'],
|
||||
default='auto',
|
||||
help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
|
||||
parser.add_argument('--scales_smooth_dtype',
|
||||
type=str,
|
||||
choices=['auto', 'float32', 'float16', 'bfloat16'],
|
||||
default='auto',
|
||||
help="if auto, scales and smooth weights use args.dtype, else use the setted dtype")
|
||||
parser.add_argument(
|
||||
'--eval_task',
|
||||
type=str,
|
||||
default='summarize',
|
||||
choices=['summarize', 'summarize_long', 'code_completion', 'summarize_hg', 'text_generation', 'custom'],
|
||||
help='''eval task to decide which dataset is selected. When set to custom, you must set these options
|
||||
dataset_name, dataset_revision, dataset_input_key, dataset_split to specify which dataset to use''')
|
||||
parser.add_argument("--dataset_cache_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="cache dir to load the hugging face dataset")
|
||||
parser.add_argument("--dataset_name", type=str, default=None, help="custom dataset name")
|
||||
parser.add_argument("--dataset_revision", type=str, default=None, help="custom dataset version")
|
||||
parser.add_argument("--dataset_input_key", type=str, default=None, help="custom dataset field")
|
||||
parser.add_argument("--dataset_split", type=str, default=None, help="custom dataset split")
|
||||
parser.add_argument('--log_level', type=int, default=logging.INFO)
|
||||
parser.add_argument('--num_samples', type=int, default=512, help='num prompt sample')
|
||||
parser.add_argument('--output_len',
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of output sequences to return for the given prompt")
|
||||
parser.add_argument('--max_input_length',
|
||||
type=int,
|
||||
default=512,
|
||||
help='max input length of the prompt')
|
||||
parser.add_argument('--block_size', type=int, default=-1, help='Token block size for contiguous chunks of tokens.')
|
||||
parser.add_argument('--temperature', type=float, default=1.0)
|
||||
parser.add_argument('--top_p', type=float, default=1.0)
|
||||
parser.add_argument('--top_k', type=int, default=-1)
|
||||
parser.add_argument('--repetition_penalty', type=float, default=1.0)
|
||||
parser.add_argument('--max_num_seqs',
|
||||
type=int,
|
||||
default=EngineArgs.max_num_seqs,
|
||||
help='Maximum number of sequences per iteration.')
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default="output_dir",
|
||||
help="The path to save the quantized checkpoint")
|
||||
parser.add_argument(
|
||||
"--max_shard_size",
|
||||
type=str,
|
||||
default="10GB",
|
||||
help=("The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size "
|
||||
"lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`)"),
|
||||
)
|
||||
parser.add_argument('--tp_size', type=int, default=1, help='N-way tensor parallelism size')
|
||||
parser.add_argument('--pp_size', type=int, default=1, help='N-way pipeline parallelism size, now supported num')
|
||||
parser.add_argument('--use_smoothquant',
|
||||
default=False,
|
||||
action="store_true",
|
||||
help='Apply smoothquant to generate weight')
|
||||
parser.add_argument("--smooth_value",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
|
||||
" to Smoothquant the model, and output int8 weights."
|
||||
" A good first try is 0.5. Must be in [0, 1]")
|
||||
parser.add_argument('--per_channel',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='By default, we use a single static scaling factor for the GEMM\'s result. '
|
||||
'per_channel instead uses a different static scaling factor for each channel. '
|
||||
'The latter is usually more accurate, but a little slower.')
|
||||
parser.add_argument(
|
||||
'--per_token',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='By default, we use a single static scaling factor to scale activations in the int8 range. '
|
||||
'per_token chooses at run time, and for each token, a custom scaling factor. '
|
||||
'The latter is usually more accurate, but a little slower.')
|
||||
parser.add_argument('--use_weight_only',
|
||||
default=False,
|
||||
action="store_true",
|
||||
help='Quantize weights for the various GEMMs to INT4/INT8.'
|
||||
'See --weight_only_precision to set the precision')
|
||||
parser.add_argument('--weight_only_precision',
|
||||
const='int8',
|
||||
type=str,
|
||||
nargs='?',
|
||||
default='int8',
|
||||
choices=['int8', 'int4'],
|
||||
help='Define the precision for the weights when using weight-only quantization.'
|
||||
'You must also use --use_weight_only for that argument to have an impact.')
|
||||
parser.add_argument(
|
||||
'--has_qzeros',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='whether to add qzeros weight to vllm_mlu weight',
|
||||
)
|
||||
parser.add_argument('--model_version',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Set model version to replace parsing from _name_or_path in hf config.")
|
||||
parser.add_argument('--model_type',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Set model type to replace parsing from model_type in hf config."
|
||||
"if set is None and parsed also None, then set as model_version")
|
||||
parser.add_argument('--no_add_special_tokens',
|
||||
dest='add_special_tokens',
|
||||
default=True,
|
||||
action='store_false',
|
||||
help="Whether or not to add special tokens")
|
||||
parser.add_argument(
|
||||
'--has_prompt_token_id',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='whether to give llm.generate prompt_token_id',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--disable_fused_quantize_expert',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='''disable fused activation to quantize for unfused moe usage.
|
||||
Because to fused_moe smoothquant, input_smooth has shape (hidden_size), act_smooth has shape (inner_size),
|
||||
and not every expert can be routed, so we assume that all expert should use the same act_smooth by default.
|
||||
You can use this option to close the assumption.'''
|
||||
)
|
||||
parser.add_argument('--prompt_file',
|
||||
type=str,
|
||||
default=None,
|
||||
help="custom prompt file, should has format that each line is one string prompt,"
|
||||
"you can refer the format of summarize_1024_prompts.csv")
|
||||
parser.add_argument(
|
||||
'--batch_size',
|
||||
type=int,
|
||||
default=-1,
|
||||
help="batch size, used to limit max_num_batched_tokens, -1 means batch_size equals to num_samples"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--cpu_offload_gb',
|
||||
type=float,
|
||||
default=0.0,
|
||||
help='''The size (GiB) of CPU memory to use for offloading the model weights.
|
||||
This virtually increases the GPU memory space you can use to hold the model weights,
|
||||
at the cost of CPU-GPU data transfer for every forward pass.'''
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_prompt_token_ids',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump prompt_token_ids used by llm.generate ',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_input_ids',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump vllm qkv used token ids at llm running',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_act_range',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump act range which is the max hidden dim value of input, output, weigth',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_weights',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump weights of the converted model',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_generate_weights',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump generate weights of the converted model',
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
|
||||
assert args.pp_size == 1, "Pipeline parallelism is not supported."
|
||||
|
||||
if args.tokenizer_dir is None:
|
||||
args.tokenizer_dir = args.hf_model_dir
|
||||
|
||||
if args.has_prompt_token_id is False:
|
||||
args.dump_prompt_token_ids = False
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
|
||||
args.hf_model_dir, args.model_version, args.model_type)
|
||||
assert args.model_type in smooth_model_config, f'''{args.model_type} hasn't supported,
|
||||
please add it's infomation in model_special.py by your self'''
|
||||
|
||||
args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
|
||||
hf_text_config = get_hf_text_config(args.hf_config)
|
||||
args.tie_word_embeddings = getattr(hf_text_config, "tie_word_embeddings", False)
|
||||
sliding_window_len = get_hf_config_sliding_window(hf_text_config)
|
||||
disable_sliding_window = sliding_window_len is None
|
||||
if args.model_type == 'qwen2_vl':
|
||||
# workround for qwen2_vl since _get_and_verify_max_len not supported for MRoPE
|
||||
# remove this when it is supported.
|
||||
args.hf_max_model_len = 32768
|
||||
else:
|
||||
if args.model_type == 'hunyuan' or args.model_type == 'deepseek_v2':
|
||||
disable_sliding_window=False
|
||||
args.hf_max_model_len = _get_and_verify_max_len(hf_text_config, None, disable_sliding_window, sliding_window_len)
|
||||
|
||||
if args.batch_size < 1:
|
||||
args.batch_size = args.num_samples
|
||||
|
||||
args.batch_size = min(args.batch_size, args.num_samples)
|
||||
if args.dtype == "auto":
|
||||
args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
|
||||
|
||||
if args.scales_smooth_dtype == "auto":
|
||||
args.scales_smooth_dtype = args.dtype
|
||||
|
||||
args.torch_dtype = str_dtype_to_torch(args.dtype)
|
||||
args.torch_scales_smooth_dtype = str_dtype_to_torch(args.scales_smooth_dtype)
|
||||
args.hf_config.torch_dtype = args.torch_dtype
|
||||
|
||||
args.tokenizer, args.pad_id, args.end_id = load_tokenizer(
|
||||
tokenizer_dir=args.tokenizer_dir,
|
||||
model_name=args.model_name,
|
||||
model_version=args.model_version,
|
||||
)
|
||||
|
||||
tik = time.time()
|
||||
main(args)
|
||||
|
||||
tok = time.time()
|
||||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||||
logger.info(f'Total time of converting checkpoints: {t}')
|
||||
69
vllm-v0.6.2/tools/quant_tools/dump_hf_weight.py
Normal file
69
vllm-v0.6.2/tools/quant_tools/dump_hf_weight.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import os
|
||||
import argparse
|
||||
from transformers import (AutoModel, AutoModelForCausalLM,
|
||||
AutoModelForSeq2SeqLM, GenerationConfig)
|
||||
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from utils_internal import (read_model_name, torch_dtype_to_str, str_dtype_to_torch)
|
||||
from dump_smooth import save_weights
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--hf_model_dir', type=str, default=None)
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default="output_dir",
|
||||
help="The path to save the quantized checkpoint")
|
||||
parser.add_argument('--model_version',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Set model version to replace parsing from _name_or_path in hf config.")
|
||||
parser.add_argument('--model_type',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Set model type to replace parsing from model_type in hf config."
|
||||
"if set is None and parsed also None, then set as model_version")
|
||||
parser.add_argument('--dtype',
|
||||
type=str,
|
||||
choices=['auto', 'float32', 'float16', 'bfloat16'],
|
||||
default='auto',
|
||||
help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
|
||||
parser.add_argument(
|
||||
'--dump_weights',
|
||||
action="store_true",
|
||||
default=True,
|
||||
help='dump weights of the converted model',
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
|
||||
args.hf_model_dir, args.model_version, args.model_type)
|
||||
|
||||
args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
|
||||
|
||||
if args.dtype == "auto":
|
||||
args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
|
||||
|
||||
args.torch_dtype = str_dtype_to_torch(args.dtype)
|
||||
args.hf_config.torch_dtype = args.torch_dtype
|
||||
|
||||
if args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'glm':
|
||||
auto_model_cls = AutoModelForSeq2SeqLM
|
||||
elif args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'chatglm':
|
||||
auto_model_cls = AutoModel
|
||||
else:
|
||||
auto_model_cls = AutoModelForCausalLM
|
||||
model = auto_model_cls.from_pretrained(
|
||||
args.hf_model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=args.torch_dtype)
|
||||
|
||||
named_parameters = dict(model.named_parameters())
|
||||
save_weights(named_parameters, args)
|
||||
145
vllm-v0.6.2/tools/quant_tools/dump_smooth.py
Normal file
145
vllm-v0.6.2/tools/quant_tools/dump_smooth.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import torch
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def tensor_shape_to_string(tensor):
|
||||
'''
|
||||
convert a tensor shape to string description
|
||||
'''
|
||||
int_list = list(tensor.shape)
|
||||
str_list = [str(num) for num in int_list]
|
||||
str_shape = "x".join(str_list)
|
||||
return str_shape
|
||||
|
||||
|
||||
def save_prompt_token_ids(prompt_input_ids, args):
|
||||
'''
|
||||
save prompt_token_id
|
||||
Args:
|
||||
prompt_input_ids: prompt input_id assiged to llm.generate
|
||||
args: arguments from main
|
||||
'''
|
||||
if args.dump_prompt_token_ids is not True:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "prompt_input_ids")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
data_len = len(prompt_input_ids)
|
||||
for data_index in range(data_len):
|
||||
tensor = prompt_input_ids[data_index]
|
||||
str_shape = tensor_shape_to_string(tensor)
|
||||
file_path = os.path.join(output_dir, f"prompt_input_ids_{data_index}_{str_shape}.pt")
|
||||
torch.save(tensor, file_path)
|
||||
logger.info(f"Saved input_ids[{data_index}] to {file_path}")
|
||||
|
||||
|
||||
def save_input_ids(input_ids, args):
|
||||
'''
|
||||
save input_ids
|
||||
Args:
|
||||
input_ids: input of qkv with layer0
|
||||
args: arguments from main
|
||||
'''
|
||||
id_len = len(input_ids)
|
||||
if args.dump_input_ids is not True or id_len == 0:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "input_ids")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
for data_index in range(id_len):
|
||||
tensor = input_ids[data_index]
|
||||
str_shape = tensor_shape_to_string(tensor)
|
||||
file_path = os.path.join(output_dir, f"input_ids_{data_index}_{str_shape}.pt")
|
||||
torch.save(tensor, file_path)
|
||||
logger.info(f"Saved input_ids[{data_index}] to {file_path}")
|
||||
|
||||
|
||||
def save_act_range(act_range, args):
|
||||
'''
|
||||
save act_range
|
||||
Args:
|
||||
act_range: save act_range collected when model running
|
||||
args: arguments from main
|
||||
'''
|
||||
if args.dump_act_range is not True:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "act_range")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
for layer_name, layer_scale in act_range.items():
|
||||
for tensor_key, tensor_value in layer_scale.items():
|
||||
if isinstance(tensor_value, torch.Tensor):
|
||||
str_shape = tensor_shape_to_string(tensor_value)
|
||||
file_name = f'{layer_name}_{tensor_key}_{str_shape}.pt'
|
||||
file_path = os.path.join(output_dir, file_name)
|
||||
torch.save(tensor_value, file_path)
|
||||
logger.info(f"Saved act_range[{layer_name}][{tensor_key}] to {file_path}")
|
||||
|
||||
|
||||
def save_weights(weights, args):
|
||||
'''
|
||||
save hugging face weights
|
||||
Args:
|
||||
weights: hugging face weights merged with llm model named parameters
|
||||
args: arguments from main
|
||||
'''
|
||||
if args.dump_weights is not True:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "weights")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
for tensor_key, tensor_value in weights.items():
|
||||
str_shape = tensor_shape_to_string(tensor_value)
|
||||
file_name = f'{tensor_key}_{str_shape}.pt'
|
||||
file_path = os.path.join(output_dir, file_name)
|
||||
torch.save(tensor_value, file_path)
|
||||
logger.info(f"Saved weights[{tensor_key}] to {file_path}")
|
||||
|
||||
|
||||
def save_generate_weights(weights, args):
|
||||
'''
|
||||
save quantizated weights
|
||||
Args:
|
||||
weights: quantized weights of smoothquant or weightonly
|
||||
args: arguments from main
|
||||
'''
|
||||
if args.dump_generate_weights is not True:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "generate_weights")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
for tensor_key, tensor_value in weights.items():
|
||||
str_shape = tensor_shape_to_string(tensor_value)
|
||||
file_name = f'{tensor_key}_{str_shape}.pt'
|
||||
file_path = os.path.join(output_dir, file_name)
|
||||
torch.save(tensor_value, file_path)
|
||||
logger.info(f"Saved generate weights[{tensor_key}] to {file_path}")
|
||||
|
||||
|
||||
def dump_save_x_y(name, x, y, index):
|
||||
'''
|
||||
dump x, y when inferrence
|
||||
output_dir need to modify by your self
|
||||
'''
|
||||
output_dir = "output_dir"
|
||||
x_output_dir = os.path.join(output_dir, "x_tensor")
|
||||
y_output_dir = os.path.join(output_dir, "y_tensor")
|
||||
if not os.path.exists(x_output_dir):
|
||||
os.makedirs(x_output_dir)
|
||||
if not os.path.exists(y_output_dir):
|
||||
os.makedirs(y_output_dir)
|
||||
|
||||
x_file_name = os.path.join(x_output_dir, f"{name}_x_{index}.pt")
|
||||
y_file_name = os.path.join(y_output_dir, f"{name}_y_{index}.pt")
|
||||
if isinstance(x, tuple):
|
||||
x = x[0]
|
||||
if not os.path.exists(x_file_name):
|
||||
torch.save(x.cpu(), x_file_name)
|
||||
if not os.path.exists(y_file_name):
|
||||
torch.save(y.cpu(), y_file_name)
|
||||
140
vllm-v0.6.2/tools/quant_tools/input_context.py
Normal file
140
vllm-v0.6.2/tools/quant_tools/input_context.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import torch
|
||||
|
||||
|
||||
def make_context(
|
||||
tokenizer,
|
||||
query,
|
||||
history,
|
||||
system,
|
||||
max_input_length,
|
||||
max_window_size: int = 6144,
|
||||
chat_format: str = "chatml",
|
||||
):
|
||||
'''
|
||||
tokenize one text context to tokenized id
|
||||
args:
|
||||
tokenizer: model tokenizer
|
||||
query: current text context
|
||||
history: history text context
|
||||
system: system prompt
|
||||
max_input_length: max input length of tokenized id
|
||||
chat_format: chat format, only accept chatml and raw
|
||||
'''
|
||||
if history is None:
|
||||
history = []
|
||||
|
||||
if chat_format == "chatml":
|
||||
im_start, im_end = "<|im_start|>", "<|im_end|>"
|
||||
im_start_tokens = [tokenizer.im_start_id]
|
||||
im_end_tokens = [tokenizer.im_end_id]
|
||||
nl_tokens = tokenizer.encode("\n")
|
||||
|
||||
def _tokenize_str(role, content):
|
||||
'''
|
||||
tokensize string
|
||||
'''
|
||||
return (f"{role}\n{content}", tokenizer.encode(
|
||||
role,
|
||||
allowed_special=set(),
|
||||
) + nl_tokens + tokenizer.encode(
|
||||
content,
|
||||
allowed_special=set(),
|
||||
))
|
||||
|
||||
system_text, system_tokens_part = _tokenize_str("system", system)
|
||||
system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
|
||||
raw_text = ""
|
||||
context_tokens = []
|
||||
|
||||
for turn_query, turn_response in reversed(history):
|
||||
query_text, query_tokens_part = _tokenize_str("user", turn_query)
|
||||
query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
|
||||
|
||||
response_text, response_tokens_part = _tokenize_str("assistant", turn_response)
|
||||
response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
|
||||
next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
|
||||
prev_chat = (f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}")
|
||||
|
||||
current_context_size = (len(system_tokens) + len(next_context_tokens) + len(context_tokens))
|
||||
if current_context_size < max_window_size:
|
||||
context_tokens = next_context_tokens + context_tokens
|
||||
raw_text = prev_chat + raw_text
|
||||
else:
|
||||
break
|
||||
|
||||
context_tokens = system_tokens + context_tokens
|
||||
raw_text = f"{im_start}{system_text}{im_end}" + raw_text
|
||||
context_tokens += (nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens +
|
||||
im_start_tokens + tokenizer.encode("assistant") + nl_tokens)
|
||||
raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
|
||||
|
||||
elif chat_format == "raw":
|
||||
raw_text = query
|
||||
context_tokens = tokenizer.encode(raw_text)
|
||||
else:
|
||||
raise NotImplementedError(f"Unknown chat format {chat_format!r}")
|
||||
# truncate to max_input_length, truncate from the front
|
||||
return raw_text, context_tokens[-max_input_length:]
|
||||
|
||||
|
||||
def prepare_inputs(batch_input_texts,
|
||||
tokenizer,
|
||||
model_name,
|
||||
model_version,
|
||||
test_token_num,
|
||||
eval_task='summarize',
|
||||
add_special_tokens=True):
|
||||
'''
|
||||
tokenize batch input texts into tokenized id.
|
||||
args:
|
||||
batch_input_texts: batch input text, also named batched prompt
|
||||
tokenizer: model tokenizer
|
||||
model_name: model name
|
||||
model_version: model version
|
||||
test_token_num: batch size, also named prompt number
|
||||
eval_task: eval task
|
||||
add_special_tokens: whether to add_special_tokens, default True
|
||||
'''
|
||||
batch_size = len(batch_input_texts)
|
||||
append_str = ' TL;DR: ' if eval_task == 'summarize' else ''
|
||||
batch_input_ids = []
|
||||
for i in range(batch_size):
|
||||
curr_text = batch_input_texts[i] + append_str
|
||||
curr_text = curr_text.strip().replace(" n't", "n't")
|
||||
|
||||
# The below lines are used to be compatible with the original code
|
||||
if 'GLM' in model_name and model_version in ['chatglm2', 'chatglm3']:
|
||||
input_ids = tokenizer.encode(curr_text, return_tensors='pt').squeeze(0)
|
||||
input_ids = input_ids[:test_token_num]
|
||||
elif 'qwen' in model_name.lower() and model_version == 'qwen':
|
||||
# use make_content to generate prompt
|
||||
system_prompt = "You are a useful assistant, please directly output the corresponding " + \
|
||||
"summary according to the article entered by the user."
|
||||
_, input_id_list = make_context(
|
||||
tokenizer=tokenizer,
|
||||
query=curr_text,
|
||||
history=[],
|
||||
system=system_prompt,
|
||||
max_input_length=test_token_num,
|
||||
)
|
||||
input_ids = torch.tensor(input_id_list)
|
||||
else:
|
||||
if 'qwen' in model_name.lower() and 'qwen2' in model_version:
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": curr_text
|
||||
}]
|
||||
curr_text = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
input_ids = tokenizer.encode(curr_text,
|
||||
return_tensors='pt',
|
||||
add_special_tokens=add_special_tokens,
|
||||
truncation=True,
|
||||
max_length=test_token_num).squeeze(0)
|
||||
|
||||
batch_input_ids.append(input_ids)
|
||||
return batch_input_ids
|
||||
206
vllm-v0.6.2/tools/quant_tools/model_special.py
Executable file
206
vllm-v0.6.2/tools/quant_tools/model_special.py
Executable file
@@ -0,0 +1,206 @@
|
||||
import re
|
||||
|
||||
# model_type, qkv_list, gate_up_list, is_gate_up
|
||||
smooth_model_config = {
|
||||
"mllama": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"llama": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"qwen2_vl": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None,
|
||||
"skip_patterns": [r"^visual\.*"]
|
||||
},
|
||||
"qwen2": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"qwen": {
|
||||
"qkv_list": ["c_attn"],
|
||||
"gate_up_list": ["w2", "w1"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"baichuan": {
|
||||
"qkv_list": ["W_pack"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"chatglm": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": ["dense_h_to_4h"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"gpt_neox": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": [],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"mixtral": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["w1", "w3"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
|
||||
"down_list": ["block_sparse_moe.w2", "w2"],
|
||||
"is_merged": True
|
||||
}
|
||||
},
|
||||
"qwen2_moe": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
||||
"down_list": ["mlp.w2", "down_proj"],
|
||||
"is_merged": True
|
||||
}
|
||||
},
|
||||
"deepseek_v2": {
|
||||
"qkv_list": ["q_proj", "q_b_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
||||
"down_list": ["mlp.w2", "down_proj"],
|
||||
"is_merged": True
|
||||
},
|
||||
"skip_patterns": [r".*\.kv_b_proj\..*",]
|
||||
},
|
||||
"falcon": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": ["dense_h_to_4h"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"bloom": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": ["dense_h_to_4h"],
|
||||
"is_gate_up": False,
|
||||
"moe_list": None
|
||||
},
|
||||
"internlm2": {
|
||||
"qkv_list": ["wqkv"],
|
||||
"gate_up_list": ["gate_up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"hunyuan": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
||||
"down_list": ["mlp.w2", "down_proj"],
|
||||
"is_merged": True
|
||||
}
|
||||
},
|
||||
"phi3": {
|
||||
"qkv_list": ["qkv_proj"],
|
||||
"gate_up_list": ["gate_up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_layer_weight_bias_name(model_type, layer_name):
|
||||
'''
|
||||
Specially adjust the condition that layer_name and weight/bias name are different,
|
||||
or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
|
||||
if model_type == "chatglm" and "output_layer" in layer_name:
|
||||
layer_name = "lm_head"
|
||||
weight_name = f"{layer_name}_weight"
|
||||
bias_name = f"{layer_name}_bias"
|
||||
Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
|
||||
'''
|
||||
weight_name = None
|
||||
bias_name = None
|
||||
|
||||
# layers which need to be modified can be listed at here
|
||||
if model_type == "hunyuan" and "lm_head" in layer_name:
|
||||
layer_name = "model.embed_tokens"
|
||||
weight_name = "model.embed_tokens.weight"
|
||||
bias_name = "model.embed_tokens.bias"
|
||||
|
||||
if weight_name is None:
|
||||
weight_name = f"{layer_name}.weight"
|
||||
if bias_name is None:
|
||||
bias_name = f"{layer_name}.bias"
|
||||
|
||||
return layer_name, weight_name, bias_name
|
||||
|
||||
|
||||
def modify_layer_weight_bias_name(model_type, named_parameters):
|
||||
'''
|
||||
modify special condition that vllm layer_name isn't same as hf layer name
|
||||
'''
|
||||
# Mapping for model type specific adjustments
|
||||
mapping = {
|
||||
"chatglm": {
|
||||
"transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
|
||||
},
|
||||
}
|
||||
|
||||
if model_type in mapping:
|
||||
for old_key, new_key in mapping[model_type].items():
|
||||
if old_key in named_parameters:
|
||||
named_parameters[new_key] = named_parameters.pop(old_key)
|
||||
|
||||
|
||||
def extract_numbers(string):
|
||||
'''
|
||||
extract a string to number
|
||||
'''
|
||||
# 使用正则表达式找到字符串中的所有数字部分
|
||||
matches = re.findall(r'\d+', string)
|
||||
|
||||
# 将所有匹配的数字部分转换为整数
|
||||
numbers = [int(match) for match in matches]
|
||||
|
||||
return numbers[-1] if len(numbers) > 0 else 0
|
||||
|
||||
|
||||
def get_qkv_distribution(model_type, model_version, hf_config):
|
||||
'''
|
||||
Get qkv distribution: n3sh or 3nsh
|
||||
n3sh: [head_num, 3, head_size, hidden_size]
|
||||
3nsh: [3, head_num, head_size, hidden_size]
|
||||
vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
|
||||
to be same as hugging face qkv distribution
|
||||
This is only for packge qkv layer and it's distribution is n3sh
|
||||
'''
|
||||
is_n3sh = False
|
||||
head_num = 0
|
||||
kv_head_num = 0
|
||||
if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
|
||||
is_n3sh = True
|
||||
head_num = hf_config.num_attention_heads
|
||||
|
||||
kv_head_num = head_num
|
||||
if model_type == "falcon":
|
||||
is_n3sh = True
|
||||
head_num = hf_config.num_attention_heads
|
||||
if hf_config.new_decoder_architecture:
|
||||
kv_head_num = hf_config.num_kv_heads
|
||||
elif hf_config.multi_query:
|
||||
kv_head_num = 1
|
||||
else:
|
||||
kv_head_num = head_num
|
||||
|
||||
return is_n3sh, head_num, kv_head_num
|
||||
418
vllm-v0.6.2/tools/quant_tools/smooth_quant.py
Normal file
418
vllm-v0.6.2/tools/quant_tools/smooth_quant.py
Normal file
@@ -0,0 +1,418 @@
|
||||
import argparse
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
import logging
|
||||
import csv
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
|
||||
|
||||
from input_context import prepare_inputs
|
||||
|
||||
from dump_smooth import save_prompt_token_ids, save_input_ids, save_act_range, save_weights, save_generate_weights
|
||||
|
||||
from model_special import smooth_model_config
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def load_prompts_from_csv(args):
|
||||
'''
|
||||
load prompts from csv file
|
||||
'''
|
||||
if args.prompt_file is not None:
|
||||
prompt_file = args.prompt_file
|
||||
else:
|
||||
current_dir = os.path.dirname(__file__)
|
||||
prompt_file = os.path.join(current_dir, 'summarize_1024_prompts.csv')
|
||||
|
||||
# 从 CSV 文件加载数据为 List
|
||||
loaded_prompts = []
|
||||
|
||||
# 从按列显示的 CSV 文件中读取数据并转换为 List 形式
|
||||
with open(prompt_file, 'r', newline='') as file:
|
||||
reader = csv.reader(file)
|
||||
loaded_prompts = list(zip(*reader))[0]
|
||||
|
||||
loaded_prompts = list(loaded_prompts)
|
||||
num_samples = min(args.num_samples, len(loaded_prompts))
|
||||
|
||||
prompts = loaded_prompts[0:num_samples]
|
||||
|
||||
return prompts
|
||||
|
||||
|
||||
def save_summarize_1024_prompts_as_csv(prompts):
|
||||
'''
|
||||
save summarize 512 prompts
|
||||
'''
|
||||
# 将 List 数据按列保存为 CSV 文件
|
||||
# 转置 List
|
||||
transposed_prompts = [prompts]
|
||||
with open('summarize_1024_prompts.csv', 'w', newline='') as file:
|
||||
writer = csv.writer(file)
|
||||
writer.writerows(zip(*transposed_prompts))
|
||||
|
||||
|
||||
def generate_prompts(args: argparse.Namespace):
|
||||
'''
|
||||
Generate prompts based on the evaluation task and arguments.
|
||||
'''
|
||||
|
||||
eval_task_config = {
|
||||
"code_completion": {
|
||||
"dataset_name": "openai_humaneval",
|
||||
"dataset_revision": None,
|
||||
"dataset_input_key": "prompt",
|
||||
"dataset_split": "test"
|
||||
},
|
||||
"summarize": {
|
||||
"dataset_name": "ccdv/cnn_dailymail",
|
||||
"dataset_revision": "3.0.0",
|
||||
"dataset_input_key": "article",
|
||||
"dataset_split": "train"
|
||||
},
|
||||
"summarize_long": {
|
||||
"dataset_name": "tau/zero_scrolls",
|
||||
"dataset_revision": "squality",
|
||||
"dataset_input_key": "input",
|
||||
"dataset_split": "validation"
|
||||
},
|
||||
"summarize_hg": {
|
||||
"dataset_name": "cnn_dailymail",
|
||||
"dataset_revision": "3.0.0",
|
||||
"dataset_input_key": "article",
|
||||
"dataset_split": "validation"
|
||||
},
|
||||
"text_generation": {
|
||||
"dataset_name": "lambada",
|
||||
"dataset_revision": None,
|
||||
"dataset_input_key": "text",
|
||||
"dataset_split": "validation"
|
||||
}
|
||||
}
|
||||
|
||||
if args.eval_task in eval_task_config:
|
||||
config = eval_task_config[args.eval_task]
|
||||
dataset_name = config["dataset_name"]
|
||||
dataset_revision = config["dataset_revision"]
|
||||
dataset_input_key = config["dataset_input_key"]
|
||||
dataset_split = config["dataset_split"]
|
||||
else:
|
||||
assert args.dataset_name is not None, f"dataset_name is None when eval_task == custom"
|
||||
assert args.dataset_input_key is not None, f"dataset_input_key is None when eval_task == custom"
|
||||
assert args.dataset_split is not None, f"dataset_split is None when eval_task == custom"
|
||||
|
||||
dataset_name = args.dataset_name
|
||||
dataset_revision = args.dataset_revision
|
||||
dataset_input_key = args.dataset_input_key
|
||||
dataset_split = args.dataset_split
|
||||
|
||||
if args.prompt_file is not None or (args.eval_task == "summarize" and args.num_samples <= 1024):
|
||||
prompts = load_prompts_from_csv(args)
|
||||
num_samples = min(args.num_samples, len(prompts))
|
||||
else:
|
||||
dataset = load_dataset(dataset_name,
|
||||
dataset_revision,
|
||||
cache_dir=args.dataset_cache_dir,
|
||||
split=dataset_split,
|
||||
trust_remote_code=True)
|
||||
num_samples = min(args.num_samples, len(dataset))
|
||||
prompts = dataset[0:num_samples][dataset_input_key]
|
||||
# save_summarize_1024_prompts_as_csv(prompts)
|
||||
|
||||
prompt_token_ids = []
|
||||
if args.has_prompt_token_id:
|
||||
batch_input_ids = prepare_inputs(prompts,
|
||||
args.tokenizer,
|
||||
args.model_name,
|
||||
args.model_version,
|
||||
args.max_input_length,
|
||||
eval_task=args.eval_task,
|
||||
add_special_tokens=args.add_special_tokens)
|
||||
save_prompt_token_ids(batch_input_ids, args)
|
||||
for i in range(num_samples):
|
||||
prompt_token_ids.append(batch_input_ids[i].tolist())
|
||||
|
||||
if len(prompts) == 0:
|
||||
prompts = None
|
||||
else:
|
||||
prompts = [s[:args.max_input_length] for s in prompts]
|
||||
|
||||
if len(prompt_token_ids) == 0:
|
||||
prompt_token_ids = None
|
||||
|
||||
return prompts, prompt_token_ids
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def get_smooth_cal_weight(name, weight, name_parameters, act_range, model_type):
|
||||
'''
|
||||
get cal_weight for smooth process to solve q/k/v and gate/up layer merged condition in vllm
|
||||
args:
|
||||
name: weight name
|
||||
weight: weight value
|
||||
name_parameters: named parameters
|
||||
act_range: layer act range info of name
|
||||
model_type: model type
|
||||
'''
|
||||
if act_range["is_qkv"] is True:
|
||||
name_parts = name.split(".")
|
||||
self_attn_layer_name = ".".join(name_parts[:-2])
|
||||
qkv_list = smooth_model_config[model_type]["qkv_list"]
|
||||
q_weight_name = f"{self_attn_layer_name}.{qkv_list[0]}.weight"
|
||||
k_weight_name = f"{self_attn_layer_name}.{qkv_list[1]}.weight"
|
||||
v_weight_name = f"{self_attn_layer_name}.{qkv_list[2]}.weight"
|
||||
q_weight = name_parameters[q_weight_name]
|
||||
k_weight = name_parameters[k_weight_name]
|
||||
v_weight = name_parameters[v_weight_name]
|
||||
cal_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
|
||||
elif act_range["is_merge"] is True:
|
||||
name_parts = name.split(".")
|
||||
mlp_layer_name = ".".join(name_parts[:-2])
|
||||
gate_up_list = smooth_model_config[model_type]["gate_up_list"]
|
||||
gate_weight_name = f"{mlp_layer_name}.{gate_up_list[0]}.weight"
|
||||
up_weight_name = f"{mlp_layer_name}.{gate_up_list[1]}.weight"
|
||||
gate_weight = name_parameters[gate_weight_name]
|
||||
up_weight = name_parameters[up_weight_name]
|
||||
cal_weight = torch.cat([gate_weight, up_weight], dim=0)
|
||||
else:
|
||||
cal_weight = weight
|
||||
|
||||
return cal_weight
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def cal_smoother(weight, act_range_x, alpha=0.5):
|
||||
'''
|
||||
calculate smoother value
|
||||
args:
|
||||
weight: smoother weight
|
||||
act_range_x: activation max value of per channel
|
||||
alpha: smooth factor, default 0.5
|
||||
'''
|
||||
assert weight.shape[-1] == act_range_x.numel()
|
||||
weight_scales = weight.view(-1, weight.shape[-1])
|
||||
weight_scales = weight_scales.abs().max(dim=0)[0]
|
||||
weight_scales = weight_scales.to(float).clamp(min=1e-6)
|
||||
smoother = (act_range_x.to(weight_scales.device).to(float).pow(alpha) /
|
||||
weight_scales.pow(1 - alpha)).clamp(min=1e-6)
|
||||
|
||||
return smoother
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def cal_qweight_scales(sweight, smooth_act_range_x, per_token, per_channel):
|
||||
'''
|
||||
calculate quantized weight anc scales
|
||||
args:
|
||||
sweight: weight which has been divided by smoother value
|
||||
smooth_act_range_x: activation max value which has beed divide by smoother value
|
||||
per_token: bool, means whether calculate the weight and scales dynamically
|
||||
per_channel: bool, mean whether calculate the weight and scales by channel
|
||||
'''
|
||||
scale_x_quant_orig_t = smooth_act_range_x.max() / 127.0
|
||||
smooth_act_range_w = sweight.abs().max(dim=-1)[0]
|
||||
smooth_act_range_w = smooth_act_range_w.to(float).clamp(min=1e-6)
|
||||
scale_w_quant_orig_c = smooth_act_range_w / 127.0
|
||||
scale_w_quant_orig_t = smooth_act_range_w.max() / 127
|
||||
|
||||
if per_channel:
|
||||
qweight = (sweight / scale_w_quant_orig_c[..., None])
|
||||
else:
|
||||
qweight = (sweight / scale_w_quant_orig_t)
|
||||
|
||||
qweight = qweight.clip(-128, 127).to(torch.int8)
|
||||
|
||||
scale_to_int = 1 / scale_x_quant_orig_t
|
||||
|
||||
if per_token:
|
||||
if per_channel:
|
||||
per_channel_scale = scale_w_quant_orig_c
|
||||
else:
|
||||
per_channel_scale = scale_w_quant_orig_t
|
||||
else:
|
||||
if per_channel:
|
||||
per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_c
|
||||
hidden_size = smooth_act_range_x.numel()
|
||||
scale_to_int = scale_to_int.repeat(hidden_size)
|
||||
else:
|
||||
per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_t
|
||||
|
||||
per_channel_scale = per_channel_scale.squeeze()
|
||||
if per_channel_scale.numel() == 1 and per_channel_scale.dim() == 0:
|
||||
per_channel_scale = per_channel_scale.unsqueeze(0)
|
||||
|
||||
if scale_to_int.numel() == 1 and scale_to_int.dim() == 0:
|
||||
scale_to_int = scale_to_int.unsqueeze(0)
|
||||
|
||||
sinfo = [
|
||||
scale_w_quant_orig_t.item(), scale_x_quant_orig_t.item(),
|
||||
scale_w_quant_orig_t.item() / scale_x_quant_orig_t.item()
|
||||
]
|
||||
return qweight, per_channel_scale, scale_to_int, sinfo
|
||||
|
||||
|
||||
def check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int):
|
||||
'''
|
||||
check whether nan/inf appears in qweight, per_channel_scale, smooth, qzeros, scale_to_int
|
||||
'''
|
||||
if torch.isinf(qweight).any() or torch.isnan(qweight).any():
|
||||
logger.error(f"name:{name} qweight has inf or nan")
|
||||
if torch.isinf(per_channel_scale).any() or torch.isnan(per_channel_scale).any():
|
||||
logger.error(f"name:{name} per_channel_scale has inf or nan")
|
||||
if torch.isinf(smooth).any() or torch.isnan(smooth).any():
|
||||
logger.error(f"name:{name} smooth has inf or nan")
|
||||
if torch.isinf(scale_to_int).any() or torch.isnan(scale_to_int).any():
|
||||
logger.error(f"name:{name} scale_to_int has inf or nan")
|
||||
if qzeros is not None and (torch.isinf(qzeros).any() or torch.isnan(qzeros).any()):
|
||||
logger.error(f"name:{name} qzeros has inf or nan")
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def cal_smooth_weight(name, act_range_x, weight, smooth_value, has_qzeros, per_token, per_channel, cal_weight):
|
||||
'''
|
||||
calculate qweight, scales, smooth, qzeros
|
||||
args:
|
||||
name: weight name
|
||||
act_range_x: activation max value of per channel
|
||||
weight: weight to be quantized
|
||||
smooth_value: smooth value
|
||||
has_qzeros: which generate qzeros weight
|
||||
per_token: bool, means whether calculate the weight and scales dynamically
|
||||
per_channel: bool, mean whether calculate the weight and scales by channel
|
||||
model_type: model type
|
||||
'''
|
||||
smoother = cal_smoother(cal_weight, act_range_x, smooth_value)
|
||||
smooth_act_range_x = act_range_x / smoother
|
||||
sweight = weight * (smoother.view(1, -1))
|
||||
qweight, per_channel_scale, scale_to_int, sinfo = cal_qweight_scales(sweight, smooth_act_range_x, per_token,
|
||||
per_channel)
|
||||
qweight = qweight.reshape(weight.shape)
|
||||
smooth = 1 / smoother
|
||||
smooth = smooth.squeeze()
|
||||
if has_qzeros:
|
||||
qzeros = torch.zeros_like(per_channel_scale, dtype=torch.int32)
|
||||
else:
|
||||
qzeros = None
|
||||
|
||||
# check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int)
|
||||
|
||||
return qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def generate_smooth_weight(act_range, name_parameters, args):
|
||||
'''
|
||||
generate smooth weight
|
||||
args:
|
||||
act_range: act_range collected in model running
|
||||
name_parameters: hugging face model named parameters
|
||||
args: argument from main
|
||||
'''
|
||||
smooth_weight = {}
|
||||
smooth_info = {}
|
||||
has_qzeros = args.has_qzeros
|
||||
smooth_value = args.smooth_value
|
||||
|
||||
smooth_info["title"] = ["max_scale_w, max_scale_x, max_scale_w/max_scale_x"]
|
||||
|
||||
for name, param in name_parameters.items():
|
||||
if should_skip(args.model_type, name):
|
||||
logger.info(f"skip {name}")
|
||||
smooth_weight[name] = param
|
||||
continue
|
||||
if name.endswith("bias"):
|
||||
smooth_weight[name] = param
|
||||
continue
|
||||
name_parts = name.split(".")
|
||||
layer_name = ".".join(name_parts[:-1])
|
||||
if layer_name in act_range:
|
||||
act_range_x = act_range[layer_name]['x']
|
||||
cal_weight = get_smooth_cal_weight(name, param, name_parameters, act_range[layer_name], args.model_type)
|
||||
qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo = cal_smooth_weight(
|
||||
name, act_range_x, param, smooth_value, has_qzeros, args.per_token, args.per_channel, cal_weight)
|
||||
|
||||
per_channel_scale = per_channel_scale.to(args.torch_scales_smooth_dtype)
|
||||
smooth = smooth.to(args.torch_scales_smooth_dtype)
|
||||
scale_to_int = scale_to_int.to(args.torch_scales_smooth_dtype)
|
||||
|
||||
smooth_weight[f'{layer_name}.qweight'] = qweight
|
||||
smooth_weight[f'{layer_name}.per_channel_scale'] = per_channel_scale
|
||||
|
||||
if args.per_token is True:
|
||||
smooth_weight[f'{layer_name}.smooth'] = smooth
|
||||
else:
|
||||
scale_to_int = scale_to_int * smooth
|
||||
smooth_weight[f'{layer_name}.scale_to_int'] = scale_to_int
|
||||
|
||||
if has_qzeros:
|
||||
smooth_weight[f'{layer_name}.qzeros'] = qzeros
|
||||
|
||||
smooth_info[name] = sinfo
|
||||
else:
|
||||
smooth_weight[name] = param
|
||||
|
||||
return smooth_weight, smooth_info
|
||||
|
||||
|
||||
def generate_weights_of_smoothquant(llm: LLM, args: argparse.Namespace):
|
||||
'''
|
||||
generate smoothquant weights
|
||||
args:
|
||||
llm: LLM instance
|
||||
args: argument from main
|
||||
'''
|
||||
prompts, prompt_token_ids = generate_prompts(args)
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(max_tokens=args.output_len,
|
||||
repetition_penalty=args.repetition_penalty,
|
||||
temperature=args.temperature,
|
||||
top_p=args.top_p,
|
||||
top_k=args.top_k)
|
||||
|
||||
tp_size = args.tp_size
|
||||
|
||||
llm.llm_engine.model_executor._run_workers("setup_smooth_hook", args.dump_input_ids)
|
||||
|
||||
llm.generate(prompts, sampling_params, prompt_token_ids=prompt_token_ids, use_tqdm=True)
|
||||
|
||||
logger.info("llm generate finished")
|
||||
|
||||
llm.llm_engine.model_executor._run_workers("remove_hooks")
|
||||
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
|
||||
named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
|
||||
|
||||
vllm_cleanup(llm)
|
||||
del prompts
|
||||
del prompt_token_ids
|
||||
cleanup()
|
||||
|
||||
logger.info("get act_range and named_parameters from llm finished")
|
||||
|
||||
merged_act_range, merged_named_parameters, input_id_list = convert_to_merged(act_range, named_parameters, tp_size,
|
||||
args)
|
||||
|
||||
save_input_ids(input_id_list, args)
|
||||
save_act_range(merged_act_range, args)
|
||||
save_weights(merged_named_parameters, args)
|
||||
|
||||
del act_range
|
||||
del named_parameters
|
||||
cleanup()
|
||||
|
||||
logger.info("get merged_act_range and merged_named_parameters finished")
|
||||
|
||||
smooth_weight, smooth_info = generate_smooth_weight(merged_act_range, merged_named_parameters, args)
|
||||
save_generate_weights(smooth_weight, args)
|
||||
|
||||
del merged_act_range
|
||||
del merged_named_parameters
|
||||
cleanup()
|
||||
|
||||
logger.info("get smooth_weight finished")
|
||||
|
||||
return smooth_weight, smooth_info
|
||||
1024
vllm-v0.6.2/tools/quant_tools/summarize_1024_prompts.csv
Normal file
1024
vllm-v0.6.2/tools/quant_tools/summarize_1024_prompts.csv
Normal file
File diff suppressed because one or more lines are too long
713
vllm-v0.6.2/tools/quant_tools/utils_internal.py
Executable file
713
vllm-v0.6.2/tools/quant_tools/utils_internal.py
Executable file
@@ -0,0 +1,713 @@
|
||||
from collections import defaultdict, OrderedDict
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import re
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
import json
|
||||
from transformers import AutoTokenizer, T5Tokenizer
|
||||
import gc
|
||||
from datetime import datetime
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from model_special import (smooth_model_config, get_layer_weight_bias_name, get_qkv_distribution,
|
||||
modify_layer_weight_bias_name)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_str_to_torch_dtype_dict = dict(
|
||||
bfloat16=torch.bfloat16,
|
||||
float16=torch.float16,
|
||||
float32=torch.float32,
|
||||
int64=torch.int64,
|
||||
int32=torch.int32,
|
||||
int8=torch.int8,
|
||||
bool=torch.bool,
|
||||
fp8=torch.float8_e4m3fn,
|
||||
)
|
||||
|
||||
|
||||
def str_dtype_to_torch(dtype):
|
||||
'''
|
||||
convert torch dytpe to str dtype
|
||||
'''
|
||||
ret = _str_to_torch_dtype_dict.get(dtype)
|
||||
dtype = ret if ret is not None else torch.float16
|
||||
return dtype
|
||||
|
||||
|
||||
_torch_dtype_to_str_dict = {
|
||||
torch.bfloat16:"bfloat16",
|
||||
torch.float16:"float16",
|
||||
torch.float32:"float32",
|
||||
torch.int64:"int64",
|
||||
torch.int32:"int32",
|
||||
torch.int8:"int8",
|
||||
torch.bool:"bool",
|
||||
torch.float8_e4m3fn:"fp8",
|
||||
}
|
||||
|
||||
|
||||
def torch_dtype_to_str(dtype):
|
||||
'''
|
||||
convert str dytpe to torch dtype
|
||||
'''
|
||||
ret = _torch_dtype_to_str_dict.get(dtype)
|
||||
dtype = ret if ret is not None else "float16"
|
||||
return dtype
|
||||
|
||||
|
||||
def extract_model_path(name_or_path):
|
||||
'''
|
||||
extract model_version, model_family from named_or_path from config.json
|
||||
'''
|
||||
patterns = [
|
||||
r"/(.*)(-[0-9]+[mMbB]{1})(-*.*)",
|
||||
r"/(.*-[0-9]+)(-*.*)",
|
||||
r"(.*)(-[0-9]+[mMbB]{1})(-*.*)",
|
||||
r"(.*-[0-9]+)(-*.*)",
|
||||
r"([^-]+)(-*.*)",
|
||||
]
|
||||
model_version = None
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, name_or_path)
|
||||
if match:
|
||||
model_version = match.group(1)
|
||||
break
|
||||
|
||||
if model_version is None:
|
||||
model_version = name_or_path
|
||||
|
||||
model_version = model_version.lower()
|
||||
match = re.search(r"([a-zA-z]+)(.*)", model_version)
|
||||
if match:
|
||||
model_family = match.group(1)
|
||||
else:
|
||||
model_family = model_version
|
||||
|
||||
return model_version, model_family
|
||||
|
||||
|
||||
def read_model_name(model_dir: str, model_version: Optional[str] = None, model_type: Optional[str] = None):
|
||||
'''
|
||||
get model_arch, model_version, model_family, model_type form config.json, passed model_version, model_type
|
||||
args:
|
||||
model_dir: model directory
|
||||
model_version: passed from main, default None
|
||||
model_type: pass from main, default None
|
||||
'''
|
||||
with open(Path(model_dir) / "config.json", 'r') as f:
|
||||
config = json.load(f)
|
||||
|
||||
model_arch = config.get('architectures', None)
|
||||
name_or_path = config.get('_name_or_path', None)
|
||||
if model_type is None:
|
||||
model_type = config.get('model_type', None)
|
||||
if model_type:
|
||||
model_type = model_type.lower()
|
||||
model_family = None
|
||||
|
||||
if model_version is None and name_or_path:
|
||||
model_version, model_family = extract_model_path(name_or_path)
|
||||
|
||||
if model_version is None:
|
||||
model_version = model_type
|
||||
|
||||
if model_version:
|
||||
model_version = model_version.lower()
|
||||
|
||||
if model_version and model_family is None:
|
||||
match = re.search(r"([a-zA-z]+)(.*)", model_version)
|
||||
if match:
|
||||
model_family = match.group(1)
|
||||
else:
|
||||
model_family = model_version
|
||||
|
||||
if isinstance(model_arch, (list, tuple)) and len(model_arch) > 0:
|
||||
model_arch = model_arch[0]
|
||||
|
||||
assert model_arch, "read model architectures failed"
|
||||
assert model_version, "read model version failed, please set args.version manually"
|
||||
assert model_family, "read model family failed, please set args.version manually"
|
||||
|
||||
return model_arch, model_version, model_family, model_type
|
||||
|
||||
|
||||
def load_tokenizer(tokenizer_dir: Optional[str] = None,
|
||||
vocab_file: Optional[str] = None,
|
||||
model_name: str = 'GPTForCausalLM',
|
||||
model_version: Optional[str] = None,
|
||||
tokenizer_type: Optional[str] = None):
|
||||
'''
|
||||
load tokenizer of model
|
||||
args:
|
||||
tokenizer_dir: tokenizer directory
|
||||
vocab_file: vocabulary file, default None
|
||||
model_name: model name
|
||||
model_version: model version
|
||||
tokenizer_type: Tokenizer type to be loaded.
|
||||
'''
|
||||
if vocab_file is None:
|
||||
use_fast = True
|
||||
if tokenizer_type == "llama":
|
||||
use_fast = False
|
||||
# Should set both padding_side and truncation_side to be 'left'
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
||||
legacy=False,
|
||||
padding_side='left',
|
||||
truncation_side='right',
|
||||
trust_remote_code=True,
|
||||
tokenizer_type=tokenizer_type,
|
||||
use_fast=use_fast)
|
||||
elif model_name == 'GemmaForCausalLM':
|
||||
from transformers import GemmaTokenizer
|
||||
|
||||
# Initialize tokenizer from vocab file.
|
||||
tokenizer = GemmaTokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
|
||||
else:
|
||||
# For gpt-next, directly load from tokenizer.model
|
||||
tokenizer = T5Tokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
|
||||
|
||||
if model_name == 'QWenForCausalLM':
|
||||
with open(Path(tokenizer_dir) / "generation_config.json") as f:
|
||||
gen_config = json.load(f)
|
||||
chat_format = gen_config['chat_format']
|
||||
assert chat_format in ('raw','chatml'), f"unknown chat format: {chat_format}"
|
||||
pad_id = gen_config['pad_token_id']
|
||||
end_id = gen_config['eos_token_id']
|
||||
elif model_name in ('ChatGLMForCausalLM', 'glm'):
|
||||
pad_id = tokenizer.pad_token_id
|
||||
end_id = tokenizer.eop_token_id
|
||||
else:
|
||||
if tokenizer.pad_token_id is None:
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
pad_id = tokenizer.pad_token_id
|
||||
end_id = tokenizer.eos_token_id
|
||||
|
||||
try:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
except Exception as e:
|
||||
logger.warn(f"set pad_token with exception:{e}")
|
||||
|
||||
return tokenizer, pad_id, end_id
|
||||
|
||||
|
||||
def merge_qkv_weight(named_parameters, weight_name, tp_size, q_proj_size, num_kv_head_replicas):
|
||||
'''
|
||||
merge tensor parallel qkv weight to none parallel q_weight, k_weight, v_weight.
|
||||
merge_qkv weight and bias has the same logic
|
||||
args:
|
||||
named_parameters: parallel named parameters
|
||||
weight_name: qkv layer weight name
|
||||
tp_size: tensor parallel size
|
||||
q_proj_size: query projection size
|
||||
num_kv_head_replicas: number kv head replicas
|
||||
'''
|
||||
qkv_proj_size = named_parameters[0][weight_name].shape[0]
|
||||
kv_proj_size = (qkv_proj_size - q_proj_size) // 2
|
||||
splite_size = [q_proj_size, kv_proj_size, kv_proj_size]
|
||||
|
||||
q_weight_list = []
|
||||
k_weight_list = []
|
||||
v_weight_list = []
|
||||
|
||||
for rank in range(0, tp_size):
|
||||
weight = named_parameters[rank][weight_name]
|
||||
split_weight = torch.split(weight, splite_size, dim=0)
|
||||
q_weight_list.append(split_weight[0])
|
||||
if rank % num_kv_head_replicas == 0:
|
||||
k_weight_list.append(split_weight[1])
|
||||
v_weight_list.append(split_weight[2])
|
||||
|
||||
q_weight = torch.cat(q_weight_list, dim=0)
|
||||
k_weight = torch.cat(k_weight_list, dim=0)
|
||||
v_weight = torch.cat(v_weight_list, dim=0)
|
||||
|
||||
return q_weight, k_weight, v_weight
|
||||
|
||||
|
||||
def merge_merged_weight(named_parameters, weight_name, tp_size, dim=0):
|
||||
'''
|
||||
merge merged linear layer weight to gate_weight and up_weight.
|
||||
merge merged weight and bias has the same logic.
|
||||
args:
|
||||
named_parameters: parallel named parameters
|
||||
weight_name: qkv layer weight name
|
||||
tp_size: tensor parallel size
|
||||
'''
|
||||
up_weight_list = []
|
||||
gate_weight_list = []
|
||||
|
||||
for rank in range(0, tp_size):
|
||||
weight = named_parameters[rank][weight_name]
|
||||
chunk_weights = torch.chunk(weight, 2, dim=dim)
|
||||
up_weight_list.append(chunk_weights[0])
|
||||
gate_weight_list.append(chunk_weights[1])
|
||||
|
||||
gate_weight = torch.cat(up_weight_list, dim=dim)
|
||||
up_weight = torch.cat(gate_weight_list, dim=dim)
|
||||
|
||||
return gate_weight, up_weight
|
||||
|
||||
|
||||
def convert_packed_qkv(q_weight, k_weight, v_weight, dim, args):
|
||||
'''
|
||||
convert packad qkv weight or bias
|
||||
args:
|
||||
q_weight: q weight or bias
|
||||
k_weight: k weight or bias
|
||||
v_weight: v_weight or bias
|
||||
dim: convert dim
|
||||
args: argument
|
||||
'''
|
||||
packed_qkv = torch.cat([q_weight, k_weight, v_weight], dim=dim)
|
||||
is_n3sh, head_num, kv_head_num = get_qkv_distribution(args.model_type, args.model_version, args.hf_config)
|
||||
if is_n3sh is True:
|
||||
packed_qkv_shape = packed_qkv.shape
|
||||
num_query_heads_per_kv_head = head_num // kv_head_num
|
||||
q_shape = q_weight.shape
|
||||
k_shape = k_weight.shape
|
||||
v_shape = v_weight.shape
|
||||
q = q_weight.view(q_shape[:dim] + (kv_head_num, num_query_heads_per_kv_head, -1) + q_shape[dim + 1:])
|
||||
k = k_weight.view(k_shape[:dim] + (kv_head_num, 1, -1) + k_shape[dim + 1:])
|
||||
v = v_weight.view(v_shape[:dim] + (kv_head_num, 1, -1) + v_shape[dim + 1:])
|
||||
tensor_n3sh = torch.cat([q, k, v], dim=dim+1)
|
||||
packed_qkv = tensor_n3sh.reshape(packed_qkv_shape)
|
||||
|
||||
return packed_qkv
|
||||
|
||||
|
||||
def convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
|
||||
layer_range, merged_act_range, tp_size, args):
|
||||
'''
|
||||
convert parallel qkv named parameters to non parallel qkv named parameters
|
||||
args:
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
args: argument
|
||||
'''
|
||||
layer_name_parts = layer_name.split(".")
|
||||
self_attn_layer_name = ".".join(layer_name_parts[:-1])
|
||||
qkv_name = layer_name_parts[-1]
|
||||
q_weight, k_weight, v_weight = merge_qkv_weight(named_parameters, weight_name, tp_size, layer_range["q_proj_size"],
|
||||
layer_range["num_kv_head_replicas"])
|
||||
qkv_list = smooth_model_config[args.model_type]["qkv_list"]
|
||||
qkv_list_len = len(qkv_list)
|
||||
if qkv_list_len == 3:
|
||||
q_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
|
||||
k_layer_name = f"{self_attn_layer_name}.{qkv_list[1]}"
|
||||
v_layer_name = f"{self_attn_layer_name}.{qkv_list[2]}"
|
||||
elif qkv_list_len == 1:
|
||||
qkv_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
|
||||
|
||||
if qkv_list_len == 3:
|
||||
merged_act_range[q_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[k_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[v_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[q_layer_name]["is_qkv"] = True
|
||||
merged_act_range[k_layer_name]["is_qkv"] = True
|
||||
merged_act_range[v_layer_name]["is_qkv"] = True
|
||||
|
||||
merged_named_parameters[f"{q_layer_name}.weight"] = q_weight
|
||||
merged_named_parameters[f"{k_layer_name}.weight"] = k_weight
|
||||
merged_named_parameters[f"{v_layer_name}.weight"] = v_weight
|
||||
elif qkv_list_len == 1:
|
||||
merged_act_range[qkv_layer_name]["x"] = layer_range["x"]
|
||||
qkv_weight = convert_packed_qkv(q_weight, k_weight, v_weight, 0, args)
|
||||
merged_named_parameters[f"{qkv_layer_name}.weight"] = qkv_weight
|
||||
|
||||
if bias_name in named_parameters[0]:
|
||||
q_bias, k_bias, v_bias = merge_qkv_weight(named_parameters, bias_name, tp_size, layer_range["q_proj_size"],
|
||||
layer_range["num_kv_head_replicas"])
|
||||
if qkv_list_len == 3:
|
||||
merged_named_parameters[f"{q_layer_name}.bias"] = q_bias
|
||||
merged_named_parameters[f"{k_layer_name}.bias"] = k_bias
|
||||
merged_named_parameters[f"{v_layer_name}.bias"] = v_bias
|
||||
elif qkv_list_len == 1:
|
||||
qkv_bias = convert_packed_qkv(q_bias, k_bias, v_bias, 0, args)
|
||||
merged_named_parameters[f"{qkv_layer_name}.bias"] = qkv_bias
|
||||
|
||||
return qkv_name
|
||||
|
||||
|
||||
def convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
|
||||
layer_range, merged_act_range, tp_size, model_type):
|
||||
'''
|
||||
convert parallel merged named parameters to non parallel merged named parameters
|
||||
args:
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
model_type: model type
|
||||
'''
|
||||
layer_name_parts = layer_name.split(".")
|
||||
mlp_layer_name = ".".join(layer_name_parts[:-1])
|
||||
gate_weight, up_weight = merge_merged_weight(named_parameters, weight_name, tp_size)
|
||||
gate_up_name = layer_name_parts[-1]
|
||||
gate_up_list = smooth_model_config[model_type]["gate_up_list"]
|
||||
gate_up_list_len = len(gate_up_list)
|
||||
is_gate_up = smooth_model_config[model_type]["is_gate_up"]
|
||||
if gate_up_list_len == 2:
|
||||
gate_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
|
||||
up_layer_name = f"{mlp_layer_name}.{gate_up_list[1]}"
|
||||
elif gate_up_list_len == 1:
|
||||
gate_up_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
|
||||
|
||||
if gate_up_list_len == 2:
|
||||
merged_act_range[gate_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[up_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[gate_layer_name]["is_merge"] = True
|
||||
merged_act_range[up_layer_name]["is_merge"] = True
|
||||
|
||||
merged_named_parameters[f"{gate_layer_name}.weight"] = gate_weight
|
||||
merged_named_parameters[f"{up_layer_name}.weight"] = up_weight
|
||||
elif gate_up_list_len == 1:
|
||||
merged_act_range[gate_up_layer_name]["x"] = layer_range["x"]
|
||||
merged_gate_up_weight_list = [gate_weight, up_weight] if is_gate_up is True else [up_weight, gate_weight]
|
||||
merged_named_parameters[f"{gate_up_layer_name}.weight"] = torch.cat(merged_gate_up_weight_list, dim=0)
|
||||
|
||||
if bias_name in named_parameters[0]:
|
||||
gate_bias, up_bias = merge_merged_weight(named_parameters, bias_name, tp_size)
|
||||
if gate_up_list_len == 2:
|
||||
merged_named_parameters[f"{gate_layer_name}.bias"] = gate_bias
|
||||
merged_named_parameters[f"{up_layer_name}.bias"] = up_bias
|
||||
elif gate_up_list_len == 1:
|
||||
merged_gate_up_bias_list = [gate_bias, up_bias] if is_gate_up is True else [up_bias, gate_bias]
|
||||
merged_named_parameters[f"{gate_up_layer_name}.bias"] = torch.cat(merged_gate_up_bias_list, dim=0)
|
||||
|
||||
return gate_up_name
|
||||
|
||||
|
||||
def convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size):
|
||||
'''
|
||||
convert colum parallel named parameters to non parallel named parameters
|
||||
args:
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
'''
|
||||
if layer_range['is_linear']:
|
||||
merged_act_range[layer_name]["x"] = layer_range["x"]
|
||||
merged_named_parameters[weight_name] = torch.cat(
|
||||
[named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=0)
|
||||
if bias_name in named_parameters[0]:
|
||||
merged_named_parameters[bias_name] = torch.cat(
|
||||
[named_parameters[tp_id][bias_name] for tp_id in range(0, tp_size)], dim=0)
|
||||
|
||||
|
||||
def convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size):
|
||||
'''
|
||||
convert row parallel named parameters to non parallel named parameters
|
||||
args:
|
||||
act_layer_name: act layer name
|
||||
act_range: parallel act_range
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
'''
|
||||
if layer_range['is_linear']:
|
||||
if isinstance(layer_range['x'], torch.Tensor):
|
||||
merged_act_range[layer_name]['x'] = torch.cat(
|
||||
[act_range[tp_id][act_layer_name]['x'] for tp_id in range(0, tp_size)], dim=0)
|
||||
else:
|
||||
merged_act_range[layer_name]['x'] = None
|
||||
|
||||
merged_named_parameters[weight_name] = torch.cat(
|
||||
[named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=1)
|
||||
if bias_name in named_parameters[0]:
|
||||
merged_named_parameters[bias_name] = named_parameters[0][bias_name]
|
||||
|
||||
|
||||
def convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size, args):
|
||||
'''
|
||||
convert parallel layer named parameters to non parallel layer named parameters
|
||||
args:
|
||||
act_layer_name: act layer name
|
||||
act_range: parallel act_range
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
'''
|
||||
qkv_name = "qkv_proj"
|
||||
gate_up_name = "gate_up_proj"
|
||||
|
||||
if layer_range['split'] == 'col': # col
|
||||
# merge weight
|
||||
if layer_range["is_qkv"]:
|
||||
qkv_name = convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size,
|
||||
args)
|
||||
|
||||
elif layer_range["is_merge"]:
|
||||
gate_up_name = convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range,
|
||||
tp_size, args.model_type)
|
||||
else:
|
||||
convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size)
|
||||
else: # row
|
||||
convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size)
|
||||
|
||||
return qkv_name, gate_up_name
|
||||
|
||||
|
||||
def collect_moe_experts_act_range_of_layer(merged_act_range, mlp_part_name, moe_list):
|
||||
'''
|
||||
collect moe experts act range in the same layer
|
||||
'''
|
||||
experts_of_gate_up_layer = {}
|
||||
experts_of_down_layer = {}
|
||||
|
||||
gate_up_list = moe_list["gate_up_list"]
|
||||
gate_up_list_len = len(gate_up_list)
|
||||
down_list = moe_list["down_list"]
|
||||
gate_up_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[1]}"
|
||||
gate_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[2]}" if gate_up_list_len > 2 else None
|
||||
down_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{down_list[1]}"
|
||||
for key, value in merged_act_range.items():
|
||||
if re.search(gate_up_layer_pattern, key) or (gate_layer_pattern is not None
|
||||
and re.search(gate_layer_pattern, key)):
|
||||
experts_of_gate_up_layer[key] = value
|
||||
if re.search(down_layer_pattern, key):
|
||||
experts_of_down_layer[key] = value
|
||||
|
||||
return experts_of_gate_up_layer, experts_of_down_layer
|
||||
|
||||
|
||||
def convert_moe_expert_activation_fused(experts_of_layer, merged_act_range):
|
||||
'''
|
||||
fuse the moe expert act range in the same layer, and asign to these experts
|
||||
'''
|
||||
unfused_activation = []
|
||||
for key, value in experts_of_layer.items():
|
||||
if isinstance(value["x"], torch.Tensor):
|
||||
unfused_activation.append(value['x'])
|
||||
|
||||
assert len(unfused_activation) > 0, f"unfused_activation len is zero, this is unsupported"
|
||||
|
||||
activation = torch.stack(unfused_activation, dim=0)
|
||||
fused_activation = torch.max(activation, dim=0)[0]
|
||||
|
||||
for key, value in experts_of_layer.items():
|
||||
if value["x"] is None or isinstance(value["x"], torch.Tensor):
|
||||
value['x'] = fused_activation
|
||||
|
||||
|
||||
def convert_moe_layer_activation_fused(merged_act_range, model_type):
|
||||
'''
|
||||
loop each layer and fuse the moe expert act range in the same layer, and asign to these experts
|
||||
'''
|
||||
moe_list = smooth_model_config[model_type]["moe_list"]
|
||||
if moe_list is None:
|
||||
return
|
||||
|
||||
mlp_name = moe_list["gate_up_list"][0].split(".")[0]
|
||||
layer = 0
|
||||
|
||||
while True:
|
||||
mlp_part_name = rf"\.{layer}\.{mlp_name}"
|
||||
experts_of_gate_up_layer, experts_of_down_layer = collect_moe_experts_act_range_of_layer(
|
||||
merged_act_range, mlp_part_name, moe_list)
|
||||
# if experts_of_layer is empty, means layer equants to expert_num, the loop is finished
|
||||
if len(experts_of_gate_up_layer) < 1 or len(experts_of_down_layer) < 1:
|
||||
logger.info(f"the experts_num is {layer}")
|
||||
break
|
||||
convert_moe_expert_activation_fused(experts_of_gate_up_layer, merged_act_range)
|
||||
convert_moe_expert_activation_fused(experts_of_down_layer, merged_act_range)
|
||||
layer += 1
|
||||
|
||||
|
||||
def should_include(key, parameters, exclude_names):
|
||||
'''
|
||||
key shouldnot include in parameters and exlude_names
|
||||
args:
|
||||
parameters: named parameters
|
||||
exclude_names: excluded nameds list
|
||||
'''
|
||||
return key not in parameters and not any(exclude_name in key for exclude_name in exclude_names)
|
||||
|
||||
|
||||
def valid_act_range(act_layer_name, layer_range):
|
||||
'''
|
||||
valid act_range, mainly filter inf, nan or zero values in x field
|
||||
args:
|
||||
act_layer_name: act layer name
|
||||
layer_range: act layer value
|
||||
'''
|
||||
act_range_x = layer_range["x"]
|
||||
if act_range_x is not None and isinstance(act_range_x, torch.Tensor):
|
||||
mask = torch.isinf(act_range_x) | torch.isnan(act_range_x) | (act_range_x == 0)
|
||||
if torch.any(mask).item():
|
||||
act_range_x[mask] = 1e-6
|
||||
logger.warning(f"act_range_x in layer:{act_layer_name} has nan, inf or zero values, force to 1e-6")
|
||||
|
||||
|
||||
def convert_to_merged(act_range, named_parameters, tp_size, args):
|
||||
'''
|
||||
convert parallel act_range and named parameters to non parallel format.
|
||||
args:
|
||||
act_range: parallel act_range
|
||||
named_parameters: parallel named parameters
|
||||
tp_size: tensor parallel size
|
||||
args: argument
|
||||
'''
|
||||
model_type = args.model_type
|
||||
merged_act_range = defaultdict(lambda: {"x": None, "is_qkv": False, "is_merge": False,})
|
||||
merged_named_parameters = {}
|
||||
input_id_list = []
|
||||
|
||||
exclude_names = set()
|
||||
|
||||
for act_layer_name, layer_range in act_range[0].items():
|
||||
valid_act_range(act_layer_name, layer_range)
|
||||
layer_name, weight_name, bias_name = get_layer_weight_bias_name(model_type, act_layer_name)
|
||||
# when tie_word_embeddings is True, lm_head use embeding weight
|
||||
if args.tie_word_embeddings is True and "lm_head" in layer_name:
|
||||
continue
|
||||
qkv_name, gate_up_name = convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name,
|
||||
named_parameters, merged_named_parameters, layer_range,
|
||||
merged_act_range, tp_size, args)
|
||||
exclude_names.update({qkv_name, gate_up_name})
|
||||
|
||||
if layer_range['split'] == 'col' and layer_range["is_qkv"] and len(layer_range["input_id"]) > 0:
|
||||
input_id_list = layer_range["input_id"]
|
||||
|
||||
if args.use_smoothquant and args.disable_fused_quantize_expert is False:
|
||||
convert_moe_layer_activation_fused(merged_act_range, model_type)
|
||||
|
||||
|
||||
merged_named_parameters.update({
|
||||
key: value
|
||||
for key, value in named_parameters[0].items()
|
||||
if should_include(key, merged_named_parameters, exclude_names)
|
||||
})
|
||||
|
||||
modify_layer_weight_bias_name(model_type, merged_named_parameters)
|
||||
|
||||
sorted_named_parameters = OrderedDict(sorted(merged_named_parameters.items(), key=lambda item: item[0]))
|
||||
sorted_merged_act_range = OrderedDict(sorted(merged_act_range.items(), key=lambda item: item[0]))
|
||||
|
||||
return sorted_merged_act_range, sorted_named_parameters, input_id_list
|
||||
|
||||
|
||||
def copy_files_except_extensions(input_dir, output_dir, extensions):
|
||||
'''
|
||||
copy python files with extension in extensions from input_dir to output_dir, and keey sub directory is same
|
||||
args:
|
||||
input_dir: input directory
|
||||
output_dir: output directory
|
||||
extensions: the copy files extension
|
||||
'''
|
||||
# 遍历输入目录及其子目录
|
||||
for root, dirs, files in os.walk(input_dir):
|
||||
# 计算相对路径
|
||||
rel_path = os.path.relpath(root, input_dir)
|
||||
if len(rel_path) > 1 and rel_path.startswith('.'):
|
||||
continue
|
||||
# 构建目标目录路径
|
||||
dst_dir = os.path.join(output_dir, rel_path)
|
||||
# 确保目标目录存在
|
||||
if not os.path.exists(dst_dir):
|
||||
os.makedirs(dst_dir)
|
||||
for file in files:
|
||||
if not any(file.endswith(ext) for ext in extensions) and not file.startswith('.'):
|
||||
# 构建源文件和目标文件的完整路径
|
||||
src_file = os.path.join(root, file)
|
||||
dst_file = os.path.join(dst_dir, file)
|
||||
# 复制文件
|
||||
shutil.copy2(src_file, dst_file)
|
||||
logger.info(f'Copied {src_file} to {dst_file}')
|
||||
|
||||
|
||||
def cleanup():
|
||||
'''
|
||||
cleanup memory resource
|
||||
'''
|
||||
gc.collect()
|
||||
if not current_platform.is_cpu():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def vllm_cleanup(llm):
|
||||
"""Release occupied resources and reset parallel_state"""
|
||||
del llm
|
||||
from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
import contextlib
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
import ray
|
||||
if ray.is_initialized():
|
||||
ray.shutdown()
|
||||
logger.info('llm and distributed env is cleanup')
|
||||
|
||||
|
||||
def generate_datetime():
|
||||
'''
|
||||
generate current datetime
|
||||
'''
|
||||
current_datetime = datetime.now()
|
||||
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
return formatted_datetime
|
||||
|
||||
|
||||
def get_hf_config_sliding_window(hf_text_config) -> Optional[int]:
|
||||
"""Get the sliding window size, or None if disabled."""
|
||||
|
||||
# Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
|
||||
# addition to sliding window size. We check if that field is present
|
||||
# and if it's False, return None.
|
||||
if (hasattr(hf_text_config, "use_sliding_window")
|
||||
and not hf_text_config.use_sliding_window):
|
||||
return None
|
||||
return getattr(hf_text_config, "sliding_window", None)
|
||||
|
||||
def get_skip_patterns(model_type):
|
||||
"""Get the skip patterns from model config."""
|
||||
config = smooth_model_config[model_type]
|
||||
return config["skip_patterns"] if "skip_patterns" in config else []
|
||||
|
||||
def should_skip(model_type, weight_name):
|
||||
"""judge if the weight should be skipped."""
|
||||
skip_patterns = get_skip_patterns(model_type)
|
||||
for pattern in skip_patterns:
|
||||
import re
|
||||
if re.match(pattern, weight_name):
|
||||
return True
|
||||
return False
|
||||
|
||||
152
vllm-v0.6.2/tools/quant_tools/weight_only.py
Normal file
152
vllm-v0.6.2/tools/quant_tools/weight_only.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import argparse
|
||||
import torch
|
||||
from torch import Tensor
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
|
||||
from dump_smooth import save_weights, save_generate_weights
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def merge_adjacent_low_4bit(tensor: Tensor):
|
||||
"""
|
||||
将一个包含int8类型数据的张量,按相邻两个元素的低4位合并成新的int8数据,
|
||||
并输出一个新的张量。
|
||||
|
||||
参数:
|
||||
- tensor: 类型为torch.int8的张量,长度应为偶数。
|
||||
|
||||
返回:
|
||||
- 新张量,其中每个元素是相邻原元素低4位的合并结果。
|
||||
|
||||
示例:
|
||||
a = torch.tensor([5, 7, 12, 3], dtype=torch.int8) # 示例张量,每对元素将被合并
|
||||
merged_tensor = merge_adjacent_low_nibbles(a)
|
||||
print(f"合并后的张量: {merged_tensor} (二进制: {merged_tensor.tolist()})")
|
||||
"""
|
||||
|
||||
# 确保输入张量类型为int8且长度为偶数
|
||||
assert tensor.dtype == torch.int8, "输入张量必须为int8类型"
|
||||
assert tensor.shape[-1] % 2 == 0, "输入张量最后一维长度需为偶数"
|
||||
|
||||
even = np.bitwise_and(tensor[..., 0::2], 0x0F, dtype=np.int8)
|
||||
odd = np.bitwise_and(tensor[..., 1::2], 0x0F, dtype=np.int8)
|
||||
merged_tensor = np.bitwise_or(np.left_shift(odd, 4), even)
|
||||
|
||||
# 结果是已经合并的新张量
|
||||
return merged_tensor
|
||||
|
||||
|
||||
def cal_weightonly_weight(weight, weight_bits, qmin, qmax, has_qzeros, eps: float = 1e-8):
|
||||
'''
|
||||
return quantized_weight, scales, qzeros
|
||||
args:
|
||||
weight: need to be quantized
|
||||
weight_bits: quantized bitwidth
|
||||
qmin: minimum value in quantized range
|
||||
qmax: maximum value in quantized range
|
||||
has_qzeros: whether to generate qzeros weight
|
||||
eps: limit zero float value to avoid floatpoint error
|
||||
'''
|
||||
assert weight.numel() != 0, "weight should not be empty tensor"
|
||||
assert weight.dim() == 2 or weight.dim() == 3, "Invalid dim. The dim of weight should be 2 or 3"
|
||||
assert weight.dtype in [torch.float32, torch.float16, torch.bfloat16
|
||||
], "Invalid datatype. Weight must be torch.float32 or torch.float16 or torch.bfloat16"
|
||||
|
||||
weight_scale = weight.float().abs().clamp(min=eps).max(dim=-1).values / qmax
|
||||
unpacked_weight = (torch.round((weight / weight_scale[..., None]).float())).clip(min=qmin, max=qmax).to(torch.int8)
|
||||
scale_quant_orig_c = weight_scale.squeeze()
|
||||
|
||||
if weight_bits == 4:
|
||||
quantized_weight = merge_adjacent_low_4bit(unpacked_weight)
|
||||
else:
|
||||
quantized_weight = unpacked_weight
|
||||
|
||||
if has_qzeros:
|
||||
qzeros = torch.zeros_like(scale_quant_orig_c, dtype=torch.int32)
|
||||
else:
|
||||
qzeros = None
|
||||
|
||||
return quantized_weight, scale_quant_orig_c, qzeros
|
||||
|
||||
|
||||
def generate_weightonly_weight(act_range, name_parameters, args):
|
||||
'''
|
||||
generate hugging face weight to quanizated weightonly weight
|
||||
args:
|
||||
act_range: non parallem act_range
|
||||
name_parameters: non parallel hugging face named parameters
|
||||
args: arguments from main
|
||||
'''
|
||||
weightonly_weight = {}
|
||||
has_qzeros = args.has_qzeros
|
||||
weight_bits = 8 if args.weight_only_precision == 'int8' else 4
|
||||
qmin = float(-2**(weight_bits - 1))
|
||||
qmax = float(2**(weight_bits - 1) - 1)
|
||||
|
||||
for name, param in name_parameters.items():
|
||||
if should_skip(args.model_type, name):
|
||||
logger.info(f"skip {name}")
|
||||
weightonly_weight[name] = param
|
||||
continue
|
||||
if name.endswith("bias"):
|
||||
weightonly_weight[name] = param
|
||||
continue
|
||||
name_parts = name.split(".")
|
||||
layer_name = ".".join(name_parts[:-1])
|
||||
if layer_name in act_range:
|
||||
qweight, scales, qzeros = cal_weightonly_weight(param, weight_bits, qmin, qmax, has_qzeros)
|
||||
scales = scales.to(args.torch_scales_smooth_dtype)
|
||||
weightonly_weight[f'{layer_name}.qweight'] = qweight
|
||||
weightonly_weight[f'{layer_name}.scales'] = scales
|
||||
if has_qzeros:
|
||||
weightonly_weight[f'{layer_name}.qzeros'] = qzeros
|
||||
else:
|
||||
weightonly_weight[name] = param
|
||||
|
||||
return weightonly_weight
|
||||
|
||||
|
||||
def generate_weights_of_weight_only(llm: LLM, args: argparse.Namespace):
|
||||
'''
|
||||
generate weightonly weights
|
||||
args:
|
||||
llm: LLM instance
|
||||
args: argument from main
|
||||
'''
|
||||
tp_size = args.tp_size
|
||||
|
||||
llm.llm_engine.model_executor._run_workers("setup_smooth_hook")
|
||||
|
||||
llm.llm_engine.model_executor._run_workers("remove_hooks")
|
||||
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
|
||||
named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
|
||||
|
||||
vllm_cleanup(llm)
|
||||
cleanup()
|
||||
|
||||
logger.info("get act_range and named_parameters from llm finished")
|
||||
|
||||
merged_act_range, merged_named_parameters, _ = convert_to_merged(act_range, named_parameters, tp_size, args)
|
||||
save_weights(merged_named_parameters, args)
|
||||
|
||||
del act_range
|
||||
del named_parameters
|
||||
cleanup()
|
||||
|
||||
logger.info("get merged_act_range and merged_named_parameters finished")
|
||||
|
||||
weightonly_weight = generate_weightonly_weight(merged_act_range, merged_named_parameters, args)
|
||||
save_generate_weights(weightonly_weight, args)
|
||||
|
||||
del merged_act_range
|
||||
del merged_named_parameters
|
||||
cleanup()
|
||||
|
||||
logger.info("get weightonly_weight finished")
|
||||
|
||||
return weightonly_weight
|
||||
Reference in New Issue
Block a user