add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

View File

@@ -0,0 +1,419 @@
import argparse
import os
import sys
import time
import safetensors
import logging
import json
from huggingface_hub import split_torch_state_dict_into_shards, constants
from vllm import LLM
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.config import _get_and_verify_max_len
import transformers
from transformers.modeling_utils import SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from smooth_quant import generate_weights_of_smoothquant
from weight_only import generate_weights_of_weight_only
from utils_internal import (read_model_name, load_tokenizer, torch_dtype_to_str, str_dtype_to_torch,
copy_files_except_extensions, generate_datetime, get_hf_config_sliding_window)
from utils_internal import get_skip_patterns, should_skip
from model_special import smooth_model_config
from vllm.engine.arg_utils import EngineArgs
sys.path.append(os.getcwd())
logger = logging.getLogger("smooth_convert")
def load_skip_params_from_hf(args):
'''
load parameters from transformers that do no need to be quantized.
'''
model_type = args.model_type
if not get_skip_patterns(model_type):
return {}
try:
model = getattr(transformers, args.model_name, None)
if model is None:
model = AutoModelForCausalLM
model = model.from_pretrained(
args.hf_model_dir,
trust_remote_code=True,
torch_dtype=args.torch_dtype,
device_map="cpu")
except Exception as e:
logger.fatal(f"Unsupported model {args.model_name}, error message: {e}")
sys.exit(1)
params_map = {}
hf_params = dict(model.named_parameters())
for name, param in hf_params.items():
if should_skip(model_type, name):
logger.info(f"load parameters from transformers, name: {name}")
params_map[name] = param
return params_map
def save_quantized_weights_to_safetensors(quantized_weights, args):
'''
save quantized_weights to safetensors format
'''
# Store the state_dict to file.
max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
state_dict_split = split_torch_state_dict_into_shards(quantized_weights,
filename_pattern=constants.SAFETENSORS_WEIGHTS_FILE_PATTERN,
max_shard_size=max_shard_size)
# Save the model
for shard_name, tensors in state_dict_split.filename_to_tensors.items():
shard = {tensor: quantized_weights[tensor] for tensor in tensors}
safetensors.torch.save_file(shard, os.path.join(args.output_dir, shard_name), metadata={"format": "pt"})
if state_dict_split.is_sharded:
index = {
"metadata": state_dict_split.metadata,
"weight_map": state_dict_split.tensor_to_filename,
}
save_index_file = os.path.join(args.output_dir, SAFE_WEIGHTS_INDEX_NAME)
with open(save_index_file, "w", encoding="utf-8") as f:
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
f.write(content)
logger.info(
f"The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be "
f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where "
f"each parameters has been saved in the index located at {save_index_file}."
)
else:
logger.info(f"Model weights saved in {os.path.join(args.output_dir, SAFE_WEIGHTS_NAME)}")
def main(args):
'''
main quantization logic
'''
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=args.log_level,
force=True,
)
tik = time.time()
skip_params = load_skip_params_from_hf(args)
# Create an LLM.
max_model_len = max(args.max_input_length + args.output_len, 2048)
args.max_model_len = min(max_model_len, args.hf_max_model_len)
max_num_batched_tokens = max(max(args.max_input_length * args.batch_size, max_model_len), 2048)
args.max_num_batched_tokens = min(max_num_batched_tokens, args.hf_max_model_len)
llm = LLM(model=args.hf_model_dir,
tokenizer=args.tokenizer_dir,
tensor_parallel_size=args.tp_size,
distributed_executor_backend='ray',
dtype=args.dtype,
enforce_eager=args.enforce_eager,
trust_remote_code=True,
block_size=args.block_size,
max_model_len=args.max_model_len,
max_num_batched_tokens=args.max_num_batched_tokens,
max_num_seqs=args.max_num_seqs,
cpu_offload_gb=args.cpu_offload_gb)
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
logger.info(f'Load vLLM model takes: {t}')
quantize_config = {}
if args.use_weight_only:
st_prefix = f"weight_{args.weight_only_precision}"
quantized_weights = generate_weights_of_weight_only(llm, args)
quantize_config['bits'] = 8 if args.weight_only_precision == "int8" else 4
quantize_config['quant_method'] = "weightonly"
quantize_config['quant_mode'] = "WeightOnly"
if args.use_smoothquant:
st_prefix = f"smoothquant_{args.smooth_value}"
quantized_weights, smooth_info = generate_weights_of_smoothquant(llm, args)
quantize_config['bits'] = 8
quantize_config['quant_method'] = "smoothquant"
quantize_config['quant_mode'] = "SmoothQuant"
quantize_config['input_quant_method'] = "per_token" if args.per_token else "per_tensor"
quantize_config['smooth_value'] = args.smooth_value
with open(os.path.join(args.output_dir, 'smooth_info.json'), 'w') as f:
json.dump(smooth_info, f, indent=4)
# Should first copy other files from hf_model_dir, and then save weight, tokenizer, config, quant_config and so on
extensions = ['.bin', '.safetensors', ".pt", ".index.json"]
copy_files_except_extensions(args.hf_model_dir, args.output_dir, extensions)
logger.info(f'copy files except extensions success')
for name, param in skip_params.items():
assert name in quantized_weights
quantized_weights[name] = param
save_quantized_weights_to_safetensors(quantized_weights, args)
logger.info(f'save quantized_weights to safetensors success')
with open(os.path.join(args.output_dir, 'quantize_config.json'), 'w') as f:
json.dump(quantize_config, f, indent=4)
from transformers.utils import CONFIG_NAME
with open(os.path.join(args.hf_model_dir, CONFIG_NAME), 'r') as f:
config = json.load(f)
config['quantization_config'] = quantize_config
config['generate_datetime'] = generate_datetime()
config['torch_dtype'] = args.dtype
with open(os.path.join(args.output_dir, CONFIG_NAME), 'w') as f:
json.dump(config, f, indent=4)
logger.info(f'quantized {args.hf_model_dir} finished')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--hf_model_dir', type=str, default=None)
parser.add_argument('--tokenizer_dir',
default=None,
help='tokenizer path; defaults to hf_model_dir if left unspecified')
parser.add_argument(
'--enforce_eager',
action="store_true",
default=True,
help='Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model '
'in eager mode. If False, we will use CUDA graph and eager execution in hybrid.')
parser.add_argument('--dtype',
type=str,
choices=['auto', 'float32', 'float16', 'bfloat16'],
default='auto',
help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
parser.add_argument('--scales_smooth_dtype',
type=str,
choices=['auto', 'float32', 'float16', 'bfloat16'],
default='auto',
help="if auto, scales and smooth weights use args.dtype, else use the setted dtype")
parser.add_argument(
'--eval_task',
type=str,
default='summarize',
choices=['summarize', 'summarize_long', 'code_completion', 'summarize_hg', 'text_generation', 'custom'],
help='''eval task to decide which dataset is selected. When set to custom, you must set these options
dataset_name, dataset_revision, dataset_input_key, dataset_split to specify which dataset to use''')
parser.add_argument("--dataset_cache_dir",
type=str,
default=None,
help="cache dir to load the hugging face dataset")
parser.add_argument("--dataset_name", type=str, default=None, help="custom dataset name")
parser.add_argument("--dataset_revision", type=str, default=None, help="custom dataset version")
parser.add_argument("--dataset_input_key", type=str, default=None, help="custom dataset field")
parser.add_argument("--dataset_split", type=str, default=None, help="custom dataset split")
parser.add_argument('--log_level', type=int, default=logging.INFO)
parser.add_argument('--num_samples', type=int, default=512, help='num prompt sample')
parser.add_argument('--output_len',
type=int,
default=100,
help="Number of output sequences to return for the given prompt")
parser.add_argument('--max_input_length',
type=int,
default=512,
help='max input length of the prompt')
parser.add_argument('--block_size', type=int, default=-1, help='Token block size for contiguous chunks of tokens.')
parser.add_argument('--temperature', type=float, default=1.0)
parser.add_argument('--top_p', type=float, default=1.0)
parser.add_argument('--top_k', type=int, default=-1)
parser.add_argument('--repetition_penalty', type=float, default=1.0)
parser.add_argument('--max_num_seqs',
type=int,
default=EngineArgs.max_num_seqs,
help='Maximum number of sequences per iteration.')
parser.add_argument('--output_dir',
type=str,
default="output_dir",
help="The path to save the quantized checkpoint")
parser.add_argument(
"--max_shard_size",
type=str,
default="10GB",
help=("The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size "
"lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`)"),
)
parser.add_argument('--tp_size', type=int, default=1, help='N-way tensor parallelism size')
parser.add_argument('--pp_size', type=int, default=1, help='N-way pipeline parallelism size, now supported num')
parser.add_argument('--use_smoothquant',
default=False,
action="store_true",
help='Apply smoothquant to generate weight')
parser.add_argument("--smooth_value",
type=float,
default=0.5,
help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
" to Smoothquant the model, and output int8 weights."
" A good first try is 0.5. Must be in [0, 1]")
parser.add_argument('--per_channel',
action="store_true",
default=False,
help='By default, we use a single static scaling factor for the GEMM\'s result. '
'per_channel instead uses a different static scaling factor for each channel. '
'The latter is usually more accurate, but a little slower.')
parser.add_argument(
'--per_token',
action="store_true",
default=False,
help='By default, we use a single static scaling factor to scale activations in the int8 range. '
'per_token chooses at run time, and for each token, a custom scaling factor. '
'The latter is usually more accurate, but a little slower.')
parser.add_argument('--use_weight_only',
default=False,
action="store_true",
help='Quantize weights for the various GEMMs to INT4/INT8.'
'See --weight_only_precision to set the precision')
parser.add_argument('--weight_only_precision',
const='int8',
type=str,
nargs='?',
default='int8',
choices=['int8', 'int4'],
help='Define the precision for the weights when using weight-only quantization.'
'You must also use --use_weight_only for that argument to have an impact.')
parser.add_argument(
'--has_qzeros',
action="store_true",
default=False,
help='whether to add qzeros weight to vllm_mlu weight',
)
parser.add_argument('--model_version',
type=str,
default=None,
help="Set model version to replace parsing from _name_or_path in hf config.")
parser.add_argument('--model_type',
type=str,
default=None,
help="Set model type to replace parsing from model_type in hf config."
"if set is None and parsed also None, then set as model_version")
parser.add_argument('--no_add_special_tokens',
dest='add_special_tokens',
default=True,
action='store_false',
help="Whether or not to add special tokens")
parser.add_argument(
'--has_prompt_token_id',
action="store_true",
default=False,
help='whether to give llm.generate prompt_token_id',
)
parser.add_argument(
'--disable_fused_quantize_expert',
action="store_true",
default=False,
help='''disable fused activation to quantize for unfused moe usage.
Because to fused_moe smoothquant, input_smooth has shape (hidden_size), act_smooth has shape (inner_size),
and not every expert can be routed, so we assume that all expert should use the same act_smooth by default.
You can use this option to close the assumption.'''
)
parser.add_argument('--prompt_file',
type=str,
default=None,
help="custom prompt file, should has format that each line is one string prompt,"
"you can refer the format of summarize_1024_prompts.csv")
parser.add_argument(
'--batch_size',
type=int,
default=-1,
help="batch size, used to limit max_num_batched_tokens, -1 means batch_size equals to num_samples"
)
parser.add_argument(
'--cpu_offload_gb',
type=float,
default=0.0,
help='''The size (GiB) of CPU memory to use for offloading the model weights.
This virtually increases the GPU memory space you can use to hold the model weights,
at the cost of CPU-GPU data transfer for every forward pass.'''
)
parser.add_argument(
'--dump_prompt_token_ids',
action="store_true",
default=False,
help='dump prompt_token_ids used by llm.generate ',
)
parser.add_argument(
'--dump_input_ids',
action="store_true",
default=False,
help='dump vllm qkv used token ids at llm running',
)
parser.add_argument(
'--dump_act_range',
action="store_true",
default=False,
help='dump act range which is the max hidden dim value of input, output, weigth',
)
parser.add_argument(
'--dump_weights',
action="store_true",
default=False,
help='dump weights of the converted model',
)
parser.add_argument(
'--dump_generate_weights',
action="store_true",
default=False,
help='dump generate weights of the converted model',
)
args = parser.parse_args()
assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
assert args.pp_size == 1, "Pipeline parallelism is not supported."
if args.tokenizer_dir is None:
args.tokenizer_dir = args.hf_model_dir
if args.has_prompt_token_id is False:
args.dump_prompt_token_ids = False
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
args.hf_model_dir, args.model_version, args.model_type)
assert args.model_type in smooth_model_config, f'''{args.model_type} hasn't supported,
please add it's infomation in model_special.py by your self'''
args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
hf_text_config = get_hf_text_config(args.hf_config)
args.tie_word_embeddings = getattr(hf_text_config, "tie_word_embeddings", False)
sliding_window_len = get_hf_config_sliding_window(hf_text_config)
disable_sliding_window = sliding_window_len is None
if args.model_type == 'qwen2_vl':
# workround for qwen2_vl since _get_and_verify_max_len not supported for MRoPE
# remove this when it is supported.
args.hf_max_model_len = 32768
else:
if args.model_type == 'hunyuan' or args.model_type == 'deepseek_v2':
disable_sliding_window=False
args.hf_max_model_len = _get_and_verify_max_len(hf_text_config, None, disable_sliding_window, sliding_window_len)
if args.batch_size < 1:
args.batch_size = args.num_samples
args.batch_size = min(args.batch_size, args.num_samples)
if args.dtype == "auto":
args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
if args.scales_smooth_dtype == "auto":
args.scales_smooth_dtype = args.dtype
args.torch_dtype = str_dtype_to_torch(args.dtype)
args.torch_scales_smooth_dtype = str_dtype_to_torch(args.scales_smooth_dtype)
args.hf_config.torch_dtype = args.torch_dtype
args.tokenizer, args.pad_id, args.end_id = load_tokenizer(
tokenizer_dir=args.tokenizer_dir,
model_name=args.model_name,
model_version=args.model_version,
)
tik = time.time()
main(args)
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
logger.info(f'Total time of converting checkpoints: {t}')

View File

@@ -0,0 +1,69 @@
import os
import argparse
from transformers import (AutoModel, AutoModelForCausalLM,
AutoModelForSeq2SeqLM, GenerationConfig)
from vllm.transformers_utils.config import get_config
from utils_internal import (read_model_name, torch_dtype_to_str, str_dtype_to_torch)
from dump_smooth import save_weights
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--hf_model_dir', type=str, default=None)
parser.add_argument('--output_dir',
type=str,
default="output_dir",
help="The path to save the quantized checkpoint")
parser.add_argument('--model_version',
type=str,
default=None,
help="Set model version to replace parsing from _name_or_path in hf config.")
parser.add_argument('--model_type',
type=str,
default=None,
help="Set model type to replace parsing from model_type in hf config."
"if set is None and parsed also None, then set as model_version")
parser.add_argument('--dtype',
type=str,
choices=['auto', 'float32', 'float16', 'bfloat16'],
default='auto',
help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
parser.add_argument(
'--dump_weights',
action="store_true",
default=True,
help='dump weights of the converted model',
)
args = parser.parse_args()
assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
args.hf_model_dir, args.model_version, args.model_type)
args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
if args.dtype == "auto":
args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
args.torch_dtype = str_dtype_to_torch(args.dtype)
args.hf_config.torch_dtype = args.torch_dtype
if args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'glm':
auto_model_cls = AutoModelForSeq2SeqLM
elif args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'chatglm':
auto_model_cls = AutoModel
else:
auto_model_cls = AutoModelForCausalLM
model = auto_model_cls.from_pretrained(
args.hf_model_dir,
trust_remote_code=True,
torch_dtype=args.torch_dtype)
named_parameters = dict(model.named_parameters())
save_weights(named_parameters, args)

View File

@@ -0,0 +1,145 @@
import torch
import os
import logging
logger = logging.getLogger(__name__)
def tensor_shape_to_string(tensor):
'''
convert a tensor shape to string description
'''
int_list = list(tensor.shape)
str_list = [str(num) for num in int_list]
str_shape = "x".join(str_list)
return str_shape
def save_prompt_token_ids(prompt_input_ids, args):
'''
save prompt_token_id
Args:
prompt_input_ids: prompt input_id assiged to llm.generate
args: arguments from main
'''
if args.dump_prompt_token_ids is not True:
return
output_dir = os.path.join(args.output_dir, "prompt_input_ids")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
data_len = len(prompt_input_ids)
for data_index in range(data_len):
tensor = prompt_input_ids[data_index]
str_shape = tensor_shape_to_string(tensor)
file_path = os.path.join(output_dir, f"prompt_input_ids_{data_index}_{str_shape}.pt")
torch.save(tensor, file_path)
logger.info(f"Saved input_ids[{data_index}] to {file_path}")
def save_input_ids(input_ids, args):
'''
save input_ids
Args:
input_ids: input of qkv with layer0
args: arguments from main
'''
id_len = len(input_ids)
if args.dump_input_ids is not True or id_len == 0:
return
output_dir = os.path.join(args.output_dir, "input_ids")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for data_index in range(id_len):
tensor = input_ids[data_index]
str_shape = tensor_shape_to_string(tensor)
file_path = os.path.join(output_dir, f"input_ids_{data_index}_{str_shape}.pt")
torch.save(tensor, file_path)
logger.info(f"Saved input_ids[{data_index}] to {file_path}")
def save_act_range(act_range, args):
'''
save act_range
Args:
act_range: save act_range collected when model running
args: arguments from main
'''
if args.dump_act_range is not True:
return
output_dir = os.path.join(args.output_dir, "act_range")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for layer_name, layer_scale in act_range.items():
for tensor_key, tensor_value in layer_scale.items():
if isinstance(tensor_value, torch.Tensor):
str_shape = tensor_shape_to_string(tensor_value)
file_name = f'{layer_name}_{tensor_key}_{str_shape}.pt'
file_path = os.path.join(output_dir, file_name)
torch.save(tensor_value, file_path)
logger.info(f"Saved act_range[{layer_name}][{tensor_key}] to {file_path}")
def save_weights(weights, args):
'''
save hugging face weights
Args:
weights: hugging face weights merged with llm model named parameters
args: arguments from main
'''
if args.dump_weights is not True:
return
output_dir = os.path.join(args.output_dir, "weights")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for tensor_key, tensor_value in weights.items():
str_shape = tensor_shape_to_string(tensor_value)
file_name = f'{tensor_key}_{str_shape}.pt'
file_path = os.path.join(output_dir, file_name)
torch.save(tensor_value, file_path)
logger.info(f"Saved weights[{tensor_key}] to {file_path}")
def save_generate_weights(weights, args):
'''
save quantizated weights
Args:
weights: quantized weights of smoothquant or weightonly
args: arguments from main
'''
if args.dump_generate_weights is not True:
return
output_dir = os.path.join(args.output_dir, "generate_weights")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for tensor_key, tensor_value in weights.items():
str_shape = tensor_shape_to_string(tensor_value)
file_name = f'{tensor_key}_{str_shape}.pt'
file_path = os.path.join(output_dir, file_name)
torch.save(tensor_value, file_path)
logger.info(f"Saved generate weights[{tensor_key}] to {file_path}")
def dump_save_x_y(name, x, y, index):
'''
dump x, y when inferrence
output_dir need to modify by your self
'''
output_dir = "output_dir"
x_output_dir = os.path.join(output_dir, "x_tensor")
y_output_dir = os.path.join(output_dir, "y_tensor")
if not os.path.exists(x_output_dir):
os.makedirs(x_output_dir)
if not os.path.exists(y_output_dir):
os.makedirs(y_output_dir)
x_file_name = os.path.join(x_output_dir, f"{name}_x_{index}.pt")
y_file_name = os.path.join(y_output_dir, f"{name}_y_{index}.pt")
if isinstance(x, tuple):
x = x[0]
if not os.path.exists(x_file_name):
torch.save(x.cpu(), x_file_name)
if not os.path.exists(y_file_name):
torch.save(y.cpu(), y_file_name)

View File

@@ -0,0 +1,140 @@
import torch
def make_context(
tokenizer,
query,
history,
system,
max_input_length,
max_window_size: int = 6144,
chat_format: str = "chatml",
):
'''
tokenize one text context to tokenized id
args:
tokenizer: model tokenizer
query: current text context
history: history text context
system: system prompt
max_input_length: max input length of tokenized id
chat_format: chat format, only accept chatml and raw
'''
if history is None:
history = []
if chat_format == "chatml":
im_start, im_end = "<|im_start|>", "<|im_end|>"
im_start_tokens = [tokenizer.im_start_id]
im_end_tokens = [tokenizer.im_end_id]
nl_tokens = tokenizer.encode("\n")
def _tokenize_str(role, content):
'''
tokensize string
'''
return (f"{role}\n{content}", tokenizer.encode(
role,
allowed_special=set(),
) + nl_tokens + tokenizer.encode(
content,
allowed_special=set(),
))
system_text, system_tokens_part = _tokenize_str("system", system)
system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
raw_text = ""
context_tokens = []
for turn_query, turn_response in reversed(history):
query_text, query_tokens_part = _tokenize_str("user", turn_query)
query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
response_text, response_tokens_part = _tokenize_str("assistant", turn_response)
response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
prev_chat = (f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}")
current_context_size = (len(system_tokens) + len(next_context_tokens) + len(context_tokens))
if current_context_size < max_window_size:
context_tokens = next_context_tokens + context_tokens
raw_text = prev_chat + raw_text
else:
break
context_tokens = system_tokens + context_tokens
raw_text = f"{im_start}{system_text}{im_end}" + raw_text
context_tokens += (nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens +
im_start_tokens + tokenizer.encode("assistant") + nl_tokens)
raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
elif chat_format == "raw":
raw_text = query
context_tokens = tokenizer.encode(raw_text)
else:
raise NotImplementedError(f"Unknown chat format {chat_format!r}")
# truncate to max_input_length, truncate from the front
return raw_text, context_tokens[-max_input_length:]
def prepare_inputs(batch_input_texts,
tokenizer,
model_name,
model_version,
test_token_num,
eval_task='summarize',
add_special_tokens=True):
'''
tokenize batch input texts into tokenized id.
args:
batch_input_texts: batch input text, also named batched prompt
tokenizer: model tokenizer
model_name: model name
model_version: model version
test_token_num: batch size, also named prompt number
eval_task: eval task
add_special_tokens: whether to add_special_tokens, default True
'''
batch_size = len(batch_input_texts)
append_str = ' TL;DR: ' if eval_task == 'summarize' else ''
batch_input_ids = []
for i in range(batch_size):
curr_text = batch_input_texts[i] + append_str
curr_text = curr_text.strip().replace(" n't", "n't")
# The below lines are used to be compatible with the original code
if 'GLM' in model_name and model_version in ['chatglm2', 'chatglm3']:
input_ids = tokenizer.encode(curr_text, return_tensors='pt').squeeze(0)
input_ids = input_ids[:test_token_num]
elif 'qwen' in model_name.lower() and model_version == 'qwen':
# use make_content to generate prompt
system_prompt = "You are a useful assistant, please directly output the corresponding " + \
"summary according to the article entered by the user."
_, input_id_list = make_context(
tokenizer=tokenizer,
query=curr_text,
history=[],
system=system_prompt,
max_input_length=test_token_num,
)
input_ids = torch.tensor(input_id_list)
else:
if 'qwen' in model_name.lower() and 'qwen2' in model_version:
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": curr_text
}]
curr_text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True)
input_ids = tokenizer.encode(curr_text,
return_tensors='pt',
add_special_tokens=add_special_tokens,
truncation=True,
max_length=test_token_num).squeeze(0)
batch_input_ids.append(input_ids)
return batch_input_ids

View File

@@ -0,0 +1,206 @@
import re
# model_type, qkv_list, gate_up_list, is_gate_up
smooth_model_config = {
"mllama": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"llama": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"qwen2_vl": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None,
"skip_patterns": [r"^visual\.*"]
},
"qwen2": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"qwen": {
"qkv_list": ["c_attn"],
"gate_up_list": ["w2", "w1"],
"is_gate_up": True,
"moe_list": None
},
"baichuan": {
"qkv_list": ["W_pack"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"chatglm": {
"qkv_list": ["query_key_value"],
"gate_up_list": ["dense_h_to_4h"],
"is_gate_up": True,
"moe_list": None
},
"gpt_neox": {
"qkv_list": ["query_key_value"],
"gate_up_list": [],
"is_gate_up": True,
"moe_list": None
},
"mixtral": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["w1", "w3"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
"down_list": ["block_sparse_moe.w2", "w2"],
"is_merged": True
}
},
"qwen2_moe": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
"down_list": ["mlp.w2", "down_proj"],
"is_merged": True
}
},
"deepseek_v2": {
"qkv_list": ["q_proj", "q_b_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
"down_list": ["mlp.w2", "down_proj"],
"is_merged": True
},
"skip_patterns": [r".*\.kv_b_proj\..*",]
},
"falcon": {
"qkv_list": ["query_key_value"],
"gate_up_list": ["dense_h_to_4h"],
"is_gate_up": True,
"moe_list": None
},
"bloom": {
"qkv_list": ["query_key_value"],
"gate_up_list": ["dense_h_to_4h"],
"is_gate_up": False,
"moe_list": None
},
"internlm2": {
"qkv_list": ["wqkv"],
"gate_up_list": ["gate_up_proj"],
"is_gate_up": True,
"moe_list": None
},
"hunyuan": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
"down_list": ["mlp.w2", "down_proj"],
"is_merged": True
}
},
"phi3": {
"qkv_list": ["qkv_proj"],
"gate_up_list": ["gate_up_proj"],
"is_gate_up": True,
"moe_list": None
},
}
def get_layer_weight_bias_name(model_type, layer_name):
'''
Specially adjust the condition that layer_name and weight/bias name are different,
or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
if model_type == "chatglm" and "output_layer" in layer_name:
layer_name = "lm_head"
weight_name = f"{layer_name}_weight"
bias_name = f"{layer_name}_bias"
Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
'''
weight_name = None
bias_name = None
# layers which need to be modified can be listed at here
if model_type == "hunyuan" and "lm_head" in layer_name:
layer_name = "model.embed_tokens"
weight_name = "model.embed_tokens.weight"
bias_name = "model.embed_tokens.bias"
if weight_name is None:
weight_name = f"{layer_name}.weight"
if bias_name is None:
bias_name = f"{layer_name}.bias"
return layer_name, weight_name, bias_name
def modify_layer_weight_bias_name(model_type, named_parameters):
'''
modify special condition that vllm layer_name isn't same as hf layer name
'''
# Mapping for model type specific adjustments
mapping = {
"chatglm": {
"transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
},
}
if model_type in mapping:
for old_key, new_key in mapping[model_type].items():
if old_key in named_parameters:
named_parameters[new_key] = named_parameters.pop(old_key)
def extract_numbers(string):
'''
extract a string to number
'''
# 使用正则表达式找到字符串中的所有数字部分
matches = re.findall(r'\d+', string)
# 将所有匹配的数字部分转换为整数
numbers = [int(match) for match in matches]
return numbers[-1] if len(numbers) > 0 else 0
def get_qkv_distribution(model_type, model_version, hf_config):
'''
Get qkv distribution: n3sh or 3nsh
n3sh: [head_num, 3, head_size, hidden_size]
3nsh: [3, head_num, head_size, hidden_size]
vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
to be same as hugging face qkv distribution
This is only for packge qkv layer and it's distribution is n3sh
'''
is_n3sh = False
head_num = 0
kv_head_num = 0
if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
is_n3sh = True
head_num = hf_config.num_attention_heads
kv_head_num = head_num
if model_type == "falcon":
is_n3sh = True
head_num = hf_config.num_attention_heads
if hf_config.new_decoder_architecture:
kv_head_num = hf_config.num_kv_heads
elif hf_config.multi_query:
kv_head_num = 1
else:
kv_head_num = head_num
return is_n3sh, head_num, kv_head_num

View File

@@ -0,0 +1,418 @@
import argparse
import torch
from datasets import load_dataset
import logging
import csv
import os
from vllm import LLM, SamplingParams
from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
from input_context import prepare_inputs
from dump_smooth import save_prompt_token_ids, save_input_ids, save_act_range, save_weights, save_generate_weights
from model_special import smooth_model_config
logger = logging.getLogger(__name__)
def load_prompts_from_csv(args):
'''
load prompts from csv file
'''
if args.prompt_file is not None:
prompt_file = args.prompt_file
else:
current_dir = os.path.dirname(__file__)
prompt_file = os.path.join(current_dir, 'summarize_1024_prompts.csv')
# 从 CSV 文件加载数据为 List
loaded_prompts = []
# 从按列显示的 CSV 文件中读取数据并转换为 List 形式
with open(prompt_file, 'r', newline='') as file:
reader = csv.reader(file)
loaded_prompts = list(zip(*reader))[0]
loaded_prompts = list(loaded_prompts)
num_samples = min(args.num_samples, len(loaded_prompts))
prompts = loaded_prompts[0:num_samples]
return prompts
def save_summarize_1024_prompts_as_csv(prompts):
'''
save summarize 512 prompts
'''
# 将 List 数据按列保存为 CSV 文件
# 转置 List
transposed_prompts = [prompts]
with open('summarize_1024_prompts.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(zip(*transposed_prompts))
def generate_prompts(args: argparse.Namespace):
'''
Generate prompts based on the evaluation task and arguments.
'''
eval_task_config = {
"code_completion": {
"dataset_name": "openai_humaneval",
"dataset_revision": None,
"dataset_input_key": "prompt",
"dataset_split": "test"
},
"summarize": {
"dataset_name": "ccdv/cnn_dailymail",
"dataset_revision": "3.0.0",
"dataset_input_key": "article",
"dataset_split": "train"
},
"summarize_long": {
"dataset_name": "tau/zero_scrolls",
"dataset_revision": "squality",
"dataset_input_key": "input",
"dataset_split": "validation"
},
"summarize_hg": {
"dataset_name": "cnn_dailymail",
"dataset_revision": "3.0.0",
"dataset_input_key": "article",
"dataset_split": "validation"
},
"text_generation": {
"dataset_name": "lambada",
"dataset_revision": None,
"dataset_input_key": "text",
"dataset_split": "validation"
}
}
if args.eval_task in eval_task_config:
config = eval_task_config[args.eval_task]
dataset_name = config["dataset_name"]
dataset_revision = config["dataset_revision"]
dataset_input_key = config["dataset_input_key"]
dataset_split = config["dataset_split"]
else:
assert args.dataset_name is not None, f"dataset_name is None when eval_task == custom"
assert args.dataset_input_key is not None, f"dataset_input_key is None when eval_task == custom"
assert args.dataset_split is not None, f"dataset_split is None when eval_task == custom"
dataset_name = args.dataset_name
dataset_revision = args.dataset_revision
dataset_input_key = args.dataset_input_key
dataset_split = args.dataset_split
if args.prompt_file is not None or (args.eval_task == "summarize" and args.num_samples <= 1024):
prompts = load_prompts_from_csv(args)
num_samples = min(args.num_samples, len(prompts))
else:
dataset = load_dataset(dataset_name,
dataset_revision,
cache_dir=args.dataset_cache_dir,
split=dataset_split,
trust_remote_code=True)
num_samples = min(args.num_samples, len(dataset))
prompts = dataset[0:num_samples][dataset_input_key]
# save_summarize_1024_prompts_as_csv(prompts)
prompt_token_ids = []
if args.has_prompt_token_id:
batch_input_ids = prepare_inputs(prompts,
args.tokenizer,
args.model_name,
args.model_version,
args.max_input_length,
eval_task=args.eval_task,
add_special_tokens=args.add_special_tokens)
save_prompt_token_ids(batch_input_ids, args)
for i in range(num_samples):
prompt_token_ids.append(batch_input_ids[i].tolist())
if len(prompts) == 0:
prompts = None
else:
prompts = [s[:args.max_input_length] for s in prompts]
if len(prompt_token_ids) == 0:
prompt_token_ids = None
return prompts, prompt_token_ids
@torch.no_grad()
def get_smooth_cal_weight(name, weight, name_parameters, act_range, model_type):
'''
get cal_weight for smooth process to solve q/k/v and gate/up layer merged condition in vllm
args:
name: weight name
weight: weight value
name_parameters: named parameters
act_range: layer act range info of name
model_type: model type
'''
if act_range["is_qkv"] is True:
name_parts = name.split(".")
self_attn_layer_name = ".".join(name_parts[:-2])
qkv_list = smooth_model_config[model_type]["qkv_list"]
q_weight_name = f"{self_attn_layer_name}.{qkv_list[0]}.weight"
k_weight_name = f"{self_attn_layer_name}.{qkv_list[1]}.weight"
v_weight_name = f"{self_attn_layer_name}.{qkv_list[2]}.weight"
q_weight = name_parameters[q_weight_name]
k_weight = name_parameters[k_weight_name]
v_weight = name_parameters[v_weight_name]
cal_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
elif act_range["is_merge"] is True:
name_parts = name.split(".")
mlp_layer_name = ".".join(name_parts[:-2])
gate_up_list = smooth_model_config[model_type]["gate_up_list"]
gate_weight_name = f"{mlp_layer_name}.{gate_up_list[0]}.weight"
up_weight_name = f"{mlp_layer_name}.{gate_up_list[1]}.weight"
gate_weight = name_parameters[gate_weight_name]
up_weight = name_parameters[up_weight_name]
cal_weight = torch.cat([gate_weight, up_weight], dim=0)
else:
cal_weight = weight
return cal_weight
@torch.no_grad()
def cal_smoother(weight, act_range_x, alpha=0.5):
'''
calculate smoother value
args:
weight: smoother weight
act_range_x: activation max value of per channel
alpha: smooth factor, default 0.5
'''
assert weight.shape[-1] == act_range_x.numel()
weight_scales = weight.view(-1, weight.shape[-1])
weight_scales = weight_scales.abs().max(dim=0)[0]
weight_scales = weight_scales.to(float).clamp(min=1e-6)
smoother = (act_range_x.to(weight_scales.device).to(float).pow(alpha) /
weight_scales.pow(1 - alpha)).clamp(min=1e-6)
return smoother
@torch.no_grad()
def cal_qweight_scales(sweight, smooth_act_range_x, per_token, per_channel):
'''
calculate quantized weight anc scales
args:
sweight: weight which has been divided by smoother value
smooth_act_range_x: activation max value which has beed divide by smoother value
per_token: bool, means whether calculate the weight and scales dynamically
per_channel: bool, mean whether calculate the weight and scales by channel
'''
scale_x_quant_orig_t = smooth_act_range_x.max() / 127.0
smooth_act_range_w = sweight.abs().max(dim=-1)[0]
smooth_act_range_w = smooth_act_range_w.to(float).clamp(min=1e-6)
scale_w_quant_orig_c = smooth_act_range_w / 127.0
scale_w_quant_orig_t = smooth_act_range_w.max() / 127
if per_channel:
qweight = (sweight / scale_w_quant_orig_c[..., None])
else:
qweight = (sweight / scale_w_quant_orig_t)
qweight = qweight.clip(-128, 127).to(torch.int8)
scale_to_int = 1 / scale_x_quant_orig_t
if per_token:
if per_channel:
per_channel_scale = scale_w_quant_orig_c
else:
per_channel_scale = scale_w_quant_orig_t
else:
if per_channel:
per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_c
hidden_size = smooth_act_range_x.numel()
scale_to_int = scale_to_int.repeat(hidden_size)
else:
per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_t
per_channel_scale = per_channel_scale.squeeze()
if per_channel_scale.numel() == 1 and per_channel_scale.dim() == 0:
per_channel_scale = per_channel_scale.unsqueeze(0)
if scale_to_int.numel() == 1 and scale_to_int.dim() == 0:
scale_to_int = scale_to_int.unsqueeze(0)
sinfo = [
scale_w_quant_orig_t.item(), scale_x_quant_orig_t.item(),
scale_w_quant_orig_t.item() / scale_x_quant_orig_t.item()
]
return qweight, per_channel_scale, scale_to_int, sinfo
def check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int):
'''
check whether nan/inf appears in qweight, per_channel_scale, smooth, qzeros, scale_to_int
'''
if torch.isinf(qweight).any() or torch.isnan(qweight).any():
logger.error(f"name:{name} qweight has inf or nan")
if torch.isinf(per_channel_scale).any() or torch.isnan(per_channel_scale).any():
logger.error(f"name:{name} per_channel_scale has inf or nan")
if torch.isinf(smooth).any() or torch.isnan(smooth).any():
logger.error(f"name:{name} smooth has inf or nan")
if torch.isinf(scale_to_int).any() or torch.isnan(scale_to_int).any():
logger.error(f"name:{name} scale_to_int has inf or nan")
if qzeros is not None and (torch.isinf(qzeros).any() or torch.isnan(qzeros).any()):
logger.error(f"name:{name} qzeros has inf or nan")
@torch.no_grad()
def cal_smooth_weight(name, act_range_x, weight, smooth_value, has_qzeros, per_token, per_channel, cal_weight):
'''
calculate qweight, scales, smooth, qzeros
args:
name: weight name
act_range_x: activation max value of per channel
weight: weight to be quantized
smooth_value: smooth value
has_qzeros: which generate qzeros weight
per_token: bool, means whether calculate the weight and scales dynamically
per_channel: bool, mean whether calculate the weight and scales by channel
model_type: model type
'''
smoother = cal_smoother(cal_weight, act_range_x, smooth_value)
smooth_act_range_x = act_range_x / smoother
sweight = weight * (smoother.view(1, -1))
qweight, per_channel_scale, scale_to_int, sinfo = cal_qweight_scales(sweight, smooth_act_range_x, per_token,
per_channel)
qweight = qweight.reshape(weight.shape)
smooth = 1 / smoother
smooth = smooth.squeeze()
if has_qzeros:
qzeros = torch.zeros_like(per_channel_scale, dtype=torch.int32)
else:
qzeros = None
# check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int)
return qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo
@torch.no_grad()
def generate_smooth_weight(act_range, name_parameters, args):
'''
generate smooth weight
args:
act_range: act_range collected in model running
name_parameters: hugging face model named parameters
args: argument from main
'''
smooth_weight = {}
smooth_info = {}
has_qzeros = args.has_qzeros
smooth_value = args.smooth_value
smooth_info["title"] = ["max_scale_w, max_scale_x, max_scale_w/max_scale_x"]
for name, param in name_parameters.items():
if should_skip(args.model_type, name):
logger.info(f"skip {name}")
smooth_weight[name] = param
continue
if name.endswith("bias"):
smooth_weight[name] = param
continue
name_parts = name.split(".")
layer_name = ".".join(name_parts[:-1])
if layer_name in act_range:
act_range_x = act_range[layer_name]['x']
cal_weight = get_smooth_cal_weight(name, param, name_parameters, act_range[layer_name], args.model_type)
qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo = cal_smooth_weight(
name, act_range_x, param, smooth_value, has_qzeros, args.per_token, args.per_channel, cal_weight)
per_channel_scale = per_channel_scale.to(args.torch_scales_smooth_dtype)
smooth = smooth.to(args.torch_scales_smooth_dtype)
scale_to_int = scale_to_int.to(args.torch_scales_smooth_dtype)
smooth_weight[f'{layer_name}.qweight'] = qweight
smooth_weight[f'{layer_name}.per_channel_scale'] = per_channel_scale
if args.per_token is True:
smooth_weight[f'{layer_name}.smooth'] = smooth
else:
scale_to_int = scale_to_int * smooth
smooth_weight[f'{layer_name}.scale_to_int'] = scale_to_int
if has_qzeros:
smooth_weight[f'{layer_name}.qzeros'] = qzeros
smooth_info[name] = sinfo
else:
smooth_weight[name] = param
return smooth_weight, smooth_info
def generate_weights_of_smoothquant(llm: LLM, args: argparse.Namespace):
'''
generate smoothquant weights
args:
llm: LLM instance
args: argument from main
'''
prompts, prompt_token_ids = generate_prompts(args)
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=args.output_len,
repetition_penalty=args.repetition_penalty,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k)
tp_size = args.tp_size
llm.llm_engine.model_executor._run_workers("setup_smooth_hook", args.dump_input_ids)
llm.generate(prompts, sampling_params, prompt_token_ids=prompt_token_ids, use_tqdm=True)
logger.info("llm generate finished")
llm.llm_engine.model_executor._run_workers("remove_hooks")
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
vllm_cleanup(llm)
del prompts
del prompt_token_ids
cleanup()
logger.info("get act_range and named_parameters from llm finished")
merged_act_range, merged_named_parameters, input_id_list = convert_to_merged(act_range, named_parameters, tp_size,
args)
save_input_ids(input_id_list, args)
save_act_range(merged_act_range, args)
save_weights(merged_named_parameters, args)
del act_range
del named_parameters
cleanup()
logger.info("get merged_act_range and merged_named_parameters finished")
smooth_weight, smooth_info = generate_smooth_weight(merged_act_range, merged_named_parameters, args)
save_generate_weights(smooth_weight, args)
del merged_act_range
del merged_named_parameters
cleanup()
logger.info("get smooth_weight finished")
return smooth_weight, smooth_info

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,713 @@
from collections import defaultdict, OrderedDict
import torch
from pathlib import Path
from typing import Optional
import re
import os
import shutil
import logging
import json
from transformers import AutoTokenizer, T5Tokenizer
import gc
from datetime import datetime
from vllm.platforms import current_platform
from model_special import (smooth_model_config, get_layer_weight_bias_name, get_qkv_distribution,
modify_layer_weight_bias_name)
logger = logging.getLogger(__name__)
_str_to_torch_dtype_dict = dict(
bfloat16=torch.bfloat16,
float16=torch.float16,
float32=torch.float32,
int64=torch.int64,
int32=torch.int32,
int8=torch.int8,
bool=torch.bool,
fp8=torch.float8_e4m3fn,
)
def str_dtype_to_torch(dtype):
'''
convert torch dytpe to str dtype
'''
ret = _str_to_torch_dtype_dict.get(dtype)
dtype = ret if ret is not None else torch.float16
return dtype
_torch_dtype_to_str_dict = {
torch.bfloat16:"bfloat16",
torch.float16:"float16",
torch.float32:"float32",
torch.int64:"int64",
torch.int32:"int32",
torch.int8:"int8",
torch.bool:"bool",
torch.float8_e4m3fn:"fp8",
}
def torch_dtype_to_str(dtype):
'''
convert str dytpe to torch dtype
'''
ret = _torch_dtype_to_str_dict.get(dtype)
dtype = ret if ret is not None else "float16"
return dtype
def extract_model_path(name_or_path):
'''
extract model_version, model_family from named_or_path from config.json
'''
patterns = [
r"/(.*)(-[0-9]+[mMbB]{1})(-*.*)",
r"/(.*-[0-9]+)(-*.*)",
r"(.*)(-[0-9]+[mMbB]{1})(-*.*)",
r"(.*-[0-9]+)(-*.*)",
r"([^-]+)(-*.*)",
]
model_version = None
for pattern in patterns:
match = re.search(pattern, name_or_path)
if match:
model_version = match.group(1)
break
if model_version is None:
model_version = name_or_path
model_version = model_version.lower()
match = re.search(r"([a-zA-z]+)(.*)", model_version)
if match:
model_family = match.group(1)
else:
model_family = model_version
return model_version, model_family
def read_model_name(model_dir: str, model_version: Optional[str] = None, model_type: Optional[str] = None):
'''
get model_arch, model_version, model_family, model_type form config.json, passed model_version, model_type
args:
model_dir: model directory
model_version: passed from main, default None
model_type: pass from main, default None
'''
with open(Path(model_dir) / "config.json", 'r') as f:
config = json.load(f)
model_arch = config.get('architectures', None)
name_or_path = config.get('_name_or_path', None)
if model_type is None:
model_type = config.get('model_type', None)
if model_type:
model_type = model_type.lower()
model_family = None
if model_version is None and name_or_path:
model_version, model_family = extract_model_path(name_or_path)
if model_version is None:
model_version = model_type
if model_version:
model_version = model_version.lower()
if model_version and model_family is None:
match = re.search(r"([a-zA-z]+)(.*)", model_version)
if match:
model_family = match.group(1)
else:
model_family = model_version
if isinstance(model_arch, (list, tuple)) and len(model_arch) > 0:
model_arch = model_arch[0]
assert model_arch, "read model architectures failed"
assert model_version, "read model version failed, please set args.version manually"
assert model_family, "read model family failed, please set args.version manually"
return model_arch, model_version, model_family, model_type
def load_tokenizer(tokenizer_dir: Optional[str] = None,
vocab_file: Optional[str] = None,
model_name: str = 'GPTForCausalLM',
model_version: Optional[str] = None,
tokenizer_type: Optional[str] = None):
'''
load tokenizer of model
args:
tokenizer_dir: tokenizer directory
vocab_file: vocabulary file, default None
model_name: model name
model_version: model version
tokenizer_type: Tokenizer type to be loaded.
'''
if vocab_file is None:
use_fast = True
if tokenizer_type == "llama":
use_fast = False
# Should set both padding_side and truncation_side to be 'left'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
legacy=False,
padding_side='left',
truncation_side='right',
trust_remote_code=True,
tokenizer_type=tokenizer_type,
use_fast=use_fast)
elif model_name == 'GemmaForCausalLM':
from transformers import GemmaTokenizer
# Initialize tokenizer from vocab file.
tokenizer = GemmaTokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
else:
# For gpt-next, directly load from tokenizer.model
tokenizer = T5Tokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
if model_name == 'QWenForCausalLM':
with open(Path(tokenizer_dir) / "generation_config.json") as f:
gen_config = json.load(f)
chat_format = gen_config['chat_format']
assert chat_format in ('raw','chatml'), f"unknown chat format: {chat_format}"
pad_id = gen_config['pad_token_id']
end_id = gen_config['eos_token_id']
elif model_name in ('ChatGLMForCausalLM', 'glm'):
pad_id = tokenizer.pad_token_id
end_id = tokenizer.eop_token_id
else:
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
pad_id = tokenizer.pad_token_id
end_id = tokenizer.eos_token_id
try:
tokenizer.pad_token = tokenizer.eos_token
except Exception as e:
logger.warn(f"set pad_token with exception:{e}")
return tokenizer, pad_id, end_id
def merge_qkv_weight(named_parameters, weight_name, tp_size, q_proj_size, num_kv_head_replicas):
'''
merge tensor parallel qkv weight to none parallel q_weight, k_weight, v_weight.
merge_qkv weight and bias has the same logic
args:
named_parameters: parallel named parameters
weight_name: qkv layer weight name
tp_size: tensor parallel size
q_proj_size: query projection size
num_kv_head_replicas: number kv head replicas
'''
qkv_proj_size = named_parameters[0][weight_name].shape[0]
kv_proj_size = (qkv_proj_size - q_proj_size) // 2
splite_size = [q_proj_size, kv_proj_size, kv_proj_size]
q_weight_list = []
k_weight_list = []
v_weight_list = []
for rank in range(0, tp_size):
weight = named_parameters[rank][weight_name]
split_weight = torch.split(weight, splite_size, dim=0)
q_weight_list.append(split_weight[0])
if rank % num_kv_head_replicas == 0:
k_weight_list.append(split_weight[1])
v_weight_list.append(split_weight[2])
q_weight = torch.cat(q_weight_list, dim=0)
k_weight = torch.cat(k_weight_list, dim=0)
v_weight = torch.cat(v_weight_list, dim=0)
return q_weight, k_weight, v_weight
def merge_merged_weight(named_parameters, weight_name, tp_size, dim=0):
'''
merge merged linear layer weight to gate_weight and up_weight.
merge merged weight and bias has the same logic.
args:
named_parameters: parallel named parameters
weight_name: qkv layer weight name
tp_size: tensor parallel size
'''
up_weight_list = []
gate_weight_list = []
for rank in range(0, tp_size):
weight = named_parameters[rank][weight_name]
chunk_weights = torch.chunk(weight, 2, dim=dim)
up_weight_list.append(chunk_weights[0])
gate_weight_list.append(chunk_weights[1])
gate_weight = torch.cat(up_weight_list, dim=dim)
up_weight = torch.cat(gate_weight_list, dim=dim)
return gate_weight, up_weight
def convert_packed_qkv(q_weight, k_weight, v_weight, dim, args):
'''
convert packad qkv weight or bias
args:
q_weight: q weight or bias
k_weight: k weight or bias
v_weight: v_weight or bias
dim: convert dim
args: argument
'''
packed_qkv = torch.cat([q_weight, k_weight, v_weight], dim=dim)
is_n3sh, head_num, kv_head_num = get_qkv_distribution(args.model_type, args.model_version, args.hf_config)
if is_n3sh is True:
packed_qkv_shape = packed_qkv.shape
num_query_heads_per_kv_head = head_num // kv_head_num
q_shape = q_weight.shape
k_shape = k_weight.shape
v_shape = v_weight.shape
q = q_weight.view(q_shape[:dim] + (kv_head_num, num_query_heads_per_kv_head, -1) + q_shape[dim + 1:])
k = k_weight.view(k_shape[:dim] + (kv_head_num, 1, -1) + k_shape[dim + 1:])
v = v_weight.view(v_shape[:dim] + (kv_head_num, 1, -1) + v_shape[dim + 1:])
tensor_n3sh = torch.cat([q, k, v], dim=dim+1)
packed_qkv = tensor_n3sh.reshape(packed_qkv_shape)
return packed_qkv
def convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
layer_range, merged_act_range, tp_size, args):
'''
convert parallel qkv named parameters to non parallel qkv named parameters
args:
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
args: argument
'''
layer_name_parts = layer_name.split(".")
self_attn_layer_name = ".".join(layer_name_parts[:-1])
qkv_name = layer_name_parts[-1]
q_weight, k_weight, v_weight = merge_qkv_weight(named_parameters, weight_name, tp_size, layer_range["q_proj_size"],
layer_range["num_kv_head_replicas"])
qkv_list = smooth_model_config[args.model_type]["qkv_list"]
qkv_list_len = len(qkv_list)
if qkv_list_len == 3:
q_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
k_layer_name = f"{self_attn_layer_name}.{qkv_list[1]}"
v_layer_name = f"{self_attn_layer_name}.{qkv_list[2]}"
elif qkv_list_len == 1:
qkv_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
if qkv_list_len == 3:
merged_act_range[q_layer_name]["x"] = layer_range["x"]
merged_act_range[k_layer_name]["x"] = layer_range["x"]
merged_act_range[v_layer_name]["x"] = layer_range["x"]
merged_act_range[q_layer_name]["is_qkv"] = True
merged_act_range[k_layer_name]["is_qkv"] = True
merged_act_range[v_layer_name]["is_qkv"] = True
merged_named_parameters[f"{q_layer_name}.weight"] = q_weight
merged_named_parameters[f"{k_layer_name}.weight"] = k_weight
merged_named_parameters[f"{v_layer_name}.weight"] = v_weight
elif qkv_list_len == 1:
merged_act_range[qkv_layer_name]["x"] = layer_range["x"]
qkv_weight = convert_packed_qkv(q_weight, k_weight, v_weight, 0, args)
merged_named_parameters[f"{qkv_layer_name}.weight"] = qkv_weight
if bias_name in named_parameters[0]:
q_bias, k_bias, v_bias = merge_qkv_weight(named_parameters, bias_name, tp_size, layer_range["q_proj_size"],
layer_range["num_kv_head_replicas"])
if qkv_list_len == 3:
merged_named_parameters[f"{q_layer_name}.bias"] = q_bias
merged_named_parameters[f"{k_layer_name}.bias"] = k_bias
merged_named_parameters[f"{v_layer_name}.bias"] = v_bias
elif qkv_list_len == 1:
qkv_bias = convert_packed_qkv(q_bias, k_bias, v_bias, 0, args)
merged_named_parameters[f"{qkv_layer_name}.bias"] = qkv_bias
return qkv_name
def convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
layer_range, merged_act_range, tp_size, model_type):
'''
convert parallel merged named parameters to non parallel merged named parameters
args:
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
model_type: model type
'''
layer_name_parts = layer_name.split(".")
mlp_layer_name = ".".join(layer_name_parts[:-1])
gate_weight, up_weight = merge_merged_weight(named_parameters, weight_name, tp_size)
gate_up_name = layer_name_parts[-1]
gate_up_list = smooth_model_config[model_type]["gate_up_list"]
gate_up_list_len = len(gate_up_list)
is_gate_up = smooth_model_config[model_type]["is_gate_up"]
if gate_up_list_len == 2:
gate_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
up_layer_name = f"{mlp_layer_name}.{gate_up_list[1]}"
elif gate_up_list_len == 1:
gate_up_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
if gate_up_list_len == 2:
merged_act_range[gate_layer_name]["x"] = layer_range["x"]
merged_act_range[up_layer_name]["x"] = layer_range["x"]
merged_act_range[gate_layer_name]["is_merge"] = True
merged_act_range[up_layer_name]["is_merge"] = True
merged_named_parameters[f"{gate_layer_name}.weight"] = gate_weight
merged_named_parameters[f"{up_layer_name}.weight"] = up_weight
elif gate_up_list_len == 1:
merged_act_range[gate_up_layer_name]["x"] = layer_range["x"]
merged_gate_up_weight_list = [gate_weight, up_weight] if is_gate_up is True else [up_weight, gate_weight]
merged_named_parameters[f"{gate_up_layer_name}.weight"] = torch.cat(merged_gate_up_weight_list, dim=0)
if bias_name in named_parameters[0]:
gate_bias, up_bias = merge_merged_weight(named_parameters, bias_name, tp_size)
if gate_up_list_len == 2:
merged_named_parameters[f"{gate_layer_name}.bias"] = gate_bias
merged_named_parameters[f"{up_layer_name}.bias"] = up_bias
elif gate_up_list_len == 1:
merged_gate_up_bias_list = [gate_bias, up_bias] if is_gate_up is True else [up_bias, gate_bias]
merged_named_parameters[f"{gate_up_layer_name}.bias"] = torch.cat(merged_gate_up_bias_list, dim=0)
return gate_up_name
def convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size):
'''
convert colum parallel named parameters to non parallel named parameters
args:
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
'''
if layer_range['is_linear']:
merged_act_range[layer_name]["x"] = layer_range["x"]
merged_named_parameters[weight_name] = torch.cat(
[named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=0)
if bias_name in named_parameters[0]:
merged_named_parameters[bias_name] = torch.cat(
[named_parameters[tp_id][bias_name] for tp_id in range(0, tp_size)], dim=0)
def convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size):
'''
convert row parallel named parameters to non parallel named parameters
args:
act_layer_name: act layer name
act_range: parallel act_range
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
'''
if layer_range['is_linear']:
if isinstance(layer_range['x'], torch.Tensor):
merged_act_range[layer_name]['x'] = torch.cat(
[act_range[tp_id][act_layer_name]['x'] for tp_id in range(0, tp_size)], dim=0)
else:
merged_act_range[layer_name]['x'] = None
merged_named_parameters[weight_name] = torch.cat(
[named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=1)
if bias_name in named_parameters[0]:
merged_named_parameters[bias_name] = named_parameters[0][bias_name]
def convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size, args):
'''
convert parallel layer named parameters to non parallel layer named parameters
args:
act_layer_name: act layer name
act_range: parallel act_range
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
'''
qkv_name = "qkv_proj"
gate_up_name = "gate_up_proj"
if layer_range['split'] == 'col': # col
# merge weight
if layer_range["is_qkv"]:
qkv_name = convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size,
args)
elif layer_range["is_merge"]:
gate_up_name = convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range,
tp_size, args.model_type)
else:
convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size)
else: # row
convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size)
return qkv_name, gate_up_name
def collect_moe_experts_act_range_of_layer(merged_act_range, mlp_part_name, moe_list):
'''
collect moe experts act range in the same layer
'''
experts_of_gate_up_layer = {}
experts_of_down_layer = {}
gate_up_list = moe_list["gate_up_list"]
gate_up_list_len = len(gate_up_list)
down_list = moe_list["down_list"]
gate_up_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[1]}"
gate_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[2]}" if gate_up_list_len > 2 else None
down_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{down_list[1]}"
for key, value in merged_act_range.items():
if re.search(gate_up_layer_pattern, key) or (gate_layer_pattern is not None
and re.search(gate_layer_pattern, key)):
experts_of_gate_up_layer[key] = value
if re.search(down_layer_pattern, key):
experts_of_down_layer[key] = value
return experts_of_gate_up_layer, experts_of_down_layer
def convert_moe_expert_activation_fused(experts_of_layer, merged_act_range):
'''
fuse the moe expert act range in the same layer, and asign to these experts
'''
unfused_activation = []
for key, value in experts_of_layer.items():
if isinstance(value["x"], torch.Tensor):
unfused_activation.append(value['x'])
assert len(unfused_activation) > 0, f"unfused_activation len is zero, this is unsupported"
activation = torch.stack(unfused_activation, dim=0)
fused_activation = torch.max(activation, dim=0)[0]
for key, value in experts_of_layer.items():
if value["x"] is None or isinstance(value["x"], torch.Tensor):
value['x'] = fused_activation
def convert_moe_layer_activation_fused(merged_act_range, model_type):
'''
loop each layer and fuse the moe expert act range in the same layer, and asign to these experts
'''
moe_list = smooth_model_config[model_type]["moe_list"]
if moe_list is None:
return
mlp_name = moe_list["gate_up_list"][0].split(".")[0]
layer = 0
while True:
mlp_part_name = rf"\.{layer}\.{mlp_name}"
experts_of_gate_up_layer, experts_of_down_layer = collect_moe_experts_act_range_of_layer(
merged_act_range, mlp_part_name, moe_list)
# if experts_of_layer is empty, means layer equants to expert_num, the loop is finished
if len(experts_of_gate_up_layer) < 1 or len(experts_of_down_layer) < 1:
logger.info(f"the experts_num is {layer}")
break
convert_moe_expert_activation_fused(experts_of_gate_up_layer, merged_act_range)
convert_moe_expert_activation_fused(experts_of_down_layer, merged_act_range)
layer += 1
def should_include(key, parameters, exclude_names):
'''
key shouldnot include in parameters and exlude_names
args:
parameters: named parameters
exclude_names: excluded nameds list
'''
return key not in parameters and not any(exclude_name in key for exclude_name in exclude_names)
def valid_act_range(act_layer_name, layer_range):
'''
valid act_range, mainly filter inf, nan or zero values in x field
args:
act_layer_name: act layer name
layer_range: act layer value
'''
act_range_x = layer_range["x"]
if act_range_x is not None and isinstance(act_range_x, torch.Tensor):
mask = torch.isinf(act_range_x) | torch.isnan(act_range_x) | (act_range_x == 0)
if torch.any(mask).item():
act_range_x[mask] = 1e-6
logger.warning(f"act_range_x in layer:{act_layer_name} has nan, inf or zero values, force to 1e-6")
def convert_to_merged(act_range, named_parameters, tp_size, args):
'''
convert parallel act_range and named parameters to non parallel format.
args:
act_range: parallel act_range
named_parameters: parallel named parameters
tp_size: tensor parallel size
args: argument
'''
model_type = args.model_type
merged_act_range = defaultdict(lambda: {"x": None, "is_qkv": False, "is_merge": False,})
merged_named_parameters = {}
input_id_list = []
exclude_names = set()
for act_layer_name, layer_range in act_range[0].items():
valid_act_range(act_layer_name, layer_range)
layer_name, weight_name, bias_name = get_layer_weight_bias_name(model_type, act_layer_name)
# when tie_word_embeddings is True, lm_head use embeding weight
if args.tie_word_embeddings is True and "lm_head" in layer_name:
continue
qkv_name, gate_up_name = convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name,
named_parameters, merged_named_parameters, layer_range,
merged_act_range, tp_size, args)
exclude_names.update({qkv_name, gate_up_name})
if layer_range['split'] == 'col' and layer_range["is_qkv"] and len(layer_range["input_id"]) > 0:
input_id_list = layer_range["input_id"]
if args.use_smoothquant and args.disable_fused_quantize_expert is False:
convert_moe_layer_activation_fused(merged_act_range, model_type)
merged_named_parameters.update({
key: value
for key, value in named_parameters[0].items()
if should_include(key, merged_named_parameters, exclude_names)
})
modify_layer_weight_bias_name(model_type, merged_named_parameters)
sorted_named_parameters = OrderedDict(sorted(merged_named_parameters.items(), key=lambda item: item[0]))
sorted_merged_act_range = OrderedDict(sorted(merged_act_range.items(), key=lambda item: item[0]))
return sorted_merged_act_range, sorted_named_parameters, input_id_list
def copy_files_except_extensions(input_dir, output_dir, extensions):
'''
copy python files with extension in extensions from input_dir to output_dir, and keey sub directory is same
args:
input_dir: input directory
output_dir: output directory
extensions: the copy files extension
'''
# 遍历输入目录及其子目录
for root, dirs, files in os.walk(input_dir):
# 计算相对路径
rel_path = os.path.relpath(root, input_dir)
if len(rel_path) > 1 and rel_path.startswith('.'):
continue
# 构建目标目录路径
dst_dir = os.path.join(output_dir, rel_path)
# 确保目标目录存在
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
for file in files:
if not any(file.endswith(ext) for ext in extensions) and not file.startswith('.'):
# 构建源文件和目标文件的完整路径
src_file = os.path.join(root, file)
dst_file = os.path.join(dst_dir, file)
# 复制文件
shutil.copy2(src_file, dst_file)
logger.info(f'Copied {src_file} to {dst_file}')
def cleanup():
'''
cleanup memory resource
'''
gc.collect()
if not current_platform.is_cpu():
torch.cuda.empty_cache()
def vllm_cleanup(llm):
"""Release occupied resources and reset parallel_state"""
del llm
from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
destroy_model_parallel()
destroy_distributed_environment()
import contextlib
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
import ray
if ray.is_initialized():
ray.shutdown()
logger.info('llm and distributed env is cleanup')
def generate_datetime():
'''
generate current datetime
'''
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
return formatted_datetime
def get_hf_config_sliding_window(hf_text_config) -> Optional[int]:
"""Get the sliding window size, or None if disabled."""
# Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
# addition to sliding window size. We check if that field is present
# and if it's False, return None.
if (hasattr(hf_text_config, "use_sliding_window")
and not hf_text_config.use_sliding_window):
return None
return getattr(hf_text_config, "sliding_window", None)
def get_skip_patterns(model_type):
"""Get the skip patterns from model config."""
config = smooth_model_config[model_type]
return config["skip_patterns"] if "skip_patterns" in config else []
def should_skip(model_type, weight_name):
"""judge if the weight should be skipped."""
skip_patterns = get_skip_patterns(model_type)
for pattern in skip_patterns:
import re
if re.match(pattern, weight_name):
return True
return False

View File

@@ -0,0 +1,152 @@
import argparse
import torch
from torch import Tensor
import numpy as np
import logging
from vllm import LLM
from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
from dump_smooth import save_weights, save_generate_weights
logger = logging.getLogger(__name__)
def merge_adjacent_low_4bit(tensor: Tensor):
"""
将一个包含int8类型数据的张量按相邻两个元素的低4位合并成新的int8数据
并输出一个新的张量。
参数:
- tensor: 类型为torch.int8的张量长度应为偶数。
返回:
- 新张量其中每个元素是相邻原元素低4位的合并结果。
示例:
a = torch.tensor([5, 7, 12, 3], dtype=torch.int8) # 示例张量,每对元素将被合并
merged_tensor = merge_adjacent_low_nibbles(a)
print(f"合并后的张量: {merged_tensor} (二进制: {merged_tensor.tolist()})")
"""
# 确保输入张量类型为int8且长度为偶数
assert tensor.dtype == torch.int8, "输入张量必须为int8类型"
assert tensor.shape[-1] % 2 == 0, "输入张量最后一维长度需为偶数"
even = np.bitwise_and(tensor[..., 0::2], 0x0F, dtype=np.int8)
odd = np.bitwise_and(tensor[..., 1::2], 0x0F, dtype=np.int8)
merged_tensor = np.bitwise_or(np.left_shift(odd, 4), even)
# 结果是已经合并的新张量
return merged_tensor
def cal_weightonly_weight(weight, weight_bits, qmin, qmax, has_qzeros, eps: float = 1e-8):
'''
return quantized_weight, scales, qzeros
args:
weight: need to be quantized
weight_bits: quantized bitwidth
qmin: minimum value in quantized range
qmax: maximum value in quantized range
has_qzeros: whether to generate qzeros weight
eps: limit zero float value to avoid floatpoint error
'''
assert weight.numel() != 0, "weight should not be empty tensor"
assert weight.dim() == 2 or weight.dim() == 3, "Invalid dim. The dim of weight should be 2 or 3"
assert weight.dtype in [torch.float32, torch.float16, torch.bfloat16
], "Invalid datatype. Weight must be torch.float32 or torch.float16 or torch.bfloat16"
weight_scale = weight.float().abs().clamp(min=eps).max(dim=-1).values / qmax
unpacked_weight = (torch.round((weight / weight_scale[..., None]).float())).clip(min=qmin, max=qmax).to(torch.int8)
scale_quant_orig_c = weight_scale.squeeze()
if weight_bits == 4:
quantized_weight = merge_adjacent_low_4bit(unpacked_weight)
else:
quantized_weight = unpacked_weight
if has_qzeros:
qzeros = torch.zeros_like(scale_quant_orig_c, dtype=torch.int32)
else:
qzeros = None
return quantized_weight, scale_quant_orig_c, qzeros
def generate_weightonly_weight(act_range, name_parameters, args):
'''
generate hugging face weight to quanizated weightonly weight
args:
act_range: non parallem act_range
name_parameters: non parallel hugging face named parameters
args: arguments from main
'''
weightonly_weight = {}
has_qzeros = args.has_qzeros
weight_bits = 8 if args.weight_only_precision == 'int8' else 4
qmin = float(-2**(weight_bits - 1))
qmax = float(2**(weight_bits - 1) - 1)
for name, param in name_parameters.items():
if should_skip(args.model_type, name):
logger.info(f"skip {name}")
weightonly_weight[name] = param
continue
if name.endswith("bias"):
weightonly_weight[name] = param
continue
name_parts = name.split(".")
layer_name = ".".join(name_parts[:-1])
if layer_name in act_range:
qweight, scales, qzeros = cal_weightonly_weight(param, weight_bits, qmin, qmax, has_qzeros)
scales = scales.to(args.torch_scales_smooth_dtype)
weightonly_weight[f'{layer_name}.qweight'] = qweight
weightonly_weight[f'{layer_name}.scales'] = scales
if has_qzeros:
weightonly_weight[f'{layer_name}.qzeros'] = qzeros
else:
weightonly_weight[name] = param
return weightonly_weight
def generate_weights_of_weight_only(llm: LLM, args: argparse.Namespace):
'''
generate weightonly weights
args:
llm: LLM instance
args: argument from main
'''
tp_size = args.tp_size
llm.llm_engine.model_executor._run_workers("setup_smooth_hook")
llm.llm_engine.model_executor._run_workers("remove_hooks")
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
vllm_cleanup(llm)
cleanup()
logger.info("get act_range and named_parameters from llm finished")
merged_act_range, merged_named_parameters, _ = convert_to_merged(act_range, named_parameters, tp_size, args)
save_weights(merged_named_parameters, args)
del act_range
del named_parameters
cleanup()
logger.info("get merged_act_range and merged_named_parameters finished")
weightonly_weight = generate_weightonly_weight(merged_act_range, merged_named_parameters, args)
save_generate_weights(weightonly_weight, args)
del merged_act_range
del merged_named_parameters
cleanup()
logger.info("get weightonly_weight finished")
return weightonly_weight