add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

13
vllm-v0.6.2/tools/actionlint.sh Executable file
View File

@@ -0,0 +1,13 @@
#!/bin/bash
if command -v actionlint &> /dev/null; then
actionlint "$@"
exit 0
elif [ -x ./actionlint ]; then
./actionlint "$@"
exit 0
fi
# download a binary to the current directory - v1.7.3
bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
./actionlint "$@"

View File

@@ -0,0 +1,9 @@
TORCH_MLU_OPS_VERSION=1.3.2+pt25
CATCH_VERSION=1.24.1+torch2.5.0
CNCL_VERSION=1.24.1-1
CNNL_VERSION=1.28.4-1
CNNLEXTRA_VERSION=1.12.3-1
CNTOOLKIT_VERSION=3.15.7-1
MLUOPS_VERSION=1.4.1-1
TRITON_VERSION=3.0.0+mlu1.3.1
XFORMERS_VERSION=0.0.24+mlu0.5.0.pt2.5

View File

@@ -0,0 +1,14 @@
#!/bin/bash
# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
if ! git diff --quiet; then
echo "Repo is dirty" >&2
exit 1
fi
if ! git describe --tags; then
echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
exit 1
fi

View File

@@ -0,0 +1,8 @@
export CN_NOTIFIER_POOL_MAX=1000
export CN_TASKTOPO_RESIDENT=0
export CNCL_STANDALONE_ENABLE=1
export CNCL_TWOSHOT_ENABLE=1
export CNPERF_DEBUG_DISABLE_CHILD_PROCESS=1
export PYTORCH_CNDEV_BASED_MLU_CHECK=1
export RAY_ROTATION_BACKUP_COUNT=10
export RAY_ROTATION_MAX_BYTES=102400

31
vllm-v0.6.2/tools/mypy.sh Executable file
View File

@@ -0,0 +1,31 @@
#!/bin/bash
CI=${1:-0}
PYTHON_VERSION=${2:-3.9}
if [ "$CI" -eq 1 ]; then
set -e
fi
run_mypy() {
echo "Running mypy on $1"
if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
mypy --python-version "${PYTHON_VERSION}" "$@"
return
fi
mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
}
run_mypy # Note that this is less strict than CI
run_mypy tests
run_mypy vllm/attention
run_mypy vllm/compilation
run_mypy vllm/distributed
run_mypy vllm/engine
run_mypy vllm/executor
run_mypy vllm/lora
run_mypy vllm/model_executor
run_mypy vllm/plugins
run_mypy vllm/prompt_adapter
run_mypy vllm/spec_decode
run_mypy vllm/worker

View File

@@ -0,0 +1,77 @@
import argparse
import json
from typing import Dict
from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry
from vllm.profiler.utils import TablePrinter, indent_string
def flatten_entries(entry_cls, profile_dict: Dict):
entries_and_depth = []
def get_entries(node, curr_depth=0):
entries_and_depth.append((entry_cls(**node["entry"]), curr_depth))
for child in node["children"]:
get_entries(
child,
curr_depth=curr_depth + 1,
)
for root in profile_dict:
get_entries(root)
return entries_and_depth
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--json-trace",
type=str,
required=True,
help="json trace file output by "
"examples/offline_profile.py")
parser.add_argument("--phase",
type=str,
choices=["prefill", "decode_1"],
required=True,
help="The phase to print the table for.")
parser.add_argument("--table",
type=str,
choices=["summary", "model"],
default="summary",
help="Which table to print, the summary table or the "
"layerwise model table")
args = parser.parse_args()
with open(args.json_trace) as f:
profile_data = json.load(f)
if args.table == "summary":
entries_and_depths = flatten_entries(
SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
column_widths = dict(name=80,
cuda_time_us=12,
pct_cuda_time=12,
invocations=15)
elif args.table == "model":
entries_and_depths = flatten_entries(
ModelStatsEntry, profile_data[args.phase]["model_stats"])
column_widths = dict(name=60,
cpu_time_us=12,
cuda_time_us=12,
pct_cuda_time=12,
trace=60)
# indent entry names based on the depth
entries = []
for entry, depth in entries_and_depths:
entry.name = indent_string(
entry.name,
indent=depth,
indent_style=lambda indent: "|" + "-" * indent + " ")
entries.append(entry)
TablePrinter(type(entries[0]), column_widths).print_table(entries)

View File

@@ -0,0 +1,522 @@
import argparse
import copy
import json
import math
import os
from pathlib import Path
from typing import Any, List, Optional, Tuple
import matplotlib.pyplot as plt
import pandas as pd
## JSON parsing utils ####
def largest_dist_from_leaf(node: dict, depth: int = 0):
if len(node["children"]) == 0:
return depth
return max([
largest_dist_from_leaf(child, depth=depth + 1)
for child in node["children"]
])
def get_entries_at_depth(depth: int,
entries_and_traces: List[Tuple[Any, Any]],
node: dict,
curr_depth: int = 0,
trace=()):
# assert that the query is at kernel or module level
assert depth == -1 or depth == -2
if curr_depth == 0 and largest_dist_from_leaf(node) <= (abs(depth) - 1):
# The tree is not tall enough!
entries_and_traces.append((node["entry"], trace))
return
if largest_dist_from_leaf(node) == (abs(depth) - 1):
entries_and_traces.append((node["entry"], trace))
trace = (node["entry"]["name"], ) + trace
for child in node["children"]:
get_entries_at_depth(depth,
entries_and_traces,
child,
curr_depth=curr_depth + 1,
trace=trace)
def fold_nodes(root: dict, nodes_to_fold: List[str]):
stack: List[dict] = [root]
while len(stack) != 0:
node = stack.pop()
if node['entry']['name'] in nodes_to_fold:
node["children"] = []
continue
for child in node["children"]:
stack.append(child)
return root
## Operation name cleanup utils ####
def trim_string_back(string: str, width: int) -> str:
if len(string) > width:
offset = len(string) - width + 3
string = string[:-offset]
if len(string) > 3:
string = string + "..."
return string
def shorten_plot_legend_strings(legend, max_char_len: int):
for t in legend.get_texts():
t.set_text(
trim_string_back(abbreviate_known_names(t.get_text()),
max_char_len))
def abbreviate_known_names(name: str) -> str:
abbreviations = {
"MergedColumnParallelLinear": "MCPLinear",
"QKVParallelLinear": "QKVPLinear",
"RowParallelLinear": "RPLinear",
"weight=": "w=",
"bfloat16": "bf16",
"float16": "f16",
}
for key, value in abbreviations.items():
name = name.replace(key, value)
return name
def attempt_to_make_names_unique(entries_and_traces):
names, non_unique_names = (set(), set())
def all_the_same(items) -> bool:
return all(i == items[0] for i in items)
for entry, _ in entries_and_traces:
if entry["name"] in names:
non_unique_names.add(entry["name"])
else:
names.add(entry["name"])
for name in non_unique_names:
entries_and_traces_with_name = [(entry, trace)
for entry, trace in entries_and_traces
if entry["name"] == name]
zipped_traces = list(
zip(*[trace for _, trace in entries_and_traces_with_name]))
first_trace_difference = next(
(i for i, trace_eles in enumerate(zipped_traces)
if not all_the_same(trace_eles)), None)
if first_trace_difference is None:
# can't create a unique name, leave them names as the
# are they will get aggregated by the pivot_table call
continue
for entry, trace in entries_and_traces_with_name:
entry["name"] = " <- ".join((entry["name"], ) +
trace[:first_trace_difference + 1])
## Operation grouping utils ####
'''
Group operations in the given dataframe by some high-level ops like,
- gemms
- attention
- rms_norm
etc.
'''
def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
def is_rms_norm(op_name: str):
if "rms_norm_kernel" in op_name:
return True
def is_attention_block(op_name: str):
if "flash_fwd" in op_name or \
"reshape_and_cache_flash_kernel" in op_name:
return True
def is_quant(op_name: str):
if "scaled_fp8_quant" in op_name or \
"scaled_int8_quant" in op_name:
return True
def is_gemm_op(op_name: str):
if is_quant(op_name):
return False
if "xmma_gemm" in op_name or \
"gemv2T_kernel" in op_name or \
"splitKreduce" in op_name or \
"void cutlass::Kernel" in op_name or \
"void cutlass::device_kernel" in op_name or \
"s16816gemm" in op_name:
return True
def is_elementwise_op(op_name: str):
return "elementwise_kernel" in op_name
def is_mem_op(op_name: str):
return "memcpy" in op_name.lower() or \
"memset" in op_name.lower()
def is_vocab_embedding_op(op_name: str):
return "vocabparallelembed" in op_name.lower()
# nccl ops
def is_nccl_op(op_name: str):
return "nccl" in op_name.lower()
def is_nccl_all_reduce(op_name: str):
return is_nccl_op(op_name) and \
("all_reduce" in op_name.lower() or \
"allreduce" in op_name.lower())
def is_nccl_gather(op_name: str):
return is_nccl_op(op_name) and \
"gather" in op_name.lower()
def is_nccl_broadcast(op_name: str):
return is_nccl_op(op_name) and \
"broadcast" in op_name.lower()
# Reduce ops types
def is_cross_device_reduce_1stage(op_name: str):
return "cross_device_reduce_1stage" in op_name
def is_cross_device_reduce_2stage(op_name: str):
return "cross_device_reduce_2stage" in op_name
def is_custom_ar_all_reduce(op_name: str):
return "_C_custom_ar::all_reduce" in op_name
def is_reduce_kernel(op_name: str):
return "reduce_kernel" in op_name
headers = list(trace_df)
ops = copy.deepcopy(headers)
attention_ops = list(filter(lambda x: is_attention_block(x), ops))
ops = list(filter(lambda x: x not in attention_ops, ops))
quant_ops = list(filter(lambda x: is_quant(x), ops))
ops = list(filter(lambda x: x not in quant_ops, ops))
gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
ops = list(filter(lambda x: x not in gemm_ops, ops))
rms_norm_ops = list(filter(lambda x: is_rms_norm(x), ops))
ops = list(filter(lambda x: x not in rms_norm_ops, ops))
vocab_embed_ops = list(filter(lambda x: is_vocab_embedding_op(x), ops))
ops = list(filter(lambda x: x not in vocab_embed_ops, ops))
mem_ops = list(filter(lambda x: is_mem_op(x), ops))
ops = list(filter(lambda x: x not in mem_ops, ops))
elementwise_ops = list(filter(lambda x: is_elementwise_op(x), ops))
ops = list(filter(lambda x: x not in elementwise_ops, ops))
nccl_all_reduce_ops = list(filter(lambda x: is_nccl_all_reduce(x), ops))
ops = list(filter(lambda x: x not in nccl_all_reduce_ops, ops))
nccl_gather_ops = list(filter(lambda x: is_nccl_gather(x), ops))
ops = list(filter(lambda x: x not in nccl_gather_ops, ops))
nccl_broadcast_ops = list(filter(lambda x: is_nccl_broadcast(x), ops))
ops = list(filter(lambda x: x not in nccl_broadcast_ops, ops))
nccl_other_ops = list(filter(lambda x: is_nccl_op(x), ops))
ops = list(filter(lambda x: x not in nccl_other_ops, ops))
cross_device_reduce_1stage_ops = list(
filter(lambda x: is_cross_device_reduce_1stage(x), ops))
ops = list(filter(lambda x: x not in cross_device_reduce_1stage_ops, ops))
cross_device_reduce_2stage_ops = list(
filter(lambda x: is_cross_device_reduce_2stage(x), ops))
ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops))
custom_ar_all_reduce_ops = list(
filter(lambda x: is_custom_ar_all_reduce(x), ops))
ops = list(filter(lambda x: x not in custom_ar_all_reduce_ops, ops))
reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops))
ops = list(filter(lambda x: x not in reduce_kernel_ops, ops))
if len(attention_ops):
trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
if len(quant_ops):
trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
if len(gemm_ops):
trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
if len(rms_norm_ops):
trace_df['rms_norm_ops'] = trace_df[rms_norm_ops].agg("sum", axis=1)
if len(vocab_embed_ops):
trace_df['vocab_embed_ops'] = trace_df[vocab_embed_ops].agg("sum",
axis=1)
if len(mem_ops):
trace_df['mem_ops'] = trace_df[mem_ops].agg("sum", axis=1)
if len(elementwise_ops):
trace_df['elementwise_ops'] = trace_df[elementwise_ops].agg("sum",
axis=1)
if len(nccl_all_reduce_ops):
trace_df['nccl_all_reduce_ops'] = trace_df[nccl_all_reduce_ops].agg(
"sum", axis=1)
if len(nccl_gather_ops):
trace_df['nccl_gather_ops'] = trace_df[nccl_gather_ops].agg("sum",
axis=1)
if len(nccl_broadcast_ops):
trace_df['nccl_broadcast_ops'] = trace_df[nccl_broadcast_ops].agg(
"sum", axis=1)
if len(nccl_other_ops):
trace_df['nccl_other_ops'] = trace_df[nccl_other_ops].agg("sum",
axis=1)
if len(cross_device_reduce_1stage_ops):
trace_df['cross_device_reduce_1stage_ops'] = trace_df[
cross_device_reduce_1stage_ops].agg("sum", axis=1)
if len(cross_device_reduce_2stage_ops):
trace_df['cross_device_reduce_2stage_ops'] = trace_df[
cross_device_reduce_2stage_ops].agg("sum", axis=1)
if len(custom_ar_all_reduce_ops):
trace_df['custom_ar_all_reduce_ops'] = trace_df[
custom_ar_all_reduce_ops].agg("sum", axis=1)
if len(reduce_kernel_ops):
trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
axis=1)
trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops +
vocab_embed_ops + mem_ops + elementwise_ops +
nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
nccl_other_ops + cross_device_reduce_1stage_ops +
cross_device_reduce_2stage_ops + custom_ar_all_reduce_ops +
reduce_kernel_ops,
axis=1,
inplace=True)
return trace_df
## Data plotting utils ####
def plot_trace_df(traces_df: pd.DataFrame,
plot_metric: str,
plot_title: str,
output: Optional[Path] = None):
phases = traces_df['phase'].unique()
traces_df = traces_df.pivot_table(index="phase",
columns="name",
values=plot_metric,
aggfunc="sum")
traces_df = group_trace_by_operations(traces_df)
# Make the figure
fig, ax = plt.subplots(1, figsize=(5, 8), sharex=True)
# Draw the stacked bars
ops = list(traces_df)
bottom = [0] * len(phases)
for op in ops:
values = [traces_df[op][phase] for phase in phases]
values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
ax.bar(phases, values, label=op, bottom=bottom)
bottom = [bottom[j] + values[j] for j in range(len(phases))]
# Write the values as text on the bars
for bar in ax.patches:
if bar.get_height() != 0:
ax.text(bar.get_x() + bar.get_width() / 2,
bar.get_height() / 2 + bar.get_y(),
f"{round(bar.get_height(), 2)}",
ha='center',
color='w',
weight='bold',
size=5)
# Setup legend
handles, labels = plt.gca().get_legend_handles_labels()
legend = fig.legend(handles,
labels,
loc='center left',
bbox_to_anchor=(1, 1))
shorten_plot_legend_strings(legend, 50)
# Setup labels and title
plt.setp(ax.get_xticklabels(), rotation=90)
ax.set_ylabel(plot_metric)
plt.suptitle(plot_title)
plt.savefig(output, bbox_inches='tight')
print("Created: ", output)
def main(
json_trace: Path,
output_directory: Path,
depth: int, # Fetch/Plot operations at this depth of the Json tree
plot_metric: str,
make_names_unique: bool,
top_k: int,
json_nodes_to_fold: List[str]):
def prepare_data(profile_json: dict, step_keys: List[str]) -> pd.DataFrame:
def get_entries_and_traces(key: str):
entries_and_traces: List[Tuple[Any, Any]] = []
for root in profile_json[key]["summary_stats"]:
# Fold nodes in the traces as per user request. i.e. simply
# make the requested nodes leaf-nodes.
root = fold_nodes(root, json_nodes_to_fold)
get_entries_at_depth(depth, entries_and_traces, root)
return entries_and_traces
def keep_only_top_entries(df: pd.DataFrame,
metric: str,
top_k: int = 9) -> pd.DataFrame:
df.loc[df.nsmallest(len(df) - top_k + 1, metric).index,
["name"]] = "others"
return df
# Get data for each key
traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
# Attempt some cleanup
if make_names_unique:
for trace in traces:
attempt_to_make_names_unique(trace)
# To pandas dataframe
trace_dfs = list(
map(lambda t: pd.DataFrame([entry for entry, _ in t]).fillna(0),
traces))
# Respect top_k
if top_k:
trace_dfs = list(
map(
lambda trace_df: keep_only_top_entries(
trace_df, "cuda_time_us", top_k), trace_dfs))
# Fill in information about the step-keys
for trace_df, step_key in zip(trace_dfs, step_keys):
trace_df['phase'] = step_key
# Combine all data frames so they can be put in a single plot
traces_df = pd.concat(trace_dfs)
# Add a derived metric `cuda_time_ms`
traces_df["cuda_time_ms"] = traces_df["cuda_time_us"] / 1000
traces_df = traces_df.fillna(0)
return traces_df
def make_plot_title_suffix(profile_json: dict) -> str:
context = profile_json["context"]
sparsity = context.get('sparsity', None)
return (f"{context['model']}\n"
f"Batch={context['batch_size']}, "
f"PromptLen={context['prompt_len']}, "
f"OutputLen={context['output_len']},"
f"NumGpus={context['tensor_parallel_size']}"
f"{', Sparsity ' + sparsity if sparsity else ''}")
profile_json = None
with open(json_trace) as f:
profile_json = json.load(f)
assert profile_json is not None
# Get all `llm.generate.step()` profile
step_traces = list(profile_json.keys())
assert (step_traces[0] == 'context')
step_traces = step_traces[1:] # have only prefill and decodes
prefills = list(filter(lambda x: "prefill" in x, step_traces))
all_decodes = list(filter(lambda x: "decode" in x, step_traces))
assert len(prefills) + len(all_decodes) == len(step_traces)
assert len(prefills) == 1
decodes = all_decodes[::args.step_plot_interval]
if decodes[-1] != all_decodes[-1]:
# Always have the last decode
decodes.append(all_decodes[-1])
prefill_traces = prepare_data(profile_json, prefills)
decode_traces = prepare_data(profile_json, decodes)
plot_title_suffix = make_plot_title_suffix(profile_json)
plot_trace_df(prefill_traces, plot_metric, "prefill " + plot_title_suffix,
output_directory / Path("prefill.png"))
plot_trace_df(decode_traces, plot_metric, "decodes " + plot_title_suffix,
output_directory / Path("decode_steps.png"))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--json-trace",
type=str,
required=True,
help="json trace file output by examples/offline_profile.py")
parser.add_argument("--output-directory",
type=str,
required=False,
help="Directory to output plots")
parser.add_argument("--level",
type=str,
default="module",
choices=["module", "kernel"])
parser.add_argument("--top-k",
type=int,
default=12,
help="Only graph the top `top_k` entries by time.")
parser.add_argument("--fold-json-node",
nargs='+',
default=['Sampler', 'LogitsProcessor'],
help='Do not plot the children of these nodes. Let, \
the node represent the aggregate of all its \
children')
parser.add_argument("--plot-metric",
type=str,
default="cuda_time_ms",
help='Metric to plot. some options are cuda_time_ms, \
pct_cuda_time')
parser.add_argument(
"--step-plot-interval",
type=int,
default=4,
help="For every `step_plot_interval` steps, plot 1 step")
args = parser.parse_args()
# Prepare/Extract relevant args
make_names_unique = False
if args.level == "module":
depth = -2
make_names_unique = True
elif args.level == "kernel":
depth = -1
else:
raise Exception(f"Unexpected level value ({args.level})")
output_directory = args.output_directory if args.output_directory else Path(
args.json_trace).parent
if not os.path.exists(output_directory):
os.makedirs(output_directory)
main(Path(args.json_trace), output_directory, depth, args.plot_metric,
make_names_unique, args.top_k, args.fold_json_node)

View File

@@ -0,0 +1,419 @@
import argparse
import os
import sys
import time
import safetensors
import logging
import json
from huggingface_hub import split_torch_state_dict_into_shards, constants
from vllm import LLM
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.config import _get_and_verify_max_len
import transformers
from transformers.modeling_utils import SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from smooth_quant import generate_weights_of_smoothquant
from weight_only import generate_weights_of_weight_only
from utils_internal import (read_model_name, load_tokenizer, torch_dtype_to_str, str_dtype_to_torch,
copy_files_except_extensions, generate_datetime, get_hf_config_sliding_window)
from utils_internal import get_skip_patterns, should_skip
from model_special import smooth_model_config
from vllm.engine.arg_utils import EngineArgs
sys.path.append(os.getcwd())
logger = logging.getLogger("smooth_convert")
def load_skip_params_from_hf(args):
'''
load parameters from transformers that do no need to be quantized.
'''
model_type = args.model_type
if not get_skip_patterns(model_type):
return {}
try:
model = getattr(transformers, args.model_name, None)
if model is None:
model = AutoModelForCausalLM
model = model.from_pretrained(
args.hf_model_dir,
trust_remote_code=True,
torch_dtype=args.torch_dtype,
device_map="cpu")
except Exception as e:
logger.fatal(f"Unsupported model {args.model_name}, error message: {e}")
sys.exit(1)
params_map = {}
hf_params = dict(model.named_parameters())
for name, param in hf_params.items():
if should_skip(model_type, name):
logger.info(f"load parameters from transformers, name: {name}")
params_map[name] = param
return params_map
def save_quantized_weights_to_safetensors(quantized_weights, args):
'''
save quantized_weights to safetensors format
'''
# Store the state_dict to file.
max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
state_dict_split = split_torch_state_dict_into_shards(quantized_weights,
filename_pattern=constants.SAFETENSORS_WEIGHTS_FILE_PATTERN,
max_shard_size=max_shard_size)
# Save the model
for shard_name, tensors in state_dict_split.filename_to_tensors.items():
shard = {tensor: quantized_weights[tensor] for tensor in tensors}
safetensors.torch.save_file(shard, os.path.join(args.output_dir, shard_name), metadata={"format": "pt"})
if state_dict_split.is_sharded:
index = {
"metadata": state_dict_split.metadata,
"weight_map": state_dict_split.tensor_to_filename,
}
save_index_file = os.path.join(args.output_dir, SAFE_WEIGHTS_INDEX_NAME)
with open(save_index_file, "w", encoding="utf-8") as f:
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
f.write(content)
logger.info(
f"The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be "
f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where "
f"each parameters has been saved in the index located at {save_index_file}."
)
else:
logger.info(f"Model weights saved in {os.path.join(args.output_dir, SAFE_WEIGHTS_NAME)}")
def main(args):
'''
main quantization logic
'''
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=args.log_level,
force=True,
)
tik = time.time()
skip_params = load_skip_params_from_hf(args)
# Create an LLM.
max_model_len = max(args.max_input_length + args.output_len, 2048)
args.max_model_len = min(max_model_len, args.hf_max_model_len)
max_num_batched_tokens = max(max(args.max_input_length * args.batch_size, max_model_len), 2048)
args.max_num_batched_tokens = min(max_num_batched_tokens, args.hf_max_model_len)
llm = LLM(model=args.hf_model_dir,
tokenizer=args.tokenizer_dir,
tensor_parallel_size=args.tp_size,
distributed_executor_backend='ray',
dtype=args.dtype,
enforce_eager=args.enforce_eager,
trust_remote_code=True,
block_size=args.block_size,
max_model_len=args.max_model_len,
max_num_batched_tokens=args.max_num_batched_tokens,
max_num_seqs=args.max_num_seqs,
cpu_offload_gb=args.cpu_offload_gb)
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
logger.info(f'Load vLLM model takes: {t}')
quantize_config = {}
if args.use_weight_only:
st_prefix = f"weight_{args.weight_only_precision}"
quantized_weights = generate_weights_of_weight_only(llm, args)
quantize_config['bits'] = 8 if args.weight_only_precision == "int8" else 4
quantize_config['quant_method'] = "weightonly"
quantize_config['quant_mode'] = "WeightOnly"
if args.use_smoothquant:
st_prefix = f"smoothquant_{args.smooth_value}"
quantized_weights, smooth_info = generate_weights_of_smoothquant(llm, args)
quantize_config['bits'] = 8
quantize_config['quant_method'] = "smoothquant"
quantize_config['quant_mode'] = "SmoothQuant"
quantize_config['input_quant_method'] = "per_token" if args.per_token else "per_tensor"
quantize_config['smooth_value'] = args.smooth_value
with open(os.path.join(args.output_dir, 'smooth_info.json'), 'w') as f:
json.dump(smooth_info, f, indent=4)
# Should first copy other files from hf_model_dir, and then save weight, tokenizer, config, quant_config and so on
extensions = ['.bin', '.safetensors', ".pt", ".index.json"]
copy_files_except_extensions(args.hf_model_dir, args.output_dir, extensions)
logger.info(f'copy files except extensions success')
for name, param in skip_params.items():
assert name in quantized_weights
quantized_weights[name] = param
save_quantized_weights_to_safetensors(quantized_weights, args)
logger.info(f'save quantized_weights to safetensors success')
with open(os.path.join(args.output_dir, 'quantize_config.json'), 'w') as f:
json.dump(quantize_config, f, indent=4)
from transformers.utils import CONFIG_NAME
with open(os.path.join(args.hf_model_dir, CONFIG_NAME), 'r') as f:
config = json.load(f)
config['quantization_config'] = quantize_config
config['generate_datetime'] = generate_datetime()
config['torch_dtype'] = args.dtype
with open(os.path.join(args.output_dir, CONFIG_NAME), 'w') as f:
json.dump(config, f, indent=4)
logger.info(f'quantized {args.hf_model_dir} finished')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--hf_model_dir', type=str, default=None)
parser.add_argument('--tokenizer_dir',
default=None,
help='tokenizer path; defaults to hf_model_dir if left unspecified')
parser.add_argument(
'--enforce_eager',
action="store_true",
default=True,
help='Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model '
'in eager mode. If False, we will use CUDA graph and eager execution in hybrid.')
parser.add_argument('--dtype',
type=str,
choices=['auto', 'float32', 'float16', 'bfloat16'],
default='auto',
help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
parser.add_argument('--scales_smooth_dtype',
type=str,
choices=['auto', 'float32', 'float16', 'bfloat16'],
default='auto',
help="if auto, scales and smooth weights use args.dtype, else use the setted dtype")
parser.add_argument(
'--eval_task',
type=str,
default='summarize',
choices=['summarize', 'summarize_long', 'code_completion', 'summarize_hg', 'text_generation', 'custom'],
help='''eval task to decide which dataset is selected. When set to custom, you must set these options
dataset_name, dataset_revision, dataset_input_key, dataset_split to specify which dataset to use''')
parser.add_argument("--dataset_cache_dir",
type=str,
default=None,
help="cache dir to load the hugging face dataset")
parser.add_argument("--dataset_name", type=str, default=None, help="custom dataset name")
parser.add_argument("--dataset_revision", type=str, default=None, help="custom dataset version")
parser.add_argument("--dataset_input_key", type=str, default=None, help="custom dataset field")
parser.add_argument("--dataset_split", type=str, default=None, help="custom dataset split")
parser.add_argument('--log_level', type=int, default=logging.INFO)
parser.add_argument('--num_samples', type=int, default=512, help='num prompt sample')
parser.add_argument('--output_len',
type=int,
default=100,
help="Number of output sequences to return for the given prompt")
parser.add_argument('--max_input_length',
type=int,
default=512,
help='max input length of the prompt')
parser.add_argument('--block_size', type=int, default=-1, help='Token block size for contiguous chunks of tokens.')
parser.add_argument('--temperature', type=float, default=1.0)
parser.add_argument('--top_p', type=float, default=1.0)
parser.add_argument('--top_k', type=int, default=-1)
parser.add_argument('--repetition_penalty', type=float, default=1.0)
parser.add_argument('--max_num_seqs',
type=int,
default=EngineArgs.max_num_seqs,
help='Maximum number of sequences per iteration.')
parser.add_argument('--output_dir',
type=str,
default="output_dir",
help="The path to save the quantized checkpoint")
parser.add_argument(
"--max_shard_size",
type=str,
default="10GB",
help=("The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size "
"lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`)"),
)
parser.add_argument('--tp_size', type=int, default=1, help='N-way tensor parallelism size')
parser.add_argument('--pp_size', type=int, default=1, help='N-way pipeline parallelism size, now supported num')
parser.add_argument('--use_smoothquant',
default=False,
action="store_true",
help='Apply smoothquant to generate weight')
parser.add_argument("--smooth_value",
type=float,
default=0.5,
help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
" to Smoothquant the model, and output int8 weights."
" A good first try is 0.5. Must be in [0, 1]")
parser.add_argument('--per_channel',
action="store_true",
default=False,
help='By default, we use a single static scaling factor for the GEMM\'s result. '
'per_channel instead uses a different static scaling factor for each channel. '
'The latter is usually more accurate, but a little slower.')
parser.add_argument(
'--per_token',
action="store_true",
default=False,
help='By default, we use a single static scaling factor to scale activations in the int8 range. '
'per_token chooses at run time, and for each token, a custom scaling factor. '
'The latter is usually more accurate, but a little slower.')
parser.add_argument('--use_weight_only',
default=False,
action="store_true",
help='Quantize weights for the various GEMMs to INT4/INT8.'
'See --weight_only_precision to set the precision')
parser.add_argument('--weight_only_precision',
const='int8',
type=str,
nargs='?',
default='int8',
choices=['int8', 'int4'],
help='Define the precision for the weights when using weight-only quantization.'
'You must also use --use_weight_only for that argument to have an impact.')
parser.add_argument(
'--has_qzeros',
action="store_true",
default=False,
help='whether to add qzeros weight to vllm_mlu weight',
)
parser.add_argument('--model_version',
type=str,
default=None,
help="Set model version to replace parsing from _name_or_path in hf config.")
parser.add_argument('--model_type',
type=str,
default=None,
help="Set model type to replace parsing from model_type in hf config."
"if set is None and parsed also None, then set as model_version")
parser.add_argument('--no_add_special_tokens',
dest='add_special_tokens',
default=True,
action='store_false',
help="Whether or not to add special tokens")
parser.add_argument(
'--has_prompt_token_id',
action="store_true",
default=False,
help='whether to give llm.generate prompt_token_id',
)
parser.add_argument(
'--disable_fused_quantize_expert',
action="store_true",
default=False,
help='''disable fused activation to quantize for unfused moe usage.
Because to fused_moe smoothquant, input_smooth has shape (hidden_size), act_smooth has shape (inner_size),
and not every expert can be routed, so we assume that all expert should use the same act_smooth by default.
You can use this option to close the assumption.'''
)
parser.add_argument('--prompt_file',
type=str,
default=None,
help="custom prompt file, should has format that each line is one string prompt,"
"you can refer the format of summarize_1024_prompts.csv")
parser.add_argument(
'--batch_size',
type=int,
default=-1,
help="batch size, used to limit max_num_batched_tokens, -1 means batch_size equals to num_samples"
)
parser.add_argument(
'--cpu_offload_gb',
type=float,
default=0.0,
help='''The size (GiB) of CPU memory to use for offloading the model weights.
This virtually increases the GPU memory space you can use to hold the model weights,
at the cost of CPU-GPU data transfer for every forward pass.'''
)
parser.add_argument(
'--dump_prompt_token_ids',
action="store_true",
default=False,
help='dump prompt_token_ids used by llm.generate ',
)
parser.add_argument(
'--dump_input_ids',
action="store_true",
default=False,
help='dump vllm qkv used token ids at llm running',
)
parser.add_argument(
'--dump_act_range',
action="store_true",
default=False,
help='dump act range which is the max hidden dim value of input, output, weigth',
)
parser.add_argument(
'--dump_weights',
action="store_true",
default=False,
help='dump weights of the converted model',
)
parser.add_argument(
'--dump_generate_weights',
action="store_true",
default=False,
help='dump generate weights of the converted model',
)
args = parser.parse_args()
assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
assert args.pp_size == 1, "Pipeline parallelism is not supported."
if args.tokenizer_dir is None:
args.tokenizer_dir = args.hf_model_dir
if args.has_prompt_token_id is False:
args.dump_prompt_token_ids = False
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
args.hf_model_dir, args.model_version, args.model_type)
assert args.model_type in smooth_model_config, f'''{args.model_type} hasn't supported,
please add it's infomation in model_special.py by your self'''
args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
hf_text_config = get_hf_text_config(args.hf_config)
args.tie_word_embeddings = getattr(hf_text_config, "tie_word_embeddings", False)
sliding_window_len = get_hf_config_sliding_window(hf_text_config)
disable_sliding_window = sliding_window_len is None
if args.model_type == 'qwen2_vl':
# workround for qwen2_vl since _get_and_verify_max_len not supported for MRoPE
# remove this when it is supported.
args.hf_max_model_len = 32768
else:
if args.model_type == 'hunyuan' or args.model_type == 'deepseek_v2':
disable_sliding_window=False
args.hf_max_model_len = _get_and_verify_max_len(hf_text_config, None, disable_sliding_window, sliding_window_len)
if args.batch_size < 1:
args.batch_size = args.num_samples
args.batch_size = min(args.batch_size, args.num_samples)
if args.dtype == "auto":
args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
if args.scales_smooth_dtype == "auto":
args.scales_smooth_dtype = args.dtype
args.torch_dtype = str_dtype_to_torch(args.dtype)
args.torch_scales_smooth_dtype = str_dtype_to_torch(args.scales_smooth_dtype)
args.hf_config.torch_dtype = args.torch_dtype
args.tokenizer, args.pad_id, args.end_id = load_tokenizer(
tokenizer_dir=args.tokenizer_dir,
model_name=args.model_name,
model_version=args.model_version,
)
tik = time.time()
main(args)
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
logger.info(f'Total time of converting checkpoints: {t}')

View File

@@ -0,0 +1,69 @@
import os
import argparse
from transformers import (AutoModel, AutoModelForCausalLM,
AutoModelForSeq2SeqLM, GenerationConfig)
from vllm.transformers_utils.config import get_config
from utils_internal import (read_model_name, torch_dtype_to_str, str_dtype_to_torch)
from dump_smooth import save_weights
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--hf_model_dir', type=str, default=None)
parser.add_argument('--output_dir',
type=str,
default="output_dir",
help="The path to save the quantized checkpoint")
parser.add_argument('--model_version',
type=str,
default=None,
help="Set model version to replace parsing from _name_or_path in hf config.")
parser.add_argument('--model_type',
type=str,
default=None,
help="Set model type to replace parsing from model_type in hf config."
"if set is None and parsed also None, then set as model_version")
parser.add_argument('--dtype',
type=str,
choices=['auto', 'float32', 'float16', 'bfloat16'],
default='auto',
help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
parser.add_argument(
'--dump_weights',
action="store_true",
default=True,
help='dump weights of the converted model',
)
args = parser.parse_args()
assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
args.hf_model_dir, args.model_version, args.model_type)
args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
if args.dtype == "auto":
args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
args.torch_dtype = str_dtype_to_torch(args.dtype)
args.hf_config.torch_dtype = args.torch_dtype
if args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'glm':
auto_model_cls = AutoModelForSeq2SeqLM
elif args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'chatglm':
auto_model_cls = AutoModel
else:
auto_model_cls = AutoModelForCausalLM
model = auto_model_cls.from_pretrained(
args.hf_model_dir,
trust_remote_code=True,
torch_dtype=args.torch_dtype)
named_parameters = dict(model.named_parameters())
save_weights(named_parameters, args)

View File

@@ -0,0 +1,145 @@
import torch
import os
import logging
logger = logging.getLogger(__name__)
def tensor_shape_to_string(tensor):
'''
convert a tensor shape to string description
'''
int_list = list(tensor.shape)
str_list = [str(num) for num in int_list]
str_shape = "x".join(str_list)
return str_shape
def save_prompt_token_ids(prompt_input_ids, args):
'''
save prompt_token_id
Args:
prompt_input_ids: prompt input_id assiged to llm.generate
args: arguments from main
'''
if args.dump_prompt_token_ids is not True:
return
output_dir = os.path.join(args.output_dir, "prompt_input_ids")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
data_len = len(prompt_input_ids)
for data_index in range(data_len):
tensor = prompt_input_ids[data_index]
str_shape = tensor_shape_to_string(tensor)
file_path = os.path.join(output_dir, f"prompt_input_ids_{data_index}_{str_shape}.pt")
torch.save(tensor, file_path)
logger.info(f"Saved input_ids[{data_index}] to {file_path}")
def save_input_ids(input_ids, args):
'''
save input_ids
Args:
input_ids: input of qkv with layer0
args: arguments from main
'''
id_len = len(input_ids)
if args.dump_input_ids is not True or id_len == 0:
return
output_dir = os.path.join(args.output_dir, "input_ids")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for data_index in range(id_len):
tensor = input_ids[data_index]
str_shape = tensor_shape_to_string(tensor)
file_path = os.path.join(output_dir, f"input_ids_{data_index}_{str_shape}.pt")
torch.save(tensor, file_path)
logger.info(f"Saved input_ids[{data_index}] to {file_path}")
def save_act_range(act_range, args):
'''
save act_range
Args:
act_range: save act_range collected when model running
args: arguments from main
'''
if args.dump_act_range is not True:
return
output_dir = os.path.join(args.output_dir, "act_range")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for layer_name, layer_scale in act_range.items():
for tensor_key, tensor_value in layer_scale.items():
if isinstance(tensor_value, torch.Tensor):
str_shape = tensor_shape_to_string(tensor_value)
file_name = f'{layer_name}_{tensor_key}_{str_shape}.pt'
file_path = os.path.join(output_dir, file_name)
torch.save(tensor_value, file_path)
logger.info(f"Saved act_range[{layer_name}][{tensor_key}] to {file_path}")
def save_weights(weights, args):
'''
save hugging face weights
Args:
weights: hugging face weights merged with llm model named parameters
args: arguments from main
'''
if args.dump_weights is not True:
return
output_dir = os.path.join(args.output_dir, "weights")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for tensor_key, tensor_value in weights.items():
str_shape = tensor_shape_to_string(tensor_value)
file_name = f'{tensor_key}_{str_shape}.pt'
file_path = os.path.join(output_dir, file_name)
torch.save(tensor_value, file_path)
logger.info(f"Saved weights[{tensor_key}] to {file_path}")
def save_generate_weights(weights, args):
'''
save quantizated weights
Args:
weights: quantized weights of smoothquant or weightonly
args: arguments from main
'''
if args.dump_generate_weights is not True:
return
output_dir = os.path.join(args.output_dir, "generate_weights")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for tensor_key, tensor_value in weights.items():
str_shape = tensor_shape_to_string(tensor_value)
file_name = f'{tensor_key}_{str_shape}.pt'
file_path = os.path.join(output_dir, file_name)
torch.save(tensor_value, file_path)
logger.info(f"Saved generate weights[{tensor_key}] to {file_path}")
def dump_save_x_y(name, x, y, index):
'''
dump x, y when inferrence
output_dir need to modify by your self
'''
output_dir = "output_dir"
x_output_dir = os.path.join(output_dir, "x_tensor")
y_output_dir = os.path.join(output_dir, "y_tensor")
if not os.path.exists(x_output_dir):
os.makedirs(x_output_dir)
if not os.path.exists(y_output_dir):
os.makedirs(y_output_dir)
x_file_name = os.path.join(x_output_dir, f"{name}_x_{index}.pt")
y_file_name = os.path.join(y_output_dir, f"{name}_y_{index}.pt")
if isinstance(x, tuple):
x = x[0]
if not os.path.exists(x_file_name):
torch.save(x.cpu(), x_file_name)
if not os.path.exists(y_file_name):
torch.save(y.cpu(), y_file_name)

View File

@@ -0,0 +1,140 @@
import torch
def make_context(
tokenizer,
query,
history,
system,
max_input_length,
max_window_size: int = 6144,
chat_format: str = "chatml",
):
'''
tokenize one text context to tokenized id
args:
tokenizer: model tokenizer
query: current text context
history: history text context
system: system prompt
max_input_length: max input length of tokenized id
chat_format: chat format, only accept chatml and raw
'''
if history is None:
history = []
if chat_format == "chatml":
im_start, im_end = "<|im_start|>", "<|im_end|>"
im_start_tokens = [tokenizer.im_start_id]
im_end_tokens = [tokenizer.im_end_id]
nl_tokens = tokenizer.encode("\n")
def _tokenize_str(role, content):
'''
tokensize string
'''
return (f"{role}\n{content}", tokenizer.encode(
role,
allowed_special=set(),
) + nl_tokens + tokenizer.encode(
content,
allowed_special=set(),
))
system_text, system_tokens_part = _tokenize_str("system", system)
system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
raw_text = ""
context_tokens = []
for turn_query, turn_response in reversed(history):
query_text, query_tokens_part = _tokenize_str("user", turn_query)
query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
response_text, response_tokens_part = _tokenize_str("assistant", turn_response)
response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
prev_chat = (f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}")
current_context_size = (len(system_tokens) + len(next_context_tokens) + len(context_tokens))
if current_context_size < max_window_size:
context_tokens = next_context_tokens + context_tokens
raw_text = prev_chat + raw_text
else:
break
context_tokens = system_tokens + context_tokens
raw_text = f"{im_start}{system_text}{im_end}" + raw_text
context_tokens += (nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens +
im_start_tokens + tokenizer.encode("assistant") + nl_tokens)
raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
elif chat_format == "raw":
raw_text = query
context_tokens = tokenizer.encode(raw_text)
else:
raise NotImplementedError(f"Unknown chat format {chat_format!r}")
# truncate to max_input_length, truncate from the front
return raw_text, context_tokens[-max_input_length:]
def prepare_inputs(batch_input_texts,
tokenizer,
model_name,
model_version,
test_token_num,
eval_task='summarize',
add_special_tokens=True):
'''
tokenize batch input texts into tokenized id.
args:
batch_input_texts: batch input text, also named batched prompt
tokenizer: model tokenizer
model_name: model name
model_version: model version
test_token_num: batch size, also named prompt number
eval_task: eval task
add_special_tokens: whether to add_special_tokens, default True
'''
batch_size = len(batch_input_texts)
append_str = ' TL;DR: ' if eval_task == 'summarize' else ''
batch_input_ids = []
for i in range(batch_size):
curr_text = batch_input_texts[i] + append_str
curr_text = curr_text.strip().replace(" n't", "n't")
# The below lines are used to be compatible with the original code
if 'GLM' in model_name and model_version in ['chatglm2', 'chatglm3']:
input_ids = tokenizer.encode(curr_text, return_tensors='pt').squeeze(0)
input_ids = input_ids[:test_token_num]
elif 'qwen' in model_name.lower() and model_version == 'qwen':
# use make_content to generate prompt
system_prompt = "You are a useful assistant, please directly output the corresponding " + \
"summary according to the article entered by the user."
_, input_id_list = make_context(
tokenizer=tokenizer,
query=curr_text,
history=[],
system=system_prompt,
max_input_length=test_token_num,
)
input_ids = torch.tensor(input_id_list)
else:
if 'qwen' in model_name.lower() and 'qwen2' in model_version:
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": curr_text
}]
curr_text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True)
input_ids = tokenizer.encode(curr_text,
return_tensors='pt',
add_special_tokens=add_special_tokens,
truncation=True,
max_length=test_token_num).squeeze(0)
batch_input_ids.append(input_ids)
return batch_input_ids

View File

@@ -0,0 +1,206 @@
import re
# model_type, qkv_list, gate_up_list, is_gate_up
smooth_model_config = {
"mllama": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"llama": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"qwen2_vl": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None,
"skip_patterns": [r"^visual\.*"]
},
"qwen2": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"qwen": {
"qkv_list": ["c_attn"],
"gate_up_list": ["w2", "w1"],
"is_gate_up": True,
"moe_list": None
},
"baichuan": {
"qkv_list": ["W_pack"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"chatglm": {
"qkv_list": ["query_key_value"],
"gate_up_list": ["dense_h_to_4h"],
"is_gate_up": True,
"moe_list": None
},
"gpt_neox": {
"qkv_list": ["query_key_value"],
"gate_up_list": [],
"is_gate_up": True,
"moe_list": None
},
"mixtral": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["w1", "w3"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
"down_list": ["block_sparse_moe.w2", "w2"],
"is_merged": True
}
},
"qwen2_moe": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
"down_list": ["mlp.w2", "down_proj"],
"is_merged": True
}
},
"deepseek_v2": {
"qkv_list": ["q_proj", "q_b_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
"down_list": ["mlp.w2", "down_proj"],
"is_merged": True
},
"skip_patterns": [r".*\.kv_b_proj\..*",]
},
"falcon": {
"qkv_list": ["query_key_value"],
"gate_up_list": ["dense_h_to_4h"],
"is_gate_up": True,
"moe_list": None
},
"bloom": {
"qkv_list": ["query_key_value"],
"gate_up_list": ["dense_h_to_4h"],
"is_gate_up": False,
"moe_list": None
},
"internlm2": {
"qkv_list": ["wqkv"],
"gate_up_list": ["gate_up_proj"],
"is_gate_up": True,
"moe_list": None
},
"hunyuan": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
"down_list": ["mlp.w2", "down_proj"],
"is_merged": True
}
},
"phi3": {
"qkv_list": ["qkv_proj"],
"gate_up_list": ["gate_up_proj"],
"is_gate_up": True,
"moe_list": None
},
}
def get_layer_weight_bias_name(model_type, layer_name):
'''
Specially adjust the condition that layer_name and weight/bias name are different,
or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
if model_type == "chatglm" and "output_layer" in layer_name:
layer_name = "lm_head"
weight_name = f"{layer_name}_weight"
bias_name = f"{layer_name}_bias"
Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
'''
weight_name = None
bias_name = None
# layers which need to be modified can be listed at here
if model_type == "hunyuan" and "lm_head" in layer_name:
layer_name = "model.embed_tokens"
weight_name = "model.embed_tokens.weight"
bias_name = "model.embed_tokens.bias"
if weight_name is None:
weight_name = f"{layer_name}.weight"
if bias_name is None:
bias_name = f"{layer_name}.bias"
return layer_name, weight_name, bias_name
def modify_layer_weight_bias_name(model_type, named_parameters):
'''
modify special condition that vllm layer_name isn't same as hf layer name
'''
# Mapping for model type specific adjustments
mapping = {
"chatglm": {
"transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
},
}
if model_type in mapping:
for old_key, new_key in mapping[model_type].items():
if old_key in named_parameters:
named_parameters[new_key] = named_parameters.pop(old_key)
def extract_numbers(string):
'''
extract a string to number
'''
# 使用正则表达式找到字符串中的所有数字部分
matches = re.findall(r'\d+', string)
# 将所有匹配的数字部分转换为整数
numbers = [int(match) for match in matches]
return numbers[-1] if len(numbers) > 0 else 0
def get_qkv_distribution(model_type, model_version, hf_config):
'''
Get qkv distribution: n3sh or 3nsh
n3sh: [head_num, 3, head_size, hidden_size]
3nsh: [3, head_num, head_size, hidden_size]
vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
to be same as hugging face qkv distribution
This is only for packge qkv layer and it's distribution is n3sh
'''
is_n3sh = False
head_num = 0
kv_head_num = 0
if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
is_n3sh = True
head_num = hf_config.num_attention_heads
kv_head_num = head_num
if model_type == "falcon":
is_n3sh = True
head_num = hf_config.num_attention_heads
if hf_config.new_decoder_architecture:
kv_head_num = hf_config.num_kv_heads
elif hf_config.multi_query:
kv_head_num = 1
else:
kv_head_num = head_num
return is_n3sh, head_num, kv_head_num

View File

@@ -0,0 +1,418 @@
import argparse
import torch
from datasets import load_dataset
import logging
import csv
import os
from vllm import LLM, SamplingParams
from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
from input_context import prepare_inputs
from dump_smooth import save_prompt_token_ids, save_input_ids, save_act_range, save_weights, save_generate_weights
from model_special import smooth_model_config
logger = logging.getLogger(__name__)
def load_prompts_from_csv(args):
'''
load prompts from csv file
'''
if args.prompt_file is not None:
prompt_file = args.prompt_file
else:
current_dir = os.path.dirname(__file__)
prompt_file = os.path.join(current_dir, 'summarize_1024_prompts.csv')
# 从 CSV 文件加载数据为 List
loaded_prompts = []
# 从按列显示的 CSV 文件中读取数据并转换为 List 形式
with open(prompt_file, 'r', newline='') as file:
reader = csv.reader(file)
loaded_prompts = list(zip(*reader))[0]
loaded_prompts = list(loaded_prompts)
num_samples = min(args.num_samples, len(loaded_prompts))
prompts = loaded_prompts[0:num_samples]
return prompts
def save_summarize_1024_prompts_as_csv(prompts):
'''
save summarize 512 prompts
'''
# 将 List 数据按列保存为 CSV 文件
# 转置 List
transposed_prompts = [prompts]
with open('summarize_1024_prompts.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(zip(*transposed_prompts))
def generate_prompts(args: argparse.Namespace):
'''
Generate prompts based on the evaluation task and arguments.
'''
eval_task_config = {
"code_completion": {
"dataset_name": "openai_humaneval",
"dataset_revision": None,
"dataset_input_key": "prompt",
"dataset_split": "test"
},
"summarize": {
"dataset_name": "ccdv/cnn_dailymail",
"dataset_revision": "3.0.0",
"dataset_input_key": "article",
"dataset_split": "train"
},
"summarize_long": {
"dataset_name": "tau/zero_scrolls",
"dataset_revision": "squality",
"dataset_input_key": "input",
"dataset_split": "validation"
},
"summarize_hg": {
"dataset_name": "cnn_dailymail",
"dataset_revision": "3.0.0",
"dataset_input_key": "article",
"dataset_split": "validation"
},
"text_generation": {
"dataset_name": "lambada",
"dataset_revision": None,
"dataset_input_key": "text",
"dataset_split": "validation"
}
}
if args.eval_task in eval_task_config:
config = eval_task_config[args.eval_task]
dataset_name = config["dataset_name"]
dataset_revision = config["dataset_revision"]
dataset_input_key = config["dataset_input_key"]
dataset_split = config["dataset_split"]
else:
assert args.dataset_name is not None, f"dataset_name is None when eval_task == custom"
assert args.dataset_input_key is not None, f"dataset_input_key is None when eval_task == custom"
assert args.dataset_split is not None, f"dataset_split is None when eval_task == custom"
dataset_name = args.dataset_name
dataset_revision = args.dataset_revision
dataset_input_key = args.dataset_input_key
dataset_split = args.dataset_split
if args.prompt_file is not None or (args.eval_task == "summarize" and args.num_samples <= 1024):
prompts = load_prompts_from_csv(args)
num_samples = min(args.num_samples, len(prompts))
else:
dataset = load_dataset(dataset_name,
dataset_revision,
cache_dir=args.dataset_cache_dir,
split=dataset_split,
trust_remote_code=True)
num_samples = min(args.num_samples, len(dataset))
prompts = dataset[0:num_samples][dataset_input_key]
# save_summarize_1024_prompts_as_csv(prompts)
prompt_token_ids = []
if args.has_prompt_token_id:
batch_input_ids = prepare_inputs(prompts,
args.tokenizer,
args.model_name,
args.model_version,
args.max_input_length,
eval_task=args.eval_task,
add_special_tokens=args.add_special_tokens)
save_prompt_token_ids(batch_input_ids, args)
for i in range(num_samples):
prompt_token_ids.append(batch_input_ids[i].tolist())
if len(prompts) == 0:
prompts = None
else:
prompts = [s[:args.max_input_length] for s in prompts]
if len(prompt_token_ids) == 0:
prompt_token_ids = None
return prompts, prompt_token_ids
@torch.no_grad()
def get_smooth_cal_weight(name, weight, name_parameters, act_range, model_type):
'''
get cal_weight for smooth process to solve q/k/v and gate/up layer merged condition in vllm
args:
name: weight name
weight: weight value
name_parameters: named parameters
act_range: layer act range info of name
model_type: model type
'''
if act_range["is_qkv"] is True:
name_parts = name.split(".")
self_attn_layer_name = ".".join(name_parts[:-2])
qkv_list = smooth_model_config[model_type]["qkv_list"]
q_weight_name = f"{self_attn_layer_name}.{qkv_list[0]}.weight"
k_weight_name = f"{self_attn_layer_name}.{qkv_list[1]}.weight"
v_weight_name = f"{self_attn_layer_name}.{qkv_list[2]}.weight"
q_weight = name_parameters[q_weight_name]
k_weight = name_parameters[k_weight_name]
v_weight = name_parameters[v_weight_name]
cal_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
elif act_range["is_merge"] is True:
name_parts = name.split(".")
mlp_layer_name = ".".join(name_parts[:-2])
gate_up_list = smooth_model_config[model_type]["gate_up_list"]
gate_weight_name = f"{mlp_layer_name}.{gate_up_list[0]}.weight"
up_weight_name = f"{mlp_layer_name}.{gate_up_list[1]}.weight"
gate_weight = name_parameters[gate_weight_name]
up_weight = name_parameters[up_weight_name]
cal_weight = torch.cat([gate_weight, up_weight], dim=0)
else:
cal_weight = weight
return cal_weight
@torch.no_grad()
def cal_smoother(weight, act_range_x, alpha=0.5):
'''
calculate smoother value
args:
weight: smoother weight
act_range_x: activation max value of per channel
alpha: smooth factor, default 0.5
'''
assert weight.shape[-1] == act_range_x.numel()
weight_scales = weight.view(-1, weight.shape[-1])
weight_scales = weight_scales.abs().max(dim=0)[0]
weight_scales = weight_scales.to(float).clamp(min=1e-6)
smoother = (act_range_x.to(weight_scales.device).to(float).pow(alpha) /
weight_scales.pow(1 - alpha)).clamp(min=1e-6)
return smoother
@torch.no_grad()
def cal_qweight_scales(sweight, smooth_act_range_x, per_token, per_channel):
'''
calculate quantized weight anc scales
args:
sweight: weight which has been divided by smoother value
smooth_act_range_x: activation max value which has beed divide by smoother value
per_token: bool, means whether calculate the weight and scales dynamically
per_channel: bool, mean whether calculate the weight and scales by channel
'''
scale_x_quant_orig_t = smooth_act_range_x.max() / 127.0
smooth_act_range_w = sweight.abs().max(dim=-1)[0]
smooth_act_range_w = smooth_act_range_w.to(float).clamp(min=1e-6)
scale_w_quant_orig_c = smooth_act_range_w / 127.0
scale_w_quant_orig_t = smooth_act_range_w.max() / 127
if per_channel:
qweight = (sweight / scale_w_quant_orig_c[..., None])
else:
qweight = (sweight / scale_w_quant_orig_t)
qweight = qweight.clip(-128, 127).to(torch.int8)
scale_to_int = 1 / scale_x_quant_orig_t
if per_token:
if per_channel:
per_channel_scale = scale_w_quant_orig_c
else:
per_channel_scale = scale_w_quant_orig_t
else:
if per_channel:
per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_c
hidden_size = smooth_act_range_x.numel()
scale_to_int = scale_to_int.repeat(hidden_size)
else:
per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_t
per_channel_scale = per_channel_scale.squeeze()
if per_channel_scale.numel() == 1 and per_channel_scale.dim() == 0:
per_channel_scale = per_channel_scale.unsqueeze(0)
if scale_to_int.numel() == 1 and scale_to_int.dim() == 0:
scale_to_int = scale_to_int.unsqueeze(0)
sinfo = [
scale_w_quant_orig_t.item(), scale_x_quant_orig_t.item(),
scale_w_quant_orig_t.item() / scale_x_quant_orig_t.item()
]
return qweight, per_channel_scale, scale_to_int, sinfo
def check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int):
'''
check whether nan/inf appears in qweight, per_channel_scale, smooth, qzeros, scale_to_int
'''
if torch.isinf(qweight).any() or torch.isnan(qweight).any():
logger.error(f"name:{name} qweight has inf or nan")
if torch.isinf(per_channel_scale).any() or torch.isnan(per_channel_scale).any():
logger.error(f"name:{name} per_channel_scale has inf or nan")
if torch.isinf(smooth).any() or torch.isnan(smooth).any():
logger.error(f"name:{name} smooth has inf or nan")
if torch.isinf(scale_to_int).any() or torch.isnan(scale_to_int).any():
logger.error(f"name:{name} scale_to_int has inf or nan")
if qzeros is not None and (torch.isinf(qzeros).any() or torch.isnan(qzeros).any()):
logger.error(f"name:{name} qzeros has inf or nan")
@torch.no_grad()
def cal_smooth_weight(name, act_range_x, weight, smooth_value, has_qzeros, per_token, per_channel, cal_weight):
'''
calculate qweight, scales, smooth, qzeros
args:
name: weight name
act_range_x: activation max value of per channel
weight: weight to be quantized
smooth_value: smooth value
has_qzeros: which generate qzeros weight
per_token: bool, means whether calculate the weight and scales dynamically
per_channel: bool, mean whether calculate the weight and scales by channel
model_type: model type
'''
smoother = cal_smoother(cal_weight, act_range_x, smooth_value)
smooth_act_range_x = act_range_x / smoother
sweight = weight * (smoother.view(1, -1))
qweight, per_channel_scale, scale_to_int, sinfo = cal_qweight_scales(sweight, smooth_act_range_x, per_token,
per_channel)
qweight = qweight.reshape(weight.shape)
smooth = 1 / smoother
smooth = smooth.squeeze()
if has_qzeros:
qzeros = torch.zeros_like(per_channel_scale, dtype=torch.int32)
else:
qzeros = None
# check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int)
return qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo
@torch.no_grad()
def generate_smooth_weight(act_range, name_parameters, args):
'''
generate smooth weight
args:
act_range: act_range collected in model running
name_parameters: hugging face model named parameters
args: argument from main
'''
smooth_weight = {}
smooth_info = {}
has_qzeros = args.has_qzeros
smooth_value = args.smooth_value
smooth_info["title"] = ["max_scale_w, max_scale_x, max_scale_w/max_scale_x"]
for name, param in name_parameters.items():
if should_skip(args.model_type, name):
logger.info(f"skip {name}")
smooth_weight[name] = param
continue
if name.endswith("bias"):
smooth_weight[name] = param
continue
name_parts = name.split(".")
layer_name = ".".join(name_parts[:-1])
if layer_name in act_range:
act_range_x = act_range[layer_name]['x']
cal_weight = get_smooth_cal_weight(name, param, name_parameters, act_range[layer_name], args.model_type)
qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo = cal_smooth_weight(
name, act_range_x, param, smooth_value, has_qzeros, args.per_token, args.per_channel, cal_weight)
per_channel_scale = per_channel_scale.to(args.torch_scales_smooth_dtype)
smooth = smooth.to(args.torch_scales_smooth_dtype)
scale_to_int = scale_to_int.to(args.torch_scales_smooth_dtype)
smooth_weight[f'{layer_name}.qweight'] = qweight
smooth_weight[f'{layer_name}.per_channel_scale'] = per_channel_scale
if args.per_token is True:
smooth_weight[f'{layer_name}.smooth'] = smooth
else:
scale_to_int = scale_to_int * smooth
smooth_weight[f'{layer_name}.scale_to_int'] = scale_to_int
if has_qzeros:
smooth_weight[f'{layer_name}.qzeros'] = qzeros
smooth_info[name] = sinfo
else:
smooth_weight[name] = param
return smooth_weight, smooth_info
def generate_weights_of_smoothquant(llm: LLM, args: argparse.Namespace):
'''
generate smoothquant weights
args:
llm: LLM instance
args: argument from main
'''
prompts, prompt_token_ids = generate_prompts(args)
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=args.output_len,
repetition_penalty=args.repetition_penalty,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k)
tp_size = args.tp_size
llm.llm_engine.model_executor._run_workers("setup_smooth_hook", args.dump_input_ids)
llm.generate(prompts, sampling_params, prompt_token_ids=prompt_token_ids, use_tqdm=True)
logger.info("llm generate finished")
llm.llm_engine.model_executor._run_workers("remove_hooks")
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
vllm_cleanup(llm)
del prompts
del prompt_token_ids
cleanup()
logger.info("get act_range and named_parameters from llm finished")
merged_act_range, merged_named_parameters, input_id_list = convert_to_merged(act_range, named_parameters, tp_size,
args)
save_input_ids(input_id_list, args)
save_act_range(merged_act_range, args)
save_weights(merged_named_parameters, args)
del act_range
del named_parameters
cleanup()
logger.info("get merged_act_range and merged_named_parameters finished")
smooth_weight, smooth_info = generate_smooth_weight(merged_act_range, merged_named_parameters, args)
save_generate_weights(smooth_weight, args)
del merged_act_range
del merged_named_parameters
cleanup()
logger.info("get smooth_weight finished")
return smooth_weight, smooth_info

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,713 @@
from collections import defaultdict, OrderedDict
import torch
from pathlib import Path
from typing import Optional
import re
import os
import shutil
import logging
import json
from transformers import AutoTokenizer, T5Tokenizer
import gc
from datetime import datetime
from vllm.platforms import current_platform
from model_special import (smooth_model_config, get_layer_weight_bias_name, get_qkv_distribution,
modify_layer_weight_bias_name)
logger = logging.getLogger(__name__)
_str_to_torch_dtype_dict = dict(
bfloat16=torch.bfloat16,
float16=torch.float16,
float32=torch.float32,
int64=torch.int64,
int32=torch.int32,
int8=torch.int8,
bool=torch.bool,
fp8=torch.float8_e4m3fn,
)
def str_dtype_to_torch(dtype):
'''
convert torch dytpe to str dtype
'''
ret = _str_to_torch_dtype_dict.get(dtype)
dtype = ret if ret is not None else torch.float16
return dtype
_torch_dtype_to_str_dict = {
torch.bfloat16:"bfloat16",
torch.float16:"float16",
torch.float32:"float32",
torch.int64:"int64",
torch.int32:"int32",
torch.int8:"int8",
torch.bool:"bool",
torch.float8_e4m3fn:"fp8",
}
def torch_dtype_to_str(dtype):
'''
convert str dytpe to torch dtype
'''
ret = _torch_dtype_to_str_dict.get(dtype)
dtype = ret if ret is not None else "float16"
return dtype
def extract_model_path(name_or_path):
'''
extract model_version, model_family from named_or_path from config.json
'''
patterns = [
r"/(.*)(-[0-9]+[mMbB]{1})(-*.*)",
r"/(.*-[0-9]+)(-*.*)",
r"(.*)(-[0-9]+[mMbB]{1})(-*.*)",
r"(.*-[0-9]+)(-*.*)",
r"([^-]+)(-*.*)",
]
model_version = None
for pattern in patterns:
match = re.search(pattern, name_or_path)
if match:
model_version = match.group(1)
break
if model_version is None:
model_version = name_or_path
model_version = model_version.lower()
match = re.search(r"([a-zA-z]+)(.*)", model_version)
if match:
model_family = match.group(1)
else:
model_family = model_version
return model_version, model_family
def read_model_name(model_dir: str, model_version: Optional[str] = None, model_type: Optional[str] = None):
'''
get model_arch, model_version, model_family, model_type form config.json, passed model_version, model_type
args:
model_dir: model directory
model_version: passed from main, default None
model_type: pass from main, default None
'''
with open(Path(model_dir) / "config.json", 'r') as f:
config = json.load(f)
model_arch = config.get('architectures', None)
name_or_path = config.get('_name_or_path', None)
if model_type is None:
model_type = config.get('model_type', None)
if model_type:
model_type = model_type.lower()
model_family = None
if model_version is None and name_or_path:
model_version, model_family = extract_model_path(name_or_path)
if model_version is None:
model_version = model_type
if model_version:
model_version = model_version.lower()
if model_version and model_family is None:
match = re.search(r"([a-zA-z]+)(.*)", model_version)
if match:
model_family = match.group(1)
else:
model_family = model_version
if isinstance(model_arch, (list, tuple)) and len(model_arch) > 0:
model_arch = model_arch[0]
assert model_arch, "read model architectures failed"
assert model_version, "read model version failed, please set args.version manually"
assert model_family, "read model family failed, please set args.version manually"
return model_arch, model_version, model_family, model_type
def load_tokenizer(tokenizer_dir: Optional[str] = None,
vocab_file: Optional[str] = None,
model_name: str = 'GPTForCausalLM',
model_version: Optional[str] = None,
tokenizer_type: Optional[str] = None):
'''
load tokenizer of model
args:
tokenizer_dir: tokenizer directory
vocab_file: vocabulary file, default None
model_name: model name
model_version: model version
tokenizer_type: Tokenizer type to be loaded.
'''
if vocab_file is None:
use_fast = True
if tokenizer_type == "llama":
use_fast = False
# Should set both padding_side and truncation_side to be 'left'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
legacy=False,
padding_side='left',
truncation_side='right',
trust_remote_code=True,
tokenizer_type=tokenizer_type,
use_fast=use_fast)
elif model_name == 'GemmaForCausalLM':
from transformers import GemmaTokenizer
# Initialize tokenizer from vocab file.
tokenizer = GemmaTokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
else:
# For gpt-next, directly load from tokenizer.model
tokenizer = T5Tokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
if model_name == 'QWenForCausalLM':
with open(Path(tokenizer_dir) / "generation_config.json") as f:
gen_config = json.load(f)
chat_format = gen_config['chat_format']
assert chat_format in ('raw','chatml'), f"unknown chat format: {chat_format}"
pad_id = gen_config['pad_token_id']
end_id = gen_config['eos_token_id']
elif model_name in ('ChatGLMForCausalLM', 'glm'):
pad_id = tokenizer.pad_token_id
end_id = tokenizer.eop_token_id
else:
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
pad_id = tokenizer.pad_token_id
end_id = tokenizer.eos_token_id
try:
tokenizer.pad_token = tokenizer.eos_token
except Exception as e:
logger.warn(f"set pad_token with exception:{e}")
return tokenizer, pad_id, end_id
def merge_qkv_weight(named_parameters, weight_name, tp_size, q_proj_size, num_kv_head_replicas):
'''
merge tensor parallel qkv weight to none parallel q_weight, k_weight, v_weight.
merge_qkv weight and bias has the same logic
args:
named_parameters: parallel named parameters
weight_name: qkv layer weight name
tp_size: tensor parallel size
q_proj_size: query projection size
num_kv_head_replicas: number kv head replicas
'''
qkv_proj_size = named_parameters[0][weight_name].shape[0]
kv_proj_size = (qkv_proj_size - q_proj_size) // 2
splite_size = [q_proj_size, kv_proj_size, kv_proj_size]
q_weight_list = []
k_weight_list = []
v_weight_list = []
for rank in range(0, tp_size):
weight = named_parameters[rank][weight_name]
split_weight = torch.split(weight, splite_size, dim=0)
q_weight_list.append(split_weight[0])
if rank % num_kv_head_replicas == 0:
k_weight_list.append(split_weight[1])
v_weight_list.append(split_weight[2])
q_weight = torch.cat(q_weight_list, dim=0)
k_weight = torch.cat(k_weight_list, dim=0)
v_weight = torch.cat(v_weight_list, dim=0)
return q_weight, k_weight, v_weight
def merge_merged_weight(named_parameters, weight_name, tp_size, dim=0):
'''
merge merged linear layer weight to gate_weight and up_weight.
merge merged weight and bias has the same logic.
args:
named_parameters: parallel named parameters
weight_name: qkv layer weight name
tp_size: tensor parallel size
'''
up_weight_list = []
gate_weight_list = []
for rank in range(0, tp_size):
weight = named_parameters[rank][weight_name]
chunk_weights = torch.chunk(weight, 2, dim=dim)
up_weight_list.append(chunk_weights[0])
gate_weight_list.append(chunk_weights[1])
gate_weight = torch.cat(up_weight_list, dim=dim)
up_weight = torch.cat(gate_weight_list, dim=dim)
return gate_weight, up_weight
def convert_packed_qkv(q_weight, k_weight, v_weight, dim, args):
'''
convert packad qkv weight or bias
args:
q_weight: q weight or bias
k_weight: k weight or bias
v_weight: v_weight or bias
dim: convert dim
args: argument
'''
packed_qkv = torch.cat([q_weight, k_weight, v_weight], dim=dim)
is_n3sh, head_num, kv_head_num = get_qkv_distribution(args.model_type, args.model_version, args.hf_config)
if is_n3sh is True:
packed_qkv_shape = packed_qkv.shape
num_query_heads_per_kv_head = head_num // kv_head_num
q_shape = q_weight.shape
k_shape = k_weight.shape
v_shape = v_weight.shape
q = q_weight.view(q_shape[:dim] + (kv_head_num, num_query_heads_per_kv_head, -1) + q_shape[dim + 1:])
k = k_weight.view(k_shape[:dim] + (kv_head_num, 1, -1) + k_shape[dim + 1:])
v = v_weight.view(v_shape[:dim] + (kv_head_num, 1, -1) + v_shape[dim + 1:])
tensor_n3sh = torch.cat([q, k, v], dim=dim+1)
packed_qkv = tensor_n3sh.reshape(packed_qkv_shape)
return packed_qkv
def convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
layer_range, merged_act_range, tp_size, args):
'''
convert parallel qkv named parameters to non parallel qkv named parameters
args:
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
args: argument
'''
layer_name_parts = layer_name.split(".")
self_attn_layer_name = ".".join(layer_name_parts[:-1])
qkv_name = layer_name_parts[-1]
q_weight, k_weight, v_weight = merge_qkv_weight(named_parameters, weight_name, tp_size, layer_range["q_proj_size"],
layer_range["num_kv_head_replicas"])
qkv_list = smooth_model_config[args.model_type]["qkv_list"]
qkv_list_len = len(qkv_list)
if qkv_list_len == 3:
q_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
k_layer_name = f"{self_attn_layer_name}.{qkv_list[1]}"
v_layer_name = f"{self_attn_layer_name}.{qkv_list[2]}"
elif qkv_list_len == 1:
qkv_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
if qkv_list_len == 3:
merged_act_range[q_layer_name]["x"] = layer_range["x"]
merged_act_range[k_layer_name]["x"] = layer_range["x"]
merged_act_range[v_layer_name]["x"] = layer_range["x"]
merged_act_range[q_layer_name]["is_qkv"] = True
merged_act_range[k_layer_name]["is_qkv"] = True
merged_act_range[v_layer_name]["is_qkv"] = True
merged_named_parameters[f"{q_layer_name}.weight"] = q_weight
merged_named_parameters[f"{k_layer_name}.weight"] = k_weight
merged_named_parameters[f"{v_layer_name}.weight"] = v_weight
elif qkv_list_len == 1:
merged_act_range[qkv_layer_name]["x"] = layer_range["x"]
qkv_weight = convert_packed_qkv(q_weight, k_weight, v_weight, 0, args)
merged_named_parameters[f"{qkv_layer_name}.weight"] = qkv_weight
if bias_name in named_parameters[0]:
q_bias, k_bias, v_bias = merge_qkv_weight(named_parameters, bias_name, tp_size, layer_range["q_proj_size"],
layer_range["num_kv_head_replicas"])
if qkv_list_len == 3:
merged_named_parameters[f"{q_layer_name}.bias"] = q_bias
merged_named_parameters[f"{k_layer_name}.bias"] = k_bias
merged_named_parameters[f"{v_layer_name}.bias"] = v_bias
elif qkv_list_len == 1:
qkv_bias = convert_packed_qkv(q_bias, k_bias, v_bias, 0, args)
merged_named_parameters[f"{qkv_layer_name}.bias"] = qkv_bias
return qkv_name
def convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
layer_range, merged_act_range, tp_size, model_type):
'''
convert parallel merged named parameters to non parallel merged named parameters
args:
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
model_type: model type
'''
layer_name_parts = layer_name.split(".")
mlp_layer_name = ".".join(layer_name_parts[:-1])
gate_weight, up_weight = merge_merged_weight(named_parameters, weight_name, tp_size)
gate_up_name = layer_name_parts[-1]
gate_up_list = smooth_model_config[model_type]["gate_up_list"]
gate_up_list_len = len(gate_up_list)
is_gate_up = smooth_model_config[model_type]["is_gate_up"]
if gate_up_list_len == 2:
gate_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
up_layer_name = f"{mlp_layer_name}.{gate_up_list[1]}"
elif gate_up_list_len == 1:
gate_up_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
if gate_up_list_len == 2:
merged_act_range[gate_layer_name]["x"] = layer_range["x"]
merged_act_range[up_layer_name]["x"] = layer_range["x"]
merged_act_range[gate_layer_name]["is_merge"] = True
merged_act_range[up_layer_name]["is_merge"] = True
merged_named_parameters[f"{gate_layer_name}.weight"] = gate_weight
merged_named_parameters[f"{up_layer_name}.weight"] = up_weight
elif gate_up_list_len == 1:
merged_act_range[gate_up_layer_name]["x"] = layer_range["x"]
merged_gate_up_weight_list = [gate_weight, up_weight] if is_gate_up is True else [up_weight, gate_weight]
merged_named_parameters[f"{gate_up_layer_name}.weight"] = torch.cat(merged_gate_up_weight_list, dim=0)
if bias_name in named_parameters[0]:
gate_bias, up_bias = merge_merged_weight(named_parameters, bias_name, tp_size)
if gate_up_list_len == 2:
merged_named_parameters[f"{gate_layer_name}.bias"] = gate_bias
merged_named_parameters[f"{up_layer_name}.bias"] = up_bias
elif gate_up_list_len == 1:
merged_gate_up_bias_list = [gate_bias, up_bias] if is_gate_up is True else [up_bias, gate_bias]
merged_named_parameters[f"{gate_up_layer_name}.bias"] = torch.cat(merged_gate_up_bias_list, dim=0)
return gate_up_name
def convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size):
'''
convert colum parallel named parameters to non parallel named parameters
args:
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
'''
if layer_range['is_linear']:
merged_act_range[layer_name]["x"] = layer_range["x"]
merged_named_parameters[weight_name] = torch.cat(
[named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=0)
if bias_name in named_parameters[0]:
merged_named_parameters[bias_name] = torch.cat(
[named_parameters[tp_id][bias_name] for tp_id in range(0, tp_size)], dim=0)
def convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size):
'''
convert row parallel named parameters to non parallel named parameters
args:
act_layer_name: act layer name
act_range: parallel act_range
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
'''
if layer_range['is_linear']:
if isinstance(layer_range['x'], torch.Tensor):
merged_act_range[layer_name]['x'] = torch.cat(
[act_range[tp_id][act_layer_name]['x'] for tp_id in range(0, tp_size)], dim=0)
else:
merged_act_range[layer_name]['x'] = None
merged_named_parameters[weight_name] = torch.cat(
[named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=1)
if bias_name in named_parameters[0]:
merged_named_parameters[bias_name] = named_parameters[0][bias_name]
def convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size, args):
'''
convert parallel layer named parameters to non parallel layer named parameters
args:
act_layer_name: act layer name
act_range: parallel act_range
layer_name: layer name
weight_name: weight name
bias_name: bias name
named_parameters: parallel hugging face named parameters
merged_named_parameters: non parallel hugging face named parameters
layer_range: parallel layer range info
merged_act_range: non parallel act range
tp_size: tensor parallel size
'''
qkv_name = "qkv_proj"
gate_up_name = "gate_up_proj"
if layer_range['split'] == 'col': # col
# merge weight
if layer_range["is_qkv"]:
qkv_name = convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size,
args)
elif layer_range["is_merge"]:
gate_up_name = convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range,
tp_size, args.model_type)
else:
convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size)
else: # row
convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
merged_named_parameters, layer_range, merged_act_range, tp_size)
return qkv_name, gate_up_name
def collect_moe_experts_act_range_of_layer(merged_act_range, mlp_part_name, moe_list):
'''
collect moe experts act range in the same layer
'''
experts_of_gate_up_layer = {}
experts_of_down_layer = {}
gate_up_list = moe_list["gate_up_list"]
gate_up_list_len = len(gate_up_list)
down_list = moe_list["down_list"]
gate_up_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[1]}"
gate_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[2]}" if gate_up_list_len > 2 else None
down_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{down_list[1]}"
for key, value in merged_act_range.items():
if re.search(gate_up_layer_pattern, key) or (gate_layer_pattern is not None
and re.search(gate_layer_pattern, key)):
experts_of_gate_up_layer[key] = value
if re.search(down_layer_pattern, key):
experts_of_down_layer[key] = value
return experts_of_gate_up_layer, experts_of_down_layer
def convert_moe_expert_activation_fused(experts_of_layer, merged_act_range):
'''
fuse the moe expert act range in the same layer, and asign to these experts
'''
unfused_activation = []
for key, value in experts_of_layer.items():
if isinstance(value["x"], torch.Tensor):
unfused_activation.append(value['x'])
assert len(unfused_activation) > 0, f"unfused_activation len is zero, this is unsupported"
activation = torch.stack(unfused_activation, dim=0)
fused_activation = torch.max(activation, dim=0)[0]
for key, value in experts_of_layer.items():
if value["x"] is None or isinstance(value["x"], torch.Tensor):
value['x'] = fused_activation
def convert_moe_layer_activation_fused(merged_act_range, model_type):
'''
loop each layer and fuse the moe expert act range in the same layer, and asign to these experts
'''
moe_list = smooth_model_config[model_type]["moe_list"]
if moe_list is None:
return
mlp_name = moe_list["gate_up_list"][0].split(".")[0]
layer = 0
while True:
mlp_part_name = rf"\.{layer}\.{mlp_name}"
experts_of_gate_up_layer, experts_of_down_layer = collect_moe_experts_act_range_of_layer(
merged_act_range, mlp_part_name, moe_list)
# if experts_of_layer is empty, means layer equants to expert_num, the loop is finished
if len(experts_of_gate_up_layer) < 1 or len(experts_of_down_layer) < 1:
logger.info(f"the experts_num is {layer}")
break
convert_moe_expert_activation_fused(experts_of_gate_up_layer, merged_act_range)
convert_moe_expert_activation_fused(experts_of_down_layer, merged_act_range)
layer += 1
def should_include(key, parameters, exclude_names):
'''
key shouldnot include in parameters and exlude_names
args:
parameters: named parameters
exclude_names: excluded nameds list
'''
return key not in parameters and not any(exclude_name in key for exclude_name in exclude_names)
def valid_act_range(act_layer_name, layer_range):
'''
valid act_range, mainly filter inf, nan or zero values in x field
args:
act_layer_name: act layer name
layer_range: act layer value
'''
act_range_x = layer_range["x"]
if act_range_x is not None and isinstance(act_range_x, torch.Tensor):
mask = torch.isinf(act_range_x) | torch.isnan(act_range_x) | (act_range_x == 0)
if torch.any(mask).item():
act_range_x[mask] = 1e-6
logger.warning(f"act_range_x in layer:{act_layer_name} has nan, inf or zero values, force to 1e-6")
def convert_to_merged(act_range, named_parameters, tp_size, args):
'''
convert parallel act_range and named parameters to non parallel format.
args:
act_range: parallel act_range
named_parameters: parallel named parameters
tp_size: tensor parallel size
args: argument
'''
model_type = args.model_type
merged_act_range = defaultdict(lambda: {"x": None, "is_qkv": False, "is_merge": False,})
merged_named_parameters = {}
input_id_list = []
exclude_names = set()
for act_layer_name, layer_range in act_range[0].items():
valid_act_range(act_layer_name, layer_range)
layer_name, weight_name, bias_name = get_layer_weight_bias_name(model_type, act_layer_name)
# when tie_word_embeddings is True, lm_head use embeding weight
if args.tie_word_embeddings is True and "lm_head" in layer_name:
continue
qkv_name, gate_up_name = convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name,
named_parameters, merged_named_parameters, layer_range,
merged_act_range, tp_size, args)
exclude_names.update({qkv_name, gate_up_name})
if layer_range['split'] == 'col' and layer_range["is_qkv"] and len(layer_range["input_id"]) > 0:
input_id_list = layer_range["input_id"]
if args.use_smoothquant and args.disable_fused_quantize_expert is False:
convert_moe_layer_activation_fused(merged_act_range, model_type)
merged_named_parameters.update({
key: value
for key, value in named_parameters[0].items()
if should_include(key, merged_named_parameters, exclude_names)
})
modify_layer_weight_bias_name(model_type, merged_named_parameters)
sorted_named_parameters = OrderedDict(sorted(merged_named_parameters.items(), key=lambda item: item[0]))
sorted_merged_act_range = OrderedDict(sorted(merged_act_range.items(), key=lambda item: item[0]))
return sorted_merged_act_range, sorted_named_parameters, input_id_list
def copy_files_except_extensions(input_dir, output_dir, extensions):
'''
copy python files with extension in extensions from input_dir to output_dir, and keey sub directory is same
args:
input_dir: input directory
output_dir: output directory
extensions: the copy files extension
'''
# 遍历输入目录及其子目录
for root, dirs, files in os.walk(input_dir):
# 计算相对路径
rel_path = os.path.relpath(root, input_dir)
if len(rel_path) > 1 and rel_path.startswith('.'):
continue
# 构建目标目录路径
dst_dir = os.path.join(output_dir, rel_path)
# 确保目标目录存在
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
for file in files:
if not any(file.endswith(ext) for ext in extensions) and not file.startswith('.'):
# 构建源文件和目标文件的完整路径
src_file = os.path.join(root, file)
dst_file = os.path.join(dst_dir, file)
# 复制文件
shutil.copy2(src_file, dst_file)
logger.info(f'Copied {src_file} to {dst_file}')
def cleanup():
'''
cleanup memory resource
'''
gc.collect()
if not current_platform.is_cpu():
torch.cuda.empty_cache()
def vllm_cleanup(llm):
"""Release occupied resources and reset parallel_state"""
del llm
from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
destroy_model_parallel()
destroy_distributed_environment()
import contextlib
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
import ray
if ray.is_initialized():
ray.shutdown()
logger.info('llm and distributed env is cleanup')
def generate_datetime():
'''
generate current datetime
'''
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
return formatted_datetime
def get_hf_config_sliding_window(hf_text_config) -> Optional[int]:
"""Get the sliding window size, or None if disabled."""
# Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
# addition to sliding window size. We check if that field is present
# and if it's False, return None.
if (hasattr(hf_text_config, "use_sliding_window")
and not hf_text_config.use_sliding_window):
return None
return getattr(hf_text_config, "sliding_window", None)
def get_skip_patterns(model_type):
"""Get the skip patterns from model config."""
config = smooth_model_config[model_type]
return config["skip_patterns"] if "skip_patterns" in config else []
def should_skip(model_type, weight_name):
"""judge if the weight should be skipped."""
skip_patterns = get_skip_patterns(model_type)
for pattern in skip_patterns:
import re
if re.match(pattern, weight_name):
return True
return False

View File

@@ -0,0 +1,152 @@
import argparse
import torch
from torch import Tensor
import numpy as np
import logging
from vllm import LLM
from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
from dump_smooth import save_weights, save_generate_weights
logger = logging.getLogger(__name__)
def merge_adjacent_low_4bit(tensor: Tensor):
"""
将一个包含int8类型数据的张量按相邻两个元素的低4位合并成新的int8数据
并输出一个新的张量。
参数:
- tensor: 类型为torch.int8的张量长度应为偶数。
返回:
- 新张量其中每个元素是相邻原元素低4位的合并结果。
示例:
a = torch.tensor([5, 7, 12, 3], dtype=torch.int8) # 示例张量,每对元素将被合并
merged_tensor = merge_adjacent_low_nibbles(a)
print(f"合并后的张量: {merged_tensor} (二进制: {merged_tensor.tolist()})")
"""
# 确保输入张量类型为int8且长度为偶数
assert tensor.dtype == torch.int8, "输入张量必须为int8类型"
assert tensor.shape[-1] % 2 == 0, "输入张量最后一维长度需为偶数"
even = np.bitwise_and(tensor[..., 0::2], 0x0F, dtype=np.int8)
odd = np.bitwise_and(tensor[..., 1::2], 0x0F, dtype=np.int8)
merged_tensor = np.bitwise_or(np.left_shift(odd, 4), even)
# 结果是已经合并的新张量
return merged_tensor
def cal_weightonly_weight(weight, weight_bits, qmin, qmax, has_qzeros, eps: float = 1e-8):
'''
return quantized_weight, scales, qzeros
args:
weight: need to be quantized
weight_bits: quantized bitwidth
qmin: minimum value in quantized range
qmax: maximum value in quantized range
has_qzeros: whether to generate qzeros weight
eps: limit zero float value to avoid floatpoint error
'''
assert weight.numel() != 0, "weight should not be empty tensor"
assert weight.dim() == 2 or weight.dim() == 3, "Invalid dim. The dim of weight should be 2 or 3"
assert weight.dtype in [torch.float32, torch.float16, torch.bfloat16
], "Invalid datatype. Weight must be torch.float32 or torch.float16 or torch.bfloat16"
weight_scale = weight.float().abs().clamp(min=eps).max(dim=-1).values / qmax
unpacked_weight = (torch.round((weight / weight_scale[..., None]).float())).clip(min=qmin, max=qmax).to(torch.int8)
scale_quant_orig_c = weight_scale.squeeze()
if weight_bits == 4:
quantized_weight = merge_adjacent_low_4bit(unpacked_weight)
else:
quantized_weight = unpacked_weight
if has_qzeros:
qzeros = torch.zeros_like(scale_quant_orig_c, dtype=torch.int32)
else:
qzeros = None
return quantized_weight, scale_quant_orig_c, qzeros
def generate_weightonly_weight(act_range, name_parameters, args):
'''
generate hugging face weight to quanizated weightonly weight
args:
act_range: non parallem act_range
name_parameters: non parallel hugging face named parameters
args: arguments from main
'''
weightonly_weight = {}
has_qzeros = args.has_qzeros
weight_bits = 8 if args.weight_only_precision == 'int8' else 4
qmin = float(-2**(weight_bits - 1))
qmax = float(2**(weight_bits - 1) - 1)
for name, param in name_parameters.items():
if should_skip(args.model_type, name):
logger.info(f"skip {name}")
weightonly_weight[name] = param
continue
if name.endswith("bias"):
weightonly_weight[name] = param
continue
name_parts = name.split(".")
layer_name = ".".join(name_parts[:-1])
if layer_name in act_range:
qweight, scales, qzeros = cal_weightonly_weight(param, weight_bits, qmin, qmax, has_qzeros)
scales = scales.to(args.torch_scales_smooth_dtype)
weightonly_weight[f'{layer_name}.qweight'] = qweight
weightonly_weight[f'{layer_name}.scales'] = scales
if has_qzeros:
weightonly_weight[f'{layer_name}.qzeros'] = qzeros
else:
weightonly_weight[name] = param
return weightonly_weight
def generate_weights_of_weight_only(llm: LLM, args: argparse.Namespace):
'''
generate weightonly weights
args:
llm: LLM instance
args: argument from main
'''
tp_size = args.tp_size
llm.llm_engine.model_executor._run_workers("setup_smooth_hook")
llm.llm_engine.model_executor._run_workers("remove_hooks")
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
vllm_cleanup(llm)
cleanup()
logger.info("get act_range and named_parameters from llm finished")
merged_act_range, merged_named_parameters, _ = convert_to_merged(act_range, named_parameters, tp_size, args)
save_weights(merged_named_parameters, args)
del act_range
del named_parameters
cleanup()
logger.info("get merged_act_range and merged_named_parameters finished")
weightonly_weight = generate_weightonly_weight(merged_act_range, merged_named_parameters, args)
save_generate_weights(weightonly_weight, args)
del merged_act_range
del merged_named_parameters
cleanup()
logger.info("get weightonly_weight finished")
return weightonly_weight

View File

@@ -0,0 +1,312 @@
#!/usr/bin/env python3
# Copyright (c) 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Modified version of: https://chromium.googlesource.com/chromium/tools/depot_tools.git/+/refs/heads/main/post_build_ninja_summary.py
"""Summarize the last ninja build, invoked with ninja's -C syntax.
> python3 tools/report_build_time_ninja.py -C build/..
Typical output looks like this:
```
Longest build steps for .cpp.o:
1.0 weighted s to build ...torch_bindings.cpp.o (12.4 s elapsed time)
2.0 weighted s to build ..._attn_c.dir/csrc... (23.5 s elapsed time)
2.6 weighted s to build ...torch_bindings.cpp.o (31.5 s elapsed time)
3.2 weighted s to build ...torch_bindings.cpp.o (38.5 s elapsed time)
Longest build steps for .so (linking):
0.1 weighted s to build _moe_C.abi3.so (1.0 s elapsed time)
0.5 weighted s to build ...flash_attn_c.abi3.so (1.1 s elapsed time)
6.2 weighted s to build _C.abi3.so (6.2 s elapsed time)
Longest build steps for .cu.o:
15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
15.3 weighted s to build ...machete_mm_... (183.6 s elapsed time)
15.3 weighted s to build ...machete_mm_... (183.7 s elapsed time)
15.5 weighted s to build ...machete_mm_... (185.6 s elapsed time)
15.5 weighted s to build ...machete_mm_... (185.9 s elapsed time)
15.5 weighted s to build ...machete_mm_... (186.2 s elapsed time)
37.4 weighted s to build ...scaled_mm_c3x.cu... (449.0 s elapsed time)
43.9 weighted s to build ...scaled_mm_c2x.cu... (527.4 s elapsed time)
344.8 weighted s to build ...attention_...cu.o (1087.2 s elapsed time)
1110.0 s weighted time (10120.4 s elapsed time sum, 9.1x parallelism)
134 build steps completed, average of 0.12/s
```
"""
import argparse
import errno
import fnmatch
import os
import sys
from collections import defaultdict
# The number of long build times to report:
long_count = 10
# The number of long times by extension to report
long_ext_count = 10
class Target:
"""Represents a single line read for a .ninja_log file."""
def __init__(self, start, end):
"""Creates a target object by passing in the start/end times in seconds
as a float."""
self.start = start
self.end = end
# A list of targets, appended to by the owner of this object.
self.targets = []
self.weighted_duration = 0.0
def Duration(self):
"""Returns the task duration in seconds as a float."""
return self.end - self.start
def SetWeightedDuration(self, weighted_duration):
"""Sets the duration, in seconds, passed in as a float."""
self.weighted_duration = weighted_duration
def WeightedDuration(self):
"""Returns the task's weighted duration in seconds as a float.
Weighted_duration takes the elapsed time of the task and divides it
by how many other tasks were running at the same time. Thus, it
represents the approximate impact of this task on the total build time,
with serialized or serializing steps typically ending up with much
longer weighted durations.
weighted_duration should always be the same or shorter than duration.
"""
# Allow for modest floating-point errors
epsilon = 0.000002
if (self.weighted_duration > self.Duration() + epsilon):
print('{} > {}?'.format(self.weighted_duration, self.Duration()))
assert (self.weighted_duration <= self.Duration() + epsilon)
return self.weighted_duration
def DescribeTargets(self):
"""Returns a printable string that summarizes the targets."""
# Some build steps generate dozens of outputs - handle them sanely.
# The max_length was chosen so that it can fit most of the long
# single-target names, while minimizing word wrapping.
result = ', '.join(self.targets)
max_length = 65
if len(result) > max_length:
result = result[:max_length] + '...'
return result
# Copied with some modifications from ninjatracing
def ReadTargets(log, show_all):
"""Reads all targets from .ninja_log file |log_file|, sorted by duration.
The result is a list of Target objects."""
header = log.readline()
assert header == '# ninja log v5\n', \
'unrecognized ninja log version {!r}'.format(header)
targets_dict = {}
last_end_seen = 0.0
for line in log:
parts = line.strip().split('\t')
if len(parts) != 5:
# If ninja.exe is rudely halted then the .ninja_log file may be
# corrupt. Silently continue.
continue
start, end, _, name, cmdhash = parts # Ignore restat.
# Convert from integral milliseconds to float seconds.
start = int(start) / 1000.0
end = int(end) / 1000.0
if not show_all and end < last_end_seen:
# An earlier time stamp means that this step is the first in a new
# build, possibly an incremental build. Throw away the previous
# data so that this new build will be displayed independently.
# This has to be done by comparing end times because records are
# written to the .ninja_log file when commands complete, so end
# times are guaranteed to be in order, but start times are not.
targets_dict = {}
target = None
if cmdhash in targets_dict:
target = targets_dict[cmdhash]
if not show_all and (target.start != start or target.end != end):
# If several builds in a row just run one or two build steps
# then the end times may not go backwards so the last build may
# not be detected as such. However in many cases there will be a
# build step repeated in the two builds and the changed
# start/stop points for that command, identified by the hash,
# can be used to detect and reset the target dictionary.
targets_dict = {}
target = None
if not target:
targets_dict[cmdhash] = target = Target(start, end)
last_end_seen = end
target.targets.append(name)
return list(targets_dict.values())
def GetExtension(target, extra_patterns):
"""Return the file extension that best represents a target.
For targets that generate multiple outputs it is important to return a
consistent 'canonical' extension. Ultimately the goal is to group build steps
by type."""
for output in target.targets:
if extra_patterns:
for fn_pattern in extra_patterns.split(';'):
if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
return fn_pattern
# Not a true extension, but a good grouping.
if output.endswith('type_mappings'):
extension = 'type_mappings'
break
# Capture two extensions if present. For example: file.javac.jar should
# be distinguished from file.interface.jar.
root, ext1 = os.path.splitext(output)
_, ext2 = os.path.splitext(root)
extension = ext2 + ext1 # Preserve the order in the file name.
if len(extension) == 0:
extension = '(no extension found)'
if ext1 in ['.pdb', '.dll', '.exe']:
extension = 'PEFile (linking)'
# Make sure that .dll and .exe are grouped together and that the
# .dll.lib files don't cause these to be listed as libraries
break
if ext1 in ['.so', '.TOC']:
extension = '.so (linking)'
# Attempt to identify linking, avoid identifying as '.TOC'
break
# Make sure .obj files don't get categorized as mojo files
if ext1 in ['.obj', '.o']:
break
# Jars are the canonical output of java targets.
if ext1 == '.jar':
break
# Normalize all mojo related outputs to 'mojo'.
if output.count('.mojom') > 0:
extension = 'mojo'
break
return extension
def SummarizeEntries(entries, extra_step_types):
"""Print a summary of the passed in list of Target objects."""
# Create a list that is in order by time stamp and has entries for the
# beginning and ending of each build step (one time stamp may have multiple
# entries due to multiple steps starting/stopping at exactly the same time).
# Iterate through this list, keeping track of which tasks are running at all
# times. At each time step calculate a running total for weighted time so
# that when each task ends its own weighted time can easily be calculated.
task_start_stop_times = []
earliest = -1
latest = 0
total_cpu_time = 0
for target in entries:
if earliest < 0 or target.start < earliest:
earliest = target.start
if target.end > latest:
latest = target.end
total_cpu_time += target.Duration()
task_start_stop_times.append((target.start, 'start', target))
task_start_stop_times.append((target.end, 'stop', target))
length = latest - earliest
weighted_total = 0.0
# Sort by the time/type records and ignore |target|
task_start_stop_times.sort(key=lambda times: times[:2])
# Now we have all task start/stop times sorted by when they happen. If a
# task starts and stops on the same time stamp then the start will come
# first because of the alphabet, which is important for making this work
# correctly.
# Track the tasks which are currently running.
running_tasks = {}
# Record the time we have processed up to so we know how to calculate time
# deltas.
last_time = task_start_stop_times[0][0]
# Track the accumulated weighted time so that it can efficiently be added
# to individual tasks.
last_weighted_time = 0.0
# Scan all start/stop events.
for event in task_start_stop_times:
time, action_name, target = event
# Accumulate weighted time up to now.
num_running = len(running_tasks)
if num_running > 0:
# Update the total weighted time up to this moment.
last_weighted_time += (time - last_time) / float(num_running)
if action_name == 'start':
# Record the total weighted task time when this task starts.
running_tasks[target] = last_weighted_time
if action_name == 'stop':
# Record the change in the total weighted task time while this task
# ran.
weighted_duration = last_weighted_time - running_tasks[target]
target.SetWeightedDuration(weighted_duration)
weighted_total += weighted_duration
del running_tasks[target]
last_time = time
assert (len(running_tasks) == 0)
# Warn if the sum of weighted times is off by more than half a second.
if abs(length - weighted_total) > 500:
print('Warning: Possible corrupt ninja log, results may be '
'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
length, weighted_total))
entries_by_ext = defaultdict(list)
for target in entries:
extension = GetExtension(target, extra_step_types)
entries_by_ext[extension].append(target)
for key, values in entries_by_ext.items():
print(' Longest build steps for {}:'.format(key))
values.sort(key=lambda x: x.WeightedDuration())
for target in values[-long_count:]:
print(
' {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
format(target.WeightedDuration(), target.DescribeTargets(),
target.Duration()))
print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
'parallelism)'.format(length, total_cpu_time,
total_cpu_time * 1.0 / length))
print(' %d build steps completed, average of %1.2f/s' %
(len(entries), len(entries) / (length)))
def main():
log_file = '.ninja_log'
parser = argparse.ArgumentParser()
parser.add_argument('-C', dest='build_directory', help='Build directory.')
parser.add_argument(
'-s',
'--step-types',
help='semicolon separated fnmatch patterns for build-step grouping')
parser.add_argument('--log-file',
help="specific ninja log file to analyze.")
args, _extra_args = parser.parse_known_args()
if args.build_directory:
log_file = os.path.join(args.build_directory, log_file)
if args.log_file:
log_file = args.log_file
if args.step_types:
# Make room for the extra build types.
global long_ext_count
long_ext_count += len(args.step_types.split(';'))
try:
with open(log_file) as log:
entries = ReadTargets(log, False)
SummarizeEntries(entries, args.step_types)
except OSError:
print('Log file {!r} not found, no build summary created.'.format(
log_file))
return errno.ENOENT
if __name__ == '__main__':
sys.exit(main())

22
vllm-v0.6.2/tools/shellcheck.sh Executable file
View File

@@ -0,0 +1,22 @@
#!/bin/bash
set -e
scversion="stable"
if [ -d "shellcheck-${scversion}" ]; then
export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
fi
if ! [ -x "$(command -v shellcheck)" ]; then
if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
exit 1
fi
# automatic local install if linux x86_64
wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
fi
# TODO - fix warnings in .buildkite/run-amd-test.sh
find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'

View File

@@ -0,0 +1,23 @@
### 1. 非page模式max_num_seqs自动调优工具
对于MLU370X8平台在unpage模式下可以通过调整`max_num_seqs`来提升性能。`tune_max_num_seqs.py`通过自动调参来搜索最佳`max_num_seqs`值。
- 用法示例
搜索固定配置下,使吞吐量最大`max_num_seqs`值,其中参数部分保持与`benchmark_latency.py`/`benchmark_throughput.py`一致。
```bash
python tools/utils/tune_max_num_seqs.py --backend vllm --input-len 1024 --output-len 1024 --model /Path/to/Llama-2-70b-chat-hf/ -tp 1 --max-model-len 4096 --dtype float16 --num-prompts 10
```
通过执行上述命令,可以搜索得到最优`max_num_seqs`配置在构建LLM对象时作为参数传入使用。
### 2. vLLM调度分析辅助工具
首先设置环境变量开启调度profilingexport VLLM_SCHEDULER_PROFILE=true
对于离线测试,测试结束后,会自动保存数据并打印出当前已经运行请求的信息
对于在线测试,获取调度数据的步骤如下:
1. 启动server
2. 运行client端测试
3. 等待client测试结束后立即运行python3 tools/utils/post_scheduler_view_action.py --host [server端ip地址] --port [server端口号] --action save请求server端将数据保存下来
4. server端会打印出当前已经运行请求的信息
5. 如果想再次运行client测试基于现有server先运行python3 tools/utils/post_scheduler_view_action.py --host [server端ip地址] --port [server端口号] --action init恢复server端然后重复2、3、4

View File

@@ -0,0 +1,27 @@
import argparse
import requests
""" Post a request to server, let server init/save scheduler view. """
def post_http_request(api_url: str, action: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
pload = {
"model": action,
"prompt": "",
"n": 1,
"temperature": 0.0,
"max_tokens": 16,
"stream": True,
}
response = requests.post(api_url, headers=headers, json=pload, stream=True)
return response
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=6000)
parser.add_argument("--action", type=str, default="save", choices=['init', 'save'])
args = parser.parse_args()
api_url = f"http://{args.host}:{args.port}/v1/completions"
post_http_request(api_url, f"{args.action}_scheduler_view")

View File

@@ -0,0 +1,181 @@
"""Autotune max_num_seqs paramter."""
# pylint: skip-file
import argparse
import random
from typing import Dict, Any
from tqdm import tqdm
def run_vllm(config: Dict[str, Any]) -> float:
"""Initialize and run an instance of a language model (LLM) using the
`vllm` library."""
print(f'Evaluate with max_num_seqs: {config["max_num_seqs"]}')
from vllm import LLM
llm = LLM(**config)
print(f'The num of gpu blocks is: {llm.llm_engine.cache_config.num_gpu_blocks}')
return llm.llm_engine.cache_config.num_gpu_blocks
def main(args: argparse.Namespace):
"""The entry function to tune max_num_seqs."""
print(args)
random.seed(args.seed)
config = {
'model': args.model,
'tokenizer': args.tokenizer,
'quantization': args.quantization,
'tensor_parallel_size': args.tensor_parallel_size,
'seed': args.seed,
'trust_remote_code': args.trust_remote_code,
'dtype': args.dtype,
'max_model_len': args.max_model_len,
'enforce_eager': args.enforce_eager,
'kv_cache_dtype': args.kv_cache_dtype,
'quantization_param_path': args.quantization_param_path,
'device': args.device,
'enable_prefix_caching': args.enable_prefix_caching,
'enable_chunked_prefill': args.enable_chunked_prefill,
'max_num_batched_tokens': args.max_num_batched_tokens,
'gpu_memory_utilization': args.gpu_memory_utilization,
'download_dir': args.download_dir,
'block_size': args.block_size
}
import multiprocessing
def worker_wrapper(config, output_queue):
"""Here we get the num_gpu_blocks by instantiate a llm object."""
result = run_vllm(config)
output_queue.put(result)
def get_num_gpu_blocks(cache, num_seqs) -> int:
"""Get the number of GPU blocks with parameter num_seqs."""
if num_seqs in cache:
return cache[num_seqs]
# Here since we cannot manually release the resources hold by Ray and NCCL,
# we evaluate a set of parameters by launching a separate process.
config['max_num_seqs'] = num_seqs
output_queue = multiprocessing.Queue()
process = multiprocessing.Process(target=worker_wrapper,
args=(config, output_queue))
process.start()
process.join()
result = output_queue.get()
cache[num_seqs] = result
return result
def find_optimal_max_num_seqs(init=256) -> int:
"""Search th optimal max_num_seqs which maximizes
min(max_num_seqs, num_gpu_blocks)."""
# Use cache to avoid repeated evaluations.
cache = {}
# Initialization seach range.
num_blocks = get_num_gpu_blocks(cache, init)
left, right = min(num_blocks, init), max(num_blocks, init)
# Binary search.
while 0 < left < right:
mid = (left + right) // 2
num_blocks = get_num_gpu_blocks(cache, mid)
if num_blocks == mid:
return mid
if num_blocks > mid:
left = mid + 1
else:
right = mid - 1
left = max(min(mid, num_blocks), left)
right = min(max(mid, num_blocks), right)
left, right = max(1, left), max(1, right)
final_left = min(left, get_num_gpu_blocks(cache, left))
final_right = min(right, get_num_gpu_blocks(cache, right))
return right if final_right > final_left else left
max_num_seqs = find_optimal_max_num_seqs()
print(f'The optimal max_num_seqs is {max_num_seqs}.')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Tune max_num_seqs.")
parser.add_argument("--backend", type=str, choices=["vllm"], default="vllm")
parser.add_argument("--dataset", type=str, default=None,
help="Path to the dataset.")
parser.add_argument("--input-len", type=int, default=None,
help="Input prompt length for each request")
parser.add_argument("--output-len", type=int, default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
parser.add_argument("--model", type=str, default="facebook/opt-125m")
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization', '-q',
choices=['awq', 'gptq', 'squeezellm', None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n", type=int, default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument("--num-prompts", type=int, default=1000,
help="Number of prompts to process.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--hf-max-batch-size", type=int, default=None,
help="Maximum batch size for HF backend.")
parser.add_argument("--block-size", type=int, default=-1)
parser.add_argument('--trust-remote-code', action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
'--max-model-len', type=int, default=None,
help='Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.')
parser.add_argument(
'--dtype', type=str, default='auto',
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.')
parser.add_argument('--gpu-memory-utilization', type=float, default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument("--enforce-eager", action="store_true",
help="enforce eager execution")
parser.add_argument(
"--kv-cache-dtype", type=str, choices=["auto", "fp8"], default="auto",
help=
'Data type for kv cache storage. If "auto", will use model data type.')
parser.add_argument(
'--quantization-param-path', type=str, default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
"--device", type=str, default="cuda", choices=["cuda"],
help='device type for vLLM execution, supporting CUDA only currently.')
parser.add_argument(
"--enable-prefix-caching", action='store_true',
help="enable automatic prefix caching for vLLM backend.")
parser.add_argument("--enable-chunked-prefill", action='store_true',
help="enable chunked prefill for vLLM backend.")
parser.add_argument('--max-num-batched-tokens', type=int, default=None,
help='maximum number of batched tokens per '
'iteration')
parser.add_argument('--download-dir', type=str, default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
cli_args = parser.parse_args()
if cli_args.tokenizer is None:
cli_args.tokenizer = cli_args.model
if cli_args.dataset is None:
assert cli_args.input_len is not None
assert cli_args.output_len is not None
else:
assert cli_args.input_len is None
main(cli_args)