forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
13
vllm-v0.6.2/tools/actionlint.sh
Executable file
13
vllm-v0.6.2/tools/actionlint.sh
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
if command -v actionlint &> /dev/null; then
|
||||
actionlint "$@"
|
||||
exit 0
|
||||
elif [ -x ./actionlint ]; then
|
||||
./actionlint "$@"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# download a binary to the current directory - v1.7.3
|
||||
bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
|
||||
./actionlint "$@"
|
||||
9
vllm-v0.6.2/tools/build.property
Normal file
9
vllm-v0.6.2/tools/build.property
Normal file
@@ -0,0 +1,9 @@
|
||||
TORCH_MLU_OPS_VERSION=1.3.2+pt25
|
||||
CATCH_VERSION=1.24.1+torch2.5.0
|
||||
CNCL_VERSION=1.24.1-1
|
||||
CNNL_VERSION=1.28.4-1
|
||||
CNNLEXTRA_VERSION=1.12.3-1
|
||||
CNTOOLKIT_VERSION=3.15.7-1
|
||||
MLUOPS_VERSION=1.4.1-1
|
||||
TRITON_VERSION=3.0.0+mlu1.3.1
|
||||
XFORMERS_VERSION=0.0.24+mlu0.5.0.pt2.5
|
||||
14
vllm-v0.6.2/tools/check_repo.sh
Normal file
14
vllm-v0.6.2/tools/check_repo.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
|
||||
|
||||
if ! git diff --quiet; then
|
||||
echo "Repo is dirty" >&2
|
||||
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! git describe --tags; then
|
||||
echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
|
||||
|
||||
exit 1
|
||||
fi
|
||||
8
vllm-v0.6.2/tools/config_env.sh
Executable file
8
vllm-v0.6.2/tools/config_env.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
export CN_NOTIFIER_POOL_MAX=1000
|
||||
export CN_TASKTOPO_RESIDENT=0
|
||||
export CNCL_STANDALONE_ENABLE=1
|
||||
export CNCL_TWOSHOT_ENABLE=1
|
||||
export CNPERF_DEBUG_DISABLE_CHILD_PROCESS=1
|
||||
export PYTORCH_CNDEV_BASED_MLU_CHECK=1
|
||||
export RAY_ROTATION_BACKUP_COUNT=10
|
||||
export RAY_ROTATION_MAX_BYTES=102400
|
||||
31
vllm-v0.6.2/tools/mypy.sh
Executable file
31
vllm-v0.6.2/tools/mypy.sh
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
CI=${1:-0}
|
||||
PYTHON_VERSION=${2:-3.9}
|
||||
|
||||
if [ "$CI" -eq 1 ]; then
|
||||
set -e
|
||||
fi
|
||||
|
||||
run_mypy() {
|
||||
echo "Running mypy on $1"
|
||||
if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
|
||||
mypy --python-version "${PYTHON_VERSION}" "$@"
|
||||
return
|
||||
fi
|
||||
mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
|
||||
}
|
||||
|
||||
run_mypy # Note that this is less strict than CI
|
||||
run_mypy tests
|
||||
run_mypy vllm/attention
|
||||
run_mypy vllm/compilation
|
||||
run_mypy vllm/distributed
|
||||
run_mypy vllm/engine
|
||||
run_mypy vllm/executor
|
||||
run_mypy vllm/lora
|
||||
run_mypy vllm/model_executor
|
||||
run_mypy vllm/plugins
|
||||
run_mypy vllm/prompt_adapter
|
||||
run_mypy vllm/spec_decode
|
||||
run_mypy vllm/worker
|
||||
77
vllm-v0.6.2/tools/profiler/print_layerwise_table.py
Normal file
77
vllm-v0.6.2/tools/profiler/print_layerwise_table.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import argparse
|
||||
import json
|
||||
from typing import Dict
|
||||
|
||||
from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry
|
||||
from vllm.profiler.utils import TablePrinter, indent_string
|
||||
|
||||
|
||||
def flatten_entries(entry_cls, profile_dict: Dict):
|
||||
entries_and_depth = []
|
||||
|
||||
def get_entries(node, curr_depth=0):
|
||||
entries_and_depth.append((entry_cls(**node["entry"]), curr_depth))
|
||||
|
||||
for child in node["children"]:
|
||||
get_entries(
|
||||
child,
|
||||
curr_depth=curr_depth + 1,
|
||||
)
|
||||
|
||||
for root in profile_dict:
|
||||
get_entries(root)
|
||||
|
||||
return entries_and_depth
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--json-trace",
|
||||
type=str,
|
||||
required=True,
|
||||
help="json trace file output by "
|
||||
"examples/offline_profile.py")
|
||||
parser.add_argument("--phase",
|
||||
type=str,
|
||||
choices=["prefill", "decode_1"],
|
||||
required=True,
|
||||
help="The phase to print the table for.")
|
||||
parser.add_argument("--table",
|
||||
type=str,
|
||||
choices=["summary", "model"],
|
||||
default="summary",
|
||||
help="Which table to print, the summary table or the "
|
||||
"layerwise model table")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.json_trace) as f:
|
||||
profile_data = json.load(f)
|
||||
|
||||
if args.table == "summary":
|
||||
entries_and_depths = flatten_entries(
|
||||
SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
|
||||
column_widths = dict(name=80,
|
||||
cuda_time_us=12,
|
||||
pct_cuda_time=12,
|
||||
invocations=15)
|
||||
elif args.table == "model":
|
||||
entries_and_depths = flatten_entries(
|
||||
ModelStatsEntry, profile_data[args.phase]["model_stats"])
|
||||
column_widths = dict(name=60,
|
||||
cpu_time_us=12,
|
||||
cuda_time_us=12,
|
||||
pct_cuda_time=12,
|
||||
trace=60)
|
||||
|
||||
# indent entry names based on the depth
|
||||
entries = []
|
||||
for entry, depth in entries_and_depths:
|
||||
entry.name = indent_string(
|
||||
entry.name,
|
||||
indent=depth,
|
||||
indent_style=lambda indent: "|" + "-" * indent + " ")
|
||||
entries.append(entry)
|
||||
|
||||
TablePrinter(type(entries[0]), column_widths).print_table(entries)
|
||||
522
vllm-v0.6.2/tools/profiler/visualize_layerwise_profile.py
Normal file
522
vllm-v0.6.2/tools/profiler/visualize_layerwise_profile.py
Normal file
@@ -0,0 +1,522 @@
|
||||
import argparse
|
||||
import copy
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
## JSON parsing utils ####
|
||||
|
||||
|
||||
def largest_dist_from_leaf(node: dict, depth: int = 0):
|
||||
if len(node["children"]) == 0:
|
||||
return depth
|
||||
return max([
|
||||
largest_dist_from_leaf(child, depth=depth + 1)
|
||||
for child in node["children"]
|
||||
])
|
||||
|
||||
|
||||
def get_entries_at_depth(depth: int,
|
||||
entries_and_traces: List[Tuple[Any, Any]],
|
||||
node: dict,
|
||||
curr_depth: int = 0,
|
||||
trace=()):
|
||||
# assert that the query is at kernel or module level
|
||||
assert depth == -1 or depth == -2
|
||||
|
||||
if curr_depth == 0 and largest_dist_from_leaf(node) <= (abs(depth) - 1):
|
||||
# The tree is not tall enough!
|
||||
entries_and_traces.append((node["entry"], trace))
|
||||
return
|
||||
|
||||
if largest_dist_from_leaf(node) == (abs(depth) - 1):
|
||||
entries_and_traces.append((node["entry"], trace))
|
||||
|
||||
trace = (node["entry"]["name"], ) + trace
|
||||
for child in node["children"]:
|
||||
get_entries_at_depth(depth,
|
||||
entries_and_traces,
|
||||
child,
|
||||
curr_depth=curr_depth + 1,
|
||||
trace=trace)
|
||||
|
||||
|
||||
def fold_nodes(root: dict, nodes_to_fold: List[str]):
|
||||
|
||||
stack: List[dict] = [root]
|
||||
while len(stack) != 0:
|
||||
node = stack.pop()
|
||||
if node['entry']['name'] in nodes_to_fold:
|
||||
node["children"] = []
|
||||
continue
|
||||
for child in node["children"]:
|
||||
stack.append(child)
|
||||
return root
|
||||
|
||||
|
||||
## Operation name cleanup utils ####
|
||||
|
||||
|
||||
def trim_string_back(string: str, width: int) -> str:
|
||||
if len(string) > width:
|
||||
offset = len(string) - width + 3
|
||||
string = string[:-offset]
|
||||
if len(string) > 3:
|
||||
string = string + "..."
|
||||
return string
|
||||
|
||||
|
||||
def shorten_plot_legend_strings(legend, max_char_len: int):
|
||||
for t in legend.get_texts():
|
||||
t.set_text(
|
||||
trim_string_back(abbreviate_known_names(t.get_text()),
|
||||
max_char_len))
|
||||
|
||||
|
||||
def abbreviate_known_names(name: str) -> str:
|
||||
abbreviations = {
|
||||
"MergedColumnParallelLinear": "MCPLinear",
|
||||
"QKVParallelLinear": "QKVPLinear",
|
||||
"RowParallelLinear": "RPLinear",
|
||||
"weight=": "w=",
|
||||
"bfloat16": "bf16",
|
||||
"float16": "f16",
|
||||
}
|
||||
for key, value in abbreviations.items():
|
||||
name = name.replace(key, value)
|
||||
return name
|
||||
|
||||
|
||||
def attempt_to_make_names_unique(entries_and_traces):
|
||||
names, non_unique_names = (set(), set())
|
||||
|
||||
def all_the_same(items) -> bool:
|
||||
return all(i == items[0] for i in items)
|
||||
|
||||
for entry, _ in entries_and_traces:
|
||||
if entry["name"] in names:
|
||||
non_unique_names.add(entry["name"])
|
||||
else:
|
||||
names.add(entry["name"])
|
||||
|
||||
for name in non_unique_names:
|
||||
entries_and_traces_with_name = [(entry, trace)
|
||||
for entry, trace in entries_and_traces
|
||||
if entry["name"] == name]
|
||||
|
||||
zipped_traces = list(
|
||||
zip(*[trace for _, trace in entries_and_traces_with_name]))
|
||||
first_trace_difference = next(
|
||||
(i for i, trace_eles in enumerate(zipped_traces)
|
||||
if not all_the_same(trace_eles)), None)
|
||||
|
||||
if first_trace_difference is None:
|
||||
# can't create a unique name, leave them names as the
|
||||
# are they will get aggregated by the pivot_table call
|
||||
continue
|
||||
|
||||
for entry, trace in entries_and_traces_with_name:
|
||||
entry["name"] = " <- ".join((entry["name"], ) +
|
||||
trace[:first_trace_difference + 1])
|
||||
|
||||
|
||||
## Operation grouping utils ####
|
||||
'''
|
||||
Group operations in the given dataframe by some high-level ops like,
|
||||
- gemms
|
||||
- attention
|
||||
- rms_norm
|
||||
etc.
|
||||
'''
|
||||
|
||||
|
||||
def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
|
||||
|
||||
def is_rms_norm(op_name: str):
|
||||
if "rms_norm_kernel" in op_name:
|
||||
return True
|
||||
|
||||
def is_attention_block(op_name: str):
|
||||
if "flash_fwd" in op_name or \
|
||||
"reshape_and_cache_flash_kernel" in op_name:
|
||||
return True
|
||||
|
||||
def is_quant(op_name: str):
|
||||
if "scaled_fp8_quant" in op_name or \
|
||||
"scaled_int8_quant" in op_name:
|
||||
return True
|
||||
|
||||
def is_gemm_op(op_name: str):
|
||||
if is_quant(op_name):
|
||||
return False
|
||||
if "xmma_gemm" in op_name or \
|
||||
"gemv2T_kernel" in op_name or \
|
||||
"splitKreduce" in op_name or \
|
||||
"void cutlass::Kernel" in op_name or \
|
||||
"void cutlass::device_kernel" in op_name or \
|
||||
"s16816gemm" in op_name:
|
||||
return True
|
||||
|
||||
def is_elementwise_op(op_name: str):
|
||||
return "elementwise_kernel" in op_name
|
||||
|
||||
def is_mem_op(op_name: str):
|
||||
return "memcpy" in op_name.lower() or \
|
||||
"memset" in op_name.lower()
|
||||
|
||||
def is_vocab_embedding_op(op_name: str):
|
||||
return "vocabparallelembed" in op_name.lower()
|
||||
|
||||
# nccl ops
|
||||
def is_nccl_op(op_name: str):
|
||||
return "nccl" in op_name.lower()
|
||||
|
||||
def is_nccl_all_reduce(op_name: str):
|
||||
return is_nccl_op(op_name) and \
|
||||
("all_reduce" in op_name.lower() or \
|
||||
"allreduce" in op_name.lower())
|
||||
|
||||
def is_nccl_gather(op_name: str):
|
||||
return is_nccl_op(op_name) and \
|
||||
"gather" in op_name.lower()
|
||||
|
||||
def is_nccl_broadcast(op_name: str):
|
||||
return is_nccl_op(op_name) and \
|
||||
"broadcast" in op_name.lower()
|
||||
|
||||
# Reduce ops types
|
||||
def is_cross_device_reduce_1stage(op_name: str):
|
||||
return "cross_device_reduce_1stage" in op_name
|
||||
|
||||
def is_cross_device_reduce_2stage(op_name: str):
|
||||
return "cross_device_reduce_2stage" in op_name
|
||||
|
||||
def is_custom_ar_all_reduce(op_name: str):
|
||||
return "_C_custom_ar::all_reduce" in op_name
|
||||
|
||||
def is_reduce_kernel(op_name: str):
|
||||
return "reduce_kernel" in op_name
|
||||
|
||||
headers = list(trace_df)
|
||||
ops = copy.deepcopy(headers)
|
||||
|
||||
attention_ops = list(filter(lambda x: is_attention_block(x), ops))
|
||||
ops = list(filter(lambda x: x not in attention_ops, ops))
|
||||
|
||||
quant_ops = list(filter(lambda x: is_quant(x), ops))
|
||||
ops = list(filter(lambda x: x not in quant_ops, ops))
|
||||
|
||||
gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
|
||||
ops = list(filter(lambda x: x not in gemm_ops, ops))
|
||||
|
||||
rms_norm_ops = list(filter(lambda x: is_rms_norm(x), ops))
|
||||
ops = list(filter(lambda x: x not in rms_norm_ops, ops))
|
||||
|
||||
vocab_embed_ops = list(filter(lambda x: is_vocab_embedding_op(x), ops))
|
||||
ops = list(filter(lambda x: x not in vocab_embed_ops, ops))
|
||||
|
||||
mem_ops = list(filter(lambda x: is_mem_op(x), ops))
|
||||
ops = list(filter(lambda x: x not in mem_ops, ops))
|
||||
|
||||
elementwise_ops = list(filter(lambda x: is_elementwise_op(x), ops))
|
||||
ops = list(filter(lambda x: x not in elementwise_ops, ops))
|
||||
|
||||
nccl_all_reduce_ops = list(filter(lambda x: is_nccl_all_reduce(x), ops))
|
||||
ops = list(filter(lambda x: x not in nccl_all_reduce_ops, ops))
|
||||
|
||||
nccl_gather_ops = list(filter(lambda x: is_nccl_gather(x), ops))
|
||||
ops = list(filter(lambda x: x not in nccl_gather_ops, ops))
|
||||
|
||||
nccl_broadcast_ops = list(filter(lambda x: is_nccl_broadcast(x), ops))
|
||||
ops = list(filter(lambda x: x not in nccl_broadcast_ops, ops))
|
||||
|
||||
nccl_other_ops = list(filter(lambda x: is_nccl_op(x), ops))
|
||||
ops = list(filter(lambda x: x not in nccl_other_ops, ops))
|
||||
|
||||
cross_device_reduce_1stage_ops = list(
|
||||
filter(lambda x: is_cross_device_reduce_1stage(x), ops))
|
||||
ops = list(filter(lambda x: x not in cross_device_reduce_1stage_ops, ops))
|
||||
|
||||
cross_device_reduce_2stage_ops = list(
|
||||
filter(lambda x: is_cross_device_reduce_2stage(x), ops))
|
||||
ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops))
|
||||
|
||||
custom_ar_all_reduce_ops = list(
|
||||
filter(lambda x: is_custom_ar_all_reduce(x), ops))
|
||||
ops = list(filter(lambda x: x not in custom_ar_all_reduce_ops, ops))
|
||||
|
||||
reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops))
|
||||
ops = list(filter(lambda x: x not in reduce_kernel_ops, ops))
|
||||
|
||||
if len(attention_ops):
|
||||
trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
|
||||
if len(quant_ops):
|
||||
trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
|
||||
if len(gemm_ops):
|
||||
trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
|
||||
if len(rms_norm_ops):
|
||||
trace_df['rms_norm_ops'] = trace_df[rms_norm_ops].agg("sum", axis=1)
|
||||
if len(vocab_embed_ops):
|
||||
trace_df['vocab_embed_ops'] = trace_df[vocab_embed_ops].agg("sum",
|
||||
axis=1)
|
||||
if len(mem_ops):
|
||||
trace_df['mem_ops'] = trace_df[mem_ops].agg("sum", axis=1)
|
||||
if len(elementwise_ops):
|
||||
trace_df['elementwise_ops'] = trace_df[elementwise_ops].agg("sum",
|
||||
axis=1)
|
||||
|
||||
if len(nccl_all_reduce_ops):
|
||||
trace_df['nccl_all_reduce_ops'] = trace_df[nccl_all_reduce_ops].agg(
|
||||
"sum", axis=1)
|
||||
if len(nccl_gather_ops):
|
||||
trace_df['nccl_gather_ops'] = trace_df[nccl_gather_ops].agg("sum",
|
||||
axis=1)
|
||||
if len(nccl_broadcast_ops):
|
||||
trace_df['nccl_broadcast_ops'] = trace_df[nccl_broadcast_ops].agg(
|
||||
"sum", axis=1)
|
||||
if len(nccl_other_ops):
|
||||
trace_df['nccl_other_ops'] = trace_df[nccl_other_ops].agg("sum",
|
||||
axis=1)
|
||||
|
||||
if len(cross_device_reduce_1stage_ops):
|
||||
trace_df['cross_device_reduce_1stage_ops'] = trace_df[
|
||||
cross_device_reduce_1stage_ops].agg("sum", axis=1)
|
||||
if len(cross_device_reduce_2stage_ops):
|
||||
trace_df['cross_device_reduce_2stage_ops'] = trace_df[
|
||||
cross_device_reduce_2stage_ops].agg("sum", axis=1)
|
||||
if len(custom_ar_all_reduce_ops):
|
||||
trace_df['custom_ar_all_reduce_ops'] = trace_df[
|
||||
custom_ar_all_reduce_ops].agg("sum", axis=1)
|
||||
if len(reduce_kernel_ops):
|
||||
trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
|
||||
axis=1)
|
||||
|
||||
trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops +
|
||||
vocab_embed_ops + mem_ops + elementwise_ops +
|
||||
nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
|
||||
nccl_other_ops + cross_device_reduce_1stage_ops +
|
||||
cross_device_reduce_2stage_ops + custom_ar_all_reduce_ops +
|
||||
reduce_kernel_ops,
|
||||
axis=1,
|
||||
inplace=True)
|
||||
return trace_df
|
||||
|
||||
|
||||
## Data plotting utils ####
|
||||
|
||||
|
||||
def plot_trace_df(traces_df: pd.DataFrame,
|
||||
plot_metric: str,
|
||||
plot_title: str,
|
||||
output: Optional[Path] = None):
|
||||
|
||||
phases = traces_df['phase'].unique()
|
||||
traces_df = traces_df.pivot_table(index="phase",
|
||||
columns="name",
|
||||
values=plot_metric,
|
||||
aggfunc="sum")
|
||||
|
||||
traces_df = group_trace_by_operations(traces_df)
|
||||
|
||||
# Make the figure
|
||||
fig, ax = plt.subplots(1, figsize=(5, 8), sharex=True)
|
||||
|
||||
# Draw the stacked bars
|
||||
ops = list(traces_df)
|
||||
bottom = [0] * len(phases)
|
||||
for op in ops:
|
||||
values = [traces_df[op][phase] for phase in phases]
|
||||
values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
|
||||
ax.bar(phases, values, label=op, bottom=bottom)
|
||||
bottom = [bottom[j] + values[j] for j in range(len(phases))]
|
||||
|
||||
# Write the values as text on the bars
|
||||
for bar in ax.patches:
|
||||
if bar.get_height() != 0:
|
||||
ax.text(bar.get_x() + bar.get_width() / 2,
|
||||
bar.get_height() / 2 + bar.get_y(),
|
||||
f"{round(bar.get_height(), 2)}",
|
||||
ha='center',
|
||||
color='w',
|
||||
weight='bold',
|
||||
size=5)
|
||||
|
||||
# Setup legend
|
||||
handles, labels = plt.gca().get_legend_handles_labels()
|
||||
legend = fig.legend(handles,
|
||||
labels,
|
||||
loc='center left',
|
||||
bbox_to_anchor=(1, 1))
|
||||
shorten_plot_legend_strings(legend, 50)
|
||||
|
||||
# Setup labels and title
|
||||
plt.setp(ax.get_xticklabels(), rotation=90)
|
||||
ax.set_ylabel(plot_metric)
|
||||
plt.suptitle(plot_title)
|
||||
|
||||
plt.savefig(output, bbox_inches='tight')
|
||||
print("Created: ", output)
|
||||
|
||||
|
||||
def main(
|
||||
json_trace: Path,
|
||||
output_directory: Path,
|
||||
depth: int, # Fetch/Plot operations at this depth of the Json tree
|
||||
plot_metric: str,
|
||||
make_names_unique: bool,
|
||||
top_k: int,
|
||||
json_nodes_to_fold: List[str]):
|
||||
|
||||
def prepare_data(profile_json: dict, step_keys: List[str]) -> pd.DataFrame:
|
||||
|
||||
def get_entries_and_traces(key: str):
|
||||
entries_and_traces: List[Tuple[Any, Any]] = []
|
||||
for root in profile_json[key]["summary_stats"]:
|
||||
# Fold nodes in the traces as per user request. i.e. simply
|
||||
# make the requested nodes leaf-nodes.
|
||||
root = fold_nodes(root, json_nodes_to_fold)
|
||||
get_entries_at_depth(depth, entries_and_traces, root)
|
||||
return entries_and_traces
|
||||
|
||||
def keep_only_top_entries(df: pd.DataFrame,
|
||||
metric: str,
|
||||
top_k: int = 9) -> pd.DataFrame:
|
||||
df.loc[df.nsmallest(len(df) - top_k + 1, metric).index,
|
||||
["name"]] = "others"
|
||||
return df
|
||||
|
||||
# Get data for each key
|
||||
traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
|
||||
|
||||
# Attempt some cleanup
|
||||
if make_names_unique:
|
||||
for trace in traces:
|
||||
attempt_to_make_names_unique(trace)
|
||||
|
||||
# To pandas dataframe
|
||||
trace_dfs = list(
|
||||
map(lambda t: pd.DataFrame([entry for entry, _ in t]).fillna(0),
|
||||
traces))
|
||||
|
||||
# Respect top_k
|
||||
if top_k:
|
||||
trace_dfs = list(
|
||||
map(
|
||||
lambda trace_df: keep_only_top_entries(
|
||||
trace_df, "cuda_time_us", top_k), trace_dfs))
|
||||
|
||||
# Fill in information about the step-keys
|
||||
for trace_df, step_key in zip(trace_dfs, step_keys):
|
||||
trace_df['phase'] = step_key
|
||||
|
||||
# Combine all data frames so they can be put in a single plot
|
||||
traces_df = pd.concat(trace_dfs)
|
||||
|
||||
# Add a derived metric `cuda_time_ms`
|
||||
traces_df["cuda_time_ms"] = traces_df["cuda_time_us"] / 1000
|
||||
traces_df = traces_df.fillna(0)
|
||||
|
||||
return traces_df
|
||||
|
||||
def make_plot_title_suffix(profile_json: dict) -> str:
|
||||
context = profile_json["context"]
|
||||
sparsity = context.get('sparsity', None)
|
||||
return (f"{context['model']}\n"
|
||||
f"Batch={context['batch_size']}, "
|
||||
f"PromptLen={context['prompt_len']}, "
|
||||
f"OutputLen={context['output_len']},"
|
||||
f"NumGpus={context['tensor_parallel_size']}"
|
||||
f"{', Sparsity ' + sparsity if sparsity else ''}")
|
||||
|
||||
profile_json = None
|
||||
with open(json_trace) as f:
|
||||
profile_json = json.load(f)
|
||||
assert profile_json is not None
|
||||
|
||||
# Get all `llm.generate.step()` profile
|
||||
step_traces = list(profile_json.keys())
|
||||
assert (step_traces[0] == 'context')
|
||||
step_traces = step_traces[1:] # have only prefill and decodes
|
||||
prefills = list(filter(lambda x: "prefill" in x, step_traces))
|
||||
all_decodes = list(filter(lambda x: "decode" in x, step_traces))
|
||||
assert len(prefills) + len(all_decodes) == len(step_traces)
|
||||
assert len(prefills) == 1
|
||||
|
||||
decodes = all_decodes[::args.step_plot_interval]
|
||||
if decodes[-1] != all_decodes[-1]:
|
||||
# Always have the last decode
|
||||
decodes.append(all_decodes[-1])
|
||||
|
||||
prefill_traces = prepare_data(profile_json, prefills)
|
||||
decode_traces = prepare_data(profile_json, decodes)
|
||||
|
||||
plot_title_suffix = make_plot_title_suffix(profile_json)
|
||||
|
||||
plot_trace_df(prefill_traces, plot_metric, "prefill " + plot_title_suffix,
|
||||
output_directory / Path("prefill.png"))
|
||||
plot_trace_df(decode_traces, plot_metric, "decodes " + plot_title_suffix,
|
||||
output_directory / Path("decode_steps.png"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--json-trace",
|
||||
type=str,
|
||||
required=True,
|
||||
help="json trace file output by examples/offline_profile.py")
|
||||
parser.add_argument("--output-directory",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Directory to output plots")
|
||||
parser.add_argument("--level",
|
||||
type=str,
|
||||
default="module",
|
||||
choices=["module", "kernel"])
|
||||
parser.add_argument("--top-k",
|
||||
type=int,
|
||||
default=12,
|
||||
help="Only graph the top `top_k` entries by time.")
|
||||
parser.add_argument("--fold-json-node",
|
||||
nargs='+',
|
||||
default=['Sampler', 'LogitsProcessor'],
|
||||
help='Do not plot the children of these nodes. Let, \
|
||||
the node represent the aggregate of all its \
|
||||
children')
|
||||
parser.add_argument("--plot-metric",
|
||||
type=str,
|
||||
default="cuda_time_ms",
|
||||
help='Metric to plot. some options are cuda_time_ms, \
|
||||
pct_cuda_time')
|
||||
parser.add_argument(
|
||||
"--step-plot-interval",
|
||||
type=int,
|
||||
default=4,
|
||||
help="For every `step_plot_interval` steps, plot 1 step")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Prepare/Extract relevant args
|
||||
make_names_unique = False
|
||||
if args.level == "module":
|
||||
depth = -2
|
||||
make_names_unique = True
|
||||
elif args.level == "kernel":
|
||||
depth = -1
|
||||
else:
|
||||
raise Exception(f"Unexpected level value ({args.level})")
|
||||
|
||||
output_directory = args.output_directory if args.output_directory else Path(
|
||||
args.json_trace).parent
|
||||
|
||||
if not os.path.exists(output_directory):
|
||||
os.makedirs(output_directory)
|
||||
|
||||
main(Path(args.json_trace), output_directory, depth, args.plot_metric,
|
||||
make_names_unique, args.top_k, args.fold_json_node)
|
||||
0
vllm-v0.6.2/tools/quant_tools/__init__.py
Normal file
0
vllm-v0.6.2/tools/quant_tools/__init__.py
Normal file
419
vllm-v0.6.2/tools/quant_tools/convert_checkpoint.py
Normal file
419
vllm-v0.6.2/tools/quant_tools/convert_checkpoint.py
Normal file
@@ -0,0 +1,419 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import sys
|
||||
import time
|
||||
import safetensors
|
||||
import logging
|
||||
import json
|
||||
from huggingface_hub import split_torch_state_dict_into_shards, constants
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.transformers_utils.config import get_config, get_hf_text_config
|
||||
from vllm.config import _get_and_verify_max_len
|
||||
import transformers
|
||||
from transformers.modeling_utils import SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
||||
|
||||
from smooth_quant import generate_weights_of_smoothquant
|
||||
from weight_only import generate_weights_of_weight_only
|
||||
from utils_internal import (read_model_name, load_tokenizer, torch_dtype_to_str, str_dtype_to_torch,
|
||||
copy_files_except_extensions, generate_datetime, get_hf_config_sliding_window)
|
||||
from utils_internal import get_skip_patterns, should_skip
|
||||
from model_special import smooth_model_config
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
|
||||
sys.path.append(os.getcwd())
|
||||
|
||||
logger = logging.getLogger("smooth_convert")
|
||||
|
||||
def load_skip_params_from_hf(args):
|
||||
'''
|
||||
load parameters from transformers that do no need to be quantized.
|
||||
'''
|
||||
model_type = args.model_type
|
||||
if not get_skip_patterns(model_type):
|
||||
return {}
|
||||
try:
|
||||
model = getattr(transformers, args.model_name, None)
|
||||
if model is None:
|
||||
model = AutoModelForCausalLM
|
||||
model = model.from_pretrained(
|
||||
args.hf_model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=args.torch_dtype,
|
||||
device_map="cpu")
|
||||
except Exception as e:
|
||||
logger.fatal(f"Unsupported model {args.model_name}, error message: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
params_map = {}
|
||||
hf_params = dict(model.named_parameters())
|
||||
for name, param in hf_params.items():
|
||||
if should_skip(model_type, name):
|
||||
logger.info(f"load parameters from transformers, name: {name}")
|
||||
params_map[name] = param
|
||||
return params_map
|
||||
|
||||
def save_quantized_weights_to_safetensors(quantized_weights, args):
|
||||
'''
|
||||
save quantized_weights to safetensors format
|
||||
'''
|
||||
# Store the state_dict to file.
|
||||
max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
|
||||
state_dict_split = split_torch_state_dict_into_shards(quantized_weights,
|
||||
filename_pattern=constants.SAFETENSORS_WEIGHTS_FILE_PATTERN,
|
||||
max_shard_size=max_shard_size)
|
||||
# Save the model
|
||||
for shard_name, tensors in state_dict_split.filename_to_tensors.items():
|
||||
shard = {tensor: quantized_weights[tensor] for tensor in tensors}
|
||||
safetensors.torch.save_file(shard, os.path.join(args.output_dir, shard_name), metadata={"format": "pt"})
|
||||
|
||||
if state_dict_split.is_sharded:
|
||||
index = {
|
||||
"metadata": state_dict_split.metadata,
|
||||
"weight_map": state_dict_split.tensor_to_filename,
|
||||
}
|
||||
save_index_file = os.path.join(args.output_dir, SAFE_WEIGHTS_INDEX_NAME)
|
||||
with open(save_index_file, "w", encoding="utf-8") as f:
|
||||
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
|
||||
f.write(content)
|
||||
logger.info(
|
||||
f"The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be "
|
||||
f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where "
|
||||
f"each parameters has been saved in the index located at {save_index_file}."
|
||||
)
|
||||
else:
|
||||
logger.info(f"Model weights saved in {os.path.join(args.output_dir, SAFE_WEIGHTS_NAME)}")
|
||||
|
||||
|
||||
def main(args):
|
||||
'''
|
||||
main quantization logic
|
||||
'''
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=args.log_level,
|
||||
force=True,
|
||||
)
|
||||
|
||||
tik = time.time()
|
||||
|
||||
skip_params = load_skip_params_from_hf(args)
|
||||
# Create an LLM.
|
||||
max_model_len = max(args.max_input_length + args.output_len, 2048)
|
||||
args.max_model_len = min(max_model_len, args.hf_max_model_len)
|
||||
|
||||
max_num_batched_tokens = max(max(args.max_input_length * args.batch_size, max_model_len), 2048)
|
||||
args.max_num_batched_tokens = min(max_num_batched_tokens, args.hf_max_model_len)
|
||||
llm = LLM(model=args.hf_model_dir,
|
||||
tokenizer=args.tokenizer_dir,
|
||||
tensor_parallel_size=args.tp_size,
|
||||
distributed_executor_backend='ray',
|
||||
dtype=args.dtype,
|
||||
enforce_eager=args.enforce_eager,
|
||||
trust_remote_code=True,
|
||||
block_size=args.block_size,
|
||||
max_model_len=args.max_model_len,
|
||||
max_num_batched_tokens=args.max_num_batched_tokens,
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
cpu_offload_gb=args.cpu_offload_gb)
|
||||
tok = time.time()
|
||||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||||
|
||||
logger.info(f'Load vLLM model takes: {t}')
|
||||
|
||||
quantize_config = {}
|
||||
if args.use_weight_only:
|
||||
st_prefix = f"weight_{args.weight_only_precision}"
|
||||
quantized_weights = generate_weights_of_weight_only(llm, args)
|
||||
quantize_config['bits'] = 8 if args.weight_only_precision == "int8" else 4
|
||||
quantize_config['quant_method'] = "weightonly"
|
||||
quantize_config['quant_mode'] = "WeightOnly"
|
||||
|
||||
if args.use_smoothquant:
|
||||
st_prefix = f"smoothquant_{args.smooth_value}"
|
||||
quantized_weights, smooth_info = generate_weights_of_smoothquant(llm, args)
|
||||
quantize_config['bits'] = 8
|
||||
quantize_config['quant_method'] = "smoothquant"
|
||||
quantize_config['quant_mode'] = "SmoothQuant"
|
||||
quantize_config['input_quant_method'] = "per_token" if args.per_token else "per_tensor"
|
||||
quantize_config['smooth_value'] = args.smooth_value
|
||||
with open(os.path.join(args.output_dir, 'smooth_info.json'), 'w') as f:
|
||||
json.dump(smooth_info, f, indent=4)
|
||||
|
||||
# Should first copy other files from hf_model_dir, and then save weight, tokenizer, config, quant_config and so on
|
||||
extensions = ['.bin', '.safetensors', ".pt", ".index.json"]
|
||||
copy_files_except_extensions(args.hf_model_dir, args.output_dir, extensions)
|
||||
logger.info(f'copy files except extensions success')
|
||||
|
||||
for name, param in skip_params.items():
|
||||
assert name in quantized_weights
|
||||
quantized_weights[name] = param
|
||||
save_quantized_weights_to_safetensors(quantized_weights, args)
|
||||
logger.info(f'save quantized_weights to safetensors success')
|
||||
|
||||
with open(os.path.join(args.output_dir, 'quantize_config.json'), 'w') as f:
|
||||
json.dump(quantize_config, f, indent=4)
|
||||
|
||||
from transformers.utils import CONFIG_NAME
|
||||
with open(os.path.join(args.hf_model_dir, CONFIG_NAME), 'r') as f:
|
||||
config = json.load(f)
|
||||
config['quantization_config'] = quantize_config
|
||||
config['generate_datetime'] = generate_datetime()
|
||||
config['torch_dtype'] = args.dtype
|
||||
with open(os.path.join(args.output_dir, CONFIG_NAME), 'w') as f:
|
||||
json.dump(config, f, indent=4)
|
||||
|
||||
logger.info(f'quantized {args.hf_model_dir} finished')
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--hf_model_dir', type=str, default=None)
|
||||
parser.add_argument('--tokenizer_dir',
|
||||
default=None,
|
||||
help='tokenizer path; defaults to hf_model_dir if left unspecified')
|
||||
parser.add_argument(
|
||||
'--enforce_eager',
|
||||
action="store_true",
|
||||
default=True,
|
||||
help='Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model '
|
||||
'in eager mode. If False, we will use CUDA graph and eager execution in hybrid.')
|
||||
parser.add_argument('--dtype',
|
||||
type=str,
|
||||
choices=['auto', 'float32', 'float16', 'bfloat16'],
|
||||
default='auto',
|
||||
help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
|
||||
parser.add_argument('--scales_smooth_dtype',
|
||||
type=str,
|
||||
choices=['auto', 'float32', 'float16', 'bfloat16'],
|
||||
default='auto',
|
||||
help="if auto, scales and smooth weights use args.dtype, else use the setted dtype")
|
||||
parser.add_argument(
|
||||
'--eval_task',
|
||||
type=str,
|
||||
default='summarize',
|
||||
choices=['summarize', 'summarize_long', 'code_completion', 'summarize_hg', 'text_generation', 'custom'],
|
||||
help='''eval task to decide which dataset is selected. When set to custom, you must set these options
|
||||
dataset_name, dataset_revision, dataset_input_key, dataset_split to specify which dataset to use''')
|
||||
parser.add_argument("--dataset_cache_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="cache dir to load the hugging face dataset")
|
||||
parser.add_argument("--dataset_name", type=str, default=None, help="custom dataset name")
|
||||
parser.add_argument("--dataset_revision", type=str, default=None, help="custom dataset version")
|
||||
parser.add_argument("--dataset_input_key", type=str, default=None, help="custom dataset field")
|
||||
parser.add_argument("--dataset_split", type=str, default=None, help="custom dataset split")
|
||||
parser.add_argument('--log_level', type=int, default=logging.INFO)
|
||||
parser.add_argument('--num_samples', type=int, default=512, help='num prompt sample')
|
||||
parser.add_argument('--output_len',
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of output sequences to return for the given prompt")
|
||||
parser.add_argument('--max_input_length',
|
||||
type=int,
|
||||
default=512,
|
||||
help='max input length of the prompt')
|
||||
parser.add_argument('--block_size', type=int, default=-1, help='Token block size for contiguous chunks of tokens.')
|
||||
parser.add_argument('--temperature', type=float, default=1.0)
|
||||
parser.add_argument('--top_p', type=float, default=1.0)
|
||||
parser.add_argument('--top_k', type=int, default=-1)
|
||||
parser.add_argument('--repetition_penalty', type=float, default=1.0)
|
||||
parser.add_argument('--max_num_seqs',
|
||||
type=int,
|
||||
default=EngineArgs.max_num_seqs,
|
||||
help='Maximum number of sequences per iteration.')
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default="output_dir",
|
||||
help="The path to save the quantized checkpoint")
|
||||
parser.add_argument(
|
||||
"--max_shard_size",
|
||||
type=str,
|
||||
default="10GB",
|
||||
help=("The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size "
|
||||
"lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`)"),
|
||||
)
|
||||
parser.add_argument('--tp_size', type=int, default=1, help='N-way tensor parallelism size')
|
||||
parser.add_argument('--pp_size', type=int, default=1, help='N-way pipeline parallelism size, now supported num')
|
||||
parser.add_argument('--use_smoothquant',
|
||||
default=False,
|
||||
action="store_true",
|
||||
help='Apply smoothquant to generate weight')
|
||||
parser.add_argument("--smooth_value",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
|
||||
" to Smoothquant the model, and output int8 weights."
|
||||
" A good first try is 0.5. Must be in [0, 1]")
|
||||
parser.add_argument('--per_channel',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='By default, we use a single static scaling factor for the GEMM\'s result. '
|
||||
'per_channel instead uses a different static scaling factor for each channel. '
|
||||
'The latter is usually more accurate, but a little slower.')
|
||||
parser.add_argument(
|
||||
'--per_token',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='By default, we use a single static scaling factor to scale activations in the int8 range. '
|
||||
'per_token chooses at run time, and for each token, a custom scaling factor. '
|
||||
'The latter is usually more accurate, but a little slower.')
|
||||
parser.add_argument('--use_weight_only',
|
||||
default=False,
|
||||
action="store_true",
|
||||
help='Quantize weights for the various GEMMs to INT4/INT8.'
|
||||
'See --weight_only_precision to set the precision')
|
||||
parser.add_argument('--weight_only_precision',
|
||||
const='int8',
|
||||
type=str,
|
||||
nargs='?',
|
||||
default='int8',
|
||||
choices=['int8', 'int4'],
|
||||
help='Define the precision for the weights when using weight-only quantization.'
|
||||
'You must also use --use_weight_only for that argument to have an impact.')
|
||||
parser.add_argument(
|
||||
'--has_qzeros',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='whether to add qzeros weight to vllm_mlu weight',
|
||||
)
|
||||
parser.add_argument('--model_version',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Set model version to replace parsing from _name_or_path in hf config.")
|
||||
parser.add_argument('--model_type',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Set model type to replace parsing from model_type in hf config."
|
||||
"if set is None and parsed also None, then set as model_version")
|
||||
parser.add_argument('--no_add_special_tokens',
|
||||
dest='add_special_tokens',
|
||||
default=True,
|
||||
action='store_false',
|
||||
help="Whether or not to add special tokens")
|
||||
parser.add_argument(
|
||||
'--has_prompt_token_id',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='whether to give llm.generate prompt_token_id',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--disable_fused_quantize_expert',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='''disable fused activation to quantize for unfused moe usage.
|
||||
Because to fused_moe smoothquant, input_smooth has shape (hidden_size), act_smooth has shape (inner_size),
|
||||
and not every expert can be routed, so we assume that all expert should use the same act_smooth by default.
|
||||
You can use this option to close the assumption.'''
|
||||
)
|
||||
parser.add_argument('--prompt_file',
|
||||
type=str,
|
||||
default=None,
|
||||
help="custom prompt file, should has format that each line is one string prompt,"
|
||||
"you can refer the format of summarize_1024_prompts.csv")
|
||||
parser.add_argument(
|
||||
'--batch_size',
|
||||
type=int,
|
||||
default=-1,
|
||||
help="batch size, used to limit max_num_batched_tokens, -1 means batch_size equals to num_samples"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--cpu_offload_gb',
|
||||
type=float,
|
||||
default=0.0,
|
||||
help='''The size (GiB) of CPU memory to use for offloading the model weights.
|
||||
This virtually increases the GPU memory space you can use to hold the model weights,
|
||||
at the cost of CPU-GPU data transfer for every forward pass.'''
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_prompt_token_ids',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump prompt_token_ids used by llm.generate ',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_input_ids',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump vllm qkv used token ids at llm running',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_act_range',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump act range which is the max hidden dim value of input, output, weigth',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_weights',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump weights of the converted model',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump_generate_weights',
|
||||
action="store_true",
|
||||
default=False,
|
||||
help='dump generate weights of the converted model',
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
|
||||
assert args.pp_size == 1, "Pipeline parallelism is not supported."
|
||||
|
||||
if args.tokenizer_dir is None:
|
||||
args.tokenizer_dir = args.hf_model_dir
|
||||
|
||||
if args.has_prompt_token_id is False:
|
||||
args.dump_prompt_token_ids = False
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
|
||||
args.hf_model_dir, args.model_version, args.model_type)
|
||||
assert args.model_type in smooth_model_config, f'''{args.model_type} hasn't supported,
|
||||
please add it's infomation in model_special.py by your self'''
|
||||
|
||||
args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
|
||||
hf_text_config = get_hf_text_config(args.hf_config)
|
||||
args.tie_word_embeddings = getattr(hf_text_config, "tie_word_embeddings", False)
|
||||
sliding_window_len = get_hf_config_sliding_window(hf_text_config)
|
||||
disable_sliding_window = sliding_window_len is None
|
||||
if args.model_type == 'qwen2_vl':
|
||||
# workround for qwen2_vl since _get_and_verify_max_len not supported for MRoPE
|
||||
# remove this when it is supported.
|
||||
args.hf_max_model_len = 32768
|
||||
else:
|
||||
if args.model_type == 'hunyuan' or args.model_type == 'deepseek_v2':
|
||||
disable_sliding_window=False
|
||||
args.hf_max_model_len = _get_and_verify_max_len(hf_text_config, None, disable_sliding_window, sliding_window_len)
|
||||
|
||||
if args.batch_size < 1:
|
||||
args.batch_size = args.num_samples
|
||||
|
||||
args.batch_size = min(args.batch_size, args.num_samples)
|
||||
if args.dtype == "auto":
|
||||
args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
|
||||
|
||||
if args.scales_smooth_dtype == "auto":
|
||||
args.scales_smooth_dtype = args.dtype
|
||||
|
||||
args.torch_dtype = str_dtype_to_torch(args.dtype)
|
||||
args.torch_scales_smooth_dtype = str_dtype_to_torch(args.scales_smooth_dtype)
|
||||
args.hf_config.torch_dtype = args.torch_dtype
|
||||
|
||||
args.tokenizer, args.pad_id, args.end_id = load_tokenizer(
|
||||
tokenizer_dir=args.tokenizer_dir,
|
||||
model_name=args.model_name,
|
||||
model_version=args.model_version,
|
||||
)
|
||||
|
||||
tik = time.time()
|
||||
main(args)
|
||||
|
||||
tok = time.time()
|
||||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||||
logger.info(f'Total time of converting checkpoints: {t}')
|
||||
69
vllm-v0.6.2/tools/quant_tools/dump_hf_weight.py
Normal file
69
vllm-v0.6.2/tools/quant_tools/dump_hf_weight.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import os
|
||||
import argparse
|
||||
from transformers import (AutoModel, AutoModelForCausalLM,
|
||||
AutoModelForSeq2SeqLM, GenerationConfig)
|
||||
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from utils_internal import (read_model_name, torch_dtype_to_str, str_dtype_to_torch)
|
||||
from dump_smooth import save_weights
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--hf_model_dir', type=str, default=None)
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default="output_dir",
|
||||
help="The path to save the quantized checkpoint")
|
||||
parser.add_argument('--model_version',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Set model version to replace parsing from _name_or_path in hf config.")
|
||||
parser.add_argument('--model_type',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Set model type to replace parsing from model_type in hf config."
|
||||
"if set is None and parsed also None, then set as model_version")
|
||||
parser.add_argument('--dtype',
|
||||
type=str,
|
||||
choices=['auto', 'float32', 'float16', 'bfloat16'],
|
||||
default='auto',
|
||||
help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
|
||||
parser.add_argument(
|
||||
'--dump_weights',
|
||||
action="store_true",
|
||||
default=True,
|
||||
help='dump weights of the converted model',
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
|
||||
args.hf_model_dir, args.model_version, args.model_type)
|
||||
|
||||
args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
|
||||
|
||||
if args.dtype == "auto":
|
||||
args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
|
||||
|
||||
args.torch_dtype = str_dtype_to_torch(args.dtype)
|
||||
args.hf_config.torch_dtype = args.torch_dtype
|
||||
|
||||
if args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'glm':
|
||||
auto_model_cls = AutoModelForSeq2SeqLM
|
||||
elif args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'chatglm':
|
||||
auto_model_cls = AutoModel
|
||||
else:
|
||||
auto_model_cls = AutoModelForCausalLM
|
||||
model = auto_model_cls.from_pretrained(
|
||||
args.hf_model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=args.torch_dtype)
|
||||
|
||||
named_parameters = dict(model.named_parameters())
|
||||
save_weights(named_parameters, args)
|
||||
145
vllm-v0.6.2/tools/quant_tools/dump_smooth.py
Normal file
145
vllm-v0.6.2/tools/quant_tools/dump_smooth.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import torch
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def tensor_shape_to_string(tensor):
|
||||
'''
|
||||
convert a tensor shape to string description
|
||||
'''
|
||||
int_list = list(tensor.shape)
|
||||
str_list = [str(num) for num in int_list]
|
||||
str_shape = "x".join(str_list)
|
||||
return str_shape
|
||||
|
||||
|
||||
def save_prompt_token_ids(prompt_input_ids, args):
|
||||
'''
|
||||
save prompt_token_id
|
||||
Args:
|
||||
prompt_input_ids: prompt input_id assiged to llm.generate
|
||||
args: arguments from main
|
||||
'''
|
||||
if args.dump_prompt_token_ids is not True:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "prompt_input_ids")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
data_len = len(prompt_input_ids)
|
||||
for data_index in range(data_len):
|
||||
tensor = prompt_input_ids[data_index]
|
||||
str_shape = tensor_shape_to_string(tensor)
|
||||
file_path = os.path.join(output_dir, f"prompt_input_ids_{data_index}_{str_shape}.pt")
|
||||
torch.save(tensor, file_path)
|
||||
logger.info(f"Saved input_ids[{data_index}] to {file_path}")
|
||||
|
||||
|
||||
def save_input_ids(input_ids, args):
|
||||
'''
|
||||
save input_ids
|
||||
Args:
|
||||
input_ids: input of qkv with layer0
|
||||
args: arguments from main
|
||||
'''
|
||||
id_len = len(input_ids)
|
||||
if args.dump_input_ids is not True or id_len == 0:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "input_ids")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
for data_index in range(id_len):
|
||||
tensor = input_ids[data_index]
|
||||
str_shape = tensor_shape_to_string(tensor)
|
||||
file_path = os.path.join(output_dir, f"input_ids_{data_index}_{str_shape}.pt")
|
||||
torch.save(tensor, file_path)
|
||||
logger.info(f"Saved input_ids[{data_index}] to {file_path}")
|
||||
|
||||
|
||||
def save_act_range(act_range, args):
|
||||
'''
|
||||
save act_range
|
||||
Args:
|
||||
act_range: save act_range collected when model running
|
||||
args: arguments from main
|
||||
'''
|
||||
if args.dump_act_range is not True:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "act_range")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
for layer_name, layer_scale in act_range.items():
|
||||
for tensor_key, tensor_value in layer_scale.items():
|
||||
if isinstance(tensor_value, torch.Tensor):
|
||||
str_shape = tensor_shape_to_string(tensor_value)
|
||||
file_name = f'{layer_name}_{tensor_key}_{str_shape}.pt'
|
||||
file_path = os.path.join(output_dir, file_name)
|
||||
torch.save(tensor_value, file_path)
|
||||
logger.info(f"Saved act_range[{layer_name}][{tensor_key}] to {file_path}")
|
||||
|
||||
|
||||
def save_weights(weights, args):
|
||||
'''
|
||||
save hugging face weights
|
||||
Args:
|
||||
weights: hugging face weights merged with llm model named parameters
|
||||
args: arguments from main
|
||||
'''
|
||||
if args.dump_weights is not True:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "weights")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
for tensor_key, tensor_value in weights.items():
|
||||
str_shape = tensor_shape_to_string(tensor_value)
|
||||
file_name = f'{tensor_key}_{str_shape}.pt'
|
||||
file_path = os.path.join(output_dir, file_name)
|
||||
torch.save(tensor_value, file_path)
|
||||
logger.info(f"Saved weights[{tensor_key}] to {file_path}")
|
||||
|
||||
|
||||
def save_generate_weights(weights, args):
|
||||
'''
|
||||
save quantizated weights
|
||||
Args:
|
||||
weights: quantized weights of smoothquant or weightonly
|
||||
args: arguments from main
|
||||
'''
|
||||
if args.dump_generate_weights is not True:
|
||||
return
|
||||
output_dir = os.path.join(args.output_dir, "generate_weights")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
for tensor_key, tensor_value in weights.items():
|
||||
str_shape = tensor_shape_to_string(tensor_value)
|
||||
file_name = f'{tensor_key}_{str_shape}.pt'
|
||||
file_path = os.path.join(output_dir, file_name)
|
||||
torch.save(tensor_value, file_path)
|
||||
logger.info(f"Saved generate weights[{tensor_key}] to {file_path}")
|
||||
|
||||
|
||||
def dump_save_x_y(name, x, y, index):
|
||||
'''
|
||||
dump x, y when inferrence
|
||||
output_dir need to modify by your self
|
||||
'''
|
||||
output_dir = "output_dir"
|
||||
x_output_dir = os.path.join(output_dir, "x_tensor")
|
||||
y_output_dir = os.path.join(output_dir, "y_tensor")
|
||||
if not os.path.exists(x_output_dir):
|
||||
os.makedirs(x_output_dir)
|
||||
if not os.path.exists(y_output_dir):
|
||||
os.makedirs(y_output_dir)
|
||||
|
||||
x_file_name = os.path.join(x_output_dir, f"{name}_x_{index}.pt")
|
||||
y_file_name = os.path.join(y_output_dir, f"{name}_y_{index}.pt")
|
||||
if isinstance(x, tuple):
|
||||
x = x[0]
|
||||
if not os.path.exists(x_file_name):
|
||||
torch.save(x.cpu(), x_file_name)
|
||||
if not os.path.exists(y_file_name):
|
||||
torch.save(y.cpu(), y_file_name)
|
||||
140
vllm-v0.6.2/tools/quant_tools/input_context.py
Normal file
140
vllm-v0.6.2/tools/quant_tools/input_context.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import torch
|
||||
|
||||
|
||||
def make_context(
|
||||
tokenizer,
|
||||
query,
|
||||
history,
|
||||
system,
|
||||
max_input_length,
|
||||
max_window_size: int = 6144,
|
||||
chat_format: str = "chatml",
|
||||
):
|
||||
'''
|
||||
tokenize one text context to tokenized id
|
||||
args:
|
||||
tokenizer: model tokenizer
|
||||
query: current text context
|
||||
history: history text context
|
||||
system: system prompt
|
||||
max_input_length: max input length of tokenized id
|
||||
chat_format: chat format, only accept chatml and raw
|
||||
'''
|
||||
if history is None:
|
||||
history = []
|
||||
|
||||
if chat_format == "chatml":
|
||||
im_start, im_end = "<|im_start|>", "<|im_end|>"
|
||||
im_start_tokens = [tokenizer.im_start_id]
|
||||
im_end_tokens = [tokenizer.im_end_id]
|
||||
nl_tokens = tokenizer.encode("\n")
|
||||
|
||||
def _tokenize_str(role, content):
|
||||
'''
|
||||
tokensize string
|
||||
'''
|
||||
return (f"{role}\n{content}", tokenizer.encode(
|
||||
role,
|
||||
allowed_special=set(),
|
||||
) + nl_tokens + tokenizer.encode(
|
||||
content,
|
||||
allowed_special=set(),
|
||||
))
|
||||
|
||||
system_text, system_tokens_part = _tokenize_str("system", system)
|
||||
system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
|
||||
raw_text = ""
|
||||
context_tokens = []
|
||||
|
||||
for turn_query, turn_response in reversed(history):
|
||||
query_text, query_tokens_part = _tokenize_str("user", turn_query)
|
||||
query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
|
||||
|
||||
response_text, response_tokens_part = _tokenize_str("assistant", turn_response)
|
||||
response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
|
||||
next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
|
||||
prev_chat = (f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}")
|
||||
|
||||
current_context_size = (len(system_tokens) + len(next_context_tokens) + len(context_tokens))
|
||||
if current_context_size < max_window_size:
|
||||
context_tokens = next_context_tokens + context_tokens
|
||||
raw_text = prev_chat + raw_text
|
||||
else:
|
||||
break
|
||||
|
||||
context_tokens = system_tokens + context_tokens
|
||||
raw_text = f"{im_start}{system_text}{im_end}" + raw_text
|
||||
context_tokens += (nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens +
|
||||
im_start_tokens + tokenizer.encode("assistant") + nl_tokens)
|
||||
raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
|
||||
|
||||
elif chat_format == "raw":
|
||||
raw_text = query
|
||||
context_tokens = tokenizer.encode(raw_text)
|
||||
else:
|
||||
raise NotImplementedError(f"Unknown chat format {chat_format!r}")
|
||||
# truncate to max_input_length, truncate from the front
|
||||
return raw_text, context_tokens[-max_input_length:]
|
||||
|
||||
|
||||
def prepare_inputs(batch_input_texts,
|
||||
tokenizer,
|
||||
model_name,
|
||||
model_version,
|
||||
test_token_num,
|
||||
eval_task='summarize',
|
||||
add_special_tokens=True):
|
||||
'''
|
||||
tokenize batch input texts into tokenized id.
|
||||
args:
|
||||
batch_input_texts: batch input text, also named batched prompt
|
||||
tokenizer: model tokenizer
|
||||
model_name: model name
|
||||
model_version: model version
|
||||
test_token_num: batch size, also named prompt number
|
||||
eval_task: eval task
|
||||
add_special_tokens: whether to add_special_tokens, default True
|
||||
'''
|
||||
batch_size = len(batch_input_texts)
|
||||
append_str = ' TL;DR: ' if eval_task == 'summarize' else ''
|
||||
batch_input_ids = []
|
||||
for i in range(batch_size):
|
||||
curr_text = batch_input_texts[i] + append_str
|
||||
curr_text = curr_text.strip().replace(" n't", "n't")
|
||||
|
||||
# The below lines are used to be compatible with the original code
|
||||
if 'GLM' in model_name and model_version in ['chatglm2', 'chatglm3']:
|
||||
input_ids = tokenizer.encode(curr_text, return_tensors='pt').squeeze(0)
|
||||
input_ids = input_ids[:test_token_num]
|
||||
elif 'qwen' in model_name.lower() and model_version == 'qwen':
|
||||
# use make_content to generate prompt
|
||||
system_prompt = "You are a useful assistant, please directly output the corresponding " + \
|
||||
"summary according to the article entered by the user."
|
||||
_, input_id_list = make_context(
|
||||
tokenizer=tokenizer,
|
||||
query=curr_text,
|
||||
history=[],
|
||||
system=system_prompt,
|
||||
max_input_length=test_token_num,
|
||||
)
|
||||
input_ids = torch.tensor(input_id_list)
|
||||
else:
|
||||
if 'qwen' in model_name.lower() and 'qwen2' in model_version:
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": curr_text
|
||||
}]
|
||||
curr_text = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
input_ids = tokenizer.encode(curr_text,
|
||||
return_tensors='pt',
|
||||
add_special_tokens=add_special_tokens,
|
||||
truncation=True,
|
||||
max_length=test_token_num).squeeze(0)
|
||||
|
||||
batch_input_ids.append(input_ids)
|
||||
return batch_input_ids
|
||||
206
vllm-v0.6.2/tools/quant_tools/model_special.py
Executable file
206
vllm-v0.6.2/tools/quant_tools/model_special.py
Executable file
@@ -0,0 +1,206 @@
|
||||
import re
|
||||
|
||||
# model_type, qkv_list, gate_up_list, is_gate_up
|
||||
smooth_model_config = {
|
||||
"mllama": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"llama": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"qwen2_vl": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None,
|
||||
"skip_patterns": [r"^visual\.*"]
|
||||
},
|
||||
"qwen2": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"qwen": {
|
||||
"qkv_list": ["c_attn"],
|
||||
"gate_up_list": ["w2", "w1"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"baichuan": {
|
||||
"qkv_list": ["W_pack"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"chatglm": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": ["dense_h_to_4h"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"gpt_neox": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": [],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"mixtral": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["w1", "w3"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
|
||||
"down_list": ["block_sparse_moe.w2", "w2"],
|
||||
"is_merged": True
|
||||
}
|
||||
},
|
||||
"qwen2_moe": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
||||
"down_list": ["mlp.w2", "down_proj"],
|
||||
"is_merged": True
|
||||
}
|
||||
},
|
||||
"deepseek_v2": {
|
||||
"qkv_list": ["q_proj", "q_b_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
||||
"down_list": ["mlp.w2", "down_proj"],
|
||||
"is_merged": True
|
||||
},
|
||||
"skip_patterns": [r".*\.kv_b_proj\..*",]
|
||||
},
|
||||
"falcon": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": ["dense_h_to_4h"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"bloom": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": ["dense_h_to_4h"],
|
||||
"is_gate_up": False,
|
||||
"moe_list": None
|
||||
},
|
||||
"internlm2": {
|
||||
"qkv_list": ["wqkv"],
|
||||
"gate_up_list": ["gate_up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"hunyuan": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
||||
"down_list": ["mlp.w2", "down_proj"],
|
||||
"is_merged": True
|
||||
}
|
||||
},
|
||||
"phi3": {
|
||||
"qkv_list": ["qkv_proj"],
|
||||
"gate_up_list": ["gate_up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_layer_weight_bias_name(model_type, layer_name):
|
||||
'''
|
||||
Specially adjust the condition that layer_name and weight/bias name are different,
|
||||
or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
|
||||
if model_type == "chatglm" and "output_layer" in layer_name:
|
||||
layer_name = "lm_head"
|
||||
weight_name = f"{layer_name}_weight"
|
||||
bias_name = f"{layer_name}_bias"
|
||||
Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
|
||||
'''
|
||||
weight_name = None
|
||||
bias_name = None
|
||||
|
||||
# layers which need to be modified can be listed at here
|
||||
if model_type == "hunyuan" and "lm_head" in layer_name:
|
||||
layer_name = "model.embed_tokens"
|
||||
weight_name = "model.embed_tokens.weight"
|
||||
bias_name = "model.embed_tokens.bias"
|
||||
|
||||
if weight_name is None:
|
||||
weight_name = f"{layer_name}.weight"
|
||||
if bias_name is None:
|
||||
bias_name = f"{layer_name}.bias"
|
||||
|
||||
return layer_name, weight_name, bias_name
|
||||
|
||||
|
||||
def modify_layer_weight_bias_name(model_type, named_parameters):
|
||||
'''
|
||||
modify special condition that vllm layer_name isn't same as hf layer name
|
||||
'''
|
||||
# Mapping for model type specific adjustments
|
||||
mapping = {
|
||||
"chatglm": {
|
||||
"transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
|
||||
},
|
||||
}
|
||||
|
||||
if model_type in mapping:
|
||||
for old_key, new_key in mapping[model_type].items():
|
||||
if old_key in named_parameters:
|
||||
named_parameters[new_key] = named_parameters.pop(old_key)
|
||||
|
||||
|
||||
def extract_numbers(string):
|
||||
'''
|
||||
extract a string to number
|
||||
'''
|
||||
# 使用正则表达式找到字符串中的所有数字部分
|
||||
matches = re.findall(r'\d+', string)
|
||||
|
||||
# 将所有匹配的数字部分转换为整数
|
||||
numbers = [int(match) for match in matches]
|
||||
|
||||
return numbers[-1] if len(numbers) > 0 else 0
|
||||
|
||||
|
||||
def get_qkv_distribution(model_type, model_version, hf_config):
|
||||
'''
|
||||
Get qkv distribution: n3sh or 3nsh
|
||||
n3sh: [head_num, 3, head_size, hidden_size]
|
||||
3nsh: [3, head_num, head_size, hidden_size]
|
||||
vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
|
||||
to be same as hugging face qkv distribution
|
||||
This is only for packge qkv layer and it's distribution is n3sh
|
||||
'''
|
||||
is_n3sh = False
|
||||
head_num = 0
|
||||
kv_head_num = 0
|
||||
if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
|
||||
is_n3sh = True
|
||||
head_num = hf_config.num_attention_heads
|
||||
|
||||
kv_head_num = head_num
|
||||
if model_type == "falcon":
|
||||
is_n3sh = True
|
||||
head_num = hf_config.num_attention_heads
|
||||
if hf_config.new_decoder_architecture:
|
||||
kv_head_num = hf_config.num_kv_heads
|
||||
elif hf_config.multi_query:
|
||||
kv_head_num = 1
|
||||
else:
|
||||
kv_head_num = head_num
|
||||
|
||||
return is_n3sh, head_num, kv_head_num
|
||||
418
vllm-v0.6.2/tools/quant_tools/smooth_quant.py
Normal file
418
vllm-v0.6.2/tools/quant_tools/smooth_quant.py
Normal file
@@ -0,0 +1,418 @@
|
||||
import argparse
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
import logging
|
||||
import csv
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
|
||||
|
||||
from input_context import prepare_inputs
|
||||
|
||||
from dump_smooth import save_prompt_token_ids, save_input_ids, save_act_range, save_weights, save_generate_weights
|
||||
|
||||
from model_special import smooth_model_config
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def load_prompts_from_csv(args):
|
||||
'''
|
||||
load prompts from csv file
|
||||
'''
|
||||
if args.prompt_file is not None:
|
||||
prompt_file = args.prompt_file
|
||||
else:
|
||||
current_dir = os.path.dirname(__file__)
|
||||
prompt_file = os.path.join(current_dir, 'summarize_1024_prompts.csv')
|
||||
|
||||
# 从 CSV 文件加载数据为 List
|
||||
loaded_prompts = []
|
||||
|
||||
# 从按列显示的 CSV 文件中读取数据并转换为 List 形式
|
||||
with open(prompt_file, 'r', newline='') as file:
|
||||
reader = csv.reader(file)
|
||||
loaded_prompts = list(zip(*reader))[0]
|
||||
|
||||
loaded_prompts = list(loaded_prompts)
|
||||
num_samples = min(args.num_samples, len(loaded_prompts))
|
||||
|
||||
prompts = loaded_prompts[0:num_samples]
|
||||
|
||||
return prompts
|
||||
|
||||
|
||||
def save_summarize_1024_prompts_as_csv(prompts):
|
||||
'''
|
||||
save summarize 512 prompts
|
||||
'''
|
||||
# 将 List 数据按列保存为 CSV 文件
|
||||
# 转置 List
|
||||
transposed_prompts = [prompts]
|
||||
with open('summarize_1024_prompts.csv', 'w', newline='') as file:
|
||||
writer = csv.writer(file)
|
||||
writer.writerows(zip(*transposed_prompts))
|
||||
|
||||
|
||||
def generate_prompts(args: argparse.Namespace):
|
||||
'''
|
||||
Generate prompts based on the evaluation task and arguments.
|
||||
'''
|
||||
|
||||
eval_task_config = {
|
||||
"code_completion": {
|
||||
"dataset_name": "openai_humaneval",
|
||||
"dataset_revision": None,
|
||||
"dataset_input_key": "prompt",
|
||||
"dataset_split": "test"
|
||||
},
|
||||
"summarize": {
|
||||
"dataset_name": "ccdv/cnn_dailymail",
|
||||
"dataset_revision": "3.0.0",
|
||||
"dataset_input_key": "article",
|
||||
"dataset_split": "train"
|
||||
},
|
||||
"summarize_long": {
|
||||
"dataset_name": "tau/zero_scrolls",
|
||||
"dataset_revision": "squality",
|
||||
"dataset_input_key": "input",
|
||||
"dataset_split": "validation"
|
||||
},
|
||||
"summarize_hg": {
|
||||
"dataset_name": "cnn_dailymail",
|
||||
"dataset_revision": "3.0.0",
|
||||
"dataset_input_key": "article",
|
||||
"dataset_split": "validation"
|
||||
},
|
||||
"text_generation": {
|
||||
"dataset_name": "lambada",
|
||||
"dataset_revision": None,
|
||||
"dataset_input_key": "text",
|
||||
"dataset_split": "validation"
|
||||
}
|
||||
}
|
||||
|
||||
if args.eval_task in eval_task_config:
|
||||
config = eval_task_config[args.eval_task]
|
||||
dataset_name = config["dataset_name"]
|
||||
dataset_revision = config["dataset_revision"]
|
||||
dataset_input_key = config["dataset_input_key"]
|
||||
dataset_split = config["dataset_split"]
|
||||
else:
|
||||
assert args.dataset_name is not None, f"dataset_name is None when eval_task == custom"
|
||||
assert args.dataset_input_key is not None, f"dataset_input_key is None when eval_task == custom"
|
||||
assert args.dataset_split is not None, f"dataset_split is None when eval_task == custom"
|
||||
|
||||
dataset_name = args.dataset_name
|
||||
dataset_revision = args.dataset_revision
|
||||
dataset_input_key = args.dataset_input_key
|
||||
dataset_split = args.dataset_split
|
||||
|
||||
if args.prompt_file is not None or (args.eval_task == "summarize" and args.num_samples <= 1024):
|
||||
prompts = load_prompts_from_csv(args)
|
||||
num_samples = min(args.num_samples, len(prompts))
|
||||
else:
|
||||
dataset = load_dataset(dataset_name,
|
||||
dataset_revision,
|
||||
cache_dir=args.dataset_cache_dir,
|
||||
split=dataset_split,
|
||||
trust_remote_code=True)
|
||||
num_samples = min(args.num_samples, len(dataset))
|
||||
prompts = dataset[0:num_samples][dataset_input_key]
|
||||
# save_summarize_1024_prompts_as_csv(prompts)
|
||||
|
||||
prompt_token_ids = []
|
||||
if args.has_prompt_token_id:
|
||||
batch_input_ids = prepare_inputs(prompts,
|
||||
args.tokenizer,
|
||||
args.model_name,
|
||||
args.model_version,
|
||||
args.max_input_length,
|
||||
eval_task=args.eval_task,
|
||||
add_special_tokens=args.add_special_tokens)
|
||||
save_prompt_token_ids(batch_input_ids, args)
|
||||
for i in range(num_samples):
|
||||
prompt_token_ids.append(batch_input_ids[i].tolist())
|
||||
|
||||
if len(prompts) == 0:
|
||||
prompts = None
|
||||
else:
|
||||
prompts = [s[:args.max_input_length] for s in prompts]
|
||||
|
||||
if len(prompt_token_ids) == 0:
|
||||
prompt_token_ids = None
|
||||
|
||||
return prompts, prompt_token_ids
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def get_smooth_cal_weight(name, weight, name_parameters, act_range, model_type):
|
||||
'''
|
||||
get cal_weight for smooth process to solve q/k/v and gate/up layer merged condition in vllm
|
||||
args:
|
||||
name: weight name
|
||||
weight: weight value
|
||||
name_parameters: named parameters
|
||||
act_range: layer act range info of name
|
||||
model_type: model type
|
||||
'''
|
||||
if act_range["is_qkv"] is True:
|
||||
name_parts = name.split(".")
|
||||
self_attn_layer_name = ".".join(name_parts[:-2])
|
||||
qkv_list = smooth_model_config[model_type]["qkv_list"]
|
||||
q_weight_name = f"{self_attn_layer_name}.{qkv_list[0]}.weight"
|
||||
k_weight_name = f"{self_attn_layer_name}.{qkv_list[1]}.weight"
|
||||
v_weight_name = f"{self_attn_layer_name}.{qkv_list[2]}.weight"
|
||||
q_weight = name_parameters[q_weight_name]
|
||||
k_weight = name_parameters[k_weight_name]
|
||||
v_weight = name_parameters[v_weight_name]
|
||||
cal_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
|
||||
elif act_range["is_merge"] is True:
|
||||
name_parts = name.split(".")
|
||||
mlp_layer_name = ".".join(name_parts[:-2])
|
||||
gate_up_list = smooth_model_config[model_type]["gate_up_list"]
|
||||
gate_weight_name = f"{mlp_layer_name}.{gate_up_list[0]}.weight"
|
||||
up_weight_name = f"{mlp_layer_name}.{gate_up_list[1]}.weight"
|
||||
gate_weight = name_parameters[gate_weight_name]
|
||||
up_weight = name_parameters[up_weight_name]
|
||||
cal_weight = torch.cat([gate_weight, up_weight], dim=0)
|
||||
else:
|
||||
cal_weight = weight
|
||||
|
||||
return cal_weight
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def cal_smoother(weight, act_range_x, alpha=0.5):
|
||||
'''
|
||||
calculate smoother value
|
||||
args:
|
||||
weight: smoother weight
|
||||
act_range_x: activation max value of per channel
|
||||
alpha: smooth factor, default 0.5
|
||||
'''
|
||||
assert weight.shape[-1] == act_range_x.numel()
|
||||
weight_scales = weight.view(-1, weight.shape[-1])
|
||||
weight_scales = weight_scales.abs().max(dim=0)[0]
|
||||
weight_scales = weight_scales.to(float).clamp(min=1e-6)
|
||||
smoother = (act_range_x.to(weight_scales.device).to(float).pow(alpha) /
|
||||
weight_scales.pow(1 - alpha)).clamp(min=1e-6)
|
||||
|
||||
return smoother
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def cal_qweight_scales(sweight, smooth_act_range_x, per_token, per_channel):
|
||||
'''
|
||||
calculate quantized weight anc scales
|
||||
args:
|
||||
sweight: weight which has been divided by smoother value
|
||||
smooth_act_range_x: activation max value which has beed divide by smoother value
|
||||
per_token: bool, means whether calculate the weight and scales dynamically
|
||||
per_channel: bool, mean whether calculate the weight and scales by channel
|
||||
'''
|
||||
scale_x_quant_orig_t = smooth_act_range_x.max() / 127.0
|
||||
smooth_act_range_w = sweight.abs().max(dim=-1)[0]
|
||||
smooth_act_range_w = smooth_act_range_w.to(float).clamp(min=1e-6)
|
||||
scale_w_quant_orig_c = smooth_act_range_w / 127.0
|
||||
scale_w_quant_orig_t = smooth_act_range_w.max() / 127
|
||||
|
||||
if per_channel:
|
||||
qweight = (sweight / scale_w_quant_orig_c[..., None])
|
||||
else:
|
||||
qweight = (sweight / scale_w_quant_orig_t)
|
||||
|
||||
qweight = qweight.clip(-128, 127).to(torch.int8)
|
||||
|
||||
scale_to_int = 1 / scale_x_quant_orig_t
|
||||
|
||||
if per_token:
|
||||
if per_channel:
|
||||
per_channel_scale = scale_w_quant_orig_c
|
||||
else:
|
||||
per_channel_scale = scale_w_quant_orig_t
|
||||
else:
|
||||
if per_channel:
|
||||
per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_c
|
||||
hidden_size = smooth_act_range_x.numel()
|
||||
scale_to_int = scale_to_int.repeat(hidden_size)
|
||||
else:
|
||||
per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_t
|
||||
|
||||
per_channel_scale = per_channel_scale.squeeze()
|
||||
if per_channel_scale.numel() == 1 and per_channel_scale.dim() == 0:
|
||||
per_channel_scale = per_channel_scale.unsqueeze(0)
|
||||
|
||||
if scale_to_int.numel() == 1 and scale_to_int.dim() == 0:
|
||||
scale_to_int = scale_to_int.unsqueeze(0)
|
||||
|
||||
sinfo = [
|
||||
scale_w_quant_orig_t.item(), scale_x_quant_orig_t.item(),
|
||||
scale_w_quant_orig_t.item() / scale_x_quant_orig_t.item()
|
||||
]
|
||||
return qweight, per_channel_scale, scale_to_int, sinfo
|
||||
|
||||
|
||||
def check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int):
|
||||
'''
|
||||
check whether nan/inf appears in qweight, per_channel_scale, smooth, qzeros, scale_to_int
|
||||
'''
|
||||
if torch.isinf(qweight).any() or torch.isnan(qweight).any():
|
||||
logger.error(f"name:{name} qweight has inf or nan")
|
||||
if torch.isinf(per_channel_scale).any() or torch.isnan(per_channel_scale).any():
|
||||
logger.error(f"name:{name} per_channel_scale has inf or nan")
|
||||
if torch.isinf(smooth).any() or torch.isnan(smooth).any():
|
||||
logger.error(f"name:{name} smooth has inf or nan")
|
||||
if torch.isinf(scale_to_int).any() or torch.isnan(scale_to_int).any():
|
||||
logger.error(f"name:{name} scale_to_int has inf or nan")
|
||||
if qzeros is not None and (torch.isinf(qzeros).any() or torch.isnan(qzeros).any()):
|
||||
logger.error(f"name:{name} qzeros has inf or nan")
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def cal_smooth_weight(name, act_range_x, weight, smooth_value, has_qzeros, per_token, per_channel, cal_weight):
|
||||
'''
|
||||
calculate qweight, scales, smooth, qzeros
|
||||
args:
|
||||
name: weight name
|
||||
act_range_x: activation max value of per channel
|
||||
weight: weight to be quantized
|
||||
smooth_value: smooth value
|
||||
has_qzeros: which generate qzeros weight
|
||||
per_token: bool, means whether calculate the weight and scales dynamically
|
||||
per_channel: bool, mean whether calculate the weight and scales by channel
|
||||
model_type: model type
|
||||
'''
|
||||
smoother = cal_smoother(cal_weight, act_range_x, smooth_value)
|
||||
smooth_act_range_x = act_range_x / smoother
|
||||
sweight = weight * (smoother.view(1, -1))
|
||||
qweight, per_channel_scale, scale_to_int, sinfo = cal_qweight_scales(sweight, smooth_act_range_x, per_token,
|
||||
per_channel)
|
||||
qweight = qweight.reshape(weight.shape)
|
||||
smooth = 1 / smoother
|
||||
smooth = smooth.squeeze()
|
||||
if has_qzeros:
|
||||
qzeros = torch.zeros_like(per_channel_scale, dtype=torch.int32)
|
||||
else:
|
||||
qzeros = None
|
||||
|
||||
# check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int)
|
||||
|
||||
return qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def generate_smooth_weight(act_range, name_parameters, args):
|
||||
'''
|
||||
generate smooth weight
|
||||
args:
|
||||
act_range: act_range collected in model running
|
||||
name_parameters: hugging face model named parameters
|
||||
args: argument from main
|
||||
'''
|
||||
smooth_weight = {}
|
||||
smooth_info = {}
|
||||
has_qzeros = args.has_qzeros
|
||||
smooth_value = args.smooth_value
|
||||
|
||||
smooth_info["title"] = ["max_scale_w, max_scale_x, max_scale_w/max_scale_x"]
|
||||
|
||||
for name, param in name_parameters.items():
|
||||
if should_skip(args.model_type, name):
|
||||
logger.info(f"skip {name}")
|
||||
smooth_weight[name] = param
|
||||
continue
|
||||
if name.endswith("bias"):
|
||||
smooth_weight[name] = param
|
||||
continue
|
||||
name_parts = name.split(".")
|
||||
layer_name = ".".join(name_parts[:-1])
|
||||
if layer_name in act_range:
|
||||
act_range_x = act_range[layer_name]['x']
|
||||
cal_weight = get_smooth_cal_weight(name, param, name_parameters, act_range[layer_name], args.model_type)
|
||||
qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo = cal_smooth_weight(
|
||||
name, act_range_x, param, smooth_value, has_qzeros, args.per_token, args.per_channel, cal_weight)
|
||||
|
||||
per_channel_scale = per_channel_scale.to(args.torch_scales_smooth_dtype)
|
||||
smooth = smooth.to(args.torch_scales_smooth_dtype)
|
||||
scale_to_int = scale_to_int.to(args.torch_scales_smooth_dtype)
|
||||
|
||||
smooth_weight[f'{layer_name}.qweight'] = qweight
|
||||
smooth_weight[f'{layer_name}.per_channel_scale'] = per_channel_scale
|
||||
|
||||
if args.per_token is True:
|
||||
smooth_weight[f'{layer_name}.smooth'] = smooth
|
||||
else:
|
||||
scale_to_int = scale_to_int * smooth
|
||||
smooth_weight[f'{layer_name}.scale_to_int'] = scale_to_int
|
||||
|
||||
if has_qzeros:
|
||||
smooth_weight[f'{layer_name}.qzeros'] = qzeros
|
||||
|
||||
smooth_info[name] = sinfo
|
||||
else:
|
||||
smooth_weight[name] = param
|
||||
|
||||
return smooth_weight, smooth_info
|
||||
|
||||
|
||||
def generate_weights_of_smoothquant(llm: LLM, args: argparse.Namespace):
|
||||
'''
|
||||
generate smoothquant weights
|
||||
args:
|
||||
llm: LLM instance
|
||||
args: argument from main
|
||||
'''
|
||||
prompts, prompt_token_ids = generate_prompts(args)
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(max_tokens=args.output_len,
|
||||
repetition_penalty=args.repetition_penalty,
|
||||
temperature=args.temperature,
|
||||
top_p=args.top_p,
|
||||
top_k=args.top_k)
|
||||
|
||||
tp_size = args.tp_size
|
||||
|
||||
llm.llm_engine.model_executor._run_workers("setup_smooth_hook", args.dump_input_ids)
|
||||
|
||||
llm.generate(prompts, sampling_params, prompt_token_ids=prompt_token_ids, use_tqdm=True)
|
||||
|
||||
logger.info("llm generate finished")
|
||||
|
||||
llm.llm_engine.model_executor._run_workers("remove_hooks")
|
||||
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
|
||||
named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
|
||||
|
||||
vllm_cleanup(llm)
|
||||
del prompts
|
||||
del prompt_token_ids
|
||||
cleanup()
|
||||
|
||||
logger.info("get act_range and named_parameters from llm finished")
|
||||
|
||||
merged_act_range, merged_named_parameters, input_id_list = convert_to_merged(act_range, named_parameters, tp_size,
|
||||
args)
|
||||
|
||||
save_input_ids(input_id_list, args)
|
||||
save_act_range(merged_act_range, args)
|
||||
save_weights(merged_named_parameters, args)
|
||||
|
||||
del act_range
|
||||
del named_parameters
|
||||
cleanup()
|
||||
|
||||
logger.info("get merged_act_range and merged_named_parameters finished")
|
||||
|
||||
smooth_weight, smooth_info = generate_smooth_weight(merged_act_range, merged_named_parameters, args)
|
||||
save_generate_weights(smooth_weight, args)
|
||||
|
||||
del merged_act_range
|
||||
del merged_named_parameters
|
||||
cleanup()
|
||||
|
||||
logger.info("get smooth_weight finished")
|
||||
|
||||
return smooth_weight, smooth_info
|
||||
1024
vllm-v0.6.2/tools/quant_tools/summarize_1024_prompts.csv
Normal file
1024
vllm-v0.6.2/tools/quant_tools/summarize_1024_prompts.csv
Normal file
File diff suppressed because one or more lines are too long
713
vllm-v0.6.2/tools/quant_tools/utils_internal.py
Executable file
713
vllm-v0.6.2/tools/quant_tools/utils_internal.py
Executable file
@@ -0,0 +1,713 @@
|
||||
from collections import defaultdict, OrderedDict
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import re
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
import json
|
||||
from transformers import AutoTokenizer, T5Tokenizer
|
||||
import gc
|
||||
from datetime import datetime
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from model_special import (smooth_model_config, get_layer_weight_bias_name, get_qkv_distribution,
|
||||
modify_layer_weight_bias_name)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_str_to_torch_dtype_dict = dict(
|
||||
bfloat16=torch.bfloat16,
|
||||
float16=torch.float16,
|
||||
float32=torch.float32,
|
||||
int64=torch.int64,
|
||||
int32=torch.int32,
|
||||
int8=torch.int8,
|
||||
bool=torch.bool,
|
||||
fp8=torch.float8_e4m3fn,
|
||||
)
|
||||
|
||||
|
||||
def str_dtype_to_torch(dtype):
|
||||
'''
|
||||
convert torch dytpe to str dtype
|
||||
'''
|
||||
ret = _str_to_torch_dtype_dict.get(dtype)
|
||||
dtype = ret if ret is not None else torch.float16
|
||||
return dtype
|
||||
|
||||
|
||||
_torch_dtype_to_str_dict = {
|
||||
torch.bfloat16:"bfloat16",
|
||||
torch.float16:"float16",
|
||||
torch.float32:"float32",
|
||||
torch.int64:"int64",
|
||||
torch.int32:"int32",
|
||||
torch.int8:"int8",
|
||||
torch.bool:"bool",
|
||||
torch.float8_e4m3fn:"fp8",
|
||||
}
|
||||
|
||||
|
||||
def torch_dtype_to_str(dtype):
|
||||
'''
|
||||
convert str dytpe to torch dtype
|
||||
'''
|
||||
ret = _torch_dtype_to_str_dict.get(dtype)
|
||||
dtype = ret if ret is not None else "float16"
|
||||
return dtype
|
||||
|
||||
|
||||
def extract_model_path(name_or_path):
|
||||
'''
|
||||
extract model_version, model_family from named_or_path from config.json
|
||||
'''
|
||||
patterns = [
|
||||
r"/(.*)(-[0-9]+[mMbB]{1})(-*.*)",
|
||||
r"/(.*-[0-9]+)(-*.*)",
|
||||
r"(.*)(-[0-9]+[mMbB]{1})(-*.*)",
|
||||
r"(.*-[0-9]+)(-*.*)",
|
||||
r"([^-]+)(-*.*)",
|
||||
]
|
||||
model_version = None
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, name_or_path)
|
||||
if match:
|
||||
model_version = match.group(1)
|
||||
break
|
||||
|
||||
if model_version is None:
|
||||
model_version = name_or_path
|
||||
|
||||
model_version = model_version.lower()
|
||||
match = re.search(r"([a-zA-z]+)(.*)", model_version)
|
||||
if match:
|
||||
model_family = match.group(1)
|
||||
else:
|
||||
model_family = model_version
|
||||
|
||||
return model_version, model_family
|
||||
|
||||
|
||||
def read_model_name(model_dir: str, model_version: Optional[str] = None, model_type: Optional[str] = None):
|
||||
'''
|
||||
get model_arch, model_version, model_family, model_type form config.json, passed model_version, model_type
|
||||
args:
|
||||
model_dir: model directory
|
||||
model_version: passed from main, default None
|
||||
model_type: pass from main, default None
|
||||
'''
|
||||
with open(Path(model_dir) / "config.json", 'r') as f:
|
||||
config = json.load(f)
|
||||
|
||||
model_arch = config.get('architectures', None)
|
||||
name_or_path = config.get('_name_or_path', None)
|
||||
if model_type is None:
|
||||
model_type = config.get('model_type', None)
|
||||
if model_type:
|
||||
model_type = model_type.lower()
|
||||
model_family = None
|
||||
|
||||
if model_version is None and name_or_path:
|
||||
model_version, model_family = extract_model_path(name_or_path)
|
||||
|
||||
if model_version is None:
|
||||
model_version = model_type
|
||||
|
||||
if model_version:
|
||||
model_version = model_version.lower()
|
||||
|
||||
if model_version and model_family is None:
|
||||
match = re.search(r"([a-zA-z]+)(.*)", model_version)
|
||||
if match:
|
||||
model_family = match.group(1)
|
||||
else:
|
||||
model_family = model_version
|
||||
|
||||
if isinstance(model_arch, (list, tuple)) and len(model_arch) > 0:
|
||||
model_arch = model_arch[0]
|
||||
|
||||
assert model_arch, "read model architectures failed"
|
||||
assert model_version, "read model version failed, please set args.version manually"
|
||||
assert model_family, "read model family failed, please set args.version manually"
|
||||
|
||||
return model_arch, model_version, model_family, model_type
|
||||
|
||||
|
||||
def load_tokenizer(tokenizer_dir: Optional[str] = None,
|
||||
vocab_file: Optional[str] = None,
|
||||
model_name: str = 'GPTForCausalLM',
|
||||
model_version: Optional[str] = None,
|
||||
tokenizer_type: Optional[str] = None):
|
||||
'''
|
||||
load tokenizer of model
|
||||
args:
|
||||
tokenizer_dir: tokenizer directory
|
||||
vocab_file: vocabulary file, default None
|
||||
model_name: model name
|
||||
model_version: model version
|
||||
tokenizer_type: Tokenizer type to be loaded.
|
||||
'''
|
||||
if vocab_file is None:
|
||||
use_fast = True
|
||||
if tokenizer_type == "llama":
|
||||
use_fast = False
|
||||
# Should set both padding_side and truncation_side to be 'left'
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
||||
legacy=False,
|
||||
padding_side='left',
|
||||
truncation_side='right',
|
||||
trust_remote_code=True,
|
||||
tokenizer_type=tokenizer_type,
|
||||
use_fast=use_fast)
|
||||
elif model_name == 'GemmaForCausalLM':
|
||||
from transformers import GemmaTokenizer
|
||||
|
||||
# Initialize tokenizer from vocab file.
|
||||
tokenizer = GemmaTokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
|
||||
else:
|
||||
# For gpt-next, directly load from tokenizer.model
|
||||
tokenizer = T5Tokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
|
||||
|
||||
if model_name == 'QWenForCausalLM':
|
||||
with open(Path(tokenizer_dir) / "generation_config.json") as f:
|
||||
gen_config = json.load(f)
|
||||
chat_format = gen_config['chat_format']
|
||||
assert chat_format in ('raw','chatml'), f"unknown chat format: {chat_format}"
|
||||
pad_id = gen_config['pad_token_id']
|
||||
end_id = gen_config['eos_token_id']
|
||||
elif model_name in ('ChatGLMForCausalLM', 'glm'):
|
||||
pad_id = tokenizer.pad_token_id
|
||||
end_id = tokenizer.eop_token_id
|
||||
else:
|
||||
if tokenizer.pad_token_id is None:
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
pad_id = tokenizer.pad_token_id
|
||||
end_id = tokenizer.eos_token_id
|
||||
|
||||
try:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
except Exception as e:
|
||||
logger.warn(f"set pad_token with exception:{e}")
|
||||
|
||||
return tokenizer, pad_id, end_id
|
||||
|
||||
|
||||
def merge_qkv_weight(named_parameters, weight_name, tp_size, q_proj_size, num_kv_head_replicas):
|
||||
'''
|
||||
merge tensor parallel qkv weight to none parallel q_weight, k_weight, v_weight.
|
||||
merge_qkv weight and bias has the same logic
|
||||
args:
|
||||
named_parameters: parallel named parameters
|
||||
weight_name: qkv layer weight name
|
||||
tp_size: tensor parallel size
|
||||
q_proj_size: query projection size
|
||||
num_kv_head_replicas: number kv head replicas
|
||||
'''
|
||||
qkv_proj_size = named_parameters[0][weight_name].shape[0]
|
||||
kv_proj_size = (qkv_proj_size - q_proj_size) // 2
|
||||
splite_size = [q_proj_size, kv_proj_size, kv_proj_size]
|
||||
|
||||
q_weight_list = []
|
||||
k_weight_list = []
|
||||
v_weight_list = []
|
||||
|
||||
for rank in range(0, tp_size):
|
||||
weight = named_parameters[rank][weight_name]
|
||||
split_weight = torch.split(weight, splite_size, dim=0)
|
||||
q_weight_list.append(split_weight[0])
|
||||
if rank % num_kv_head_replicas == 0:
|
||||
k_weight_list.append(split_weight[1])
|
||||
v_weight_list.append(split_weight[2])
|
||||
|
||||
q_weight = torch.cat(q_weight_list, dim=0)
|
||||
k_weight = torch.cat(k_weight_list, dim=0)
|
||||
v_weight = torch.cat(v_weight_list, dim=0)
|
||||
|
||||
return q_weight, k_weight, v_weight
|
||||
|
||||
|
||||
def merge_merged_weight(named_parameters, weight_name, tp_size, dim=0):
|
||||
'''
|
||||
merge merged linear layer weight to gate_weight and up_weight.
|
||||
merge merged weight and bias has the same logic.
|
||||
args:
|
||||
named_parameters: parallel named parameters
|
||||
weight_name: qkv layer weight name
|
||||
tp_size: tensor parallel size
|
||||
'''
|
||||
up_weight_list = []
|
||||
gate_weight_list = []
|
||||
|
||||
for rank in range(0, tp_size):
|
||||
weight = named_parameters[rank][weight_name]
|
||||
chunk_weights = torch.chunk(weight, 2, dim=dim)
|
||||
up_weight_list.append(chunk_weights[0])
|
||||
gate_weight_list.append(chunk_weights[1])
|
||||
|
||||
gate_weight = torch.cat(up_weight_list, dim=dim)
|
||||
up_weight = torch.cat(gate_weight_list, dim=dim)
|
||||
|
||||
return gate_weight, up_weight
|
||||
|
||||
|
||||
def convert_packed_qkv(q_weight, k_weight, v_weight, dim, args):
|
||||
'''
|
||||
convert packad qkv weight or bias
|
||||
args:
|
||||
q_weight: q weight or bias
|
||||
k_weight: k weight or bias
|
||||
v_weight: v_weight or bias
|
||||
dim: convert dim
|
||||
args: argument
|
||||
'''
|
||||
packed_qkv = torch.cat([q_weight, k_weight, v_weight], dim=dim)
|
||||
is_n3sh, head_num, kv_head_num = get_qkv_distribution(args.model_type, args.model_version, args.hf_config)
|
||||
if is_n3sh is True:
|
||||
packed_qkv_shape = packed_qkv.shape
|
||||
num_query_heads_per_kv_head = head_num // kv_head_num
|
||||
q_shape = q_weight.shape
|
||||
k_shape = k_weight.shape
|
||||
v_shape = v_weight.shape
|
||||
q = q_weight.view(q_shape[:dim] + (kv_head_num, num_query_heads_per_kv_head, -1) + q_shape[dim + 1:])
|
||||
k = k_weight.view(k_shape[:dim] + (kv_head_num, 1, -1) + k_shape[dim + 1:])
|
||||
v = v_weight.view(v_shape[:dim] + (kv_head_num, 1, -1) + v_shape[dim + 1:])
|
||||
tensor_n3sh = torch.cat([q, k, v], dim=dim+1)
|
||||
packed_qkv = tensor_n3sh.reshape(packed_qkv_shape)
|
||||
|
||||
return packed_qkv
|
||||
|
||||
|
||||
def convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
|
||||
layer_range, merged_act_range, tp_size, args):
|
||||
'''
|
||||
convert parallel qkv named parameters to non parallel qkv named parameters
|
||||
args:
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
args: argument
|
||||
'''
|
||||
layer_name_parts = layer_name.split(".")
|
||||
self_attn_layer_name = ".".join(layer_name_parts[:-1])
|
||||
qkv_name = layer_name_parts[-1]
|
||||
q_weight, k_weight, v_weight = merge_qkv_weight(named_parameters, weight_name, tp_size, layer_range["q_proj_size"],
|
||||
layer_range["num_kv_head_replicas"])
|
||||
qkv_list = smooth_model_config[args.model_type]["qkv_list"]
|
||||
qkv_list_len = len(qkv_list)
|
||||
if qkv_list_len == 3:
|
||||
q_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
|
||||
k_layer_name = f"{self_attn_layer_name}.{qkv_list[1]}"
|
||||
v_layer_name = f"{self_attn_layer_name}.{qkv_list[2]}"
|
||||
elif qkv_list_len == 1:
|
||||
qkv_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
|
||||
|
||||
if qkv_list_len == 3:
|
||||
merged_act_range[q_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[k_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[v_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[q_layer_name]["is_qkv"] = True
|
||||
merged_act_range[k_layer_name]["is_qkv"] = True
|
||||
merged_act_range[v_layer_name]["is_qkv"] = True
|
||||
|
||||
merged_named_parameters[f"{q_layer_name}.weight"] = q_weight
|
||||
merged_named_parameters[f"{k_layer_name}.weight"] = k_weight
|
||||
merged_named_parameters[f"{v_layer_name}.weight"] = v_weight
|
||||
elif qkv_list_len == 1:
|
||||
merged_act_range[qkv_layer_name]["x"] = layer_range["x"]
|
||||
qkv_weight = convert_packed_qkv(q_weight, k_weight, v_weight, 0, args)
|
||||
merged_named_parameters[f"{qkv_layer_name}.weight"] = qkv_weight
|
||||
|
||||
if bias_name in named_parameters[0]:
|
||||
q_bias, k_bias, v_bias = merge_qkv_weight(named_parameters, bias_name, tp_size, layer_range["q_proj_size"],
|
||||
layer_range["num_kv_head_replicas"])
|
||||
if qkv_list_len == 3:
|
||||
merged_named_parameters[f"{q_layer_name}.bias"] = q_bias
|
||||
merged_named_parameters[f"{k_layer_name}.bias"] = k_bias
|
||||
merged_named_parameters[f"{v_layer_name}.bias"] = v_bias
|
||||
elif qkv_list_len == 1:
|
||||
qkv_bias = convert_packed_qkv(q_bias, k_bias, v_bias, 0, args)
|
||||
merged_named_parameters[f"{qkv_layer_name}.bias"] = qkv_bias
|
||||
|
||||
return qkv_name
|
||||
|
||||
|
||||
def convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
|
||||
layer_range, merged_act_range, tp_size, model_type):
|
||||
'''
|
||||
convert parallel merged named parameters to non parallel merged named parameters
|
||||
args:
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
model_type: model type
|
||||
'''
|
||||
layer_name_parts = layer_name.split(".")
|
||||
mlp_layer_name = ".".join(layer_name_parts[:-1])
|
||||
gate_weight, up_weight = merge_merged_weight(named_parameters, weight_name, tp_size)
|
||||
gate_up_name = layer_name_parts[-1]
|
||||
gate_up_list = smooth_model_config[model_type]["gate_up_list"]
|
||||
gate_up_list_len = len(gate_up_list)
|
||||
is_gate_up = smooth_model_config[model_type]["is_gate_up"]
|
||||
if gate_up_list_len == 2:
|
||||
gate_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
|
||||
up_layer_name = f"{mlp_layer_name}.{gate_up_list[1]}"
|
||||
elif gate_up_list_len == 1:
|
||||
gate_up_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
|
||||
|
||||
if gate_up_list_len == 2:
|
||||
merged_act_range[gate_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[up_layer_name]["x"] = layer_range["x"]
|
||||
merged_act_range[gate_layer_name]["is_merge"] = True
|
||||
merged_act_range[up_layer_name]["is_merge"] = True
|
||||
|
||||
merged_named_parameters[f"{gate_layer_name}.weight"] = gate_weight
|
||||
merged_named_parameters[f"{up_layer_name}.weight"] = up_weight
|
||||
elif gate_up_list_len == 1:
|
||||
merged_act_range[gate_up_layer_name]["x"] = layer_range["x"]
|
||||
merged_gate_up_weight_list = [gate_weight, up_weight] if is_gate_up is True else [up_weight, gate_weight]
|
||||
merged_named_parameters[f"{gate_up_layer_name}.weight"] = torch.cat(merged_gate_up_weight_list, dim=0)
|
||||
|
||||
if bias_name in named_parameters[0]:
|
||||
gate_bias, up_bias = merge_merged_weight(named_parameters, bias_name, tp_size)
|
||||
if gate_up_list_len == 2:
|
||||
merged_named_parameters[f"{gate_layer_name}.bias"] = gate_bias
|
||||
merged_named_parameters[f"{up_layer_name}.bias"] = up_bias
|
||||
elif gate_up_list_len == 1:
|
||||
merged_gate_up_bias_list = [gate_bias, up_bias] if is_gate_up is True else [up_bias, gate_bias]
|
||||
merged_named_parameters[f"{gate_up_layer_name}.bias"] = torch.cat(merged_gate_up_bias_list, dim=0)
|
||||
|
||||
return gate_up_name
|
||||
|
||||
|
||||
def convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size):
|
||||
'''
|
||||
convert colum parallel named parameters to non parallel named parameters
|
||||
args:
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
'''
|
||||
if layer_range['is_linear']:
|
||||
merged_act_range[layer_name]["x"] = layer_range["x"]
|
||||
merged_named_parameters[weight_name] = torch.cat(
|
||||
[named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=0)
|
||||
if bias_name in named_parameters[0]:
|
||||
merged_named_parameters[bias_name] = torch.cat(
|
||||
[named_parameters[tp_id][bias_name] for tp_id in range(0, tp_size)], dim=0)
|
||||
|
||||
|
||||
def convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size):
|
||||
'''
|
||||
convert row parallel named parameters to non parallel named parameters
|
||||
args:
|
||||
act_layer_name: act layer name
|
||||
act_range: parallel act_range
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
'''
|
||||
if layer_range['is_linear']:
|
||||
if isinstance(layer_range['x'], torch.Tensor):
|
||||
merged_act_range[layer_name]['x'] = torch.cat(
|
||||
[act_range[tp_id][act_layer_name]['x'] for tp_id in range(0, tp_size)], dim=0)
|
||||
else:
|
||||
merged_act_range[layer_name]['x'] = None
|
||||
|
||||
merged_named_parameters[weight_name] = torch.cat(
|
||||
[named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=1)
|
||||
if bias_name in named_parameters[0]:
|
||||
merged_named_parameters[bias_name] = named_parameters[0][bias_name]
|
||||
|
||||
|
||||
def convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size, args):
|
||||
'''
|
||||
convert parallel layer named parameters to non parallel layer named parameters
|
||||
args:
|
||||
act_layer_name: act layer name
|
||||
act_range: parallel act_range
|
||||
layer_name: layer name
|
||||
weight_name: weight name
|
||||
bias_name: bias name
|
||||
named_parameters: parallel hugging face named parameters
|
||||
merged_named_parameters: non parallel hugging face named parameters
|
||||
layer_range: parallel layer range info
|
||||
merged_act_range: non parallel act range
|
||||
tp_size: tensor parallel size
|
||||
'''
|
||||
qkv_name = "qkv_proj"
|
||||
gate_up_name = "gate_up_proj"
|
||||
|
||||
if layer_range['split'] == 'col': # col
|
||||
# merge weight
|
||||
if layer_range["is_qkv"]:
|
||||
qkv_name = convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size,
|
||||
args)
|
||||
|
||||
elif layer_range["is_merge"]:
|
||||
gate_up_name = convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range,
|
||||
tp_size, args.model_type)
|
||||
else:
|
||||
convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size)
|
||||
else: # row
|
||||
convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
|
||||
merged_named_parameters, layer_range, merged_act_range, tp_size)
|
||||
|
||||
return qkv_name, gate_up_name
|
||||
|
||||
|
||||
def collect_moe_experts_act_range_of_layer(merged_act_range, mlp_part_name, moe_list):
|
||||
'''
|
||||
collect moe experts act range in the same layer
|
||||
'''
|
||||
experts_of_gate_up_layer = {}
|
||||
experts_of_down_layer = {}
|
||||
|
||||
gate_up_list = moe_list["gate_up_list"]
|
||||
gate_up_list_len = len(gate_up_list)
|
||||
down_list = moe_list["down_list"]
|
||||
gate_up_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[1]}"
|
||||
gate_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[2]}" if gate_up_list_len > 2 else None
|
||||
down_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{down_list[1]}"
|
||||
for key, value in merged_act_range.items():
|
||||
if re.search(gate_up_layer_pattern, key) or (gate_layer_pattern is not None
|
||||
and re.search(gate_layer_pattern, key)):
|
||||
experts_of_gate_up_layer[key] = value
|
||||
if re.search(down_layer_pattern, key):
|
||||
experts_of_down_layer[key] = value
|
||||
|
||||
return experts_of_gate_up_layer, experts_of_down_layer
|
||||
|
||||
|
||||
def convert_moe_expert_activation_fused(experts_of_layer, merged_act_range):
|
||||
'''
|
||||
fuse the moe expert act range in the same layer, and asign to these experts
|
||||
'''
|
||||
unfused_activation = []
|
||||
for key, value in experts_of_layer.items():
|
||||
if isinstance(value["x"], torch.Tensor):
|
||||
unfused_activation.append(value['x'])
|
||||
|
||||
assert len(unfused_activation) > 0, f"unfused_activation len is zero, this is unsupported"
|
||||
|
||||
activation = torch.stack(unfused_activation, dim=0)
|
||||
fused_activation = torch.max(activation, dim=0)[0]
|
||||
|
||||
for key, value in experts_of_layer.items():
|
||||
if value["x"] is None or isinstance(value["x"], torch.Tensor):
|
||||
value['x'] = fused_activation
|
||||
|
||||
|
||||
def convert_moe_layer_activation_fused(merged_act_range, model_type):
|
||||
'''
|
||||
loop each layer and fuse the moe expert act range in the same layer, and asign to these experts
|
||||
'''
|
||||
moe_list = smooth_model_config[model_type]["moe_list"]
|
||||
if moe_list is None:
|
||||
return
|
||||
|
||||
mlp_name = moe_list["gate_up_list"][0].split(".")[0]
|
||||
layer = 0
|
||||
|
||||
while True:
|
||||
mlp_part_name = rf"\.{layer}\.{mlp_name}"
|
||||
experts_of_gate_up_layer, experts_of_down_layer = collect_moe_experts_act_range_of_layer(
|
||||
merged_act_range, mlp_part_name, moe_list)
|
||||
# if experts_of_layer is empty, means layer equants to expert_num, the loop is finished
|
||||
if len(experts_of_gate_up_layer) < 1 or len(experts_of_down_layer) < 1:
|
||||
logger.info(f"the experts_num is {layer}")
|
||||
break
|
||||
convert_moe_expert_activation_fused(experts_of_gate_up_layer, merged_act_range)
|
||||
convert_moe_expert_activation_fused(experts_of_down_layer, merged_act_range)
|
||||
layer += 1
|
||||
|
||||
|
||||
def should_include(key, parameters, exclude_names):
|
||||
'''
|
||||
key shouldnot include in parameters and exlude_names
|
||||
args:
|
||||
parameters: named parameters
|
||||
exclude_names: excluded nameds list
|
||||
'''
|
||||
return key not in parameters and not any(exclude_name in key for exclude_name in exclude_names)
|
||||
|
||||
|
||||
def valid_act_range(act_layer_name, layer_range):
|
||||
'''
|
||||
valid act_range, mainly filter inf, nan or zero values in x field
|
||||
args:
|
||||
act_layer_name: act layer name
|
||||
layer_range: act layer value
|
||||
'''
|
||||
act_range_x = layer_range["x"]
|
||||
if act_range_x is not None and isinstance(act_range_x, torch.Tensor):
|
||||
mask = torch.isinf(act_range_x) | torch.isnan(act_range_x) | (act_range_x == 0)
|
||||
if torch.any(mask).item():
|
||||
act_range_x[mask] = 1e-6
|
||||
logger.warning(f"act_range_x in layer:{act_layer_name} has nan, inf or zero values, force to 1e-6")
|
||||
|
||||
|
||||
def convert_to_merged(act_range, named_parameters, tp_size, args):
|
||||
'''
|
||||
convert parallel act_range and named parameters to non parallel format.
|
||||
args:
|
||||
act_range: parallel act_range
|
||||
named_parameters: parallel named parameters
|
||||
tp_size: tensor parallel size
|
||||
args: argument
|
||||
'''
|
||||
model_type = args.model_type
|
||||
merged_act_range = defaultdict(lambda: {"x": None, "is_qkv": False, "is_merge": False,})
|
||||
merged_named_parameters = {}
|
||||
input_id_list = []
|
||||
|
||||
exclude_names = set()
|
||||
|
||||
for act_layer_name, layer_range in act_range[0].items():
|
||||
valid_act_range(act_layer_name, layer_range)
|
||||
layer_name, weight_name, bias_name = get_layer_weight_bias_name(model_type, act_layer_name)
|
||||
# when tie_word_embeddings is True, lm_head use embeding weight
|
||||
if args.tie_word_embeddings is True and "lm_head" in layer_name:
|
||||
continue
|
||||
qkv_name, gate_up_name = convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name,
|
||||
named_parameters, merged_named_parameters, layer_range,
|
||||
merged_act_range, tp_size, args)
|
||||
exclude_names.update({qkv_name, gate_up_name})
|
||||
|
||||
if layer_range['split'] == 'col' and layer_range["is_qkv"] and len(layer_range["input_id"]) > 0:
|
||||
input_id_list = layer_range["input_id"]
|
||||
|
||||
if args.use_smoothquant and args.disable_fused_quantize_expert is False:
|
||||
convert_moe_layer_activation_fused(merged_act_range, model_type)
|
||||
|
||||
|
||||
merged_named_parameters.update({
|
||||
key: value
|
||||
for key, value in named_parameters[0].items()
|
||||
if should_include(key, merged_named_parameters, exclude_names)
|
||||
})
|
||||
|
||||
modify_layer_weight_bias_name(model_type, merged_named_parameters)
|
||||
|
||||
sorted_named_parameters = OrderedDict(sorted(merged_named_parameters.items(), key=lambda item: item[0]))
|
||||
sorted_merged_act_range = OrderedDict(sorted(merged_act_range.items(), key=lambda item: item[0]))
|
||||
|
||||
return sorted_merged_act_range, sorted_named_parameters, input_id_list
|
||||
|
||||
|
||||
def copy_files_except_extensions(input_dir, output_dir, extensions):
|
||||
'''
|
||||
copy python files with extension in extensions from input_dir to output_dir, and keey sub directory is same
|
||||
args:
|
||||
input_dir: input directory
|
||||
output_dir: output directory
|
||||
extensions: the copy files extension
|
||||
'''
|
||||
# 遍历输入目录及其子目录
|
||||
for root, dirs, files in os.walk(input_dir):
|
||||
# 计算相对路径
|
||||
rel_path = os.path.relpath(root, input_dir)
|
||||
if len(rel_path) > 1 and rel_path.startswith('.'):
|
||||
continue
|
||||
# 构建目标目录路径
|
||||
dst_dir = os.path.join(output_dir, rel_path)
|
||||
# 确保目标目录存在
|
||||
if not os.path.exists(dst_dir):
|
||||
os.makedirs(dst_dir)
|
||||
for file in files:
|
||||
if not any(file.endswith(ext) for ext in extensions) and not file.startswith('.'):
|
||||
# 构建源文件和目标文件的完整路径
|
||||
src_file = os.path.join(root, file)
|
||||
dst_file = os.path.join(dst_dir, file)
|
||||
# 复制文件
|
||||
shutil.copy2(src_file, dst_file)
|
||||
logger.info(f'Copied {src_file} to {dst_file}')
|
||||
|
||||
|
||||
def cleanup():
|
||||
'''
|
||||
cleanup memory resource
|
||||
'''
|
||||
gc.collect()
|
||||
if not current_platform.is_cpu():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def vllm_cleanup(llm):
|
||||
"""Release occupied resources and reset parallel_state"""
|
||||
del llm
|
||||
from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
import contextlib
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
import ray
|
||||
if ray.is_initialized():
|
||||
ray.shutdown()
|
||||
logger.info('llm and distributed env is cleanup')
|
||||
|
||||
|
||||
def generate_datetime():
|
||||
'''
|
||||
generate current datetime
|
||||
'''
|
||||
current_datetime = datetime.now()
|
||||
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
return formatted_datetime
|
||||
|
||||
|
||||
def get_hf_config_sliding_window(hf_text_config) -> Optional[int]:
|
||||
"""Get the sliding window size, or None if disabled."""
|
||||
|
||||
# Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
|
||||
# addition to sliding window size. We check if that field is present
|
||||
# and if it's False, return None.
|
||||
if (hasattr(hf_text_config, "use_sliding_window")
|
||||
and not hf_text_config.use_sliding_window):
|
||||
return None
|
||||
return getattr(hf_text_config, "sliding_window", None)
|
||||
|
||||
def get_skip_patterns(model_type):
|
||||
"""Get the skip patterns from model config."""
|
||||
config = smooth_model_config[model_type]
|
||||
return config["skip_patterns"] if "skip_patterns" in config else []
|
||||
|
||||
def should_skip(model_type, weight_name):
|
||||
"""judge if the weight should be skipped."""
|
||||
skip_patterns = get_skip_patterns(model_type)
|
||||
for pattern in skip_patterns:
|
||||
import re
|
||||
if re.match(pattern, weight_name):
|
||||
return True
|
||||
return False
|
||||
|
||||
152
vllm-v0.6.2/tools/quant_tools/weight_only.py
Normal file
152
vllm-v0.6.2/tools/quant_tools/weight_only.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import argparse
|
||||
import torch
|
||||
from torch import Tensor
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
|
||||
from dump_smooth import save_weights, save_generate_weights
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def merge_adjacent_low_4bit(tensor: Tensor):
|
||||
"""
|
||||
将一个包含int8类型数据的张量,按相邻两个元素的低4位合并成新的int8数据,
|
||||
并输出一个新的张量。
|
||||
|
||||
参数:
|
||||
- tensor: 类型为torch.int8的张量,长度应为偶数。
|
||||
|
||||
返回:
|
||||
- 新张量,其中每个元素是相邻原元素低4位的合并结果。
|
||||
|
||||
示例:
|
||||
a = torch.tensor([5, 7, 12, 3], dtype=torch.int8) # 示例张量,每对元素将被合并
|
||||
merged_tensor = merge_adjacent_low_nibbles(a)
|
||||
print(f"合并后的张量: {merged_tensor} (二进制: {merged_tensor.tolist()})")
|
||||
"""
|
||||
|
||||
# 确保输入张量类型为int8且长度为偶数
|
||||
assert tensor.dtype == torch.int8, "输入张量必须为int8类型"
|
||||
assert tensor.shape[-1] % 2 == 0, "输入张量最后一维长度需为偶数"
|
||||
|
||||
even = np.bitwise_and(tensor[..., 0::2], 0x0F, dtype=np.int8)
|
||||
odd = np.bitwise_and(tensor[..., 1::2], 0x0F, dtype=np.int8)
|
||||
merged_tensor = np.bitwise_or(np.left_shift(odd, 4), even)
|
||||
|
||||
# 结果是已经合并的新张量
|
||||
return merged_tensor
|
||||
|
||||
|
||||
def cal_weightonly_weight(weight, weight_bits, qmin, qmax, has_qzeros, eps: float = 1e-8):
|
||||
'''
|
||||
return quantized_weight, scales, qzeros
|
||||
args:
|
||||
weight: need to be quantized
|
||||
weight_bits: quantized bitwidth
|
||||
qmin: minimum value in quantized range
|
||||
qmax: maximum value in quantized range
|
||||
has_qzeros: whether to generate qzeros weight
|
||||
eps: limit zero float value to avoid floatpoint error
|
||||
'''
|
||||
assert weight.numel() != 0, "weight should not be empty tensor"
|
||||
assert weight.dim() == 2 or weight.dim() == 3, "Invalid dim. The dim of weight should be 2 or 3"
|
||||
assert weight.dtype in [torch.float32, torch.float16, torch.bfloat16
|
||||
], "Invalid datatype. Weight must be torch.float32 or torch.float16 or torch.bfloat16"
|
||||
|
||||
weight_scale = weight.float().abs().clamp(min=eps).max(dim=-1).values / qmax
|
||||
unpacked_weight = (torch.round((weight / weight_scale[..., None]).float())).clip(min=qmin, max=qmax).to(torch.int8)
|
||||
scale_quant_orig_c = weight_scale.squeeze()
|
||||
|
||||
if weight_bits == 4:
|
||||
quantized_weight = merge_adjacent_low_4bit(unpacked_weight)
|
||||
else:
|
||||
quantized_weight = unpacked_weight
|
||||
|
||||
if has_qzeros:
|
||||
qzeros = torch.zeros_like(scale_quant_orig_c, dtype=torch.int32)
|
||||
else:
|
||||
qzeros = None
|
||||
|
||||
return quantized_weight, scale_quant_orig_c, qzeros
|
||||
|
||||
|
||||
def generate_weightonly_weight(act_range, name_parameters, args):
|
||||
'''
|
||||
generate hugging face weight to quanizated weightonly weight
|
||||
args:
|
||||
act_range: non parallem act_range
|
||||
name_parameters: non parallel hugging face named parameters
|
||||
args: arguments from main
|
||||
'''
|
||||
weightonly_weight = {}
|
||||
has_qzeros = args.has_qzeros
|
||||
weight_bits = 8 if args.weight_only_precision == 'int8' else 4
|
||||
qmin = float(-2**(weight_bits - 1))
|
||||
qmax = float(2**(weight_bits - 1) - 1)
|
||||
|
||||
for name, param in name_parameters.items():
|
||||
if should_skip(args.model_type, name):
|
||||
logger.info(f"skip {name}")
|
||||
weightonly_weight[name] = param
|
||||
continue
|
||||
if name.endswith("bias"):
|
||||
weightonly_weight[name] = param
|
||||
continue
|
||||
name_parts = name.split(".")
|
||||
layer_name = ".".join(name_parts[:-1])
|
||||
if layer_name in act_range:
|
||||
qweight, scales, qzeros = cal_weightonly_weight(param, weight_bits, qmin, qmax, has_qzeros)
|
||||
scales = scales.to(args.torch_scales_smooth_dtype)
|
||||
weightonly_weight[f'{layer_name}.qweight'] = qweight
|
||||
weightonly_weight[f'{layer_name}.scales'] = scales
|
||||
if has_qzeros:
|
||||
weightonly_weight[f'{layer_name}.qzeros'] = qzeros
|
||||
else:
|
||||
weightonly_weight[name] = param
|
||||
|
||||
return weightonly_weight
|
||||
|
||||
|
||||
def generate_weights_of_weight_only(llm: LLM, args: argparse.Namespace):
|
||||
'''
|
||||
generate weightonly weights
|
||||
args:
|
||||
llm: LLM instance
|
||||
args: argument from main
|
||||
'''
|
||||
tp_size = args.tp_size
|
||||
|
||||
llm.llm_engine.model_executor._run_workers("setup_smooth_hook")
|
||||
|
||||
llm.llm_engine.model_executor._run_workers("remove_hooks")
|
||||
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
|
||||
named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
|
||||
|
||||
vllm_cleanup(llm)
|
||||
cleanup()
|
||||
|
||||
logger.info("get act_range and named_parameters from llm finished")
|
||||
|
||||
merged_act_range, merged_named_parameters, _ = convert_to_merged(act_range, named_parameters, tp_size, args)
|
||||
save_weights(merged_named_parameters, args)
|
||||
|
||||
del act_range
|
||||
del named_parameters
|
||||
cleanup()
|
||||
|
||||
logger.info("get merged_act_range and merged_named_parameters finished")
|
||||
|
||||
weightonly_weight = generate_weightonly_weight(merged_act_range, merged_named_parameters, args)
|
||||
save_generate_weights(weightonly_weight, args)
|
||||
|
||||
del merged_act_range
|
||||
del merged_named_parameters
|
||||
cleanup()
|
||||
|
||||
logger.info("get weightonly_weight finished")
|
||||
|
||||
return weightonly_weight
|
||||
312
vllm-v0.6.2/tools/report_build_time_ninja.py
Normal file
312
vllm-v0.6.2/tools/report_build_time_ninja.py
Normal file
@@ -0,0 +1,312 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2018 The Chromium Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
# Modified version of: https://chromium.googlesource.com/chromium/tools/depot_tools.git/+/refs/heads/main/post_build_ninja_summary.py
|
||||
"""Summarize the last ninja build, invoked with ninja's -C syntax.
|
||||
|
||||
> python3 tools/report_build_time_ninja.py -C build/..
|
||||
|
||||
Typical output looks like this:
|
||||
```
|
||||
Longest build steps for .cpp.o:
|
||||
1.0 weighted s to build ...torch_bindings.cpp.o (12.4 s elapsed time)
|
||||
2.0 weighted s to build ..._attn_c.dir/csrc... (23.5 s elapsed time)
|
||||
2.6 weighted s to build ...torch_bindings.cpp.o (31.5 s elapsed time)
|
||||
3.2 weighted s to build ...torch_bindings.cpp.o (38.5 s elapsed time)
|
||||
Longest build steps for .so (linking):
|
||||
0.1 weighted s to build _moe_C.abi3.so (1.0 s elapsed time)
|
||||
0.5 weighted s to build ...flash_attn_c.abi3.so (1.1 s elapsed time)
|
||||
6.2 weighted s to build _C.abi3.so (6.2 s elapsed time)
|
||||
Longest build steps for .cu.o:
|
||||
15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
|
||||
15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
|
||||
15.3 weighted s to build ...machete_mm_... (183.6 s elapsed time)
|
||||
15.3 weighted s to build ...machete_mm_... (183.7 s elapsed time)
|
||||
15.5 weighted s to build ...machete_mm_... (185.6 s elapsed time)
|
||||
15.5 weighted s to build ...machete_mm_... (185.9 s elapsed time)
|
||||
15.5 weighted s to build ...machete_mm_... (186.2 s elapsed time)
|
||||
37.4 weighted s to build ...scaled_mm_c3x.cu... (449.0 s elapsed time)
|
||||
43.9 weighted s to build ...scaled_mm_c2x.cu... (527.4 s elapsed time)
|
||||
344.8 weighted s to build ...attention_...cu.o (1087.2 s elapsed time)
|
||||
1110.0 s weighted time (10120.4 s elapsed time sum, 9.1x parallelism)
|
||||
134 build steps completed, average of 0.12/s
|
||||
```
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import errno
|
||||
import fnmatch
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
# The number of long build times to report:
|
||||
long_count = 10
|
||||
# The number of long times by extension to report
|
||||
long_ext_count = 10
|
||||
|
||||
|
||||
class Target:
|
||||
"""Represents a single line read for a .ninja_log file."""
|
||||
|
||||
def __init__(self, start, end):
|
||||
"""Creates a target object by passing in the start/end times in seconds
|
||||
as a float."""
|
||||
self.start = start
|
||||
self.end = end
|
||||
# A list of targets, appended to by the owner of this object.
|
||||
self.targets = []
|
||||
self.weighted_duration = 0.0
|
||||
|
||||
def Duration(self):
|
||||
"""Returns the task duration in seconds as a float."""
|
||||
return self.end - self.start
|
||||
|
||||
def SetWeightedDuration(self, weighted_duration):
|
||||
"""Sets the duration, in seconds, passed in as a float."""
|
||||
self.weighted_duration = weighted_duration
|
||||
|
||||
def WeightedDuration(self):
|
||||
"""Returns the task's weighted duration in seconds as a float.
|
||||
|
||||
Weighted_duration takes the elapsed time of the task and divides it
|
||||
by how many other tasks were running at the same time. Thus, it
|
||||
represents the approximate impact of this task on the total build time,
|
||||
with serialized or serializing steps typically ending up with much
|
||||
longer weighted durations.
|
||||
weighted_duration should always be the same or shorter than duration.
|
||||
"""
|
||||
# Allow for modest floating-point errors
|
||||
epsilon = 0.000002
|
||||
if (self.weighted_duration > self.Duration() + epsilon):
|
||||
print('{} > {}?'.format(self.weighted_duration, self.Duration()))
|
||||
assert (self.weighted_duration <= self.Duration() + epsilon)
|
||||
return self.weighted_duration
|
||||
|
||||
def DescribeTargets(self):
|
||||
"""Returns a printable string that summarizes the targets."""
|
||||
# Some build steps generate dozens of outputs - handle them sanely.
|
||||
# The max_length was chosen so that it can fit most of the long
|
||||
# single-target names, while minimizing word wrapping.
|
||||
result = ', '.join(self.targets)
|
||||
max_length = 65
|
||||
if len(result) > max_length:
|
||||
result = result[:max_length] + '...'
|
||||
return result
|
||||
|
||||
|
||||
# Copied with some modifications from ninjatracing
|
||||
def ReadTargets(log, show_all):
|
||||
"""Reads all targets from .ninja_log file |log_file|, sorted by duration.
|
||||
|
||||
The result is a list of Target objects."""
|
||||
header = log.readline()
|
||||
assert header == '# ninja log v5\n', \
|
||||
'unrecognized ninja log version {!r}'.format(header)
|
||||
targets_dict = {}
|
||||
last_end_seen = 0.0
|
||||
for line in log:
|
||||
parts = line.strip().split('\t')
|
||||
if len(parts) != 5:
|
||||
# If ninja.exe is rudely halted then the .ninja_log file may be
|
||||
# corrupt. Silently continue.
|
||||
continue
|
||||
start, end, _, name, cmdhash = parts # Ignore restat.
|
||||
# Convert from integral milliseconds to float seconds.
|
||||
start = int(start) / 1000.0
|
||||
end = int(end) / 1000.0
|
||||
if not show_all and end < last_end_seen:
|
||||
# An earlier time stamp means that this step is the first in a new
|
||||
# build, possibly an incremental build. Throw away the previous
|
||||
# data so that this new build will be displayed independently.
|
||||
# This has to be done by comparing end times because records are
|
||||
# written to the .ninja_log file when commands complete, so end
|
||||
# times are guaranteed to be in order, but start times are not.
|
||||
targets_dict = {}
|
||||
target = None
|
||||
if cmdhash in targets_dict:
|
||||
target = targets_dict[cmdhash]
|
||||
if not show_all and (target.start != start or target.end != end):
|
||||
# If several builds in a row just run one or two build steps
|
||||
# then the end times may not go backwards so the last build may
|
||||
# not be detected as such. However in many cases there will be a
|
||||
# build step repeated in the two builds and the changed
|
||||
# start/stop points for that command, identified by the hash,
|
||||
# can be used to detect and reset the target dictionary.
|
||||
targets_dict = {}
|
||||
target = None
|
||||
if not target:
|
||||
targets_dict[cmdhash] = target = Target(start, end)
|
||||
last_end_seen = end
|
||||
target.targets.append(name)
|
||||
return list(targets_dict.values())
|
||||
|
||||
|
||||
def GetExtension(target, extra_patterns):
|
||||
"""Return the file extension that best represents a target.
|
||||
|
||||
For targets that generate multiple outputs it is important to return a
|
||||
consistent 'canonical' extension. Ultimately the goal is to group build steps
|
||||
by type."""
|
||||
for output in target.targets:
|
||||
if extra_patterns:
|
||||
for fn_pattern in extra_patterns.split(';'):
|
||||
if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
|
||||
return fn_pattern
|
||||
# Not a true extension, but a good grouping.
|
||||
if output.endswith('type_mappings'):
|
||||
extension = 'type_mappings'
|
||||
break
|
||||
|
||||
# Capture two extensions if present. For example: file.javac.jar should
|
||||
# be distinguished from file.interface.jar.
|
||||
root, ext1 = os.path.splitext(output)
|
||||
_, ext2 = os.path.splitext(root)
|
||||
extension = ext2 + ext1 # Preserve the order in the file name.
|
||||
|
||||
if len(extension) == 0:
|
||||
extension = '(no extension found)'
|
||||
|
||||
if ext1 in ['.pdb', '.dll', '.exe']:
|
||||
extension = 'PEFile (linking)'
|
||||
# Make sure that .dll and .exe are grouped together and that the
|
||||
# .dll.lib files don't cause these to be listed as libraries
|
||||
break
|
||||
if ext1 in ['.so', '.TOC']:
|
||||
extension = '.so (linking)'
|
||||
# Attempt to identify linking, avoid identifying as '.TOC'
|
||||
break
|
||||
# Make sure .obj files don't get categorized as mojo files
|
||||
if ext1 in ['.obj', '.o']:
|
||||
break
|
||||
# Jars are the canonical output of java targets.
|
||||
if ext1 == '.jar':
|
||||
break
|
||||
# Normalize all mojo related outputs to 'mojo'.
|
||||
if output.count('.mojom') > 0:
|
||||
extension = 'mojo'
|
||||
break
|
||||
return extension
|
||||
|
||||
|
||||
def SummarizeEntries(entries, extra_step_types):
|
||||
"""Print a summary of the passed in list of Target objects."""
|
||||
|
||||
# Create a list that is in order by time stamp and has entries for the
|
||||
# beginning and ending of each build step (one time stamp may have multiple
|
||||
# entries due to multiple steps starting/stopping at exactly the same time).
|
||||
# Iterate through this list, keeping track of which tasks are running at all
|
||||
# times. At each time step calculate a running total for weighted time so
|
||||
# that when each task ends its own weighted time can easily be calculated.
|
||||
task_start_stop_times = []
|
||||
|
||||
earliest = -1
|
||||
latest = 0
|
||||
total_cpu_time = 0
|
||||
for target in entries:
|
||||
if earliest < 0 or target.start < earliest:
|
||||
earliest = target.start
|
||||
if target.end > latest:
|
||||
latest = target.end
|
||||
total_cpu_time += target.Duration()
|
||||
task_start_stop_times.append((target.start, 'start', target))
|
||||
task_start_stop_times.append((target.end, 'stop', target))
|
||||
length = latest - earliest
|
||||
weighted_total = 0.0
|
||||
|
||||
# Sort by the time/type records and ignore |target|
|
||||
task_start_stop_times.sort(key=lambda times: times[:2])
|
||||
# Now we have all task start/stop times sorted by when they happen. If a
|
||||
# task starts and stops on the same time stamp then the start will come
|
||||
# first because of the alphabet, which is important for making this work
|
||||
# correctly.
|
||||
# Track the tasks which are currently running.
|
||||
running_tasks = {}
|
||||
# Record the time we have processed up to so we know how to calculate time
|
||||
# deltas.
|
||||
last_time = task_start_stop_times[0][0]
|
||||
# Track the accumulated weighted time so that it can efficiently be added
|
||||
# to individual tasks.
|
||||
last_weighted_time = 0.0
|
||||
# Scan all start/stop events.
|
||||
for event in task_start_stop_times:
|
||||
time, action_name, target = event
|
||||
# Accumulate weighted time up to now.
|
||||
num_running = len(running_tasks)
|
||||
if num_running > 0:
|
||||
# Update the total weighted time up to this moment.
|
||||
last_weighted_time += (time - last_time) / float(num_running)
|
||||
if action_name == 'start':
|
||||
# Record the total weighted task time when this task starts.
|
||||
running_tasks[target] = last_weighted_time
|
||||
if action_name == 'stop':
|
||||
# Record the change in the total weighted task time while this task
|
||||
# ran.
|
||||
weighted_duration = last_weighted_time - running_tasks[target]
|
||||
target.SetWeightedDuration(weighted_duration)
|
||||
weighted_total += weighted_duration
|
||||
del running_tasks[target]
|
||||
last_time = time
|
||||
assert (len(running_tasks) == 0)
|
||||
|
||||
# Warn if the sum of weighted times is off by more than half a second.
|
||||
if abs(length - weighted_total) > 500:
|
||||
print('Warning: Possible corrupt ninja log, results may be '
|
||||
'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
|
||||
length, weighted_total))
|
||||
|
||||
entries_by_ext = defaultdict(list)
|
||||
for target in entries:
|
||||
extension = GetExtension(target, extra_step_types)
|
||||
entries_by_ext[extension].append(target)
|
||||
|
||||
for key, values in entries_by_ext.items():
|
||||
print(' Longest build steps for {}:'.format(key))
|
||||
values.sort(key=lambda x: x.WeightedDuration())
|
||||
for target in values[-long_count:]:
|
||||
print(
|
||||
' {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
|
||||
format(target.WeightedDuration(), target.DescribeTargets(),
|
||||
target.Duration()))
|
||||
|
||||
print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
|
||||
'parallelism)'.format(length, total_cpu_time,
|
||||
total_cpu_time * 1.0 / length))
|
||||
print(' %d build steps completed, average of %1.2f/s' %
|
||||
(len(entries), len(entries) / (length)))
|
||||
|
||||
|
||||
def main():
|
||||
log_file = '.ninja_log'
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-C', dest='build_directory', help='Build directory.')
|
||||
parser.add_argument(
|
||||
'-s',
|
||||
'--step-types',
|
||||
help='semicolon separated fnmatch patterns for build-step grouping')
|
||||
parser.add_argument('--log-file',
|
||||
help="specific ninja log file to analyze.")
|
||||
args, _extra_args = parser.parse_known_args()
|
||||
if args.build_directory:
|
||||
log_file = os.path.join(args.build_directory, log_file)
|
||||
if args.log_file:
|
||||
log_file = args.log_file
|
||||
if args.step_types:
|
||||
# Make room for the extra build types.
|
||||
global long_ext_count
|
||||
long_ext_count += len(args.step_types.split(';'))
|
||||
|
||||
try:
|
||||
with open(log_file) as log:
|
||||
entries = ReadTargets(log, False)
|
||||
SummarizeEntries(entries, args.step_types)
|
||||
except OSError:
|
||||
print('Log file {!r} not found, no build summary created.'.format(
|
||||
log_file))
|
||||
return errno.ENOENT
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
22
vllm-v0.6.2/tools/shellcheck.sh
Executable file
22
vllm-v0.6.2/tools/shellcheck.sh
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
scversion="stable"
|
||||
|
||||
if [ -d "shellcheck-${scversion}" ]; then
|
||||
export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
|
||||
fi
|
||||
|
||||
if ! [ -x "$(command -v shellcheck)" ]; then
|
||||
if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
|
||||
echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# automatic local install if linux x86_64
|
||||
wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
|
||||
export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
|
||||
fi
|
||||
|
||||
# TODO - fix warnings in .buildkite/run-amd-test.sh
|
||||
find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'
|
||||
23
vllm-v0.6.2/tools/utils/README.md
Normal file
23
vllm-v0.6.2/tools/utils/README.md
Normal file
@@ -0,0 +1,23 @@
|
||||
### 1. 非page模式max_num_seqs自动调优工具
|
||||
|
||||
对于MLU370X8平台,在unpage模式下,可以通过调整`max_num_seqs`来提升性能。`tune_max_num_seqs.py`通过自动调参来搜索最佳`max_num_seqs`值。
|
||||
- 用法示例
|
||||
搜索固定配置下,使吞吐量最大`max_num_seqs`值,其中参数部分保持与`benchmark_latency.py`/`benchmark_throughput.py`一致。
|
||||
```bash
|
||||
python tools/utils/tune_max_num_seqs.py --backend vllm --input-len 1024 --output-len 1024 --model /Path/to/Llama-2-70b-chat-hf/ -tp 1 --max-model-len 4096 --dtype float16 --num-prompts 10
|
||||
```
|
||||
通过执行上述命令,可以搜索得到最优`max_num_seqs`配置,在构建LLM对象时,作为参数传入使用。
|
||||
|
||||
### 2. vLLM调度分析辅助工具
|
||||
|
||||
首先,设置环境变量开启调度profiling:export VLLM_SCHEDULER_PROFILE=true
|
||||
|
||||
对于离线测试,测试结束后,会自动保存数据并打印出当前已经运行请求的信息
|
||||
|
||||
对于在线测试,获取调度数据的步骤如下:
|
||||
|
||||
1. 启动server
|
||||
2. 运行client端测试
|
||||
3. 等待client测试结束后,立即运行:python3 tools/utils/post_scheduler_view_action.py --host [server端ip地址] --port [server端口号] --action save,请求server端将数据保存下来
|
||||
4. server端会打印出当前已经运行请求的信息
|
||||
5. 如果想再次运行client测试(基于现有server),先运行:python3 tools/utils/post_scheduler_view_action.py --host [server端ip地址] --port [server端口号] --action init,恢复server端,然后重复2、3、4
|
||||
27
vllm-v0.6.2/tools/utils/post_scheduler_view_action.py
Normal file
27
vllm-v0.6.2/tools/utils/post_scheduler_view_action.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import argparse
|
||||
import requests
|
||||
|
||||
""" Post a request to server, let server init/save scheduler view. """
|
||||
def post_http_request(api_url: str, action: str) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
pload = {
|
||||
"model": action,
|
||||
"prompt": "",
|
||||
"n": 1,
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 16,
|
||||
"stream": True,
|
||||
}
|
||||
response = requests.post(api_url, headers=headers, json=pload, stream=True)
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=6000)
|
||||
parser.add_argument("--action", type=str, default="save", choices=['init', 'save'])
|
||||
args = parser.parse_args()
|
||||
api_url = f"http://{args.host}:{args.port}/v1/completions"
|
||||
|
||||
post_http_request(api_url, f"{args.action}_scheduler_view")
|
||||
181
vllm-v0.6.2/tools/utils/tune_max_num_seqs.py
Normal file
181
vllm-v0.6.2/tools/utils/tune_max_num_seqs.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""Autotune max_num_seqs paramter."""
|
||||
# pylint: skip-file
|
||||
import argparse
|
||||
import random
|
||||
from typing import Dict, Any
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def run_vllm(config: Dict[str, Any]) -> float:
|
||||
"""Initialize and run an instance of a language model (LLM) using the
|
||||
`vllm` library."""
|
||||
print(f'Evaluate with max_num_seqs: {config["max_num_seqs"]}')
|
||||
from vllm import LLM
|
||||
llm = LLM(**config)
|
||||
print(f'The num of gpu blocks is: {llm.llm_engine.cache_config.num_gpu_blocks}')
|
||||
return llm.llm_engine.cache_config.num_gpu_blocks
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
"""The entry function to tune max_num_seqs."""
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
config = {
|
||||
'model': args.model,
|
||||
'tokenizer': args.tokenizer,
|
||||
'quantization': args.quantization,
|
||||
'tensor_parallel_size': args.tensor_parallel_size,
|
||||
'seed': args.seed,
|
||||
'trust_remote_code': args.trust_remote_code,
|
||||
'dtype': args.dtype,
|
||||
'max_model_len': args.max_model_len,
|
||||
'enforce_eager': args.enforce_eager,
|
||||
'kv_cache_dtype': args.kv_cache_dtype,
|
||||
'quantization_param_path': args.quantization_param_path,
|
||||
'device': args.device,
|
||||
'enable_prefix_caching': args.enable_prefix_caching,
|
||||
'enable_chunked_prefill': args.enable_chunked_prefill,
|
||||
'max_num_batched_tokens': args.max_num_batched_tokens,
|
||||
'gpu_memory_utilization': args.gpu_memory_utilization,
|
||||
'download_dir': args.download_dir,
|
||||
'block_size': args.block_size
|
||||
}
|
||||
|
||||
import multiprocessing
|
||||
def worker_wrapper(config, output_queue):
|
||||
"""Here we get the num_gpu_blocks by instantiate a llm object."""
|
||||
result = run_vllm(config)
|
||||
output_queue.put(result)
|
||||
|
||||
|
||||
def get_num_gpu_blocks(cache, num_seqs) -> int:
|
||||
"""Get the number of GPU blocks with parameter num_seqs."""
|
||||
if num_seqs in cache:
|
||||
return cache[num_seqs]
|
||||
# Here since we cannot manually release the resources hold by Ray and NCCL,
|
||||
# we evaluate a set of parameters by launching a separate process.
|
||||
config['max_num_seqs'] = num_seqs
|
||||
output_queue = multiprocessing.Queue()
|
||||
process = multiprocessing.Process(target=worker_wrapper,
|
||||
args=(config, output_queue))
|
||||
process.start()
|
||||
process.join()
|
||||
result = output_queue.get()
|
||||
cache[num_seqs] = result
|
||||
return result
|
||||
|
||||
|
||||
def find_optimal_max_num_seqs(init=256) -> int:
|
||||
"""Search th optimal max_num_seqs which maximizes
|
||||
min(max_num_seqs, num_gpu_blocks)."""
|
||||
# Use cache to avoid repeated evaluations.
|
||||
cache = {}
|
||||
|
||||
# Initialization seach range.
|
||||
num_blocks = get_num_gpu_blocks(cache, init)
|
||||
left, right = min(num_blocks, init), max(num_blocks, init)
|
||||
|
||||
# Binary search.
|
||||
while 0 < left < right:
|
||||
mid = (left + right) // 2
|
||||
num_blocks = get_num_gpu_blocks(cache, mid)
|
||||
|
||||
if num_blocks == mid:
|
||||
return mid
|
||||
if num_blocks > mid:
|
||||
left = mid + 1
|
||||
else:
|
||||
right = mid - 1
|
||||
left = max(min(mid, num_blocks), left)
|
||||
right = min(max(mid, num_blocks), right)
|
||||
|
||||
left, right = max(1, left), max(1, right)
|
||||
final_left = min(left, get_num_gpu_blocks(cache, left))
|
||||
final_right = min(right, get_num_gpu_blocks(cache, right))
|
||||
return right if final_right > final_left else left
|
||||
|
||||
max_num_seqs = find_optimal_max_num_seqs()
|
||||
print(f'The optimal max_num_seqs is {max_num_seqs}.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Tune max_num_seqs.")
|
||||
parser.add_argument("--backend", type=str, choices=["vllm"], default="vllm")
|
||||
parser.add_argument("--dataset", type=str, default=None,
|
||||
help="Path to the dataset.")
|
||||
parser.add_argument("--input-len", type=int, default=None,
|
||||
help="Input prompt length for each request")
|
||||
parser.add_argument("--output-len", type=int, default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.")
|
||||
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
||||
parser.add_argument("--tokenizer", type=str, default=None)
|
||||
parser.add_argument('--quantization', '-q',
|
||||
choices=['awq', 'gptq', 'squeezellm', None],
|
||||
default=None)
|
||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||
parser.add_argument("--n", type=int, default=1,
|
||||
help="Number of generated sequences per prompt.")
|
||||
parser.add_argument("--use-beam-search", action="store_true")
|
||||
parser.add_argument("--num-prompts", type=int, default=1000,
|
||||
help="Number of prompts to process.")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--hf-max-batch-size", type=int, default=None,
|
||||
help="Maximum batch size for HF backend.")
|
||||
|
||||
parser.add_argument("--block-size", type=int, default=-1)
|
||||
parser.add_argument('--trust-remote-code', action='store_true',
|
||||
help='trust remote code from huggingface')
|
||||
parser.add_argument(
|
||||
'--max-model-len', type=int, default=None,
|
||||
help='Maximum length of a sequence (including prompt and output). '
|
||||
'If None, will be derived from the model.')
|
||||
parser.add_argument(
|
||||
'--dtype', type=str, default='auto',
|
||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||
help='data type for model weights and activations. '
|
||||
'The "auto" option will use FP16 precision '
|
||||
'for FP32 and FP16 models, and BF16 precision '
|
||||
'for BF16 models.')
|
||||
parser.add_argument('--gpu-memory-utilization', type=float, default=0.9,
|
||||
help='the fraction of GPU memory to be used for '
|
||||
'the model executor, which can range from 0 to 1.'
|
||||
'If unspecified, will use the default value of 0.9.')
|
||||
parser.add_argument("--enforce-eager", action="store_true",
|
||||
help="enforce eager execution")
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype", type=str, choices=["auto", "fp8"], default="auto",
|
||||
help=
|
||||
'Data type for kv cache storage. If "auto", will use model data type.')
|
||||
parser.add_argument(
|
||||
'--quantization-param-path', type=str, default=None,
|
||||
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||
'instead supported for common inference criteria.')
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cuda", choices=["cuda"],
|
||||
help='device type for vLLM execution, supporting CUDA only currently.')
|
||||
parser.add_argument(
|
||||
"--enable-prefix-caching", action='store_true',
|
||||
help="enable automatic prefix caching for vLLM backend.")
|
||||
parser.add_argument("--enable-chunked-prefill", action='store_true',
|
||||
help="enable chunked prefill for vLLM backend.")
|
||||
parser.add_argument('--max-num-batched-tokens', type=int, default=None,
|
||||
help='maximum number of batched tokens per '
|
||||
'iteration')
|
||||
parser.add_argument('--download-dir', type=str, default=None,
|
||||
help='directory to download and load the weights, '
|
||||
'default to the default cache dir of huggingface')
|
||||
cli_args = parser.parse_args()
|
||||
if cli_args.tokenizer is None:
|
||||
cli_args.tokenizer = cli_args.model
|
||||
if cli_args.dataset is None:
|
||||
assert cli_args.input_len is not None
|
||||
assert cli_args.output_len is not None
|
||||
else:
|
||||
assert cli_args.input_len is None
|
||||
|
||||
main(cli_args)
|
||||
Reference in New Issue
Block a user