add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/tools/actionlint.sh
+++ b/vllm-v0.6.2/tools/actionlint.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if command -v actionlint &> /dev/null; then
+    actionlint "$@"
+    exit 0
+elif [ -x ./actionlint ]; then
+    ./actionlint "$@"
+    exit 0
+fi
+
+# download a binary to the current directory - v1.7.3
+bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
+./actionlint "$@"
--- a/vllm-v0.6.2/tools/build.property
+++ b/vllm-v0.6.2/tools/build.property
@@ -0,0 +1,9 @@
+TORCH_MLU_OPS_VERSION=1.3.2+pt25
+CATCH_VERSION=1.24.1+torch2.5.0
+CNCL_VERSION=1.24.1-1
+CNNL_VERSION=1.28.4-1
+CNNLEXTRA_VERSION=1.12.3-1
+CNTOOLKIT_VERSION=3.15.7-1
+MLUOPS_VERSION=1.4.1-1
+TRITON_VERSION=3.0.0+mlu1.3.1
+XFORMERS_VERSION=0.0.24+mlu0.5.0.pt2.5
--- a/vllm-v0.6.2/tools/check_repo.sh
+++ b/vllm-v0.6.2/tools/check_repo.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
+
+if ! git diff --quiet; then
+	echo "Repo is dirty" >&2
+
+	exit 1
+fi
+
+if ! git describe --tags; then
+	echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
+
+	exit 1
+fi
--- a/vllm-v0.6.2/tools/config_env.sh
+++ b/vllm-v0.6.2/tools/config_env.sh
@@ -0,0 +1,8 @@
+export CN_NOTIFIER_POOL_MAX=1000
+export CN_TASKTOPO_RESIDENT=0
+export CNCL_STANDALONE_ENABLE=1
+export CNCL_TWOSHOT_ENABLE=1
+export CNPERF_DEBUG_DISABLE_CHILD_PROCESS=1
+export PYTORCH_CNDEV_BASED_MLU_CHECK=1
+export RAY_ROTATION_BACKUP_COUNT=10
+export RAY_ROTATION_MAX_BYTES=102400
--- a/vllm-v0.6.2/tools/mypy.sh
+++ b/vllm-v0.6.2/tools/mypy.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+CI=${1:-0}
+PYTHON_VERSION=${2:-3.9}
+
+if [ "$CI" -eq 1 ]; then
+    set -e
+fi
+
+run_mypy() {
+    echo "Running mypy on $1"
+    if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
+        mypy --python-version "${PYTHON_VERSION}" "$@"
+        return
+    fi
+    mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
+}
+
+run_mypy # Note that this is less strict than CI
+run_mypy tests
+run_mypy vllm/attention
+run_mypy vllm/compilation
+run_mypy vllm/distributed
+run_mypy vllm/engine
+run_mypy vllm/executor
+run_mypy vllm/lora
+run_mypy vllm/model_executor
+run_mypy vllm/plugins
+run_mypy vllm/prompt_adapter
+run_mypy vllm/spec_decode
+run_mypy vllm/worker
--- a/vllm-v0.6.2/tools/profiler/print_layerwise_table.py
+++ b/vllm-v0.6.2/tools/profiler/print_layerwise_table.py
@@ -0,0 +1,77 @@
+import argparse
+import json
+from typing import Dict
+
+from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry
+from vllm.profiler.utils import TablePrinter, indent_string
+
+
+def flatten_entries(entry_cls, profile_dict: Dict):
+    entries_and_depth = []
+
+    def get_entries(node, curr_depth=0):
+        entries_and_depth.append((entry_cls(**node["entry"]), curr_depth))
+
+        for child in node["children"]:
+            get_entries(
+                child,
+                curr_depth=curr_depth + 1,
+            )
+
+    for root in profile_dict:
+        get_entries(root)
+
+    return entries_and_depth
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--json-trace",
+                        type=str,
+                        required=True,
+                        help="json trace file output by "
+                        "examples/offline_profile.py")
+    parser.add_argument("--phase",
+                        type=str,
+                        choices=["prefill", "decode_1"],
+                        required=True,
+                        help="The phase to print the table for.")
+    parser.add_argument("--table",
+                        type=str,
+                        choices=["summary", "model"],
+                        default="summary",
+                        help="Which table to print, the summary table or the "
+                        "layerwise model table")
+
+    args = parser.parse_args()
+
+    with open(args.json_trace) as f:
+        profile_data = json.load(f)
+
+    if args.table == "summary":
+        entries_and_depths = flatten_entries(
+            SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
+        column_widths = dict(name=80,
+                             cuda_time_us=12,
+                             pct_cuda_time=12,
+                             invocations=15)
+    elif args.table == "model":
+        entries_and_depths = flatten_entries(
+            ModelStatsEntry, profile_data[args.phase]["model_stats"])
+        column_widths = dict(name=60,
+                             cpu_time_us=12,
+                             cuda_time_us=12,
+                             pct_cuda_time=12,
+                             trace=60)
+
+    # indent entry names based on the depth
+    entries = []
+    for entry, depth in entries_and_depths:
+        entry.name = indent_string(
+            entry.name,
+            indent=depth,
+            indent_style=lambda indent: "|" + "-" * indent + " ")
+        entries.append(entry)
+
+    TablePrinter(type(entries[0]), column_widths).print_table(entries)
--- a/vllm-v0.6.2/tools/profiler/visualize_layerwise_profile.py
+++ b/vllm-v0.6.2/tools/profiler/visualize_layerwise_profile.py
@@ -0,0 +1,522 @@
+import argparse
+import copy
+import json
+import math
+import os
+from pathlib import Path
+from typing import Any, List, Optional, Tuple
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+## JSON parsing utils ####
+
+
+def largest_dist_from_leaf(node: dict, depth: int = 0):
+    if len(node["children"]) == 0:
+        return depth
+    return max([
+        largest_dist_from_leaf(child, depth=depth + 1)
+        for child in node["children"]
+    ])
+
+
+def get_entries_at_depth(depth: int,
+                         entries_and_traces: List[Tuple[Any, Any]],
+                         node: dict,
+                         curr_depth: int = 0,
+                         trace=()):
+    # assert that the query is at kernel or module level
+    assert depth == -1 or depth == -2
+
+    if curr_depth == 0 and largest_dist_from_leaf(node) <= (abs(depth) - 1):
+        # The tree is not tall enough!
+        entries_and_traces.append((node["entry"], trace))
+        return
+
+    if largest_dist_from_leaf(node) == (abs(depth) - 1):
+        entries_and_traces.append((node["entry"], trace))
+
+    trace = (node["entry"]["name"], ) + trace
+    for child in node["children"]:
+        get_entries_at_depth(depth,
+                             entries_and_traces,
+                             child,
+                             curr_depth=curr_depth + 1,
+                             trace=trace)
+
+
+def fold_nodes(root: dict, nodes_to_fold: List[str]):
+
+    stack: List[dict] = [root]
+    while len(stack) != 0:
+        node = stack.pop()
+        if node['entry']['name'] in nodes_to_fold:
+            node["children"] = []
+            continue
+        for child in node["children"]:
+            stack.append(child)
+    return root
+
+
+## Operation name cleanup utils ####
+
+
+def trim_string_back(string: str, width: int) -> str:
+    if len(string) > width:
+        offset = len(string) - width + 3
+        string = string[:-offset]
+        if len(string) > 3:
+            string = string + "..."
+    return string
+
+
+def shorten_plot_legend_strings(legend, max_char_len: int):
+    for t in legend.get_texts():
+        t.set_text(
+            trim_string_back(abbreviate_known_names(t.get_text()),
+                             max_char_len))
+
+
+def abbreviate_known_names(name: str) -> str:
+    abbreviations = {
+        "MergedColumnParallelLinear": "MCPLinear",
+        "QKVParallelLinear": "QKVPLinear",
+        "RowParallelLinear": "RPLinear",
+        "weight=": "w=",
+        "bfloat16": "bf16",
+        "float16": "f16",
+    }
+    for key, value in abbreviations.items():
+        name = name.replace(key, value)
+    return name
+
+
+def attempt_to_make_names_unique(entries_and_traces):
+    names, non_unique_names = (set(), set())
+
+    def all_the_same(items) -> bool:
+        return all(i == items[0] for i in items)
+
+    for entry, _ in entries_and_traces:
+        if entry["name"] in names:
+            non_unique_names.add(entry["name"])
+        else:
+            names.add(entry["name"])
+
+    for name in non_unique_names:
+        entries_and_traces_with_name = [(entry, trace)
+                                        for entry, trace in entries_and_traces
+                                        if entry["name"] == name]
+
+        zipped_traces = list(
+            zip(*[trace for _, trace in entries_and_traces_with_name]))
+        first_trace_difference = next(
+            (i for i, trace_eles in enumerate(zipped_traces)
+             if not all_the_same(trace_eles)), None)
+
+        if first_trace_difference is None:
+            # can't create a unique name, leave them names as the
+            # are they will get aggregated by the pivot_table call
+            continue
+
+        for entry, trace in entries_and_traces_with_name:
+            entry["name"] = " <- ".join((entry["name"], ) +
+                                        trace[:first_trace_difference + 1])
+
+
+## Operation grouping utils ####
+'''
+    Group operations in the given dataframe by some high-level ops like,
+    - gemms
+    - attention
+    - rms_norm 
+    etc.
+'''
+
+
+def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
+
+    def is_rms_norm(op_name: str):
+        if "rms_norm_kernel" in op_name:
+            return True
+
+    def is_attention_block(op_name: str):
+        if "flash_fwd" in op_name or \
+            "reshape_and_cache_flash_kernel" in op_name:
+            return True
+
+    def is_quant(op_name: str):
+        if "scaled_fp8_quant" in op_name or \
+           "scaled_int8_quant" in op_name:
+            return True
+
+    def is_gemm_op(op_name: str):
+        if is_quant(op_name):
+            return False
+        if "xmma_gemm" in op_name  or \
+           "gemv2T_kernel" in op_name or \
+           "splitKreduce" in op_name or \
+           "void cutlass::Kernel" in op_name or \
+           "void cutlass::device_kernel" in op_name or \
+           "s16816gemm" in op_name:
+            return True
+
+    def is_elementwise_op(op_name: str):
+        return "elementwise_kernel" in op_name
+
+    def is_mem_op(op_name: str):
+        return "memcpy" in op_name.lower() or \
+               "memset" in op_name.lower()
+
+    def is_vocab_embedding_op(op_name: str):
+        return "vocabparallelembed" in op_name.lower()
+
+    # nccl ops
+    def is_nccl_op(op_name: str):
+        return "nccl" in op_name.lower()
+
+    def is_nccl_all_reduce(op_name: str):
+        return is_nccl_op(op_name) and \
+                ("all_reduce" in op_name.lower() or \
+                "allreduce" in op_name.lower())
+
+    def is_nccl_gather(op_name: str):
+        return is_nccl_op(op_name) and \
+                "gather" in op_name.lower()
+
+    def is_nccl_broadcast(op_name: str):
+        return is_nccl_op(op_name) and \
+                "broadcast" in op_name.lower()
+
+    # Reduce ops types
+    def is_cross_device_reduce_1stage(op_name: str):
+        return "cross_device_reduce_1stage" in op_name
+
+    def is_cross_device_reduce_2stage(op_name: str):
+        return "cross_device_reduce_2stage" in op_name
+
+    def is_custom_ar_all_reduce(op_name: str):
+        return "_C_custom_ar::all_reduce" in op_name
+
+    def is_reduce_kernel(op_name: str):
+        return "reduce_kernel" in op_name
+
+    headers = list(trace_df)
+    ops = copy.deepcopy(headers)
+
+    attention_ops = list(filter(lambda x: is_attention_block(x), ops))
+    ops = list(filter(lambda x: x not in attention_ops, ops))
+
+    quant_ops = list(filter(lambda x: is_quant(x), ops))
+    ops = list(filter(lambda x: x not in quant_ops, ops))
+
+    gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
+    ops = list(filter(lambda x: x not in gemm_ops, ops))
+
+    rms_norm_ops = list(filter(lambda x: is_rms_norm(x), ops))
+    ops = list(filter(lambda x: x not in rms_norm_ops, ops))
+
+    vocab_embed_ops = list(filter(lambda x: is_vocab_embedding_op(x), ops))
+    ops = list(filter(lambda x: x not in vocab_embed_ops, ops))
+
+    mem_ops = list(filter(lambda x: is_mem_op(x), ops))
+    ops = list(filter(lambda x: x not in mem_ops, ops))
+
+    elementwise_ops = list(filter(lambda x: is_elementwise_op(x), ops))
+    ops = list(filter(lambda x: x not in elementwise_ops, ops))
+
+    nccl_all_reduce_ops = list(filter(lambda x: is_nccl_all_reduce(x), ops))
+    ops = list(filter(lambda x: x not in nccl_all_reduce_ops, ops))
+
+    nccl_gather_ops = list(filter(lambda x: is_nccl_gather(x), ops))
+    ops = list(filter(lambda x: x not in nccl_gather_ops, ops))
+
+    nccl_broadcast_ops = list(filter(lambda x: is_nccl_broadcast(x), ops))
+    ops = list(filter(lambda x: x not in nccl_broadcast_ops, ops))
+
+    nccl_other_ops = list(filter(lambda x: is_nccl_op(x), ops))
+    ops = list(filter(lambda x: x not in nccl_other_ops, ops))
+
+    cross_device_reduce_1stage_ops = list(
+        filter(lambda x: is_cross_device_reduce_1stage(x), ops))
+    ops = list(filter(lambda x: x not in cross_device_reduce_1stage_ops, ops))
+
+    cross_device_reduce_2stage_ops = list(
+        filter(lambda x: is_cross_device_reduce_2stage(x), ops))
+    ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops))
+
+    custom_ar_all_reduce_ops = list(
+        filter(lambda x: is_custom_ar_all_reduce(x), ops))
+    ops = list(filter(lambda x: x not in custom_ar_all_reduce_ops, ops))
+
+    reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops))
+    ops = list(filter(lambda x: x not in reduce_kernel_ops, ops))
+
+    if len(attention_ops):
+        trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
+    if len(quant_ops):
+        trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
+    if len(gemm_ops):
+        trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
+    if len(rms_norm_ops):
+        trace_df['rms_norm_ops'] = trace_df[rms_norm_ops].agg("sum", axis=1)
+    if len(vocab_embed_ops):
+        trace_df['vocab_embed_ops'] = trace_df[vocab_embed_ops].agg("sum",
+                                                                    axis=1)
+    if len(mem_ops):
+        trace_df['mem_ops'] = trace_df[mem_ops].agg("sum", axis=1)
+    if len(elementwise_ops):
+        trace_df['elementwise_ops'] = trace_df[elementwise_ops].agg("sum",
+                                                                    axis=1)
+
+    if len(nccl_all_reduce_ops):
+        trace_df['nccl_all_reduce_ops'] = trace_df[nccl_all_reduce_ops].agg(
+            "sum", axis=1)
+    if len(nccl_gather_ops):
+        trace_df['nccl_gather_ops'] = trace_df[nccl_gather_ops].agg("sum",
+                                                                    axis=1)
+    if len(nccl_broadcast_ops):
+        trace_df['nccl_broadcast_ops'] = trace_df[nccl_broadcast_ops].agg(
+            "sum", axis=1)
+    if len(nccl_other_ops):
+        trace_df['nccl_other_ops'] = trace_df[nccl_other_ops].agg("sum",
+                                                                  axis=1)
+
+    if len(cross_device_reduce_1stage_ops):
+        trace_df['cross_device_reduce_1stage_ops'] = trace_df[
+            cross_device_reduce_1stage_ops].agg("sum", axis=1)
+    if len(cross_device_reduce_2stage_ops):
+        trace_df['cross_device_reduce_2stage_ops'] = trace_df[
+            cross_device_reduce_2stage_ops].agg("sum", axis=1)
+    if len(custom_ar_all_reduce_ops):
+        trace_df['custom_ar_all_reduce_ops'] = trace_df[
+            custom_ar_all_reduce_ops].agg("sum", axis=1)
+    if len(reduce_kernel_ops):
+        trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
+                                                                        axis=1)
+
+    trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops +
+                  vocab_embed_ops + mem_ops + elementwise_ops +
+                  nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
+                  nccl_other_ops + cross_device_reduce_1stage_ops +
+                  cross_device_reduce_2stage_ops + custom_ar_all_reduce_ops +
+                  reduce_kernel_ops,
+                  axis=1,
+                  inplace=True)
+    return trace_df
+
+
+## Data plotting utils ####
+
+
+def plot_trace_df(traces_df: pd.DataFrame,
+                  plot_metric: str,
+                  plot_title: str,
+                  output: Optional[Path] = None):
+
+    phases = traces_df['phase'].unique()
+    traces_df = traces_df.pivot_table(index="phase",
+                                      columns="name",
+                                      values=plot_metric,
+                                      aggfunc="sum")
+
+    traces_df = group_trace_by_operations(traces_df)
+
+    # Make the figure
+    fig, ax = plt.subplots(1, figsize=(5, 8), sharex=True)
+
+    # Draw the stacked bars
+    ops = list(traces_df)
+    bottom = [0] * len(phases)
+    for op in ops:
+        values = [traces_df[op][phase] for phase in phases]
+        values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
+        ax.bar(phases, values, label=op, bottom=bottom)
+        bottom = [bottom[j] + values[j] for j in range(len(phases))]
+
+    # Write the values as text on the bars
+    for bar in ax.patches:
+        if bar.get_height() != 0:
+            ax.text(bar.get_x() + bar.get_width() / 2,
+                    bar.get_height() / 2 + bar.get_y(),
+                    f"{round(bar.get_height(), 2)}",
+                    ha='center',
+                    color='w',
+                    weight='bold',
+                    size=5)
+
+    # Setup legend
+    handles, labels = plt.gca().get_legend_handles_labels()
+    legend = fig.legend(handles,
+                        labels,
+                        loc='center left',
+                        bbox_to_anchor=(1, 1))
+    shorten_plot_legend_strings(legend, 50)
+
+    # Setup labels and title
+    plt.setp(ax.get_xticklabels(), rotation=90)
+    ax.set_ylabel(plot_metric)
+    plt.suptitle(plot_title)
+
+    plt.savefig(output, bbox_inches='tight')
+    print("Created: ", output)
+
+
+def main(
+        json_trace: Path,
+        output_directory: Path,
+        depth: int,  # Fetch/Plot operations at this depth of the Json tree
+        plot_metric: str,
+        make_names_unique: bool,
+        top_k: int,
+        json_nodes_to_fold: List[str]):
+
+    def prepare_data(profile_json: dict, step_keys: List[str]) -> pd.DataFrame:
+
+        def get_entries_and_traces(key: str):
+            entries_and_traces: List[Tuple[Any, Any]] = []
+            for root in profile_json[key]["summary_stats"]:
+                # Fold nodes in the traces as per user request. i.e. simply
+                # make the requested nodes leaf-nodes.
+                root = fold_nodes(root, json_nodes_to_fold)
+                get_entries_at_depth(depth, entries_and_traces, root)
+            return entries_and_traces
+
+        def keep_only_top_entries(df: pd.DataFrame,
+                                  metric: str,
+                                  top_k: int = 9) -> pd.DataFrame:
+            df.loc[df.nsmallest(len(df) - top_k + 1, metric).index,
+                   ["name"]] = "others"
+            return df
+
+        # Get data for each key
+        traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
+
+        # Attempt some cleanup
+        if make_names_unique:
+            for trace in traces:
+                attempt_to_make_names_unique(trace)
+
+        # To pandas dataframe
+        trace_dfs = list(
+            map(lambda t: pd.DataFrame([entry for entry, _ in t]).fillna(0),
+                traces))
+
+        # Respect top_k
+        if top_k:
+            trace_dfs = list(
+                map(
+                    lambda trace_df: keep_only_top_entries(
+                        trace_df, "cuda_time_us", top_k), trace_dfs))
+
+        # Fill in information about the step-keys
+        for trace_df, step_key in zip(trace_dfs, step_keys):
+            trace_df['phase'] = step_key
+
+        # Combine all data frames so they can be put in a single plot
+        traces_df = pd.concat(trace_dfs)
+
+        # Add a derived metric `cuda_time_ms`
+        traces_df["cuda_time_ms"] = traces_df["cuda_time_us"] / 1000
+        traces_df = traces_df.fillna(0)
+
+        return traces_df
+
+    def make_plot_title_suffix(profile_json: dict) -> str:
+        context = profile_json["context"]
+        sparsity = context.get('sparsity', None)
+        return (f"{context['model']}\n"
+                f"Batch={context['batch_size']}, "
+                f"PromptLen={context['prompt_len']}, "
+                f"OutputLen={context['output_len']},"
+                f"NumGpus={context['tensor_parallel_size']}"
+                f"{', Sparsity ' + sparsity if sparsity else ''}")
+
+    profile_json = None
+    with open(json_trace) as f:
+        profile_json = json.load(f)
+    assert profile_json is not None
+
+    # Get all `llm.generate.step()` profile
+    step_traces = list(profile_json.keys())
+    assert (step_traces[0] == 'context')
+    step_traces = step_traces[1:]  # have only prefill and decodes
+    prefills = list(filter(lambda x: "prefill" in x, step_traces))
+    all_decodes = list(filter(lambda x: "decode" in x, step_traces))
+    assert len(prefills) + len(all_decodes) == len(step_traces)
+    assert len(prefills) == 1
+
+    decodes = all_decodes[::args.step_plot_interval]
+    if decodes[-1] != all_decodes[-1]:
+        # Always have the last decode
+        decodes.append(all_decodes[-1])
+
+    prefill_traces = prepare_data(profile_json, prefills)
+    decode_traces = prepare_data(profile_json, decodes)
+
+    plot_title_suffix = make_plot_title_suffix(profile_json)
+
+    plot_trace_df(prefill_traces, plot_metric, "prefill " + plot_title_suffix,
+                  output_directory / Path("prefill.png"))
+    plot_trace_df(decode_traces, plot_metric, "decodes " + plot_title_suffix,
+                  output_directory / Path("decode_steps.png"))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--json-trace",
+        type=str,
+        required=True,
+        help="json trace file output by examples/offline_profile.py")
+    parser.add_argument("--output-directory",
+                        type=str,
+                        required=False,
+                        help="Directory to output plots")
+    parser.add_argument("--level",
+                        type=str,
+                        default="module",
+                        choices=["module", "kernel"])
+    parser.add_argument("--top-k",
+                        type=int,
+                        default=12,
+                        help="Only graph the top `top_k` entries by time.")
+    parser.add_argument("--fold-json-node",
+                        nargs='+',
+                        default=['Sampler', 'LogitsProcessor'],
+                        help='Do not plot the children of these nodes. Let, \
+                              the node represent the aggregate of all its \
+                              children')
+    parser.add_argument("--plot-metric",
+                        type=str,
+                        default="cuda_time_ms",
+                        help='Metric to plot. some options are cuda_time_ms, \
+                                pct_cuda_time')
+    parser.add_argument(
+        "--step-plot-interval",
+        type=int,
+        default=4,
+        help="For every `step_plot_interval` steps, plot 1 step")
+
+    args = parser.parse_args()
+
+    # Prepare/Extract relevant args
+    make_names_unique = False
+    if args.level == "module":
+        depth = -2
+        make_names_unique = True
+    elif args.level == "kernel":
+        depth = -1
+    else:
+        raise Exception(f"Unexpected level value ({args.level})")
+
+    output_directory = args.output_directory if args.output_directory else Path(
+        args.json_trace).parent
+
+    if not os.path.exists(output_directory):
+        os.makedirs(output_directory)
+
+    main(Path(args.json_trace), output_directory, depth, args.plot_metric,
+         make_names_unique, args.top_k, args.fold_json_node)
--- a/vllm-v0.6.2/tools/quant_tools/init.py
+++ b/vllm-v0.6.2/tools/quant_tools/init.py
--- a/vllm-v0.6.2/tools/quant_tools/convert_checkpoint.py
+++ b/vllm-v0.6.2/tools/quant_tools/convert_checkpoint.py
@@ -0,0 +1,419 @@
+import argparse
+import os
+
+import sys
+import time
+import safetensors
+import logging
+import json
+from huggingface_hub import split_torch_state_dict_into_shards, constants
+
+from vllm import LLM
+from vllm.transformers_utils.config import get_config, get_hf_text_config
+from vllm.config import _get_and_verify_max_len
+import transformers
+from transformers.modeling_utils import SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+
+from smooth_quant import generate_weights_of_smoothquant
+from weight_only import generate_weights_of_weight_only
+from utils_internal import (read_model_name, load_tokenizer, torch_dtype_to_str, str_dtype_to_torch,
+                            copy_files_except_extensions, generate_datetime, get_hf_config_sliding_window)
+from utils_internal import get_skip_patterns, should_skip
+from model_special import smooth_model_config
+from vllm.engine.arg_utils import EngineArgs
+
+sys.path.append(os.getcwd())
+
+logger = logging.getLogger("smooth_convert")
+
+def load_skip_params_from_hf(args):
+    '''
+    load parameters from transformers that do no need to be quantized.
+    '''
+    model_type = args.model_type
+    if not get_skip_patterns(model_type):
+        return {}
+    try:
+        model = getattr(transformers, args.model_name, None)
+        if model is None:
+            model = AutoModelForCausalLM
+        model = model.from_pretrained(
+            args.hf_model_dir,
+            trust_remote_code=True,
+            torch_dtype=args.torch_dtype,
+            device_map="cpu")
+    except Exception as e:
+        logger.fatal(f"Unsupported model {args.model_name}, error message: {e}")
+        sys.exit(1)
+
+    params_map = {}
+    hf_params = dict(model.named_parameters())
+    for name, param in hf_params.items():
+        if should_skip(model_type, name):
+            logger.info(f"load parameters from transformers, name: {name}")
+            params_map[name] = param
+    return params_map
+
+def save_quantized_weights_to_safetensors(quantized_weights, args):
+    '''
+    save quantized_weights to safetensors format
+    '''
+    # Store the state_dict to file.
+    max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
+    state_dict_split = split_torch_state_dict_into_shards(quantized_weights,
+                                                          filename_pattern=constants.SAFETENSORS_WEIGHTS_FILE_PATTERN,
+                                                          max_shard_size=max_shard_size)
+    # Save the model
+    for shard_name, tensors in state_dict_split.filename_to_tensors.items():
+        shard = {tensor: quantized_weights[tensor] for tensor in tensors}
+        safetensors.torch.save_file(shard, os.path.join(args.output_dir, shard_name), metadata={"format": "pt"})
+
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = os.path.join(args.output_dir, SAFE_WEIGHTS_INDEX_NAME)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+        logger.info(
+            f"The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be "
+            f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where "
+            f"each parameters has been saved in the index located at {save_index_file}."
+        )
+    else:
+        logger.info(f"Model weights saved in {os.path.join(args.output_dir, SAFE_WEIGHTS_NAME)}")
+
+
+def main(args):
+    '''
+    main quantization logic
+    '''
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=args.log_level,
+        force=True,
+    )
+
+    tik = time.time()
+
+    skip_params = load_skip_params_from_hf(args)
+    # Create an LLM.
+    max_model_len = max(args.max_input_length + args.output_len, 2048)
+    args.max_model_len = min(max_model_len, args.hf_max_model_len)
+
+    max_num_batched_tokens = max(max(args.max_input_length * args.batch_size, max_model_len), 2048)
+    args.max_num_batched_tokens = min(max_num_batched_tokens, args.hf_max_model_len)
+    llm = LLM(model=args.hf_model_dir,
+              tokenizer=args.tokenizer_dir,
+              tensor_parallel_size=args.tp_size,
+              distributed_executor_backend='ray',
+              dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
+              trust_remote_code=True,
+              block_size=args.block_size,
+              max_model_len=args.max_model_len,
+              max_num_batched_tokens=args.max_num_batched_tokens,
+              max_num_seqs=args.max_num_seqs,
+              cpu_offload_gb=args.cpu_offload_gb)
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+
+    logger.info(f'Load vLLM model takes: {t}')
+
+    quantize_config = {}
+    if args.use_weight_only:
+        st_prefix = f"weight_{args.weight_only_precision}"
+        quantized_weights = generate_weights_of_weight_only(llm, args)
+        quantize_config['bits'] = 8 if args.weight_only_precision == "int8" else 4
+        quantize_config['quant_method'] = "weightonly"
+        quantize_config['quant_mode'] = "WeightOnly"
+
+    if args.use_smoothquant:
+        st_prefix = f"smoothquant_{args.smooth_value}"
+        quantized_weights, smooth_info = generate_weights_of_smoothquant(llm, args)
+        quantize_config['bits'] = 8
+        quantize_config['quant_method'] = "smoothquant"
+        quantize_config['quant_mode'] = "SmoothQuant"
+        quantize_config['input_quant_method'] = "per_token" if args.per_token else "per_tensor"
+        quantize_config['smooth_value'] = args.smooth_value
+        with open(os.path.join(args.output_dir, 'smooth_info.json'), 'w') as f:
+            json.dump(smooth_info, f, indent=4)
+
+    # Should first copy other files from hf_model_dir, and then save weight, tokenizer, config, quant_config and so on
+    extensions = ['.bin', '.safetensors', ".pt", ".index.json"]
+    copy_files_except_extensions(args.hf_model_dir, args.output_dir, extensions)
+    logger.info(f'copy files except extensions success')
+
+    for name, param in skip_params.items():
+        assert name in quantized_weights
+        quantized_weights[name] = param
+    save_quantized_weights_to_safetensors(quantized_weights, args)
+    logger.info(f'save quantized_weights to safetensors success')
+
+    with open(os.path.join(args.output_dir, 'quantize_config.json'), 'w') as f:
+        json.dump(quantize_config, f, indent=4)
+
+    from transformers.utils import CONFIG_NAME
+    with open(os.path.join(args.hf_model_dir, CONFIG_NAME), 'r') as f:
+        config = json.load(f)
+    config['quantization_config'] = quantize_config
+    config['generate_datetime'] = generate_datetime()
+    config['torch_dtype'] = args.dtype
+    with open(os.path.join(args.output_dir, CONFIG_NAME), 'w') as f:
+        json.dump(config, f, indent=4)
+
+    logger.info(f'quantized {args.hf_model_dir} finished')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hf_model_dir', type=str, default=None)
+    parser.add_argument('--tokenizer_dir',
+                        default=None,
+                        help='tokenizer path; defaults to hf_model_dir if left unspecified')
+    parser.add_argument(
+        '--enforce_eager',
+        action="store_true",
+        default=True,
+        help='Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model '
+        'in eager mode. If False, we will use CUDA graph and eager execution in hybrid.')
+    parser.add_argument('--dtype',
+                        type=str,
+                        choices=['auto', 'float32', 'float16', 'bfloat16'],
+                        default='auto',
+                        help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
+    parser.add_argument('--scales_smooth_dtype',
+                        type=str,
+                        choices=['auto', 'float32', 'float16', 'bfloat16'],
+                        default='auto',
+                        help="if auto, scales and smooth weights use args.dtype, else use the setted dtype")
+    parser.add_argument(
+        '--eval_task',
+        type=str,
+        default='summarize',
+        choices=['summarize', 'summarize_long', 'code_completion', 'summarize_hg', 'text_generation', 'custom'],
+        help='''eval task to decide which dataset is selected. When set to custom, you must set these options
+          dataset_name, dataset_revision, dataset_input_key, dataset_split to specify which dataset to use''')
+    parser.add_argument("--dataset_cache_dir",
+                        type=str,
+                        default=None,
+                        help="cache dir to load the hugging face dataset")
+    parser.add_argument("--dataset_name", type=str, default=None, help="custom dataset name")
+    parser.add_argument("--dataset_revision", type=str, default=None, help="custom dataset version")
+    parser.add_argument("--dataset_input_key", type=str, default=None, help="custom dataset field")
+    parser.add_argument("--dataset_split", type=str, default=None, help="custom dataset split")
+    parser.add_argument('--log_level', type=int, default=logging.INFO)
+    parser.add_argument('--num_samples', type=int, default=512, help='num prompt sample')
+    parser.add_argument('--output_len',
+                        type=int,
+                        default=100,
+                        help="Number of output sequences to return for the given prompt")
+    parser.add_argument('--max_input_length',
+                        type=int,
+                        default=512,
+                        help='max input length of the prompt')
+    parser.add_argument('--block_size', type=int, default=-1, help='Token block size for contiguous chunks of tokens.')
+    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--top_p', type=float, default=1.0)
+    parser.add_argument('--top_k', type=int, default=-1)
+    parser.add_argument('--repetition_penalty', type=float, default=1.0)
+    parser.add_argument('--max_num_seqs',
+                        type=int,
+                        default=EngineArgs.max_num_seqs,
+                        help='Maximum number of sequences per iteration.')
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default="output_dir",
+                        help="The path to save the quantized checkpoint")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="10GB",
+        help=("The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size "
+              "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`)"),
+    )
+    parser.add_argument('--tp_size', type=int, default=1, help='N-way tensor parallelism size')
+    parser.add_argument('--pp_size', type=int, default=1, help='N-way pipeline parallelism size, now supported num')
+    parser.add_argument('--use_smoothquant',
+                        default=False,
+                        action="store_true",
+                        help='Apply smoothquant to generate weight')
+    parser.add_argument("--smooth_value",
+                        type=float,
+                        default=0.5,
+                        help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
+                        " to Smoothquant the model, and output int8 weights."
+                        " A good first try is 0.5. Must be in [0, 1]")
+    parser.add_argument('--per_channel',
+                        action="store_true",
+                        default=False,
+                        help='By default, we use a single static scaling factor for the GEMM\'s result. '
+                        'per_channel instead uses a different static scaling factor for each channel. '
+                        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_token',
+        action="store_true",
+        default=False,
+        help='By default, we use a single static scaling factor to scale activations in the int8 range. '
+        'per_token chooses at run time, and for each token, a custom scaling factor. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument('--use_weight_only',
+                        default=False,
+                        action="store_true",
+                        help='Quantize weights for the various GEMMs to INT4/INT8.'
+                        'See --weight_only_precision to set the precision')
+    parser.add_argument('--weight_only_precision',
+                        const='int8',
+                        type=str,
+                        nargs='?',
+                        default='int8',
+                        choices=['int8', 'int4'],
+                        help='Define the precision for the weights when using weight-only quantization.'
+                        'You must also use --use_weight_only for that argument to have an impact.')
+    parser.add_argument(
+        '--has_qzeros',
+        action="store_true",
+        default=False,
+        help='whether to add qzeros weight to vllm_mlu weight',
+    )
+    parser.add_argument('--model_version',
+                        type=str,
+                        default=None,
+                        help="Set model version to replace parsing from _name_or_path in hf config.")
+    parser.add_argument('--model_type',
+                        type=str,
+                        default=None,
+                        help="Set model type to replace parsing from model_type in hf config."
+                        "if set is None and parsed also None, then set as model_version")
+    parser.add_argument('--no_add_special_tokens',
+                        dest='add_special_tokens',
+                        default=True,
+                        action='store_false',
+                        help="Whether or not to add special tokens")
+    parser.add_argument(
+        '--has_prompt_token_id',
+        action="store_true",
+        default=False,
+        help='whether to give llm.generate prompt_token_id',
+    )
+    parser.add_argument(
+        '--disable_fused_quantize_expert',
+        action="store_true",
+        default=False,
+        help='''disable fused activation to quantize for unfused moe usage.
+          Because to fused_moe smoothquant, input_smooth has shape (hidden_size), act_smooth has shape (inner_size),
+          and not every expert can be routed, so we assume that all expert should use the same act_smooth by default.
+          You can use this option to close the assumption.'''
+    )
+    parser.add_argument('--prompt_file',
+                        type=str,
+                        default=None,
+                        help="custom prompt file, should has format that each line is one string prompt,"
+                        "you can refer the format of summarize_1024_prompts.csv")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=-1,
+        help="batch size, used to limit max_num_batched_tokens, -1 means batch_size equals to num_samples"
+    )
+    parser.add_argument(
+        '--cpu_offload_gb',
+        type=float,
+        default=0.0,
+        help='''The size (GiB) of CPU memory to use for offloading the model weights.
+         This virtually increases the GPU memory space you can use to hold the model weights,
+         at the cost of CPU-GPU data transfer for every forward pass.'''
+    )
+    parser.add_argument(
+        '--dump_prompt_token_ids',
+        action="store_true",
+        default=False,
+        help='dump prompt_token_ids used by llm.generate ',
+    )
+    parser.add_argument(
+        '--dump_input_ids',
+        action="store_true",
+        default=False,
+        help='dump vllm qkv used token ids at llm running',
+    )
+    parser.add_argument(
+        '--dump_act_range',
+        action="store_true",
+        default=False,
+        help='dump act range which is the max hidden dim value of input, output, weigth',
+    )
+    parser.add_argument(
+        '--dump_weights',
+        action="store_true",
+        default=False,
+        help='dump weights of the converted model',
+    )
+    parser.add_argument(
+        '--dump_generate_weights',
+        action="store_true",
+        default=False,
+        help='dump generate weights of the converted model',
+    )
+
+    args = parser.parse_args()
+
+    assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
+    assert args.pp_size == 1, "Pipeline parallelism is not supported."
+
+    if args.tokenizer_dir is None:
+        args.tokenizer_dir = args.hf_model_dir
+
+    if args.has_prompt_token_id is False:
+        args.dump_prompt_token_ids = False
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
+        args.hf_model_dir, args.model_version, args.model_type)
+    assert args.model_type in smooth_model_config, f'''{args.model_type} hasn't supported,
+      please add it's infomation in model_special.py by your self'''
+
+    args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
+    hf_text_config = get_hf_text_config(args.hf_config)
+    args.tie_word_embeddings = getattr(hf_text_config, "tie_word_embeddings", False)
+    sliding_window_len = get_hf_config_sliding_window(hf_text_config)
+    disable_sliding_window = sliding_window_len is None
+    if args.model_type == 'qwen2_vl':
+        # workround for qwen2_vl since _get_and_verify_max_len not supported for MRoPE
+        # remove this when it is supported.
+        args.hf_max_model_len = 32768
+    else:
+        if args.model_type == 'hunyuan' or args.model_type == 'deepseek_v2':
+            disable_sliding_window=False
+        args.hf_max_model_len = _get_and_verify_max_len(hf_text_config, None, disable_sliding_window, sliding_window_len)
+
+    if args.batch_size < 1:
+        args.batch_size = args.num_samples
+
+    args.batch_size = min(args.batch_size, args.num_samples)
+    if args.dtype == "auto":
+        args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
+
+    if args.scales_smooth_dtype == "auto":
+        args.scales_smooth_dtype = args.dtype
+
+    args.torch_dtype = str_dtype_to_torch(args.dtype)
+    args.torch_scales_smooth_dtype = str_dtype_to_torch(args.scales_smooth_dtype)
+    args.hf_config.torch_dtype = args.torch_dtype
+
+    args.tokenizer, args.pad_id, args.end_id = load_tokenizer(
+        tokenizer_dir=args.tokenizer_dir,
+        model_name=args.model_name,
+        model_version=args.model_version,
+    )
+
+    tik = time.time()
+    main(args)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Total time of converting checkpoints: {t}')
--- a/vllm-v0.6.2/tools/quant_tools/dump_hf_weight.py
+++ b/vllm-v0.6.2/tools/quant_tools/dump_hf_weight.py
@@ -0,0 +1,69 @@
+import os
+import argparse
+from transformers import (AutoModel, AutoModelForCausalLM,
+                          AutoModelForSeq2SeqLM, GenerationConfig)
+
+from vllm.transformers_utils.config import get_config
+from utils_internal import (read_model_name, torch_dtype_to_str, str_dtype_to_torch)
+from dump_smooth import save_weights
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hf_model_dir', type=str, default=None)
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default="output_dir",
+                        help="The path to save the quantized checkpoint")
+    parser.add_argument('--model_version',
+                        type=str,
+                        default=None,
+                        help="Set model version to replace parsing from _name_or_path in hf config.")
+    parser.add_argument('--model_type',
+                        type=str,
+                        default=None,
+                        help="Set model type to replace parsing from model_type in hf config."
+                        "if set is None and parsed also None, then set as model_version")
+    parser.add_argument('--dtype',
+                        type=str,
+                        choices=['auto', 'float32', 'float16', 'bfloat16'],
+                        default='auto',
+                        help="if auto, use unquantized weight torch_dtype in config.json, else use setted dtype")
+    parser.add_argument(
+        '--dump_weights',
+        action="store_true",
+        default=True,
+        help='dump weights of the converted model',
+    )
+
+    args = parser.parse_args()
+
+    assert args.hf_model_dir, "Please set model_dir by --model_dir or --hf_model_dir"
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    args.model_name, args.model_version, args.model_family, args.model_type = read_model_name(
+        args.hf_model_dir, args.model_version, args.model_type)
+
+    args.hf_config = get_config(args.hf_model_dir, trust_remote_code=True)
+
+    if args.dtype == "auto":
+        args.dtype = torch_dtype_to_str(args.hf_config.torch_dtype)
+
+    args.torch_dtype = str_dtype_to_torch(args.dtype)
+    args.hf_config.torch_dtype = args.torch_dtype
+
+    if args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'glm':
+        auto_model_cls = AutoModelForSeq2SeqLM
+    elif args.model_name == 'ChatGLMForCausalLM' and args.model_version == 'chatglm':
+        auto_model_cls = AutoModel
+    else:
+        auto_model_cls = AutoModelForCausalLM
+    model = auto_model_cls.from_pretrained(
+        args.hf_model_dir,
+        trust_remote_code=True,
+        torch_dtype=args.torch_dtype)
+
+    named_parameters = dict(model.named_parameters())
+    save_weights(named_parameters, args)
--- a/vllm-v0.6.2/tools/quant_tools/dump_smooth.py
+++ b/vllm-v0.6.2/tools/quant_tools/dump_smooth.py
@@ -0,0 +1,145 @@
+import torch
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def tensor_shape_to_string(tensor):
+    '''
+    convert a tensor shape to string description
+    '''
+    int_list = list(tensor.shape)
+    str_list = [str(num) for num in int_list]
+    str_shape = "x".join(str_list)
+    return str_shape
+
+
+def save_prompt_token_ids(prompt_input_ids, args):
+    '''
+    save prompt_token_id
+    Args:
+        prompt_input_ids: prompt input_id assiged to llm.generate
+        args: arguments from main
+    '''
+    if args.dump_prompt_token_ids is not True:
+        return
+    output_dir = os.path.join(args.output_dir, "prompt_input_ids")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    data_len = len(prompt_input_ids)
+    for data_index in range(data_len):
+        tensor = prompt_input_ids[data_index]
+        str_shape = tensor_shape_to_string(tensor)
+        file_path = os.path.join(output_dir, f"prompt_input_ids_{data_index}_{str_shape}.pt")
+        torch.save(tensor, file_path)
+        logger.info(f"Saved input_ids[{data_index}] to {file_path}")
+
+
+def save_input_ids(input_ids, args):
+    '''
+    save input_ids
+    Args:
+        input_ids: input of qkv with layer0
+        args: arguments from main
+    '''
+    id_len = len(input_ids)
+    if args.dump_input_ids is not True or id_len == 0:
+        return
+    output_dir = os.path.join(args.output_dir, "input_ids")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    for data_index in range(id_len):
+        tensor = input_ids[data_index]
+        str_shape = tensor_shape_to_string(tensor)
+        file_path = os.path.join(output_dir, f"input_ids_{data_index}_{str_shape}.pt")
+        torch.save(tensor, file_path)
+        logger.info(f"Saved input_ids[{data_index}] to {file_path}")
+
+
+def save_act_range(act_range, args):
+    '''
+    save act_range
+    Args:
+        act_range: save act_range collected when model running
+        args: arguments from main
+    '''
+    if args.dump_act_range is not True:
+        return
+    output_dir = os.path.join(args.output_dir, "act_range")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    for layer_name, layer_scale in act_range.items():
+        for tensor_key, tensor_value in layer_scale.items():
+            if isinstance(tensor_value, torch.Tensor):
+                str_shape = tensor_shape_to_string(tensor_value)
+                file_name = f'{layer_name}_{tensor_key}_{str_shape}.pt'
+                file_path = os.path.join(output_dir, file_name)
+                torch.save(tensor_value, file_path)
+                logger.info(f"Saved act_range[{layer_name}][{tensor_key}] to {file_path}")
+
+
+def save_weights(weights, args):
+    '''
+    save hugging face weights
+    Args:
+        weights: hugging face weights merged with llm model named parameters
+        args: arguments from main
+    '''
+    if args.dump_weights is not True:
+        return
+    output_dir = os.path.join(args.output_dir, "weights")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    for tensor_key, tensor_value in weights.items():
+        str_shape = tensor_shape_to_string(tensor_value)
+        file_name = f'{tensor_key}_{str_shape}.pt'
+        file_path = os.path.join(output_dir, file_name)
+        torch.save(tensor_value, file_path)
+        logger.info(f"Saved weights[{tensor_key}] to {file_path}")
+
+
+def save_generate_weights(weights, args):
+    '''
+    save quantizated weights
+    Args:
+        weights: quantized weights of smoothquant or weightonly
+        args: arguments from main
+    '''
+    if args.dump_generate_weights is not True:
+        return
+    output_dir = os.path.join(args.output_dir, "generate_weights")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    for tensor_key, tensor_value in weights.items():
+        str_shape = tensor_shape_to_string(tensor_value)
+        file_name = f'{tensor_key}_{str_shape}.pt'
+        file_path = os.path.join(output_dir, file_name)
+        torch.save(tensor_value, file_path)
+        logger.info(f"Saved generate weights[{tensor_key}] to {file_path}")
+
+
+def dump_save_x_y(name, x, y, index):
+    '''
+    dump x, y when inferrence
+    output_dir need to modify by your self
+    '''
+    output_dir = "output_dir"
+    x_output_dir = os.path.join(output_dir, "x_tensor")
+    y_output_dir = os.path.join(output_dir, "y_tensor")
+    if not os.path.exists(x_output_dir):
+        os.makedirs(x_output_dir)
+    if not os.path.exists(y_output_dir):
+        os.makedirs(y_output_dir)
+
+    x_file_name = os.path.join(x_output_dir, f"{name}_x_{index}.pt")
+    y_file_name = os.path.join(y_output_dir, f"{name}_y_{index}.pt")
+    if isinstance(x, tuple):
+        x = x[0]
+    if not os.path.exists(x_file_name):
+        torch.save(x.cpu(), x_file_name)
+    if not os.path.exists(y_file_name):
+        torch.save(y.cpu(), y_file_name)
--- a/vllm-v0.6.2/tools/quant_tools/input_context.py
+++ b/vllm-v0.6.2/tools/quant_tools/input_context.py
@@ -0,0 +1,140 @@
+import torch
+
+
+def make_context(
+    tokenizer,
+    query,
+    history,
+    system,
+    max_input_length,
+    max_window_size: int = 6144,
+    chat_format: str = "chatml",
+):
+    '''
+    tokenize one text context to tokenized id
+    args:
+        tokenizer: model tokenizer
+        query: current text context
+        history: history text context
+        system: system prompt
+        max_input_length: max input length of tokenized id
+        chat_format: chat format, only accept chatml and raw
+    '''
+    if history is None:
+        history = []
+
+    if chat_format == "chatml":
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [tokenizer.im_start_id]
+        im_end_tokens = [tokenizer.im_end_id]
+        nl_tokens = tokenizer.encode("\n")
+
+        def _tokenize_str(role, content):
+            '''
+            tokensize string
+            '''
+            return (f"{role}\n{content}", tokenizer.encode(
+                role,
+                allowed_special=set(),
+            ) + nl_tokens + tokenizer.encode(
+                content,
+                allowed_special=set(),
+            ))
+
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+        raw_text = ""
+        context_tokens = []
+
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+
+            response_text, response_tokens_part = _tokenize_str("assistant", turn_response)
+            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+            prev_chat = (f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}")
+
+            current_context_size = (len(system_tokens) + len(next_context_tokens) + len(context_tokens))
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens +
+                           im_start_tokens + tokenizer.encode("assistant") + nl_tokens)
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+
+    elif chat_format == "raw":
+        raw_text = query
+        context_tokens = tokenizer.encode(raw_text)
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+    # truncate to max_input_length, truncate from the front
+    return raw_text, context_tokens[-max_input_length:]
+
+
+def prepare_inputs(batch_input_texts,
+                   tokenizer,
+                   model_name,
+                   model_version,
+                   test_token_num,
+                   eval_task='summarize',
+                   add_special_tokens=True):
+    '''
+    tokenize batch input texts into tokenized id.
+    args:
+        batch_input_texts: batch input text, also named batched prompt
+        tokenizer: model tokenizer
+        model_name: model name
+        model_version: model version
+        test_token_num: batch size, also named prompt number
+        eval_task: eval task
+        add_special_tokens: whether to add_special_tokens, default True
+    '''
+    batch_size = len(batch_input_texts)
+    append_str = ' TL;DR: ' if eval_task == 'summarize' else ''
+    batch_input_ids = []
+    for i in range(batch_size):
+        curr_text = batch_input_texts[i] + append_str
+        curr_text = curr_text.strip().replace(" n't", "n't")
+
+        # The below lines are used to be compatible with the original code
+        if 'GLM' in model_name and model_version in ['chatglm2', 'chatglm3']:
+            input_ids = tokenizer.encode(curr_text, return_tensors='pt').squeeze(0)
+            input_ids = input_ids[:test_token_num]
+        elif 'qwen' in model_name.lower() and model_version == 'qwen':
+            # use make_content to generate prompt
+            system_prompt = "You are a useful assistant, please directly output the corresponding " + \
+                "summary according to the article entered by the user."
+            _, input_id_list = make_context(
+                tokenizer=tokenizer,
+                query=curr_text,
+                history=[],
+                system=system_prompt,
+                max_input_length=test_token_num,
+            )
+            input_ids = torch.tensor(input_id_list)
+        else:
+            if 'qwen' in model_name.lower() and 'qwen2' in model_version:
+                messages = [{
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                }, {
+                    "role": "user",
+                    "content": curr_text
+                }]
+                curr_text = tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True)
+
+            input_ids = tokenizer.encode(curr_text,
+                                         return_tensors='pt',
+                                         add_special_tokens=add_special_tokens,
+                                         truncation=True,
+                                         max_length=test_token_num).squeeze(0)
+
+        batch_input_ids.append(input_ids)
+    return batch_input_ids
--- a/vllm-v0.6.2/tools/quant_tools/model_special.py
+++ b/vllm-v0.6.2/tools/quant_tools/model_special.py
@@ -0,0 +1,206 @@
+import re
+
+# model_type, qkv_list, gate_up_list, is_gate_up
+smooth_model_config = {
+    "mllama": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "llama": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "qwen2_vl": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None,
+        "skip_patterns": [r"^visual\.*"]
+    },
+    "qwen2": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "qwen": {
+        "qkv_list": ["c_attn"],
+        "gate_up_list": ["w2", "w1"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "baichuan": {
+        "qkv_list": ["W_pack"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "chatglm": {
+        "qkv_list": ["query_key_value"],
+        "gate_up_list": ["dense_h_to_4h"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "gpt_neox": {
+        "qkv_list": ["query_key_value"],
+        "gate_up_list": [],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "mixtral": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["w1", "w3"],
+        "is_gate_up": True,
+        "moe_list": {
+            "gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
+            "down_list": ["block_sparse_moe.w2", "w2"],
+            "is_merged": True
+        }
+    },
+    "qwen2_moe": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": {
+            "gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
+            "down_list": ["mlp.w2", "down_proj"],
+            "is_merged": True
+        }
+    },
+    "deepseek_v2": {
+        "qkv_list": ["q_proj", "q_b_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": {
+            "gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
+            "down_list": ["mlp.w2", "down_proj"],
+            "is_merged": True
+        },
+        "skip_patterns": [r".*\.kv_b_proj\..*",]
+    },
+    "falcon": {
+        "qkv_list": ["query_key_value"],
+        "gate_up_list": ["dense_h_to_4h"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "bloom": {
+        "qkv_list": ["query_key_value"],
+        "gate_up_list": ["dense_h_to_4h"],
+        "is_gate_up": False,
+        "moe_list": None
+    },
+    "internlm2": {
+        "qkv_list": ["wqkv"],
+        "gate_up_list": ["gate_up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "hunyuan": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": {
+            "gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
+            "down_list": ["mlp.w2", "down_proj"],
+            "is_merged": True
+        }
+    },
+    "phi3": {
+        "qkv_list": ["qkv_proj"],
+        "gate_up_list": ["gate_up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+}
+
+
+def get_layer_weight_bias_name(model_type, layer_name):
+    '''
+    Specially adjust the condition that layer_name and weight/bias name are different,
+    or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
+    if model_type == "chatglm" and "output_layer" in layer_name:
+        layer_name = "lm_head"
+        weight_name = f"{layer_name}_weight"
+        bias_name = f"{layer_name}_bias"
+    Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
+    '''
+    weight_name = None
+    bias_name = None
+
+    # layers which need to be modified can be listed at here
+    if model_type == "hunyuan" and "lm_head" in layer_name:
+        layer_name = "model.embed_tokens"
+        weight_name = "model.embed_tokens.weight"
+        bias_name = "model.embed_tokens.bias"
+
+    if weight_name is None:
+        weight_name = f"{layer_name}.weight"
+    if bias_name is None:
+        bias_name = f"{layer_name}.bias"
+
+    return layer_name, weight_name, bias_name
+
+
+def modify_layer_weight_bias_name(model_type, named_parameters):
+    '''
+    modify special condition that vllm layer_name isn't same as hf layer name
+    '''
+    # Mapping for model type specific adjustments
+    mapping = {
+        "chatglm": {
+            "transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
+        },
+    }
+
+    if model_type in mapping:
+        for old_key, new_key in mapping[model_type].items():
+            if old_key in named_parameters:
+                named_parameters[new_key] = named_parameters.pop(old_key)
+
+
+def extract_numbers(string):
+    '''
+    extract a string to number
+    '''
+    # 使用正则表达式找到字符串中的所有数字部分
+    matches = re.findall(r'\d+', string)
+
+    # 将所有匹配的数字部分转换为整数
+    numbers = [int(match) for match in matches]
+
+    return numbers[-1] if len(numbers) > 0 else 0
+
+
+def get_qkv_distribution(model_type, model_version, hf_config):
+    '''
+    Get qkv distribution: n3sh or 3nsh
+    n3sh: [head_num, 3, head_size, hidden_size]
+    3nsh: [3, head_num, head_size, hidden_size]
+    vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
+    to be same as hugging face qkv distribution
+    This is only for packge qkv layer and it's distribution is n3sh
+    '''
+    is_n3sh = False
+    head_num = 0
+    kv_head_num = 0
+    if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
+        is_n3sh = True
+        head_num = hf_config.num_attention_heads
+
+        kv_head_num = head_num
+    if model_type == "falcon":
+        is_n3sh = True
+        head_num = hf_config.num_attention_heads
+        if hf_config.new_decoder_architecture:
+            kv_head_num = hf_config.num_kv_heads
+        elif hf_config.multi_query:
+            kv_head_num = 1
+        else:
+            kv_head_num = head_num
+
+    return is_n3sh, head_num, kv_head_num
--- a/vllm-v0.6.2/tools/quant_tools/smooth_quant.py
+++ b/vllm-v0.6.2/tools/quant_tools/smooth_quant.py
@@ -0,0 +1,418 @@
+import argparse
+import torch
+from datasets import load_dataset
+import logging
+import csv
+import os
+
+from vllm import LLM, SamplingParams
+
+from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
+
+from input_context import prepare_inputs
+
+from dump_smooth import save_prompt_token_ids, save_input_ids, save_act_range, save_weights, save_generate_weights
+
+from model_special import smooth_model_config
+
+
+logger = logging.getLogger(__name__)
+
+def load_prompts_from_csv(args):
+    '''
+    load prompts from csv file
+    '''
+    if args.prompt_file is not None:
+        prompt_file = args.prompt_file
+    else:
+        current_dir = os.path.dirname(__file__)
+        prompt_file = os.path.join(current_dir, 'summarize_1024_prompts.csv')
+
+    # 从 CSV 文件加载数据为 List
+    loaded_prompts = []
+
+    # 从按列显示的 CSV 文件中读取数据并转换为 List 形式
+    with open(prompt_file, 'r', newline='') as file:
+        reader = csv.reader(file)
+        loaded_prompts = list(zip(*reader))[0]
+
+    loaded_prompts = list(loaded_prompts)
+    num_samples = min(args.num_samples, len(loaded_prompts))
+
+    prompts = loaded_prompts[0:num_samples]
+
+    return prompts
+
+
+def save_summarize_1024_prompts_as_csv(prompts):
+    '''
+    save summarize 512 prompts
+    '''
+    # 将 List 数据按列保存为 CSV 文件
+    # 转置 List
+    transposed_prompts = [prompts]
+    with open('summarize_1024_prompts.csv', 'w', newline='') as file:
+        writer = csv.writer(file)
+        writer.writerows(zip(*transposed_prompts))
+
+
+def generate_prompts(args: argparse.Namespace):
+    '''
+    Generate prompts based on the evaluation task and arguments.
+    '''
+
+    eval_task_config = {
+        "code_completion": {
+            "dataset_name": "openai_humaneval",
+            "dataset_revision": None,
+            "dataset_input_key": "prompt",
+            "dataset_split": "test"
+        },
+        "summarize": {
+            "dataset_name": "ccdv/cnn_dailymail",
+            "dataset_revision": "3.0.0",
+            "dataset_input_key": "article",
+            "dataset_split": "train"
+        },
+        "summarize_long": {
+            "dataset_name": "tau/zero_scrolls",
+            "dataset_revision": "squality",
+            "dataset_input_key": "input",
+            "dataset_split": "validation"
+        },
+        "summarize_hg": {
+            "dataset_name": "cnn_dailymail",
+            "dataset_revision": "3.0.0",
+            "dataset_input_key": "article",
+            "dataset_split": "validation"
+        },
+        "text_generation": {
+            "dataset_name": "lambada",
+            "dataset_revision": None,
+            "dataset_input_key": "text",
+            "dataset_split": "validation"
+        }
+    }
+
+    if args.eval_task in eval_task_config:
+        config = eval_task_config[args.eval_task]
+        dataset_name = config["dataset_name"]
+        dataset_revision = config["dataset_revision"]
+        dataset_input_key = config["dataset_input_key"]
+        dataset_split = config["dataset_split"]
+    else:
+        assert args.dataset_name is not None, f"dataset_name is None when eval_task == custom"
+        assert args.dataset_input_key is not None, f"dataset_input_key is None when eval_task == custom"
+        assert args.dataset_split is not None, f"dataset_split is None when eval_task == custom"
+
+        dataset_name = args.dataset_name
+        dataset_revision = args.dataset_revision
+        dataset_input_key = args.dataset_input_key
+        dataset_split = args.dataset_split
+
+    if args.prompt_file is not None or (args.eval_task == "summarize" and args.num_samples <= 1024):
+        prompts = load_prompts_from_csv(args)
+        num_samples = min(args.num_samples, len(prompts))
+    else:
+        dataset = load_dataset(dataset_name,
+                           dataset_revision,
+                           cache_dir=args.dataset_cache_dir,
+                           split=dataset_split,
+                           trust_remote_code=True)
+        num_samples = min(args.num_samples, len(dataset))
+        prompts = dataset[0:num_samples][dataset_input_key]
+        # save_summarize_1024_prompts_as_csv(prompts)
+
+    prompt_token_ids = []
+    if args.has_prompt_token_id:
+        batch_input_ids = prepare_inputs(prompts,
+                                         args.tokenizer,
+                                         args.model_name,
+                                         args.model_version,
+                                         args.max_input_length,
+                                         eval_task=args.eval_task,
+                                         add_special_tokens=args.add_special_tokens)
+        save_prompt_token_ids(batch_input_ids, args)
+        for i in range(num_samples):
+            prompt_token_ids.append(batch_input_ids[i].tolist())
+
+    if len(prompts) == 0:
+        prompts = None
+    else:
+        prompts = [s[:args.max_input_length] for s in prompts]
+
+    if len(prompt_token_ids) == 0:
+        prompt_token_ids = None
+
+    return prompts, prompt_token_ids
+
+
+@torch.no_grad()
+def get_smooth_cal_weight(name, weight, name_parameters, act_range, model_type):
+    '''
+    get cal_weight for smooth process to solve q/k/v and gate/up layer merged condition in vllm
+    args:
+        name: weight name
+        weight: weight value
+        name_parameters: named parameters
+        act_range: layer act range info of name
+        model_type: model type
+    '''
+    if act_range["is_qkv"] is True:
+        name_parts = name.split(".")
+        self_attn_layer_name = ".".join(name_parts[:-2])
+        qkv_list = smooth_model_config[model_type]["qkv_list"]
+        q_weight_name = f"{self_attn_layer_name}.{qkv_list[0]}.weight"
+        k_weight_name = f"{self_attn_layer_name}.{qkv_list[1]}.weight"
+        v_weight_name = f"{self_attn_layer_name}.{qkv_list[2]}.weight"
+        q_weight = name_parameters[q_weight_name]
+        k_weight = name_parameters[k_weight_name]
+        v_weight = name_parameters[v_weight_name]
+        cal_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+    elif act_range["is_merge"] is True:
+        name_parts = name.split(".")
+        mlp_layer_name = ".".join(name_parts[:-2])
+        gate_up_list = smooth_model_config[model_type]["gate_up_list"]
+        gate_weight_name = f"{mlp_layer_name}.{gate_up_list[0]}.weight"
+        up_weight_name = f"{mlp_layer_name}.{gate_up_list[1]}.weight"
+        gate_weight = name_parameters[gate_weight_name]
+        up_weight = name_parameters[up_weight_name]
+        cal_weight = torch.cat([gate_weight, up_weight], dim=0)
+    else:
+        cal_weight = weight
+
+    return cal_weight
+
+
+@torch.no_grad()
+def cal_smoother(weight, act_range_x, alpha=0.5):
+    '''
+    calculate smoother value
+    args:
+        weight: smoother weight
+        act_range_x: activation max value of per channel
+        alpha: smooth factor, default 0.5
+    '''
+    assert weight.shape[-1] == act_range_x.numel()
+    weight_scales = weight.view(-1, weight.shape[-1])
+    weight_scales = weight_scales.abs().max(dim=0)[0]
+    weight_scales = weight_scales.to(float).clamp(min=1e-6)
+    smoother = (act_range_x.to(weight_scales.device).to(float).pow(alpha) /
+                weight_scales.pow(1 - alpha)).clamp(min=1e-6)
+
+    return smoother
+
+
+@torch.no_grad()
+def cal_qweight_scales(sweight, smooth_act_range_x, per_token, per_channel):
+    '''
+    calculate quantized weight anc scales
+    args:
+        sweight: weight which has been divided by smoother value
+        smooth_act_range_x: activation max value which has beed divide by smoother value
+        per_token: bool, means whether calculate the weight and scales dynamically
+        per_channel: bool, mean whether calculate the weight and scales by channel
+    '''
+    scale_x_quant_orig_t = smooth_act_range_x.max() / 127.0
+    smooth_act_range_w = sweight.abs().max(dim=-1)[0]
+    smooth_act_range_w = smooth_act_range_w.to(float).clamp(min=1e-6)
+    scale_w_quant_orig_c = smooth_act_range_w / 127.0
+    scale_w_quant_orig_t = smooth_act_range_w.max() / 127
+
+    if per_channel:
+        qweight = (sweight / scale_w_quant_orig_c[..., None])
+    else:
+        qweight = (sweight / scale_w_quant_orig_t)
+
+    qweight = qweight.clip(-128, 127).to(torch.int8)
+
+    scale_to_int = 1 / scale_x_quant_orig_t
+
+    if per_token:
+        if per_channel:
+            per_channel_scale = scale_w_quant_orig_c
+        else:
+            per_channel_scale = scale_w_quant_orig_t
+    else:
+        if per_channel:
+            per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_c
+            hidden_size = smooth_act_range_x.numel()
+            scale_to_int = scale_to_int.repeat(hidden_size)
+        else:
+            per_channel_scale = scale_x_quant_orig_t * scale_w_quant_orig_t
+
+    per_channel_scale = per_channel_scale.squeeze()
+    if per_channel_scale.numel() == 1 and per_channel_scale.dim() == 0:
+        per_channel_scale = per_channel_scale.unsqueeze(0)
+
+    if scale_to_int.numel() == 1 and scale_to_int.dim() == 0:
+        scale_to_int = scale_to_int.unsqueeze(0)
+
+    sinfo = [
+        scale_w_quant_orig_t.item(), scale_x_quant_orig_t.item(),
+        scale_w_quant_orig_t.item() / scale_x_quant_orig_t.item()
+    ]
+    return qweight, per_channel_scale, scale_to_int, sinfo
+
+
+def check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int):
+    '''
+    check whether nan/inf appears in qweight, per_channel_scale, smooth, qzeros, scale_to_int
+    '''
+    if torch.isinf(qweight).any() or torch.isnan(qweight).any():
+        logger.error(f"name:{name} qweight has inf or nan")
+    if torch.isinf(per_channel_scale).any() or torch.isnan(per_channel_scale).any():
+        logger.error(f"name:{name} per_channel_scale has inf or nan")
+    if torch.isinf(smooth).any() or torch.isnan(smooth).any():
+        logger.error(f"name:{name} smooth has inf or nan")
+    if torch.isinf(scale_to_int).any() or torch.isnan(scale_to_int).any():
+        logger.error(f"name:{name} scale_to_int has inf or nan")
+    if qzeros is not None and (torch.isinf(qzeros).any() or torch.isnan(qzeros).any()):
+        logger.error(f"name:{name} qzeros has inf or nan")
+
+
+@torch.no_grad()
+def cal_smooth_weight(name, act_range_x, weight, smooth_value, has_qzeros, per_token, per_channel, cal_weight):
+    '''
+    calculate qweight, scales, smooth, qzeros
+    args:
+        name: weight name
+        act_range_x: activation max value of per channel
+        weight: weight to be quantized
+        smooth_value: smooth value
+        has_qzeros: which generate qzeros weight
+        per_token: bool, means whether calculate the weight and scales dynamically
+        per_channel: bool, mean whether calculate the weight and scales by channel
+        model_type: model type
+    '''
+    smoother = cal_smoother(cal_weight, act_range_x, smooth_value)
+    smooth_act_range_x = act_range_x / smoother
+    sweight = weight * (smoother.view(1, -1))
+    qweight, per_channel_scale, scale_to_int, sinfo = cal_qweight_scales(sweight, smooth_act_range_x, per_token,
+                                                                         per_channel)
+    qweight = qweight.reshape(weight.shape)
+    smooth = 1 / smoother
+    smooth = smooth.squeeze()
+    if has_qzeros:
+        qzeros = torch.zeros_like(per_channel_scale, dtype=torch.int32)
+    else:
+        qzeros = None
+
+    # check_smooth_weight_vaild(name, qweight, per_channel_scale, smooth, qzeros, scale_to_int)
+
+    return qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo
+
+
+@torch.no_grad()
+def generate_smooth_weight(act_range, name_parameters, args):
+    '''
+    generate smooth weight
+    args:
+        act_range: act_range collected in model running
+        name_parameters: hugging face model named parameters
+        args: argument from main
+    '''
+    smooth_weight = {}
+    smooth_info = {}
+    has_qzeros = args.has_qzeros
+    smooth_value = args.smooth_value
+
+    smooth_info["title"] = ["max_scale_w, max_scale_x, max_scale_w/max_scale_x"]
+
+    for name, param in name_parameters.items():
+        if should_skip(args.model_type, name):
+            logger.info(f"skip {name}")
+            smooth_weight[name] = param
+            continue
+        if name.endswith("bias"):
+            smooth_weight[name] = param
+            continue
+        name_parts = name.split(".")
+        layer_name = ".".join(name_parts[:-1])
+        if layer_name in act_range:
+            act_range_x = act_range[layer_name]['x']
+            cal_weight = get_smooth_cal_weight(name, param, name_parameters, act_range[layer_name], args.model_type)
+            qweight, per_channel_scale, smooth, qzeros, scale_to_int, sinfo = cal_smooth_weight(
+                name, act_range_x, param, smooth_value, has_qzeros, args.per_token, args.per_channel, cal_weight)
+
+            per_channel_scale = per_channel_scale.to(args.torch_scales_smooth_dtype)
+            smooth = smooth.to(args.torch_scales_smooth_dtype)
+            scale_to_int = scale_to_int.to(args.torch_scales_smooth_dtype)
+
+            smooth_weight[f'{layer_name}.qweight'] = qweight
+            smooth_weight[f'{layer_name}.per_channel_scale'] = per_channel_scale
+
+            if args.per_token is True:
+                smooth_weight[f'{layer_name}.smooth'] = smooth
+            else:
+                scale_to_int = scale_to_int * smooth
+                smooth_weight[f'{layer_name}.scale_to_int'] = scale_to_int
+
+            if has_qzeros:
+                smooth_weight[f'{layer_name}.qzeros'] = qzeros
+
+            smooth_info[name] = sinfo
+        else:
+            smooth_weight[name] = param
+
+    return smooth_weight, smooth_info
+
+
+def generate_weights_of_smoothquant(llm: LLM, args: argparse.Namespace):
+    '''
+    generate smoothquant weights
+    args:
+        llm: LLM instance
+        args: argument from main
+    '''
+    prompts, prompt_token_ids = generate_prompts(args)
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(max_tokens=args.output_len,
+                                     repetition_penalty=args.repetition_penalty,
+                                     temperature=args.temperature,
+                                     top_p=args.top_p,
+                                     top_k=args.top_k)
+
+    tp_size = args.tp_size
+
+    llm.llm_engine.model_executor._run_workers("setup_smooth_hook", args.dump_input_ids)
+
+    llm.generate(prompts, sampling_params, prompt_token_ids=prompt_token_ids, use_tqdm=True)
+
+    logger.info("llm generate finished")
+
+    llm.llm_engine.model_executor._run_workers("remove_hooks")
+    act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
+    named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
+
+    vllm_cleanup(llm)
+    del prompts
+    del prompt_token_ids
+    cleanup()
+
+    logger.info("get act_range and named_parameters from llm finished")
+
+    merged_act_range, merged_named_parameters, input_id_list = convert_to_merged(act_range, named_parameters, tp_size,
+                                                                                 args)
+
+    save_input_ids(input_id_list, args)
+    save_act_range(merged_act_range, args)
+    save_weights(merged_named_parameters, args)
+
+    del act_range
+    del named_parameters
+    cleanup()
+
+    logger.info("get merged_act_range and merged_named_parameters finished")
+
+    smooth_weight, smooth_info = generate_smooth_weight(merged_act_range, merged_named_parameters, args)
+    save_generate_weights(smooth_weight, args)
+
+    del merged_act_range
+    del merged_named_parameters
+    cleanup()
+
+    logger.info("get smooth_weight finished")
+
+    return smooth_weight, smooth_info
--- a/vllm-v0.6.2/tools/quant_tools/summarize_1024_prompts.csv
+++ b/vllm-v0.6.2/tools/quant_tools/summarize_1024_prompts.csv
--- a/vllm-v0.6.2/tools/quant_tools/utils_internal.py
+++ b/vllm-v0.6.2/tools/quant_tools/utils_internal.py
@@ -0,0 +1,713 @@
+from collections import defaultdict, OrderedDict
+import torch
+from pathlib import Path
+from typing import Optional
+import re
+import os
+import shutil
+import logging
+import json
+from transformers import AutoTokenizer, T5Tokenizer
+import gc
+from datetime import datetime
+from vllm.platforms import current_platform
+
+from model_special import (smooth_model_config, get_layer_weight_bias_name, get_qkv_distribution,
+                           modify_layer_weight_bias_name)
+
+logger = logging.getLogger(__name__)
+
+
+_str_to_torch_dtype_dict = dict(
+    bfloat16=torch.bfloat16,
+    float16=torch.float16,
+    float32=torch.float32,
+    int64=torch.int64,
+    int32=torch.int32,
+    int8=torch.int8,
+    bool=torch.bool,
+    fp8=torch.float8_e4m3fn,
+)
+
+
+def str_dtype_to_torch(dtype):
+    '''
+    convert torch dytpe to str dtype
+    '''
+    ret = _str_to_torch_dtype_dict.get(dtype)
+    dtype = ret if ret is not None else torch.float16
+    return dtype
+
+
+_torch_dtype_to_str_dict = {
+    torch.bfloat16:"bfloat16",
+    torch.float16:"float16",
+    torch.float32:"float32",
+    torch.int64:"int64",
+    torch.int32:"int32",
+    torch.int8:"int8",
+    torch.bool:"bool",
+    torch.float8_e4m3fn:"fp8",
+}
+
+
+def torch_dtype_to_str(dtype):
+    '''
+    convert str dytpe to torch dtype
+    '''
+    ret = _torch_dtype_to_str_dict.get(dtype)
+    dtype = ret if ret is not None else "float16"
+    return dtype
+
+
+def extract_model_path(name_or_path):
+    '''
+    extract model_version, model_family from named_or_path from config.json
+    '''
+    patterns = [
+        r"/(.*)(-[0-9]+[mMbB]{1})(-*.*)",
+        r"/(.*-[0-9]+)(-*.*)",
+        r"(.*)(-[0-9]+[mMbB]{1})(-*.*)",
+        r"(.*-[0-9]+)(-*.*)",
+        r"([^-]+)(-*.*)",
+    ]
+    model_version = None
+    for pattern in patterns:
+        match = re.search(pattern, name_or_path)
+        if match:
+            model_version = match.group(1)
+            break
+
+    if model_version is None:
+        model_version = name_or_path
+
+    model_version = model_version.lower()
+    match = re.search(r"([a-zA-z]+)(.*)", model_version)
+    if match:
+        model_family = match.group(1)
+    else:
+        model_family = model_version
+
+    return model_version, model_family
+
+
+def read_model_name(model_dir: str, model_version: Optional[str] = None, model_type: Optional[str] = None):
+    '''
+    get model_arch, model_version, model_family, model_type form config.json, passed model_version, model_type
+    args:
+        model_dir: model directory
+        model_version: passed from main, default None
+        model_type: pass from main, default None
+    '''
+    with open(Path(model_dir) / "config.json", 'r') as f:
+        config = json.load(f)
+
+    model_arch = config.get('architectures', None)
+    name_or_path = config.get('_name_or_path', None)
+    if model_type is None:
+        model_type = config.get('model_type', None)
+    if model_type:
+        model_type = model_type.lower()
+    model_family = None
+
+    if model_version is None and name_or_path:
+        model_version, model_family = extract_model_path(name_or_path)
+
+    if model_version is None:
+        model_version = model_type
+
+    if model_version:
+        model_version = model_version.lower()
+
+    if model_version and model_family is None:
+        match = re.search(r"([a-zA-z]+)(.*)", model_version)
+        if match:
+            model_family = match.group(1)
+        else:
+            model_family = model_version
+
+    if isinstance(model_arch, (list, tuple)) and len(model_arch) > 0:
+        model_arch = model_arch[0]
+
+    assert model_arch, "read model architectures failed"
+    assert model_version, "read model version failed, please set args.version manually"
+    assert model_family, "read model family failed, please set args.version manually"
+
+    return model_arch, model_version, model_family, model_type
+
+
+def load_tokenizer(tokenizer_dir: Optional[str] = None,
+                   vocab_file: Optional[str] = None,
+                   model_name: str = 'GPTForCausalLM',
+                   model_version: Optional[str] = None,
+                   tokenizer_type: Optional[str] = None):
+    '''
+    load tokenizer of model
+    args:
+        tokenizer_dir: tokenizer directory
+        vocab_file: vocabulary file, default None
+        model_name: model name
+        model_version: model version
+        tokenizer_type: Tokenizer type to be loaded.
+    '''
+    if vocab_file is None:
+        use_fast = True
+        if tokenizer_type == "llama":
+            use_fast = False
+        # Should set both padding_side and truncation_side to be 'left'
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                  legacy=False,
+                                                  padding_side='left',
+                                                  truncation_side='right',
+                                                  trust_remote_code=True,
+                                                  tokenizer_type=tokenizer_type,
+                                                  use_fast=use_fast)
+    elif model_name == 'GemmaForCausalLM':
+        from transformers import GemmaTokenizer
+
+        # Initialize tokenizer from vocab file.
+        tokenizer = GemmaTokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
+    else:
+        # For gpt-next, directly load from tokenizer.model
+        tokenizer = T5Tokenizer(vocab_file=vocab_file, padding_side='left', truncation_side='left', legacy=False)
+
+    if model_name == 'QWenForCausalLM':
+        with open(Path(tokenizer_dir) / "generation_config.json") as f:
+            gen_config = json.load(f)
+        chat_format = gen_config['chat_format']
+        assert chat_format in ('raw','chatml'), f"unknown chat format: {chat_format}"
+        pad_id = gen_config['pad_token_id']
+        end_id = gen_config['eos_token_id']
+    elif model_name in ('ChatGLMForCausalLM', 'glm'):
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eop_token_id
+    else:
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eos_token_id
+
+    try:
+        tokenizer.pad_token = tokenizer.eos_token
+    except Exception as e:
+        logger.warn(f"set pad_token with exception:{e}")
+
+    return tokenizer, pad_id, end_id
+
+
+def merge_qkv_weight(named_parameters, weight_name, tp_size, q_proj_size, num_kv_head_replicas):
+    '''
+    merge tensor parallel qkv weight to none parallel q_weight, k_weight, v_weight.
+    merge_qkv weight and bias has the same logic
+    args:
+        named_parameters: parallel named parameters
+        weight_name: qkv layer weight name
+        tp_size: tensor parallel size
+        q_proj_size: query projection size
+        num_kv_head_replicas: number kv head replicas
+    '''
+    qkv_proj_size = named_parameters[0][weight_name].shape[0]
+    kv_proj_size = (qkv_proj_size - q_proj_size) // 2
+    splite_size = [q_proj_size, kv_proj_size, kv_proj_size]
+
+    q_weight_list = []
+    k_weight_list = []
+    v_weight_list = []
+
+    for rank in range(0, tp_size):
+        weight = named_parameters[rank][weight_name]
+        split_weight = torch.split(weight, splite_size, dim=0)
+        q_weight_list.append(split_weight[0])
+        if rank % num_kv_head_replicas == 0:
+            k_weight_list.append(split_weight[1])
+            v_weight_list.append(split_weight[2])
+
+    q_weight = torch.cat(q_weight_list, dim=0)
+    k_weight = torch.cat(k_weight_list, dim=0)
+    v_weight = torch.cat(v_weight_list, dim=0)
+
+    return q_weight, k_weight, v_weight
+
+
+def merge_merged_weight(named_parameters, weight_name, tp_size, dim=0):
+    '''
+    merge merged linear layer weight to gate_weight and up_weight.
+    merge merged weight and bias has the same logic.
+    args:
+        named_parameters: parallel named parameters
+        weight_name: qkv layer weight name
+        tp_size: tensor parallel size
+    '''
+    up_weight_list = []
+    gate_weight_list = []
+
+    for rank in range(0, tp_size):
+        weight = named_parameters[rank][weight_name]
+        chunk_weights = torch.chunk(weight, 2, dim=dim)
+        up_weight_list.append(chunk_weights[0])
+        gate_weight_list.append(chunk_weights[1])
+
+    gate_weight = torch.cat(up_weight_list, dim=dim)
+    up_weight = torch.cat(gate_weight_list, dim=dim)
+
+    return gate_weight, up_weight
+
+
+def convert_packed_qkv(q_weight, k_weight, v_weight, dim, args):
+    '''
+    convert packad qkv weight or bias
+    args:
+        q_weight: q weight or bias
+        k_weight: k weight or bias
+        v_weight: v_weight or bias
+        dim: convert dim
+        args: argument
+    '''
+    packed_qkv = torch.cat([q_weight, k_weight, v_weight], dim=dim)
+    is_n3sh, head_num, kv_head_num = get_qkv_distribution(args.model_type, args.model_version, args.hf_config)
+    if is_n3sh is True:
+        packed_qkv_shape = packed_qkv.shape
+        num_query_heads_per_kv_head = head_num // kv_head_num
+        q_shape = q_weight.shape
+        k_shape = k_weight.shape
+        v_shape = v_weight.shape
+        q = q_weight.view(q_shape[:dim] + (kv_head_num, num_query_heads_per_kv_head, -1) + q_shape[dim + 1:])
+        k = k_weight.view(k_shape[:dim] + (kv_head_num, 1, -1) + k_shape[dim + 1:])
+        v = v_weight.view(v_shape[:dim] + (kv_head_num, 1, -1) + v_shape[dim + 1:])
+        tensor_n3sh = torch.cat([q, k, v], dim=dim+1)
+        packed_qkv = tensor_n3sh.reshape(packed_qkv_shape)
+
+    return packed_qkv
+
+
+def convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
+                                 layer_range, merged_act_range, tp_size, args):
+    '''
+    convert parallel qkv named parameters to non parallel qkv named parameters
+    args:
+        layer_name: layer name
+        weight_name: weight name
+        bias_name: bias name
+        named_parameters: parallel hugging face named parameters
+        merged_named_parameters: non parallel hugging face named parameters
+        layer_range: parallel layer range info
+        merged_act_range: non parallel act range
+        tp_size: tensor parallel size
+        args: argument
+    '''
+    layer_name_parts = layer_name.split(".")
+    self_attn_layer_name = ".".join(layer_name_parts[:-1])
+    qkv_name = layer_name_parts[-1]
+    q_weight, k_weight, v_weight = merge_qkv_weight(named_parameters, weight_name, tp_size, layer_range["q_proj_size"],
+                                                    layer_range["num_kv_head_replicas"])
+    qkv_list = smooth_model_config[args.model_type]["qkv_list"]
+    qkv_list_len = len(qkv_list)
+    if qkv_list_len == 3:
+        q_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
+        k_layer_name = f"{self_attn_layer_name}.{qkv_list[1]}"
+        v_layer_name = f"{self_attn_layer_name}.{qkv_list[2]}"
+    elif qkv_list_len == 1:
+        qkv_layer_name = f"{self_attn_layer_name}.{qkv_list[0]}"
+
+    if qkv_list_len == 3:
+        merged_act_range[q_layer_name]["x"] = layer_range["x"]
+        merged_act_range[k_layer_name]["x"] = layer_range["x"]
+        merged_act_range[v_layer_name]["x"] = layer_range["x"]
+        merged_act_range[q_layer_name]["is_qkv"] = True
+        merged_act_range[k_layer_name]["is_qkv"] = True
+        merged_act_range[v_layer_name]["is_qkv"] = True
+
+        merged_named_parameters[f"{q_layer_name}.weight"] = q_weight
+        merged_named_parameters[f"{k_layer_name}.weight"] = k_weight
+        merged_named_parameters[f"{v_layer_name}.weight"] = v_weight
+    elif qkv_list_len == 1:
+        merged_act_range[qkv_layer_name]["x"] = layer_range["x"]
+        qkv_weight = convert_packed_qkv(q_weight, k_weight, v_weight, 0, args)
+        merged_named_parameters[f"{qkv_layer_name}.weight"] = qkv_weight
+
+    if bias_name in named_parameters[0]:
+        q_bias, k_bias, v_bias = merge_qkv_weight(named_parameters, bias_name, tp_size, layer_range["q_proj_size"],
+                                                  layer_range["num_kv_head_replicas"])
+        if qkv_list_len == 3:
+            merged_named_parameters[f"{q_layer_name}.bias"] = q_bias
+            merged_named_parameters[f"{k_layer_name}.bias"] = k_bias
+            merged_named_parameters[f"{v_layer_name}.bias"] = v_bias
+        elif qkv_list_len == 1:
+            qkv_bias = convert_packed_qkv(q_bias, k_bias, v_bias, 0, args)
+            merged_named_parameters[f"{qkv_layer_name}.bias"] = qkv_bias
+
+    return qkv_name
+
+
+def convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters, merged_named_parameters,
+                                 layer_range, merged_act_range, tp_size, model_type):
+    '''
+    convert parallel merged named parameters to non parallel merged named parameters
+    args:
+        layer_name: layer name
+        weight_name: weight name
+        bias_name: bias name
+        named_parameters: parallel hugging face named parameters
+        merged_named_parameters: non parallel hugging face named parameters
+        layer_range: parallel layer range info
+        merged_act_range: non parallel act range
+        tp_size: tensor parallel size
+        model_type: model type
+    '''
+    layer_name_parts = layer_name.split(".")
+    mlp_layer_name = ".".join(layer_name_parts[:-1])
+    gate_weight, up_weight = merge_merged_weight(named_parameters, weight_name, tp_size)
+    gate_up_name = layer_name_parts[-1]
+    gate_up_list = smooth_model_config[model_type]["gate_up_list"]
+    gate_up_list_len = len(gate_up_list)
+    is_gate_up = smooth_model_config[model_type]["is_gate_up"]
+    if gate_up_list_len == 2:
+        gate_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
+        up_layer_name = f"{mlp_layer_name}.{gate_up_list[1]}"
+    elif gate_up_list_len == 1:
+        gate_up_layer_name = f"{mlp_layer_name}.{gate_up_list[0]}"
+
+    if gate_up_list_len == 2:
+        merged_act_range[gate_layer_name]["x"] = layer_range["x"]
+        merged_act_range[up_layer_name]["x"] = layer_range["x"]
+        merged_act_range[gate_layer_name]["is_merge"] = True
+        merged_act_range[up_layer_name]["is_merge"] = True
+
+        merged_named_parameters[f"{gate_layer_name}.weight"] = gate_weight
+        merged_named_parameters[f"{up_layer_name}.weight"] = up_weight
+    elif gate_up_list_len == 1:
+        merged_act_range[gate_up_layer_name]["x"] = layer_range["x"]
+        merged_gate_up_weight_list = [gate_weight, up_weight] if is_gate_up is True else [up_weight, gate_weight]
+        merged_named_parameters[f"{gate_up_layer_name}.weight"] = torch.cat(merged_gate_up_weight_list, dim=0)
+
+    if bias_name in named_parameters[0]:
+        gate_bias, up_bias = merge_merged_weight(named_parameters, bias_name, tp_size)
+        if gate_up_list_len == 2:
+            merged_named_parameters[f"{gate_layer_name}.bias"] = gate_bias
+            merged_named_parameters[f"{up_layer_name}.bias"] = up_bias
+        elif gate_up_list_len == 1:
+            merged_gate_up_bias_list = [gate_bias, up_bias] if is_gate_up is True else [up_bias, gate_bias]
+            merged_named_parameters[f"{gate_up_layer_name}.bias"] = torch.cat(merged_gate_up_bias_list, dim=0)
+
+    return gate_up_name
+
+
+def convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
+                                            merged_named_parameters, layer_range, merged_act_range, tp_size):
+    '''
+    convert colum parallel named parameters to non parallel named parameters
+    args:
+        layer_name: layer name
+        weight_name: weight name
+        bias_name: bias name
+        named_parameters: parallel hugging face named parameters
+        merged_named_parameters: non parallel hugging face named parameters
+        layer_range: parallel layer range info
+        merged_act_range: non parallel act range
+        tp_size: tensor parallel size
+    '''
+    if layer_range['is_linear']:
+        merged_act_range[layer_name]["x"] = layer_range["x"]
+    merged_named_parameters[weight_name] = torch.cat(
+        [named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=0)
+    if bias_name in named_parameters[0]:
+        merged_named_parameters[bias_name] = torch.cat(
+            [named_parameters[tp_id][bias_name] for tp_id in range(0, tp_size)], dim=0)
+
+
+def convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
+                          merged_named_parameters, layer_range, merged_act_range, tp_size):
+    '''
+    convert row parallel named parameters to non parallel named parameters
+    args:
+        act_layer_name: act layer name
+        act_range: parallel act_range
+        layer_name: layer name
+        weight_name: weight name
+        bias_name: bias name
+        named_parameters: parallel hugging face named parameters
+        merged_named_parameters: non parallel hugging face named parameters
+        layer_range: parallel layer range info
+        merged_act_range: non parallel act range
+        tp_size: tensor parallel size
+    '''
+    if layer_range['is_linear']:
+        if isinstance(layer_range['x'], torch.Tensor):
+            merged_act_range[layer_name]['x'] = torch.cat(
+                [act_range[tp_id][act_layer_name]['x'] for tp_id in range(0, tp_size)], dim=0)
+        else:
+            merged_act_range[layer_name]['x'] = None
+
+    merged_named_parameters[weight_name] = torch.cat(
+        [named_parameters[tp_id][weight_name] for tp_id in range(0, tp_size)], dim=1)
+    if bias_name in named_parameters[0]:
+        merged_named_parameters[bias_name] = named_parameters[0][bias_name]
+
+
+def convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
+                            merged_named_parameters, layer_range, merged_act_range, tp_size, args):
+    '''
+    convert parallel layer named parameters to non parallel layer named parameters
+    args:
+        act_layer_name: act layer name
+        act_range: parallel act_range
+        layer_name: layer name
+        weight_name: weight name
+        bias_name: bias name
+        named_parameters: parallel hugging face named parameters
+        merged_named_parameters: non parallel hugging face named parameters
+        layer_range: parallel layer range info
+        merged_act_range: non parallel act range
+        tp_size: tensor parallel size
+    '''
+    qkv_name = "qkv_proj"
+    gate_up_name = "gate_up_proj"
+
+    if layer_range['split'] == 'col':  # col
+        # merge weight
+        if layer_range["is_qkv"]:
+            qkv_name = convert_to_merged_qkv_weight(layer_name, weight_name, bias_name, named_parameters,
+                                                    merged_named_parameters, layer_range, merged_act_range, tp_size,
+                                                    args)
+
+        elif layer_range["is_merge"]:
+            gate_up_name = convert_to_merged_merged_weight(layer_name, weight_name, bias_name, named_parameters,
+                                                           merged_named_parameters, layer_range, merged_act_range,
+                                                           tp_size, args.model_type)
+        else:
+            convert_to_col_weight_except_qkv_merged(layer_name, weight_name, bias_name, named_parameters,
+                                                    merged_named_parameters, layer_range, merged_act_range, tp_size)
+    else:  # row
+        convert_to_row_weight(act_layer_name, act_range, layer_name, weight_name, bias_name, named_parameters,
+                              merged_named_parameters, layer_range, merged_act_range, tp_size)
+
+    return qkv_name, gate_up_name
+
+
+def collect_moe_experts_act_range_of_layer(merged_act_range, mlp_part_name, moe_list):
+    '''
+    collect moe experts act range in the same layer
+    '''
+    experts_of_gate_up_layer = {}
+    experts_of_down_layer = {}
+
+    gate_up_list = moe_list["gate_up_list"]
+    gate_up_list_len = len(gate_up_list)
+    down_list = moe_list["down_list"]
+    gate_up_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[1]}"
+    gate_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{gate_up_list[2]}" if gate_up_list_len > 2 else None
+    down_layer_pattern = rf"{mlp_part_name}.experts\.\d+\.{down_list[1]}"
+    for key, value in merged_act_range.items():
+        if re.search(gate_up_layer_pattern, key) or (gate_layer_pattern is not None
+                                                     and re.search(gate_layer_pattern, key)):
+            experts_of_gate_up_layer[key] = value
+        if re.search(down_layer_pattern, key):
+            experts_of_down_layer[key] = value
+
+    return experts_of_gate_up_layer, experts_of_down_layer
+
+
+def convert_moe_expert_activation_fused(experts_of_layer, merged_act_range):
+    '''
+    fuse the moe expert act range in the same layer, and asign to these experts
+    '''
+    unfused_activation = []
+    for key, value in experts_of_layer.items():
+        if isinstance(value["x"], torch.Tensor):
+            unfused_activation.append(value['x'])
+
+    assert len(unfused_activation) > 0, f"unfused_activation len is zero, this is unsupported"
+
+    activation = torch.stack(unfused_activation, dim=0)
+    fused_activation = torch.max(activation, dim=0)[0]
+
+    for key, value in experts_of_layer.items():
+        if value["x"] is None or isinstance(value["x"], torch.Tensor):
+            value['x'] = fused_activation
+
+
+def convert_moe_layer_activation_fused(merged_act_range, model_type):
+    '''
+    loop each layer and fuse the moe expert act range in the same layer, and asign to these experts
+    '''
+    moe_list = smooth_model_config[model_type]["moe_list"]
+    if moe_list is None:
+        return
+
+    mlp_name = moe_list["gate_up_list"][0].split(".")[0]
+    layer = 0
+
+    while True:
+        mlp_part_name = rf"\.{layer}\.{mlp_name}"
+        experts_of_gate_up_layer, experts_of_down_layer = collect_moe_experts_act_range_of_layer(
+            merged_act_range, mlp_part_name, moe_list)
+        # if experts_of_layer is empty, means layer equants to expert_num, the loop is finished
+        if len(experts_of_gate_up_layer) < 1 or len(experts_of_down_layer) < 1:
+            logger.info(f"the experts_num is {layer}")
+            break
+        convert_moe_expert_activation_fused(experts_of_gate_up_layer, merged_act_range)
+        convert_moe_expert_activation_fused(experts_of_down_layer, merged_act_range)
+        layer += 1
+
+
+def should_include(key, parameters, exclude_names):
+    '''
+    key shouldnot include in parameters and exlude_names
+    args:
+        parameters: named parameters
+        exclude_names: excluded nameds list
+    '''
+    return key not in parameters and not any(exclude_name in key for exclude_name in exclude_names)
+
+
+def valid_act_range(act_layer_name, layer_range):
+    '''
+    valid act_range, mainly filter inf, nan or zero values in x field
+    args:
+        act_layer_name: act layer name
+        layer_range: act layer value
+    '''
+    act_range_x = layer_range["x"]
+    if act_range_x is not None and isinstance(act_range_x, torch.Tensor):
+        mask = torch.isinf(act_range_x) | torch.isnan(act_range_x) | (act_range_x == 0)
+        if torch.any(mask).item():
+            act_range_x[mask] = 1e-6
+            logger.warning(f"act_range_x in layer:{act_layer_name} has nan, inf or zero values, force to 1e-6")
+
+
+def convert_to_merged(act_range, named_parameters, tp_size, args):
+    '''
+    convert parallel act_range and named parameters to non parallel format.
+    args:
+        act_range: parallel act_range
+        named_parameters: parallel named parameters
+        tp_size: tensor parallel size
+        args: argument
+    '''
+    model_type = args.model_type
+    merged_act_range = defaultdict(lambda: {"x": None, "is_qkv": False, "is_merge": False,})
+    merged_named_parameters = {}
+    input_id_list = []
+
+    exclude_names = set()
+
+    for act_layer_name, layer_range in act_range[0].items():
+        valid_act_range(act_layer_name, layer_range)
+        layer_name, weight_name, bias_name = get_layer_weight_bias_name(model_type, act_layer_name)
+        # when tie_word_embeddings is True, lm_head use embeding weight
+        if args.tie_word_embeddings is True and "lm_head" in layer_name:
+            continue
+        qkv_name, gate_up_name = convert_to_layer_merged(act_layer_name, act_range, layer_name, weight_name, bias_name,
+                                                         named_parameters, merged_named_parameters, layer_range,
+                                                         merged_act_range, tp_size, args)
+        exclude_names.update({qkv_name, gate_up_name})
+
+        if layer_range['split'] == 'col' and layer_range["is_qkv"] and len(layer_range["input_id"]) > 0:
+            input_id_list = layer_range["input_id"]
+
+    if args.use_smoothquant and args.disable_fused_quantize_expert is False:
+        convert_moe_layer_activation_fused(merged_act_range, model_type)
+
+
+    merged_named_parameters.update({
+        key: value
+        for key, value in named_parameters[0].items()
+        if should_include(key, merged_named_parameters, exclude_names)
+    })
+
+    modify_layer_weight_bias_name(model_type, merged_named_parameters)
+
+    sorted_named_parameters = OrderedDict(sorted(merged_named_parameters.items(), key=lambda item: item[0]))
+    sorted_merged_act_range = OrderedDict(sorted(merged_act_range.items(), key=lambda item: item[0]))
+
+    return sorted_merged_act_range, sorted_named_parameters, input_id_list
+
+
+def copy_files_except_extensions(input_dir, output_dir, extensions):
+    '''
+    copy python files with extension in extensions from input_dir to output_dir, and keey sub directory is same
+    args:
+        input_dir: input directory
+        output_dir: output directory
+        extensions: the copy files extension
+    '''
+    # 遍历输入目录及其子目录
+    for root, dirs, files in os.walk(input_dir):
+        # 计算相对路径
+        rel_path = os.path.relpath(root, input_dir)
+        if len(rel_path) > 1 and rel_path.startswith('.'):
+            continue
+        # 构建目标目录路径
+        dst_dir = os.path.join(output_dir, rel_path)
+        # 确保目标目录存在
+        if not os.path.exists(dst_dir):
+            os.makedirs(dst_dir)
+        for file in files:
+            if not any(file.endswith(ext) for ext in extensions) and not file.startswith('.'):
+                # 构建源文件和目标文件的完整路径
+                src_file = os.path.join(root, file)
+                dst_file = os.path.join(dst_dir, file)
+                # 复制文件
+                shutil.copy2(src_file, dst_file)
+                logger.info(f'Copied {src_file} to {dst_file}')
+
+
+def cleanup():
+    '''
+    cleanup memory resource
+    '''
+    gc.collect()
+    if not current_platform.is_cpu():
+        torch.cuda.empty_cache()
+
+
+def vllm_cleanup(llm):
+    """Release occupied resources and reset parallel_state"""
+    del llm
+    from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    import contextlib
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    import ray
+    if ray.is_initialized():
+        ray.shutdown()
+    logger.info('llm and distributed env is cleanup')
+
+
+def generate_datetime():
+    '''
+    generate current datetime
+    '''
+    current_datetime = datetime.now()
+    formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
+
+    return formatted_datetime
+
+
+def get_hf_config_sliding_window(hf_text_config) -> Optional[int]:
+    """Get the sliding window size, or None if disabled."""
+
+    # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
+    # addition to sliding window size. We check if that field is present
+    # and if it's False, return None.
+    if (hasattr(hf_text_config, "use_sliding_window")
+            and not hf_text_config.use_sliding_window):
+        return None
+    return getattr(hf_text_config, "sliding_window", None)
+
+def get_skip_patterns(model_type):
+    """Get the skip patterns from model config."""
+    config = smooth_model_config[model_type]
+    return config["skip_patterns"] if "skip_patterns" in config else []
+
+def should_skip(model_type, weight_name):
+    """judge if the weight should be skipped."""
+    skip_patterns = get_skip_patterns(model_type)
+    for pattern in skip_patterns:
+        import re
+        if re.match(pattern, weight_name):
+            return True
+    return False
+
--- a/vllm-v0.6.2/tools/quant_tools/weight_only.py
+++ b/vllm-v0.6.2/tools/quant_tools/weight_only.py
@@ -0,0 +1,152 @@
+import argparse
+import torch
+from torch import Tensor
+import numpy as np
+import logging
+
+from vllm import LLM
+
+from utils_internal import convert_to_merged, cleanup, vllm_cleanup, should_skip
+from dump_smooth import save_weights, save_generate_weights
+
+logger = logging.getLogger(__name__)
+
+
+def merge_adjacent_low_4bit(tensor: Tensor):
+    """
+    将一个包含int8类型数据的张量，按相邻两个元素的低4位合并成新的int8数据，
+    并输出一个新的张量。
+
+    参数:
+    - tensor: 类型为torch.int8的张量，长度应为偶数。
+
+    返回:
+    - 新张量，其中每个元素是相邻原元素低4位的合并结果。
+
+    示例:
+    a = torch.tensor([5, 7, 12, 3], dtype=torch.int8)  # 示例张量，每对元素将被合并
+    merged_tensor = merge_adjacent_low_nibbles(a)
+    print(f"合并后的张量: {merged_tensor} (二进制: {merged_tensor.tolist()})")
+    """
+
+    # 确保输入张量类型为int8且长度为偶数
+    assert tensor.dtype == torch.int8, "输入张量必须为int8类型"
+    assert tensor.shape[-1] % 2 == 0, "输入张量最后一维长度需为偶数"
+
+    even = np.bitwise_and(tensor[..., 0::2], 0x0F, dtype=np.int8)
+    odd = np.bitwise_and(tensor[..., 1::2], 0x0F, dtype=np.int8)
+    merged_tensor = np.bitwise_or(np.left_shift(odd, 4), even)
+
+    # 结果是已经合并的新张量
+    return merged_tensor
+
+
+def cal_weightonly_weight(weight, weight_bits, qmin, qmax, has_qzeros, eps: float = 1e-8):
+    '''
+    return quantized_weight, scales, qzeros
+    args:
+        weight: need to be quantized
+        weight_bits: quantized bitwidth
+        qmin: minimum value in quantized range
+        qmax: maximum value in quantized range
+        has_qzeros: whether to generate qzeros weight
+        eps: limit zero float value to avoid floatpoint error
+    '''
+    assert weight.numel() != 0, "weight should not be empty tensor"
+    assert weight.dim() == 2 or weight.dim() == 3, "Invalid dim. The dim of weight should be 2 or 3"
+    assert weight.dtype in [torch.float32, torch.float16, torch.bfloat16
+                            ], "Invalid datatype. Weight must be torch.float32 or torch.float16 or torch.bfloat16"
+
+    weight_scale = weight.float().abs().clamp(min=eps).max(dim=-1).values / qmax
+    unpacked_weight = (torch.round((weight / weight_scale[..., None]).float())).clip(min=qmin, max=qmax).to(torch.int8)
+    scale_quant_orig_c = weight_scale.squeeze()
+
+    if weight_bits == 4:
+        quantized_weight = merge_adjacent_low_4bit(unpacked_weight)
+    else:
+        quantized_weight = unpacked_weight
+
+    if has_qzeros:
+        qzeros = torch.zeros_like(scale_quant_orig_c, dtype=torch.int32)
+    else:
+        qzeros = None
+
+    return quantized_weight, scale_quant_orig_c, qzeros
+
+
+def generate_weightonly_weight(act_range, name_parameters, args):
+    '''
+    generate hugging face weight to quanizated weightonly weight
+    args:
+        act_range: non parallem act_range
+        name_parameters: non parallel hugging face named parameters
+        args: arguments from main
+    '''
+    weightonly_weight = {}
+    has_qzeros = args.has_qzeros
+    weight_bits = 8 if args.weight_only_precision == 'int8' else 4
+    qmin = float(-2**(weight_bits - 1))
+    qmax = float(2**(weight_bits - 1) - 1)
+
+    for name, param in name_parameters.items():
+        if should_skip(args.model_type, name):
+            logger.info(f"skip {name}")
+            weightonly_weight[name] = param
+            continue
+        if name.endswith("bias"):
+            weightonly_weight[name] = param
+            continue
+        name_parts = name.split(".")
+        layer_name = ".".join(name_parts[:-1])
+        if layer_name in act_range:
+            qweight, scales, qzeros = cal_weightonly_weight(param, weight_bits, qmin, qmax, has_qzeros)
+            scales = scales.to(args.torch_scales_smooth_dtype)
+            weightonly_weight[f'{layer_name}.qweight'] = qweight
+            weightonly_weight[f'{layer_name}.scales'] = scales
+            if has_qzeros:
+                weightonly_weight[f'{layer_name}.qzeros'] = qzeros
+        else:
+            weightonly_weight[name] = param
+
+    return weightonly_weight
+
+
+def generate_weights_of_weight_only(llm: LLM, args: argparse.Namespace):
+    '''
+    generate weightonly weights
+    args:
+        llm: LLM instance
+        args: argument from main
+    '''
+    tp_size = args.tp_size
+
+    llm.llm_engine.model_executor._run_workers("setup_smooth_hook")
+
+    llm.llm_engine.model_executor._run_workers("remove_hooks")
+    act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
+    named_parameters = llm.llm_engine.model_executor._run_workers("get_named_parameters")
+
+    vllm_cleanup(llm)
+    cleanup()
+
+    logger.info("get act_range and named_parameters from llm finished")
+
+    merged_act_range, merged_named_parameters, _ = convert_to_merged(act_range, named_parameters, tp_size, args)
+    save_weights(merged_named_parameters, args)
+
+    del act_range
+    del named_parameters
+    cleanup()
+
+    logger.info("get merged_act_range and merged_named_parameters finished")
+
+    weightonly_weight = generate_weightonly_weight(merged_act_range, merged_named_parameters, args)
+    save_generate_weights(weightonly_weight, args)
+
+    del merged_act_range
+    del merged_named_parameters
+    cleanup()
+
+    logger.info("get weightonly_weight finished")
+
+    return weightonly_weight
--- a/vllm-v0.6.2/tools/report_build_time_ninja.py
+++ b/vllm-v0.6.2/tools/report_build_time_ninja.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+# Copyright (c) 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Modified version of: https://chromium.googlesource.com/chromium/tools/depot_tools.git/+/refs/heads/main/post_build_ninja_summary.py
+"""Summarize the last ninja build, invoked with ninja's -C syntax.
+
+> python3 tools/report_build_time_ninja.py -C build/..
+
+Typical output looks like this:
+```
+    Longest build steps for .cpp.o:
+           1.0 weighted s to build ...torch_bindings.cpp.o (12.4 s elapsed time)
+           2.0 weighted s to build ..._attn_c.dir/csrc... (23.5 s elapsed time)
+           2.6 weighted s to build ...torch_bindings.cpp.o (31.5 s elapsed time)
+           3.2 weighted s to build ...torch_bindings.cpp.o (38.5 s elapsed time)
+    Longest build steps for .so (linking):
+           0.1 weighted s to build _moe_C.abi3.so (1.0 s elapsed time)
+           0.5 weighted s to build ...flash_attn_c.abi3.so (1.1 s elapsed time)
+           6.2 weighted s to build _C.abi3.so (6.2 s elapsed time)
+    Longest build steps for .cu.o:
+          15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
+          15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
+          15.3 weighted s to build ...machete_mm_... (183.6 s elapsed time)
+          15.3 weighted s to build ...machete_mm_... (183.7 s elapsed time)
+          15.5 weighted s to build ...machete_mm_... (185.6 s elapsed time)
+          15.5 weighted s to build ...machete_mm_... (185.9 s elapsed time)
+          15.5 weighted s to build ...machete_mm_... (186.2 s elapsed time)
+          37.4 weighted s to build ...scaled_mm_c3x.cu... (449.0 s elapsed time)
+          43.9 weighted s to build ...scaled_mm_c2x.cu... (527.4 s elapsed time)
+         344.8 weighted s to build ...attention_...cu.o (1087.2 s elapsed time)
+    1110.0 s weighted time (10120.4 s elapsed time sum, 9.1x parallelism)
+    134 build steps completed, average of 0.12/s
+```
+"""
+
+import argparse
+import errno
+import fnmatch
+import os
+import sys
+from collections import defaultdict
+
+# The number of long build times to report:
+long_count = 10
+# The number of long times by extension to report
+long_ext_count = 10
+
+
+class Target:
+    """Represents a single line read for a .ninja_log file."""
+
+    def __init__(self, start, end):
+        """Creates a target object by passing in the start/end times in seconds
+        as a float."""
+        self.start = start
+        self.end = end
+        # A list of targets, appended to by the owner of this object.
+        self.targets = []
+        self.weighted_duration = 0.0
+
+    def Duration(self):
+        """Returns the task duration in seconds as a float."""
+        return self.end - self.start
+
+    def SetWeightedDuration(self, weighted_duration):
+        """Sets the duration, in seconds, passed in as a float."""
+        self.weighted_duration = weighted_duration
+
+    def WeightedDuration(self):
+        """Returns the task's weighted duration in seconds as a float.
+
+        Weighted_duration takes the elapsed time of the task and divides it
+        by how many other tasks were running at the same time. Thus, it
+        represents the approximate impact of this task on the total build time,
+        with serialized or serializing steps typically ending up with much
+        longer weighted durations.
+        weighted_duration should always be the same or shorter than duration.
+        """
+        # Allow for modest floating-point errors
+        epsilon = 0.000002
+        if (self.weighted_duration > self.Duration() + epsilon):
+            print('{} > {}?'.format(self.weighted_duration, self.Duration()))
+        assert (self.weighted_duration <= self.Duration() + epsilon)
+        return self.weighted_duration
+
+    def DescribeTargets(self):
+        """Returns a printable string that summarizes the targets."""
+        # Some build steps generate dozens of outputs - handle them sanely.
+        # The max_length was chosen so that it can fit most of the long
+        # single-target names, while minimizing word wrapping.
+        result = ', '.join(self.targets)
+        max_length = 65
+        if len(result) > max_length:
+            result = result[:max_length] + '...'
+        return result
+
+
+# Copied with some modifications from ninjatracing
+def ReadTargets(log, show_all):
+    """Reads all targets from .ninja_log file |log_file|, sorted by duration.
+
+    The result is a list of Target objects."""
+    header = log.readline()
+    assert header == '# ninja log v5\n', \
+           'unrecognized ninja log version {!r}'.format(header)
+    targets_dict = {}
+    last_end_seen = 0.0
+    for line in log:
+        parts = line.strip().split('\t')
+        if len(parts) != 5:
+            # If ninja.exe is rudely halted then the .ninja_log file may be
+            # corrupt. Silently continue.
+            continue
+        start, end, _, name, cmdhash = parts  # Ignore restat.
+        # Convert from integral milliseconds to float seconds.
+        start = int(start) / 1000.0
+        end = int(end) / 1000.0
+        if not show_all and end < last_end_seen:
+            # An earlier time stamp means that this step is the first in a new
+            # build, possibly an incremental build. Throw away the previous
+            # data so that this new build will be displayed independently.
+            # This has to be done by comparing end times because records are
+            # written to the .ninja_log file when commands complete, so end
+            # times are guaranteed to be in order, but start times are not.
+            targets_dict = {}
+        target = None
+        if cmdhash in targets_dict:
+            target = targets_dict[cmdhash]
+            if not show_all and (target.start != start or target.end != end):
+                # If several builds in a row just run one or two build steps
+                # then the end times may not go backwards so the last build may
+                # not be detected as such. However in many cases there will be a
+                # build step repeated in the two builds and the changed
+                # start/stop points for that command, identified by the hash,
+                # can be used to detect and reset the target dictionary.
+                targets_dict = {}
+                target = None
+        if not target:
+            targets_dict[cmdhash] = target = Target(start, end)
+        last_end_seen = end
+        target.targets.append(name)
+    return list(targets_dict.values())
+
+
+def GetExtension(target, extra_patterns):
+    """Return the file extension that best represents a target.
+
+  For targets that generate multiple outputs it is important to return a
+  consistent 'canonical' extension. Ultimately the goal is to group build steps
+  by type."""
+    for output in target.targets:
+        if extra_patterns:
+            for fn_pattern in extra_patterns.split(';'):
+                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+                    return fn_pattern
+        # Not a true extension, but a good grouping.
+        if output.endswith('type_mappings'):
+            extension = 'type_mappings'
+            break
+
+        # Capture two extensions if present. For example: file.javac.jar should
+        # be distinguished from file.interface.jar.
+        root, ext1 = os.path.splitext(output)
+        _, ext2 = os.path.splitext(root)
+        extension = ext2 + ext1  # Preserve the order in the file name.
+
+        if len(extension) == 0:
+            extension = '(no extension found)'
+
+        if ext1 in ['.pdb', '.dll', '.exe']:
+            extension = 'PEFile (linking)'
+            # Make sure that .dll and .exe are grouped together and that the
+            # .dll.lib files don't cause these to be listed as libraries
+            break
+        if ext1 in ['.so', '.TOC']:
+            extension = '.so (linking)'
+            # Attempt to identify linking, avoid identifying as '.TOC'
+            break
+        # Make sure .obj files don't get categorized as mojo files
+        if ext1 in ['.obj', '.o']:
+            break
+        # Jars are the canonical output of java targets.
+        if ext1 == '.jar':
+            break
+        # Normalize all mojo related outputs to 'mojo'.
+        if output.count('.mojom') > 0:
+            extension = 'mojo'
+            break
+    return extension
+
+
+def SummarizeEntries(entries, extra_step_types):
+    """Print a summary of the passed in list of Target objects."""
+
+    # Create a list that is in order by time stamp and has entries for the
+    # beginning and ending of each build step (one time stamp may have multiple
+    # entries due to multiple steps starting/stopping at exactly the same time).
+    # Iterate through this list, keeping track of which tasks are running at all
+    # times. At each time step calculate a running total for weighted time so
+    # that when each task ends its own weighted time can easily be calculated.
+    task_start_stop_times = []
+
+    earliest = -1
+    latest = 0
+    total_cpu_time = 0
+    for target in entries:
+        if earliest < 0 or target.start < earliest:
+            earliest = target.start
+        if target.end > latest:
+            latest = target.end
+        total_cpu_time += target.Duration()
+        task_start_stop_times.append((target.start, 'start', target))
+        task_start_stop_times.append((target.end, 'stop', target))
+    length = latest - earliest
+    weighted_total = 0.0
+
+    # Sort by the time/type records and ignore |target|
+    task_start_stop_times.sort(key=lambda times: times[:2])
+    # Now we have all task start/stop times sorted by when they happen. If a
+    # task starts and stops on the same time stamp then the start will come
+    # first because of the alphabet, which is important for making this work
+    # correctly.
+    # Track the tasks which are currently running.
+    running_tasks = {}
+    # Record the time we have processed up to so we know how to calculate time
+    # deltas.
+    last_time = task_start_stop_times[0][0]
+    # Track the accumulated weighted time so that it can efficiently be added
+    # to individual tasks.
+    last_weighted_time = 0.0
+    # Scan all start/stop events.
+    for event in task_start_stop_times:
+        time, action_name, target = event
+        # Accumulate weighted time up to now.
+        num_running = len(running_tasks)
+        if num_running > 0:
+            # Update the total weighted time up to this moment.
+            last_weighted_time += (time - last_time) / float(num_running)
+        if action_name == 'start':
+            # Record the total weighted task time when this task starts.
+            running_tasks[target] = last_weighted_time
+        if action_name == 'stop':
+            # Record the change in the total weighted task time while this task
+            # ran.
+            weighted_duration = last_weighted_time - running_tasks[target]
+            target.SetWeightedDuration(weighted_duration)
+            weighted_total += weighted_duration
+            del running_tasks[target]
+        last_time = time
+    assert (len(running_tasks) == 0)
+
+    # Warn if the sum of weighted times is off by more than half a second.
+    if abs(length - weighted_total) > 500:
+        print('Warning: Possible corrupt ninja log, results may be '
+              'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
+                  length, weighted_total))
+
+    entries_by_ext = defaultdict(list)
+    for target in entries:
+        extension = GetExtension(target, extra_step_types)
+        entries_by_ext[extension].append(target)
+
+    for key, values in entries_by_ext.items():
+        print('    Longest build steps for {}:'.format(key))
+        values.sort(key=lambda x: x.WeightedDuration())
+        for target in values[-long_count:]:
+            print(
+                '      {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
+                format(target.WeightedDuration(), target.DescribeTargets(),
+                       target.Duration()))
+
+    print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
+          'parallelism)'.format(length, total_cpu_time,
+                                total_cpu_time * 1.0 / length))
+    print('    %d build steps completed, average of %1.2f/s' %
+          (len(entries), len(entries) / (length)))
+
+
+def main():
+    log_file = '.ninja_log'
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument(
+        '-s',
+        '--step-types',
+        help='semicolon separated fnmatch patterns for build-step grouping')
+    parser.add_argument('--log-file',
+                        help="specific ninja log file to analyze.")
+    args, _extra_args = parser.parse_known_args()
+    if args.build_directory:
+        log_file = os.path.join(args.build_directory, log_file)
+    if args.log_file:
+        log_file = args.log_file
+    if args.step_types:
+        # Make room for the extra build types.
+        global long_ext_count
+        long_ext_count += len(args.step_types.split(';'))
+
+    try:
+        with open(log_file) as log:
+            entries = ReadTargets(log, False)
+            SummarizeEntries(entries, args.step_types)
+    except OSError:
+        print('Log file {!r} not found, no build summary created.'.format(
+            log_file))
+        return errno.ENOENT
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/vllm-v0.6.2/tools/shellcheck.sh
+++ b/vllm-v0.6.2/tools/shellcheck.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+scversion="stable"
+
+if [ -d "shellcheck-${scversion}" ]; then
+    export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
+fi
+
+if ! [ -x "$(command -v shellcheck)" ]; then
+    if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
+        echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
+        exit 1
+    fi
+
+    # automatic local install if linux x86_64
+    wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
+    export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
+fi
+
+# TODO - fix warnings in .buildkite/run-amd-test.sh
+find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'
--- a/vllm-v0.6.2/tools/utils/README.md
+++ b/vllm-v0.6.2/tools/utils/README.md
@@ -0,0 +1,23 @@
+### 1. 非page模式max_num_seqs自动调优工具
+
+对于MLU370X8平台，在unpage模式下，可以通过调整`max_num_seqs`来提升性能。`tune_max_num_seqs.py`通过自动调参来搜索最佳`max_num_seqs`值。
+- 用法示例
+搜索固定配置下，使吞吐量最大`max_num_seqs`值，其中参数部分保持与`benchmark_latency.py`/`benchmark_throughput.py`一致。
+```bash
+python tools/utils/tune_max_num_seqs.py --backend vllm --input-len 1024 --output-len 1024 --model /Path/to/Llama-2-70b-chat-hf/ -tp 1 --max-model-len 4096 --dtype float16 --num-prompts 10
+```
+通过执行上述命令，可以搜索得到最优`max_num_seqs`配置，在构建LLM对象时，作为参数传入使用。
+
+### 2. vLLM调度分析辅助工具
+
+首先，设置环境变量开启调度profiling：export VLLM_SCHEDULER_PROFILE=true
+
+对于离线测试，测试结束后，会自动保存数据并打印出当前已经运行请求的信息
+
+对于在线测试，获取调度数据的步骤如下：
+
+1. 启动server
+2. 运行client端测试
+3. 等待client测试结束后，立即运行：python3 tools/utils/post_scheduler_view_action.py --host [server端ip地址] --port [server端口号] --action save，请求server端将数据保存下来
+4. server端会打印出当前已经运行请求的信息
+5. 如果想再次运行client测试（基于现有server），先运行：python3 tools/utils/post_scheduler_view_action.py --host [server端ip地址] --port [server端口号] --action init，恢复server端，然后重复2、3、4
--- a/vllm-v0.6.2/tools/utils/post_scheduler_view_action.py
+++ b/vllm-v0.6.2/tools/utils/post_scheduler_view_action.py
@@ -0,0 +1,27 @@
+import argparse
+import requests
+
+""" Post a request to server, let server init/save scheduler view. """
+def post_http_request(api_url: str, action: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    pload = {
+        "model": action,
+        "prompt": "",
+        "n": 1,
+        "temperature": 0.0,
+        "max_tokens": 16,
+        "stream": True,
+    }
+    response = requests.post(api_url, headers=headers, json=pload, stream=True)
+    return response
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=6000)
+    parser.add_argument("--action", type=str, default="save", choices=['init', 'save'])
+    args = parser.parse_args()
+    api_url = f"http://{args.host}:{args.port}/v1/completions"
+
+    post_http_request(api_url, f"{args.action}_scheduler_view")
--- a/vllm-v0.6.2/tools/utils/tune_max_num_seqs.py
+++ b/vllm-v0.6.2/tools/utils/tune_max_num_seqs.py
@@ -0,0 +1,181 @@
+"""Autotune max_num_seqs paramter."""
+# pylint: skip-file
+import argparse
+import random
+from typing import Dict, Any
+from tqdm import tqdm
+
+
+def run_vllm(config: Dict[str, Any]) -> float:
+    """Initialize and run an instance of a language model (LLM) using the
+    `vllm` library."""
+    print(f'Evaluate with max_num_seqs: {config["max_num_seqs"]}')
+    from vllm import LLM
+    llm = LLM(**config)
+    print(f'The num of gpu blocks is: {llm.llm_engine.cache_config.num_gpu_blocks}')
+    return llm.llm_engine.cache_config.num_gpu_blocks
+
+
+def main(args: argparse.Namespace):
+    """The entry function to tune max_num_seqs."""
+    print(args)
+    random.seed(args.seed)
+    config = {
+        'model': args.model,
+        'tokenizer': args.tokenizer,
+        'quantization': args.quantization,
+        'tensor_parallel_size': args.tensor_parallel_size,
+        'seed': args.seed,
+        'trust_remote_code': args.trust_remote_code,
+        'dtype': args.dtype,
+        'max_model_len': args.max_model_len,
+        'enforce_eager': args.enforce_eager,
+        'kv_cache_dtype': args.kv_cache_dtype,
+        'quantization_param_path': args.quantization_param_path,
+        'device': args.device,
+        'enable_prefix_caching': args.enable_prefix_caching,
+        'enable_chunked_prefill': args.enable_chunked_prefill,
+        'max_num_batched_tokens': args.max_num_batched_tokens,
+        'gpu_memory_utilization': args.gpu_memory_utilization,
+        'download_dir': args.download_dir,
+        'block_size': args.block_size
+    }
+
+    import multiprocessing
+    def worker_wrapper(config, output_queue):
+        """Here we get the num_gpu_blocks by instantiate a llm object."""
+        result = run_vllm(config)
+        output_queue.put(result)
+
+
+    def get_num_gpu_blocks(cache, num_seqs) -> int:
+        """Get the number of GPU blocks with parameter num_seqs."""
+        if num_seqs in cache:
+            return cache[num_seqs]
+        # Here since we cannot manually release the resources hold by Ray and NCCL,
+        # we evaluate a set of parameters by launching a separate process.
+        config['max_num_seqs'] = num_seqs
+        output_queue = multiprocessing.Queue()
+        process = multiprocessing.Process(target=worker_wrapper,
+                                          args=(config, output_queue))
+        process.start()
+        process.join()
+        result = output_queue.get()
+        cache[num_seqs] = result
+        return result
+
+
+    def find_optimal_max_num_seqs(init=256) -> int:
+        """Search th optimal max_num_seqs which maximizes
+        min(max_num_seqs, num_gpu_blocks)."""
+        # Use cache to avoid repeated evaluations.
+        cache = {}
+
+        # Initialization seach range.
+        num_blocks = get_num_gpu_blocks(cache, init)
+        left, right = min(num_blocks, init), max(num_blocks, init)
+
+        # Binary search.
+        while 0 < left < right:
+            mid = (left + right) // 2
+            num_blocks = get_num_gpu_blocks(cache, mid)
+
+            if num_blocks == mid:
+                return mid
+            if num_blocks > mid:
+                left = mid + 1
+            else:
+                right = mid - 1
+            left = max(min(mid, num_blocks), left)
+            right = min(max(mid, num_blocks), right)
+
+        left, right = max(1, left), max(1, right)
+        final_left = min(left, get_num_gpu_blocks(cache, left))
+        final_right = min(right, get_num_gpu_blocks(cache, right))
+        return right if final_right > final_left else left
+
+    max_num_seqs = find_optimal_max_num_seqs()
+    print(f'The optimal max_num_seqs is {max_num_seqs}.')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Tune max_num_seqs.")
+    parser.add_argument("--backend", type=str, choices=["vllm"], default="vllm")
+    parser.add_argument("--dataset", type=str, default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len", type=int, default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len", type=int, default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization', '-q',
+                        choices=['awq', 'gptq', 'squeezellm', None],
+                        default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n", type=int, default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts", type=int, default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size", type=int, default=None,
+                        help="Maximum batch size for HF backend.")
+
+    parser.add_argument("--block-size", type=int, default=-1)
+    parser.add_argument('--trust-remote-code', action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len', type=int, default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype', type=str, default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization', type=float, default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument("--enforce-eager", action="store_true",
+                        help="enforce eager execution")
+    parser.add_argument(
+        "--kv-cache-dtype", type=str, choices=["auto", "fp8"], default="auto",
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type.')
+    parser.add_argument(
+        '--quantization-param-path', type=str, default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument(
+        "--device", type=str, default="cuda", choices=["cuda"],
+        help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument(
+        "--enable-prefix-caching", action='store_true',
+        help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument("--enable-chunked-prefill", action='store_true',
+                        help="enable chunked prefill for vLLM backend.")
+    parser.add_argument('--max-num-batched-tokens', type=int, default=None,
+                        help='maximum number of batched tokens per '
+                        'iteration')
+    parser.add_argument('--download-dir', type=str, default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    cli_args = parser.parse_args()
+    if cli_args.tokenizer is None:
+        cli_args.tokenizer = cli_args.model
+    if cli_args.dataset is None:
+        assert cli_args.input_len is not None
+        assert cli_args.output_len is not None
+    else:
+        assert cli_args.input_len is None
+
+    main(cli_args)