From f63c1341d9453d4e1c687f795542f6bb28378ce2 Mon Sep 17 00:00:00 2001 From: 1092626063 <1092626063@qq.com> Date: Fri, 9 Jan 2026 16:07:42 +0800 Subject: [PATCH] [Feature] GLM4.6 support mtp with fullgraph (#5460) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? GLM4.6 support mtp with fullgraph to improve performance ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ` export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_OP_EXPANSION_MODE=AIV vllm serve /weight/glm4.6_w8a8_with_float_mtp \ --data-parallel-size 1 \ --tensor-parallel-size 16 \ --seed 1024 \ --served-model-name glm \ --max-model-len 35000 \ --max-num-batched-tokens 16384 \ --max-num-seqs 16 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ --speculative-config '{"num_speculative_tokens": 1, "model":"/weight/glm4.6_w8a8_with_float_mtp", "method":"mtp"}' \ --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ --async-scheduling \ ` test case: ` vllm bench serve \ --backend vllm \ --dataset-name prefix_repetition \ --prefix-repetition-prefix-len 22400 \ --prefix-repetition-suffix-len 9600 \ --prefix-repetition-output-len 1024 \ --num-prompts 1 \ --prefix-repetition-num-prefixes 1 \ --ignore-eos \ --model glm \ --tokenizer /weight/glm4.6_w8a8_with_float_mtp \ --seed 1000 \ --host 0.0.0.0 \ --port 8000 \ --endpoint /v1/completions \ --max-concurrency 1 \ --request-rate 1 ` - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/5326c89803566a131c928f7fdd2100b75c981a42 Signed-off-by: 1092626063 <1092626063@qq.com> --- .../e2e/nightly/single_node/models/test_glm4_5.py | 14 +++++++++----- vllm_ascend/quantization/quant_config.py | 10 +++++++++- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/e2e/nightly/single_node/models/test_glm4_5.py b/tests/e2e/nightly/single_node/models/test_glm4_5.py index 1255ddd0..49809cfb 100644 --- a/tests/e2e/nightly/single_node/models/test_glm4_5.py +++ b/tests/e2e/nightly/single_node/models/test_glm4_5.py @@ -29,6 +29,7 @@ MODELS = [ TENSOR_PARALLELS = [8] DATA_PARALLELS = [2] +FULL_GRAPH = [True, False] prompts = [ "San Francisco is a", @@ -65,11 +66,9 @@ aisbench_cases = [{ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) @pytest.mark.parametrize("dp_size", DATA_PARALLELS) -async def test_models( - model: str, - tp_size: int, - dp_size: int, -) -> None: +@pytest.mark.parametrize("full_graph", FULL_GRAPH) +async def test_models(model: str, tp_size: int, dp_size: int, + full_graph: bool) -> None: port = get_open_port() env_dict = {"HCCL_BUFFSIZE": "1024"} server_args = [ @@ -91,6 +90,11 @@ async def test_models( "--gpu-memory-utilization", "0.9", ] + if full_graph: + server_args += [ + "--compilation-config", + '{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}' + ] request_keyword_args: dict[str, Any] = { **api_keyword_args, } diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index f6a98241..1d0ddd74 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -173,7 +173,15 @@ class AscendQuantConfig(QuantizationConfig): "are quantized. All shards of fused layers " "to have the same precision.") else: - is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT" + # NOTE: In GLM4.6, the MTP draft model shares the same LM head weigthts + # with the main model. Therefore, before `load_weights()` runs, some parameter + # names may not include the expected prefix and may appear only with the + # ".head" suffix. This can trigger a load-time error, so here we replace the + # key with "lm_head.weight". + key = prefix + '.weight' + if key not in self.quant_description and ".head" in prefix: + key = 'lm_head.weight' + is_skipped = self.quant_description[key] == "FLOAT" assert is_skipped is not None return is_skipped