From f63c1341d9453d4e1c687f795542f6bb28378ce2 Mon Sep 17 00:00:00 2001
From: 1092626063 <1092626063@qq.com>
Date: Fri, 9 Jan 2026 16:07:42 +0800
Subject: [PATCH] [Feature] GLM4.6 support mtp with fullgraph (#5460)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What this PR does / why we need it?
GLM4.6 support mtp with fullgraph to improve performance

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
`
export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_OP_EXPANSION_MODE=AIV

vllm serve /weight/glm4.6_w8a8_with_float_mtp \
  --data-parallel-size 1 \
  --tensor-parallel-size 16 \
  --seed 1024 \
  --served-model-name glm \
  --max-model-len 35000 \
  --max-num-batched-tokens 16384 \
  --max-num-seqs 16 \
  --trust-remote-code \
  --gpu-memory-utilization 0.9 \
--speculative-config '{"num_speculative_tokens": 1,
"model":"/weight/glm4.6_w8a8_with_float_mtp", "method":"mtp"}' \
--compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32],
"cudagraph_mode": "FULL_DECODE_ONLY"}' \
  --async-scheduling \
`

test case：
`
vllm bench serve \
  --backend vllm \
  --dataset-name prefix_repetition \
  --prefix-repetition-prefix-len 22400 \
  --prefix-repetition-suffix-len 9600 \
  --prefix-repetition-output-len 1024 \
  --num-prompts 1 \
  --prefix-repetition-num-prefixes 1 \
  --ignore-eos \
  --model glm \
  --tokenizer /weight/glm4.6_w8a8_with_float_mtp \
  --seed 1000 \
  --host 0.0.0.0 \
  --port 8000 \
  --endpoint /v1/completions \
  --max-concurrency 1 \
  --request-rate 1

`
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5326c89803566a131c928f7fdd2100b75c981a42

Signed-off-by: 1092626063 <1092626063@qq.com>
---
 .../e2e/nightly/single_node/models/test_glm4_5.py  | 14 +++++++++-----
 vllm_ascend/quantization/quant_config.py           | 10 +++++++++-
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/tests/e2e/nightly/single_node/models/test_glm4_5.py b/tests/e2e/nightly/single_node/models/test_glm4_5.py
index 1255ddd0..49809cfb 100644
--- a/tests/e2e/nightly/single_node/models/test_glm4_5.py
+++ b/tests/e2e/nightly/single_node/models/test_glm4_5.py
@@ -29,6 +29,7 @@ MODELS = [
 
 TENSOR_PARALLELS = [8]
 DATA_PARALLELS = [2]
+FULL_GRAPH = [True, False]
 
 prompts = [
     "San Francisco is a",
@@ -65,11 +66,9 @@ aisbench_cases = [{
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
 @pytest.mark.parametrize("dp_size", DATA_PARALLELS)
-async def test_models(
-    model: str,
-    tp_size: int,
-    dp_size: int,
-) -> None:
+@pytest.mark.parametrize("full_graph", FULL_GRAPH)
+async def test_models(model: str, tp_size: int, dp_size: int,
+                      full_graph: bool) -> None:
     port = get_open_port()
     env_dict = {"HCCL_BUFFSIZE": "1024"}
     server_args = [
@@ -91,6 +90,11 @@ async def test_models(
         "--gpu-memory-utilization",
         "0.9",
     ]
+    if full_graph:
+        server_args += [
+            "--compilation-config",
+            '{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}'
+        ]
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
     }
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index f6a98241..1d0ddd74 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -173,7 +173,15 @@ class AscendQuantConfig(QuantizationConfig):
                         "are quantized. All shards of fused layers "
                         "to have the same precision.")
         else:
-            is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT"
+            # NOTE: In GLM4.6, the MTP draft model shares the same LM head weigthts
+            # with the main model. Therefore, before `load_weights()` runs, some parameter
+            # names may not include the expected prefix and may appear only with the
+            # ".head" suffix. This can trigger a load-time error, so here we replace the
+            # key with "lm_head.weight".
+            key = prefix + '.weight'
+            if key not in self.quant_description and ".head" in prefix:
+                key = 'lm_head.weight'
+            is_skipped = self.quant_description[key] == "FLOAT"
 
         assert is_skipped is not None
         return is_skipped