[Doc] [v0.18.0]Fix glm4.7 readme v18 (#8460)

### What this PR does / why we need it? update GLM4.7 doc. Fix configuration issues, including:VLLM_ASCEND_ENABLE_FLASHCOMM1、VLLM_ASCEND_BALANCE_SCHEDULING、VLLM_NIXL_ABORT_REQUEST_TIMEOUT etc. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? doc test --------- Signed-off-by: zjks98 <zhangjiakang4@huawei.com> Signed-off-by: aipaes <82140963+aipaes@users.noreply.github.com> Co-authored-by: zjks98 <zhangjiakang4@huawei.com>
2026-04-23 14:42:28 +08:00
parent 58c87bd15b
commit 4a254ba59a
1 changed files with 11 additions and 23 deletions
--- a/docs/source/tutorials/models/GLM4.x.md
+++ b/docs/source/tutorials/models/GLM4.x.md
@@ -145,7 +145,6 @@ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 export HCCL_OP_EXPANSION_MODE=AIV
 export VLLM_ASCEND_BALANCE_SCHEDULING=1
 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1
-export VLLM_ASCEND_ENABLE_FLASHCOMM1=1

 vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \
  --data-parallel-size 2 \
@@ -160,7 +159,7 @@ vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \
  --quantization ascend \
  --trust-remote-code \
  --gpu-memory-utilization 0.9 \
-  --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \
+  --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \
  --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32,64,128,256,512], "cudagraph_mode": "FULL_DECODE_ONLY"}' \
  --additional-config '{"enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}'\
 ```
@@ -170,6 +169,7 @@ The parameters are explained as follows:

 - `--async-scheduling` Asynchronous scheduling is a technique used to optimize inference efficiency. It allows non-blocking task scheduling to improve concurrency and throughput, especially when processing large-scale models.
 - `fusion_ops_gmmswigluquant` The performance of the GmmSwigluQuant fusion operator tends to degrade when the total number of NPUs is ≤ 16.
+- `VLLM_ASCEND_ENABLE_FLASHCOMM1` Due to the FD feature of the FIA operator being invalidated by padding data introduced by this feature, we recommend disabling the `flashcomm1` feature for long-sequence (≥16k) and low-concurrency (≤8 batch size) scenarios.For long-sequence and high-concurrency scenarios, you may enable this feature to achieve improved Prefill performance.

 ### Multi-node Deployment

@@ -196,7 +196,6 @@ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 export HCCL_OP_EXPANSION_MODE=AIV
 export VLLM_ASCEND_BALANCE_SCHEDULING=1
 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1
-export VLLM_ASCEND_ENABLE_FLASHCOMM1=1

 vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \
  --host 0.0.0.0 \
@@ -220,7 +219,7 @@ vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \
  --reasoning-parser glm45 \
  --tool-call-parser glm47 \
  --served-model-name glm47 \
-  --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \
+  --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \
  --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32,64,128,256,512], "cudagraph_mode": "FULL_DECODE_ONLY"}' \
  --additional-config '{"enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}'
 ```
@@ -247,7 +246,6 @@ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 export HCCL_OP_EXPANSION_MODE=AIV
 export VLLM_ASCEND_BALANCE_SCHEDULING=1
 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1
-export VLLM_ASCEND_ENABLE_FLASHCOMM1=1

 vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \
  --host 0.0.0.0 \
@@ -272,7 +270,7 @@ vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \
  --reasoning-parser glm45 \
  --tool-call-parser glm47 \
  --served-model-name glm47 \
-  --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \
+  --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \
  --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32,64,128,256,512], "cudagraph_mode": "FULL_DECODE_ONLY"}' \
  --additional-config '{"enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}'
 ```
@@ -407,9 +405,7 @@ Before you start, please
        export ASCEND_TRANSPORT_PRINT=1
        export ACL_OP_INIT_MODE=1
        export ASCEND_A3_ENABLE=1
-        export VLLM_ASCEND_BALANCE_SCHEDULING=1
        export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1
-        export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
        export ASCEND_RT_VISIBLE_DEVICES=$1
        export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH

@@ -431,7 +427,7 @@ Before you start, please
            --gpu-memory-utilization 0.9 \
            --quantization ascend \
            --enforce-eager \
-            --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \
+            --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \
            --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile", "torch_profiler_with_stack": false}' \
            --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \
            --kv-transfer-config \
@@ -472,9 +468,7 @@ Before you start, please
        export ASCEND_TRANSPORT_PRINT=1
        export ACL_OP_INIT_MODE=1
        export ASCEND_A3_ENABLE=1
-        export VLLM_ASCEND_BALANCE_SCHEDULING=1
        export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1
-        export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
        export ASCEND_RT_VISIBLE_DEVICES=$1
        export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH

@@ -496,14 +490,14 @@ Before you start, please
            --gpu-memory-utilization 0.9 \
            --quantization ascend \
            --enforce-eager \
-            --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \
+            --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \
            --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile", "torch_profiler_with_stack": false}' \
            --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \
            --kv-transfer-config \
            '{"kv_connector": "MooncakeConnectorV1",
            "kv_role": "kv_producer",
-            "kv_port": "30000",
-            "engine_id": "0",
+            "kv_port": "30100",
+            "engine_id": "1",
            "kv_connector_extra_config": {
                        "prefill": {
                                "dp_size": 2,
@@ -536,12 +530,9 @@ Before you start, please
        export ASCEND_TRANSPORT_PRINT=1
        export ACL_OP_INIT_MODE=1
        export ASCEND_A3_ENABLE=1
-        export VLLM_NIXL_ABORT_REQUEST_TIMEOUT=300000
        export TASK_QUEUE_ENABLE=1
        export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH
-        export VLLM_ASCEND_BALANCE_SCHEDULING=1
        export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1
-        export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
        export VLLM_ASCEND_ENABLE_FUSED_MC2=1
        export ASCEND_RT_VISIBLE_DEVICES=$1

@@ -563,12 +554,12 @@ Before you start, please
            --async-scheduling \
            --gpu-memory-utilization 0.9 \
            --quantization ascend \
-            --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \
+            --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \
            --profiler-config \
            '{"profiler": "torch",
            "torch_profiler_dir": "./vllm_profile",
            "torch_profiler_with_stack": false}' \
-            --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY",  "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,24,26,28,30,32,64,128,256,512]}' \
+            --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,24,26,28,30,32,64,128,256,512]}' \
            --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \
            --kv-transfer-config \
            '{"kv_connector": "MooncakeConnectorV1",
@@ -607,12 +598,9 @@ Before you start, please
        export ASCEND_TRANSPORT_PRINT=1
        export ACL_OP_INIT_MODE=1
        export ASCEND_A3_ENABLE=1
-        export VLLM_NIXL_ABORT_REQUEST_TIMEOUT=300000
        export TASK_QUEUE_ENABLE=1
        export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH
-        export VLLM_ASCEND_BALANCE_SCHEDULING=1
        export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1
-        export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
        export VLLM_ASCEND_ENABLE_FUSED_MC2=1
        export ASCEND_RT_VISIBLE_DEVICES=$1

@@ -634,7 +622,7 @@ Before you start, please
            --async-scheduling \
            --gpu-memory-utilization 0.9 \
            --quantization ascend \
-            --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \
+            --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \
            --profiler-config \
            '{"profiler": "torch",
            "torch_profiler_dir": "./vllm_profile",