From 4a254ba59ad80cd439b090aa0f6440ad2f1661ac Mon Sep 17 00:00:00 2001 From: aipaes <82140963+aipaes@users.noreply.github.com> Date: Thu, 23 Apr 2026 14:42:28 +0800 Subject: [PATCH] [Doc] [v0.18.0]Fix glm4.7 readme v18 (#8460) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? update GLM4.7 doc. Fix configuration issues, including:VLLM_ASCEND_ENABLE_FLASHCOMM1、VLLM_ASCEND_BALANCE_SCHEDULING、VLLM_NIXL_ABORT_REQUEST_TIMEOUT etc. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? doc test --------- Signed-off-by: zjks98 Signed-off-by: aipaes <82140963+aipaes@users.noreply.github.com> Co-authored-by: zjks98 --- docs/source/tutorials/models/GLM4.x.md | 34 +++++++++----------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/docs/source/tutorials/models/GLM4.x.md b/docs/source/tutorials/models/GLM4.x.md index e591931d..a167ecec 100644 --- a/docs/source/tutorials/models/GLM4.x.md +++ b/docs/source/tutorials/models/GLM4.x.md @@ -145,7 +145,6 @@ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_OP_EXPANSION_MODE=AIV export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 -export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ --data-parallel-size 2 \ @@ -160,7 +159,7 @@ vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ --quantization ascend \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \ --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32,64,128,256,512], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ --additional-config '{"enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}'\ ``` @@ -170,6 +169,7 @@ The parameters are explained as follows: - `--async-scheduling` Asynchronous scheduling is a technique used to optimize inference efficiency. It allows non-blocking task scheduling to improve concurrency and throughput, especially when processing large-scale models. - `fusion_ops_gmmswigluquant` The performance of the GmmSwigluQuant fusion operator tends to degrade when the total number of NPUs is ≤ 16. +- `VLLM_ASCEND_ENABLE_FLASHCOMM1` Due to the FD feature of the FIA operator being invalidated by padding data introduced by this feature, we recommend disabling the `flashcomm1` feature for long-sequence (≥16k) and low-concurrency (≤8 batch size) scenarios.For long-sequence and high-concurrency scenarios, you may enable this feature to achieve improved Prefill performance. ### Multi-node Deployment @@ -196,7 +196,6 @@ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_OP_EXPANSION_MODE=AIV export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 -export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ --host 0.0.0.0 \ @@ -220,7 +219,7 @@ vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ --reasoning-parser glm45 \ --tool-call-parser glm47 \ --served-model-name glm47 \ - --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \ --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32,64,128,256,512], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ --additional-config '{"enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' ``` @@ -247,7 +246,6 @@ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_OP_EXPANSION_MODE=AIV export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 -export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ --host 0.0.0.0 \ @@ -272,7 +270,7 @@ vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ --reasoning-parser glm45 \ --tool-call-parser glm47 \ --served-model-name glm47 \ - --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \ --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32,64,128,256,512], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ --additional-config '{"enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' ``` @@ -407,9 +405,7 @@ Before you start, please export ASCEND_TRANSPORT_PRINT=1 export ACL_OP_INIT_MODE=1 export ASCEND_A3_ENABLE=1 - export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 - export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export ASCEND_RT_VISIBLE_DEVICES=$1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH @@ -431,7 +427,7 @@ Before you start, please --gpu-memory-utilization 0.9 \ --quantization ascend \ --enforce-eager \ - --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \ --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile", "torch_profiler_with_stack": false}' \ --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \ --kv-transfer-config \ @@ -472,9 +468,7 @@ Before you start, please export ASCEND_TRANSPORT_PRINT=1 export ACL_OP_INIT_MODE=1 export ASCEND_A3_ENABLE=1 - export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 - export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export ASCEND_RT_VISIBLE_DEVICES=$1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH @@ -496,14 +490,14 @@ Before you start, please --gpu-memory-utilization 0.9 \ --quantization ascend \ --enforce-eager \ - --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \ --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile", "torch_profiler_with_stack": false}' \ --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \ --kv-transfer-config \ '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", - "kv_port": "30000", - "engine_id": "0", + "kv_port": "30100", + "engine_id": "1", "kv_connector_extra_config": { "prefill": { "dp_size": 2, @@ -536,12 +530,9 @@ Before you start, please export ASCEND_TRANSPORT_PRINT=1 export ACL_OP_INIT_MODE=1 export ASCEND_A3_ENABLE=1 - export VLLM_NIXL_ABORT_REQUEST_TIMEOUT=300000 export TASK_QUEUE_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH - export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 - export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export VLLM_ASCEND_ENABLE_FUSED_MC2=1 export ASCEND_RT_VISIBLE_DEVICES=$1 @@ -563,12 +554,12 @@ Before you start, please --async-scheduling \ --gpu-memory-utilization 0.9 \ --quantization ascend \ - --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \ --profiler-config \ '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile", "torch_profiler_with_stack": false}' \ - --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,24,26,28,30,32,64,128,256,512]}' \ + --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,24,26,28,30,32,64,128,256,512]}' \ --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \ --kv-transfer-config \ '{"kv_connector": "MooncakeConnectorV1", @@ -607,12 +598,9 @@ Before you start, please export ASCEND_TRANSPORT_PRINT=1 export ACL_OP_INIT_MODE=1 export ASCEND_A3_ENABLE=1 - export VLLM_NIXL_ABORT_REQUEST_TIMEOUT=300000 export TASK_QUEUE_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH - export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 - export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export VLLM_ASCEND_ENABLE_FUSED_MC2=1 export ASCEND_RT_VISIBLE_DEVICES=$1 @@ -634,7 +622,7 @@ Before you start, please --async-scheduling \ --gpu-memory-utilization 0.9 \ --quantization ascend \ - --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' \ --profiler-config \ '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile",