diff --git a/docs/source/tutorials/models/GLM4.x.md b/docs/source/tutorials/models/GLM4.x.md index ed331399..e591931d 100644 --- a/docs/source/tutorials/models/GLM4.x.md +++ b/docs/source/tutorials/models/GLM4.x.md @@ -146,7 +146,6 @@ export HCCL_OP_EXPANSION_MODE=AIV export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 -export VLLM_ASCEND_ENABLE_FUSED_MC2=1 vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ --data-parallel-size 2 \ @@ -198,7 +197,6 @@ export HCCL_OP_EXPANSION_MODE=AIV export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 -export VLLM_ASCEND_ENABLE_FUSED_MC2=1 vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ --host 0.0.0.0 \ @@ -250,7 +248,6 @@ export HCCL_OP_EXPANSION_MODE=AIV export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 -export VLLM_ASCEND_ENABLE_FUSED_MC2=1 vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ --host 0.0.0.0 \ @@ -413,7 +410,6 @@ Before you start, please export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 - export VLLM_ASCEND_ENABLE_FUSED_MC2=1 export ASCEND_RT_VISIBLE_DEVICES=$1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH @@ -479,7 +475,6 @@ Before you start, please export VLLM_ASCEND_BALANCE_SCHEDULING=1 export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 - export VLLM_ASCEND_ENABLE_FUSED_MC2=1 export ASCEND_RT_VISIBLE_DEVICES=$1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH diff --git a/docs/source/tutorials/models/GLM5.md b/docs/source/tutorials/models/GLM5.md index f0710d38..0bf9a5ea 100644 --- a/docs/source/tutorials/models/GLM5.md +++ b/docs/source/tutorials/models/GLM5.md @@ -770,8 +770,7 @@ Before you start, please export ASCEND_RT_VISIBLE_DEVICES=$1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 - - export VLLM_ASCEND_ENABLE_FUSED_MC2=1 + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib vllm serve /root/.cache/glm5-w8a8 \ @@ -851,8 +850,7 @@ Before you start, please export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 - - export VLLM_ASCEND_ENABLE_FUSED_MC2=1 + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib vllm serve /root/.cache/glm5-w8a8 \ @@ -1320,7 +1318,7 @@ python load_balance_proxy_server_example.py \ Some configurations for optimization are shown below: - `VLLM_ASCEND_ENABLE_FLASHCOMM1`: Enable FlashComm optimization to reduce communication and computation overhead on prefill node. With FlashComm enabled, layer_sharding list cannot include o_proj as an element. -- `VLLM_ASCEND_ENABLE_FUSED_MC2`: Enable following fused operators: dispatch_gmm_combine_decode and dispatch_ffn_combine operator. +- `VLLM_ASCEND_ENABLE_FUSED_MC2`: Enable following fused operators: dispatch_gmm_combine_decode and dispatch_ffn_combine operator. and please **note** that this environment variable can only be enabled on decode nodes. - `VLLM_ASCEND_ENABLE_MLAPO`: Enable fused operator MlaPreprocessOperation. Please refer to the following python file for further explanation and restrictions of the environment variables above: [envs.py](https://github.com/vllm-project/vllm-ascend/blob/main/vllm_ascend/envs.py) diff --git a/docs/source/tutorials/models/MiniMax-M2.md b/docs/source/tutorials/models/MiniMax-M2.md index fb202177..0b8f4238 100644 --- a/docs/source/tutorials/models/MiniMax-M2.md +++ b/docs/source/tutorials/models/MiniMax-M2.md @@ -148,7 +148,6 @@ sysctl kernel.sched_migration_cost_ns=50000 export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD export TASK_QUEUE_ENABLE=1 -export VLLM_ASCEND_ENABLE_FUSED_MC2=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 diff --git a/docs/source/tutorials/models/Qwen3-235B-A22B.md b/docs/source/tutorials/models/Qwen3-235B-A22B.md index 07780448..bc5ea18b 100644 --- a/docs/source/tutorials/models/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/models/Qwen3-235B-A22B.md @@ -343,7 +343,6 @@ export HCCL_OP_EXPANSION_MODE="AIV" export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 -export VLLM_ASCEND_ENABLE_FUSED_MC2=1 export TASK_QUEUE_ENABLE=1 vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ @@ -389,7 +388,7 @@ Reference test results: Note: -1. Setting `export VLLM_ASCEND_ENABLE_FUSED_MC2=1` enables MoE fused operators that reduce time consumption of MoE in both prefill and decode. This is an experimental feature which only supports W8A8 quantization on Atlas A3 servers now. If you encounter any problems when using this feature, you can disable it by setting `export VLLM_ASCEND_ENABLE_FUSED_MC2=0` and update issues in vLLM-Ascend community. +1. Setting `export VLLM_ASCEND_ENABLE_FUSED_MC2=1` enables MoE fused operators that reduce time consumption of MoE in decode. This is an experimental feature which only supports W8A8 quantization on Atlas A3 servers now. If you encounter any problems when using this feature, you can disable it by setting `export VLLM_ASCEND_ENABLE_FUSED_MC2=0` and update issues in vLLM-Ascend community. **Note** that this environment variable can only be enabled on decode nodes. 2. Here we disable prefix cache because of random datasets. You can enable prefix cache if requests have long common prefix. ### Three Node A3 -- PD disaggregation @@ -417,7 +416,6 @@ export HCCL_OP_EXPANSION_MODE="AIV" export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 -export VLLM_ASCEND_ENABLE_FUSED_MC2=2 export TASK_QUEUE_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH @@ -618,4 +616,4 @@ Reference test results: Note: -1. We recommend to set `export VLLM_ASCEND_ENABLE_FUSED_MC2=2` on this scenario (typically EP32 for Qwen3-235B). This enables a different MoE fusion operator. +1. We recommend to set `export VLLM_ASCEND_ENABLE_FUSED_MC2=2` on this scenario (typically EP32 for Qwen3-235B). This enables a different MoE fusion operator. **Note** that this environment variable can only be enabled on decode nodes. diff --git a/docs/source/tutorials/models/Qwen3.5-397B-A17B.md b/docs/source/tutorials/models/Qwen3.5-397B-A17B.md index c70ff358..fa7fadb4 100644 --- a/docs/source/tutorials/models/Qwen3.5-397B-A17B.md +++ b/docs/source/tutorials/models/Qwen3.5-397B-A17B.md @@ -106,7 +106,6 @@ sysctl -w vm.swappiness=0 sysctl -w kernel.numa_balancing=0 sysctl kernel.sched_migration_cost_ns=50000 export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD -export VLLM_ASCEND_ENABLE_FUSED_MC2=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 vllm serve Eco-Tech/Qwen3.5-397B-A17B-w8a8-mtp \ @@ -303,7 +302,6 @@ To run the vllm-ascend `Prefill-Decode Disaggregation` service, you need to depl export VLLM_TORCH_PROFILER_WITH_STACK=0 export TASK_QUEUE_ENABLE=1 - export VLLM_ASCEND_ENABLE_FUSED_MC2=1 export HCCL_OP_EXPANSION_MODE="AIV" export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 diff --git a/tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml b/tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml index 9391352d..3ba36a4c 100644 --- a/tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml +++ b/tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml @@ -10,7 +10,6 @@ env_common: OMP_PROC_BIND: false OMP_NUM_THREADS: 1 VLLM_ASCEND_ENABLE_FLASHCOMM1: 1 - VLLM_ASCEND_ENABLE_FUSED_MC2: 2 TASK_QUEUE_ENABLE: 1 SERVER_PORT: 8080 @@ -21,6 +20,9 @@ disaggregated_prefill: deployment: - + envs: + # should disable this in the prefiller node + VLLM_ASCEND_ENABLE_FUSED_MC2: 0 server_cmd: > vllm serve "Qwen/Qwen3-235B-A22B" --host 0.0.0.0 @@ -57,6 +59,8 @@ deployment: }' - + envs: + VLLM_ASCEND_ENABLE_FUSED_MC2: 2 server_cmd: > vllm serve "Qwen/Qwen3-235B-A22B" --host 0.0.0.0 diff --git a/tests/e2e/nightly/single_node/models/configs/GLM-4.7.yaml b/tests/e2e/nightly/single_node/models/configs/GLM-4.7.yaml index a99c45ad..343b5a0e 100644 --- a/tests/e2e/nightly/single_node/models/configs/GLM-4.7.yaml +++ b/tests/e2e/nightly/single_node/models/configs/GLM-4.7.yaml @@ -12,7 +12,6 @@ _envs: &envs VLLM_ASCEND_BALANCE_SCHEDULING: "1" VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE: "1" VLLM_ASCEND_ENABLE_FLASHCOMM1: "1" - VLLM_ASCEND_ENABLE_FUSED_MC2: "1" _server_cmd: &server_cmd - "--enable-expert-parallel" diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-235B-A22B-Instruct-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-235B-A22B-Instruct-W8A8.yaml index bf45fc5a..0a2d0e6f 100644 --- a/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-235B-A22B-Instruct-W8A8.yaml +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-235B-A22B-Instruct-W8A8.yaml @@ -10,7 +10,6 @@ _envs: &envs HCCL_BUFFSIZE: "1536" PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" VLLM_ASCEND_ENABLE_FLASHCOMM1: "1" - VLLM_ASCEND_ENABLE_FUSED_MC2: "1" VLLM_ASCEND_ENABLE_NZ: "2" VLLM_ASCEND_BALANCE_SCHEDULING: "1" SERVER_PORT: "DEFAULT_PORT" diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3.5-397B-A17B-W8A8-mtp-A3.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3.5-397B-A17B-W8A8-mtp-A3.yaml index 06264059..cc1be809 100644 --- a/tests/e2e/nightly/single_node/models/configs/Qwen3.5-397B-A17B-W8A8-mtp-A3.yaml +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3.5-397B-A17B-W8A8-mtp-A3.yaml @@ -13,7 +13,6 @@ test_cases: OMP_NUM_THREADS: "1" TASK_QUEUE_ENABLE: "1" SERVER_PORT: "DEFAULT_PORT" - VLLM_ASCEND_ENABLE_FUSED_MC2: "1" VLLM_ASCEND_ENABLE_FLASHCOMM1: "1" server_cmd: - "--tensor-parallel-size"