From eab3635850ba351af81d76a7b4b3db46ffb7f697 Mon Sep 17 00:00:00 2001 From: wyu0-0 Date: Thu, 11 Sep 2025 22:15:19 +0800 Subject: [PATCH] [Bugfix] Retrieve num_redundant_experts from eplb_config in torchair qwen3_moe.py (#2857) ### What this PR does / why we need it? This PR addresses a configuration retrieval issue related to EPLB (Expert Parallel Load Balancing) settings in qwen3_moe.py. The key change is adjusting the source of num_redundant_experts to correctly fetch from the eplb_config sub-structure within parallel_config, rather than directly from parallel_config. This aligns with the updated configuration hierarchy for EPLB-related parameters. This change references `vllm_ascend/models/qwen3_moe.py` https://github.com/vllm-project/vllm-ascend/blob/main/vllm_ascend/models/qwen3_moe.py#L255-L257 ### Does this PR introduce _any_ user-facing change? NA ### How was this patch tested? run bash as follows and test pass ``` source /sfs_turbo/humpy/B080/cann_b080/ascend-toolkit/set_env.sh source /sfs_turbo/humpy/B080/cann_b080/nnal/atb/set_env.sh #export HCCL_BUFFSIZE=300 # export HCCL_SOCKET_IFNAME="eth0" # export TP_SOCKET_IFNAME="eth0" # export GLOO_SOCKET_IFNAME="eth0" # export HCCL_IF_IP=33.215.118.231 export VLLM_USE_V1=1 export VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ=1 export TASK_QUEUE_ENABLE=1 # export VLLM_VERSION=0.9.1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_OP_EXPANSION_MODE="AIV" export HCCL_INTRA_PCIE_ENABLE=1 export HCCL_INTRA_ROCE_ENABLE=0 rm -rf ./.torchair_cache/ rm -rf ./dynamo_* rm -rf /root/ascend/log/debug/plog/* python -m vllm.entrypoints.openai.api_server \ --model=/sfs_turbo/tzq/model/Qwen/Qwen3-235B-A22B/ \ --served-model-name auto \ --port 8006 \ -tp 1 \ -dp 16 \ --enable_expert_parallel \ --max-num-seqs 48 \ --max-model-len 32768 \ --gpu-memory-utilization 0.95 \ --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes_init":false,"graph_batch_sizes":[1, 8, 16, 24, 48]}, "ascend_scheduler_config":{"enabled":false}, "refresh":true}' \ --kv-transfer-config \ '{ "kv_connector": "SharedStorageConnector", "kv_buffer_device": "npu", "kv_role": "kv_consumer", "kv_parallel_size": 2, "kv_port": "20002", "engine_id": "decode-'${NODE_RANK}'", "kv_rank": 1, "kv_connector_extra_config": { "prefill": { "dp_size": 1, "tp_size": 16 }, "decode": { "dp_size": 16, "tp_size": 1 } } }' \ 2>&1 disown ``` - vLLM version: main - vLLM main: https://github.com/vllm-project/vllm/commit/0ae43dbf8cb28a299ae724fc742b0c5bcddea868 Signed-off-by: wyu0-0 --- vllm_ascend/torchair/models/qwen3_moe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/torchair/models/qwen3_moe.py b/vllm_ascend/torchair/models/qwen3_moe.py index dd4a592..eaed918 100644 --- a/vllm_ascend/torchair/models/qwen3_moe.py +++ b/vllm_ascend/torchair/models/qwen3_moe.py @@ -394,7 +394,8 @@ class CustomQwen3MoeModel(Qwen3MoeModel): quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config - self.num_redundant_experts = parallel_config.num_redundant_experts + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.config = config