From 4e6dbe0956343d767d738547762be1aa162ef9bf Mon Sep 17 00:00:00 2001 From: LI SHENGYONG <49200266+shenchuxiaofugui@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:22:55 +0800 Subject: [PATCH] [EPLB][Bugfix] Set parallel_config.enable_eplb to true to load redundant experts (#7470) ### What this PR does / why we need it? pr: https://github.com/vllm-project/vllm/pull/37136 break eplb because it filters out redundant experts. pr: https://github.com/vllm-project/vllm/pull/37322 fix it due to use parallel_config.enable_eplb to determine whether to skip the weight loading filter. But in vllm-ascend, parallel_config.enable_eplb is always false. When we use eplb, we temporarily set it to true. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ![Snipaste_2026-03-19_16-13-01](https://github.com/user-attachments/assets/b3a4911e-36b3-4c31-951c-7c091f416d00) | dataset | version | metric | mode | vllm-api-stream-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | Signed-off-by: shenchuxiaofugui <1311027364@qq.com> --- tests/e2e/multicard/2-cards/test_qwen3_moe.py | 6 ------ vllm_ascend/worker/model_runner_v1.py | 5 ++++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/e2e/multicard/2-cards/test_qwen3_moe.py b/tests/e2e/multicard/2-cards/test_qwen3_moe.py index 4ce5e33e..385b32e8 100644 --- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py +++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py @@ -76,12 +76,6 @@ def test_qwen3_moe_distributed_aiv_tp2(): @pytest.mark.asyncio async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb(): - from vllm_ascend.utils import vllm_version_is - - if not vllm_version_is("0.17.0"): - pytest.skip( - "EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408", - ) model = "vllm-ascend/Qwen3-30B-A3B-W8A8" port = get_open_port() compilation_config = json.dumps({"cudagraph_capture_sizes": [8]}) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1bf7095e..a49182ce 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -364,6 +364,7 @@ class NPUModelRunner(GPUModelRunner): eplb_config = self.ascend_config.eplb_config self.dynamic_eplb = eplb_config.dynamic_eplb + self.eplb_enable = self.dynamic_eplb or (eplb_config.expert_map_path is not None) if self.dynamic_eplb: self.is_eplb_warmuped = False self.policy_type = eplb_config.eplb_policy_type @@ -2554,7 +2555,9 @@ class NPUModelRunner(GPUModelRunner): logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 - self.model = get_model(vllm_config=self.vllm_config) + if self.eplb_enable: + self.vllm_config.parallel_config.enable_eplb = True + self.model: nn.Module = get_model(vllm_config=self.vllm_config) if self.dynamic_eplb: model_register(self.model) if self.drafter: