[EPLB][Bugfix] Set parallel_config.enable_eplb to true to load redundant experts (#7470)
### What this PR does / why we need it? pr: https://github.com/vllm-project/vllm/pull/37136 break eplb because it filters out redundant experts. pr: https://github.com/vllm-project/vllm/pull/37322 fix it due to use parallel_config.enable_eplb to determine whether to skip the weight loading filter. But in vllm-ascend, parallel_config.enable_eplb is always false. When we use eplb, we temporarily set it to true. ### Does this PR introduce _any_ user-facing change? <!-- Note that it means *any* user-facing change including all aspects such as API, interface or other behavior changes. Documentation-only updates are not considered user-facing changes. --> ### How was this patch tested?  | dataset | version | metric | mode | vllm-api-stream-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -76,12 +76,6 @@ def test_qwen3_moe_distributed_aiv_tp2():
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
|
async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
if not vllm_version_is("0.17.0"):
|
|
||||||
pytest.skip(
|
|
||||||
"EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
|
|
||||||
)
|
|
||||||
model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
|
model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
|
||||||
port = get_open_port()
|
port = get_open_port()
|
||||||
compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
|
compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
|
||||||
|
|||||||
@@ -364,6 +364,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
|
|
||||||
eplb_config = self.ascend_config.eplb_config
|
eplb_config = self.ascend_config.eplb_config
|
||||||
self.dynamic_eplb = eplb_config.dynamic_eplb
|
self.dynamic_eplb = eplb_config.dynamic_eplb
|
||||||
|
self.eplb_enable = self.dynamic_eplb or (eplb_config.expert_map_path is not None)
|
||||||
if self.dynamic_eplb:
|
if self.dynamic_eplb:
|
||||||
self.is_eplb_warmuped = False
|
self.is_eplb_warmuped = False
|
||||||
self.policy_type = eplb_config.eplb_policy_type
|
self.policy_type = eplb_config.eplb_policy_type
|
||||||
@@ -2554,7 +2555,9 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
logger.info("Starting to load model %s...", self.model_config.model)
|
logger.info("Starting to load model %s...", self.model_config.model)
|
||||||
|
|
||||||
with DeviceMemoryProfiler() as m: # noqa: SIM117
|
with DeviceMemoryProfiler() as m: # noqa: SIM117
|
||||||
self.model = get_model(vllm_config=self.vllm_config)
|
if self.eplb_enable:
|
||||||
|
self.vllm_config.parallel_config.enable_eplb = True
|
||||||
|
self.model: nn.Module = get_model(vllm_config=self.vllm_config)
|
||||||
if self.dynamic_eplb:
|
if self.dynamic_eplb:
|
||||||
model_register(self.model)
|
model_register(self.model)
|
||||||
if self.drafter:
|
if self.drafter:
|
||||||
|
|||||||
Reference in New Issue
Block a user