From 14ca1e5cb222ba311c2c0bd56f967f174b694f46 Mon Sep 17 00:00:00 2001 From: offline893 <158537145+offline893@users.noreply.github.com> Date: Thu, 30 Oct 2025 10:18:07 +0800 Subject: [PATCH] [CI]Fix oom of deepseek-eplb nigtly test. (#3884) ### What this PR does / why we need it? Fix oom of deepseek-eplb nigtly test - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac --------- Signed-off-by: offline0806 <3337230449@qq.com> Co-authored-by: offline0806 <3337230449@qq.com> --- tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py | 2 +- tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py | 3 +-- vllm_ascend/eplb/core/eplb_device_transfer_loader.py | 2 +- vllm_ascend/eplb/eplb_updator.py | 1 + 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py index de77145b..26bcfa91 100644 --- a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py +++ b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py @@ -85,7 +85,7 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None: "--quantization", "ascend", "--gpu-memory-utilization", "0.9", "--additional-config", '{"enable_weight_nz_layout":true, ' '"torch_air_graph_config":{"enabled": true, "enable_multistream_mla": true, "graph_batch_size": [16], "use_cached_graph": true},' - '"dynamic_eplb": true, "num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200, "init_redundancy_expert": 16}' + '"dynamic_eplb": true, "num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200' ] request_keyword_args: dict[str, Any] = { **api_keyword_args, diff --git a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py index 6814d5d4..52aafa15 100644 --- a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py +++ b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py @@ -82,8 +82,7 @@ async def test_models(model: str, tp_size: int) -> None: "--quantization", "ascend", "--gpu-memory-utilization", "0.9", "--additional-config", '{"enable_weight_nz_layout":true, "dynamic_eplb": true, ' - '"num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200, ' - '"init_redundancy_expert": 16}' + '"num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200}' ] request_keyword_args: dict[str, Any] = { **api_keyword_args, diff --git a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py index 9a8a323f..5c676cdd 100644 --- a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py +++ b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py @@ -126,7 +126,7 @@ class D2DExpertWeightLoader: local_expert_to_replace, buffer_tensor_id) - logger.info( + logger.debug( f"[EPLB] finished update expert weight for layer: {self.layer_id}") self.recv_expert_list = [] diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index e5eee62e..f2a5b695 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -77,6 +77,7 @@ class EplbUpdator: self.cur_iterations += 1 if self.cur_iterations == (self.num_iterations_eplb_update + \ self.num_wait_worker_iterations + self.num_moe_layers): + logger.info("Finish expert parallel load balancing.") if self.expert_map_record_path is not None: self.adaptor._export_tensor_to_file( self.shared_dict["expert_maps"],