[EPLB][Bugfix]Reduce unnecessary video memory usage (#6020)

### What this PR does / why we need it? 1.Incorporate the warm up of the EPLB into the profile run. 2.Reusing the same gather buffer ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? qwen3-235b aime baseline | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | eplb The OOM issue does not occur. | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | - vLLM version: v0.13.0 - vLLM main: 2c24bc6996 Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-23 14:21:13 +08:00
parent 749e24f81e
commit 8210a62a44
4 changed files with 20 additions and 30 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2266,7 +2266,7 @@ class NPUModelRunner(GPUModelRunner):
                    is_profile=is_profile)
            if is_profile and self.dynamic_eplb:
                self.model.clear_all_moe_loads()
-            if not is_profile and self.dynamic_eplb:
+            if self.dynamic_eplb:
                self.eplb_updator.take_update_info_from_eplb_process()
                self.eplb_updator.forward_end()
            return hidden_states, hidden_states
@@ -2293,6 +2293,7 @@ class NPUModelRunner(GPUModelRunner):
        return output

    def profile_run(self) -> None:
+        self.eplb_warmup()
        mc2_tokens_capacity = get_mc2_tokens_capacity()
        if self.max_num_tokens > mc2_tokens_capacity and \
            select_moe_comm_method(mc2_tokens_capacity, self.vllm_config) in {MoECommType.MC2, MoECommType.FUSED_MC2}:
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -366,7 +366,6 @@ class NPUWorker(WorkerBase):

    def compile_or_warm_up_model(self) -> None:
        # Note: need to adapt for graph mode.
-        self.model_runner.eplb_warmup()
        warmup_sizes = (self.vllm_config.compilation_config.compile_sizes
                        or []).copy()
        if not self.model_config.enforce_eager: