From 5a3744c542cc80d70eb6505efe418b727b5136f0 Mon Sep 17 00:00:00 2001 From: LI SHENGYONG <49200266+shenchuxiaofugui@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:10:57 +0800 Subject: [PATCH] [EPLB] The profiling can collect the time required for adjusting the eplb. (#7001) ### What this PR does / why we need it? To analyze the overhead of the dynamic eplb adjustment framework in detail, we added the time consumption of the adjustment to the print information in profiling mode. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ![Snipaste_2026-03-05_11-42-28](https://github.com/user-attachments/assets/41c2b82a-5dfa-4e39-8b50-f4649deed30c) - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 Signed-off-by: shenchuxiaofugui <1311027364@qq.com> --- vllm_ascend/eplb/eplb_updator.py | 8 +++----- vllm_ascend/worker/model_runner_v1.py | 14 ++++++-------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 55fab81f..28fb90d2 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -99,6 +99,9 @@ class EplbUpdator: self.eplb_process.planner_q.put(1) def forward_before(self): + # Batch after eplb process being triggered, get update info provided by eplb process + if self.get_update_info_flag(): + self.update_info_all = self.eplb_process.block_update_q.get() if self.update_expert_weight_flag(): (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop( 0 @@ -117,11 +120,6 @@ class EplbUpdator: self.reqs = [] self.eplb_loader.asyn_expert_weight_transfer(self.reqs) - def take_update_info_from_eplb_process(self): - # Batch after eplb process being triggered, get update info provided by eplb process - if self.get_update_info_flag(): - self.update_info_all = self.eplb_process.block_update_q.get() - def forward_end(self): if self.wakeup_eplb_worker_flag(): self.compute_and_set_moe_load() diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 748c7e01..ca475dd3 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1144,9 +1144,6 @@ class NPUModelRunner(GPUModelRunner): "it when the requests need prompt logprobs" ) - if self.dynamic_eplb: - self.eplb_updator.forward_before() - num_reqs = self.input_batch.num_reqs req_ids = self.input_batch.req_ids tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] @@ -1255,12 +1252,13 @@ class NPUModelRunner(GPUModelRunner): intermediate_tensors, ) - if self.dynamic_eplb: - self.eplb_updator.take_update_info_from_eplb_process() - # update global cos, sin update_cos_sin(positions) + if self.dynamic_eplb: + with record_function_or_nullcontext("EPLB weight D2D"): + self.eplb_updator.forward_before() + # Set cudagraph mode to none if calc_kv_scales is true. # KV scales calculation involves dynamic operations that are incompatible # with CUDA graph capture. @@ -1507,7 +1505,8 @@ class NPUModelRunner(GPUModelRunner): ) if self.dynamic_eplb: - self.eplb_updator.forward_end() + with record_function_or_nullcontext("EPLB update"): + self.eplb_updator.forward_end() if self.debugger is not None: self.debugger.stop() @@ -2354,7 +2353,6 @@ class NPUModelRunner(GPUModelRunner): if is_profile and self.dynamic_eplb: self.model.clear_all_moe_loads() if self.dynamic_eplb: - self.eplb_updator.take_update_info_from_eplb_process() self.eplb_updator.forward_end() return hidden_states, hidden_states