[EPLB] The profiling can collect the time required for adjusting the eplb. (#7001)

### What this PR does / why we need it? To analyze the overhead of the dynamic eplb adjustment framework in detail, we added the time consumption of the adjustment to the print information in profiling mode. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ![Snipaste_2026-03-05_11-42-28](https://github.com/user-attachments/assets/41c2b82a-5dfa-4e39-8b50-f4649deed30c) - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-03-05 16:10:57 +08:00
parent 43c8da3574
commit 5a3744c542
2 changed files with 9 additions and 13 deletions
--- a/vllm_ascend/eplb/eplb_updator.py
+++ b/vllm_ascend/eplb/eplb_updator.py
@@ -99,6 +99,9 @@ class EplbUpdator:
        self.eplb_process.planner_q.put(1)
    def forward_before(self):
        # Batch after eplb process being triggered, get update info provided by eplb process
        if self.get_update_info_flag():
            self.update_info_all = self.eplb_process.block_update_q.get()
        if self.update_expert_weight_flag():
            (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(
                0
@@ -117,11 +120,6 @@ class EplbUpdator:
            self.reqs = []
            self.eplb_loader.asyn_expert_weight_transfer(self.reqs)
    def take_update_info_from_eplb_process(self):
        # Batch after eplb process being triggered, get update info provided by eplb process
        if self.get_update_info_flag():
            self.update_info_all = self.eplb_process.block_update_q.get()
    def forward_end(self):
        if self.wakeup_eplb_worker_flag():
            self.compute_and_set_moe_load()
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1144,9 +1144,6 @@ class NPUModelRunner(GPUModelRunner):
                        "it when the requests need prompt logprobs"
                    )
                if self.dynamic_eplb:
                    self.eplb_updator.forward_before()
                num_reqs = self.input_batch.num_reqs
                req_ids = self.input_batch.req_ids
                tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
@@ -1255,12 +1252,13 @@ class NPUModelRunner(GPUModelRunner):
                intermediate_tensors,
            )
            if self.dynamic_eplb:
                self.eplb_updator.take_update_info_from_eplb_process()
            # update global cos, sin
            update_cos_sin(positions)
        if self.dynamic_eplb:
            with record_function_or_nullcontext("EPLB weight D2D"):
                self.eplb_updator.forward_before()
        # Set cudagraph mode to none if calc_kv_scales is true.
        # KV scales calculation involves dynamic operations that are incompatible
        # with CUDA graph capture.
@@ -1507,6 +1505,7 @@ class NPUModelRunner(GPUModelRunner):
        )
        if self.dynamic_eplb:
            with record_function_or_nullcontext("EPLB update"):
                self.eplb_updator.forward_end()
        if self.debugger is not None:
@@ -2354,7 +2353,6 @@ class NPUModelRunner(GPUModelRunner):
            if is_profile and self.dynamic_eplb:
                self.model.clear_all_moe_loads()
            if self.dynamic_eplb:
                self.eplb_updator.take_update_info_from_eplb_process()
                self.eplb_updator.forward_end()
            return hidden_states, hidden_states