[EPLB] The profiling can collect the time required for adjusting the eplb. (#7001)

### What this PR does / why we need it?
To analyze the overhead of the dynamic eplb adjustment framework in
detail, we added the time consumption of the adjustment to the print
information in profiling mode.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


![Snipaste_2026-03-05_11-42-28](https://github.com/user-attachments/assets/41c2b82a-5dfa-4e39-8b50-f4649deed30c)

- vLLM version: v0.16.0
- vLLM main:
15d76f74e2

Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
LI SHENGYONG
2026-03-05 16:10:57 +08:00
committed by GitHub
parent 43c8da3574
commit 5a3744c542
2 changed files with 9 additions and 13 deletions

View File

@@ -99,6 +99,9 @@ class EplbUpdator:
self.eplb_process.planner_q.put(1)
def forward_before(self):
# Batch after eplb process being triggered, get update info provided by eplb process
if self.get_update_info_flag():
self.update_info_all = self.eplb_process.block_update_q.get()
if self.update_expert_weight_flag():
(expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(
0
@@ -117,11 +120,6 @@ class EplbUpdator:
self.reqs = []
self.eplb_loader.asyn_expert_weight_transfer(self.reqs)
def take_update_info_from_eplb_process(self):
# Batch after eplb process being triggered, get update info provided by eplb process
if self.get_update_info_flag():
self.update_info_all = self.eplb_process.block_update_q.get()
def forward_end(self):
if self.wakeup_eplb_worker_flag():
self.compute_and_set_moe_load()

View File

@@ -1144,9 +1144,6 @@ class NPUModelRunner(GPUModelRunner):
"it when the requests need prompt logprobs"
)
if self.dynamic_eplb:
self.eplb_updator.forward_before()
num_reqs = self.input_batch.num_reqs
req_ids = self.input_batch.req_ids
tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
@@ -1255,12 +1252,13 @@ class NPUModelRunner(GPUModelRunner):
intermediate_tensors,
)
if self.dynamic_eplb:
self.eplb_updator.take_update_info_from_eplb_process()
# update global cos, sin
update_cos_sin(positions)
if self.dynamic_eplb:
with record_function_or_nullcontext("EPLB weight D2D"):
self.eplb_updator.forward_before()
# Set cudagraph mode to none if calc_kv_scales is true.
# KV scales calculation involves dynamic operations that are incompatible
# with CUDA graph capture.
@@ -1507,7 +1505,8 @@ class NPUModelRunner(GPUModelRunner):
)
if self.dynamic_eplb:
self.eplb_updator.forward_end()
with record_function_or_nullcontext("EPLB update"):
self.eplb_updator.forward_end()
if self.debugger is not None:
self.debugger.stop()
@@ -2354,7 +2353,6 @@ class NPUModelRunner(GPUModelRunner):
if is_profile and self.dynamic_eplb:
self.model.clear_all_moe_loads()
if self.dynamic_eplb:
self.eplb_updator.take_update_info_from_eplb_process()
self.eplb_updator.forward_end()
return hidden_states, hidden_states