[EPLB] The profiling can collect the time required for adjusting the eplb. (#7001)
### What this PR does / why we need it?
To analyze the overhead of the dynamic eplb adjustment framework in
detail, we added the time consumption of the adjustment to the print
information in profiling mode.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?

- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -99,6 +99,9 @@ class EplbUpdator:
|
|||||||
self.eplb_process.planner_q.put(1)
|
self.eplb_process.planner_q.put(1)
|
||||||
|
|
||||||
def forward_before(self):
|
def forward_before(self):
|
||||||
|
# Batch after eplb process being triggered, get update info provided by eplb process
|
||||||
|
if self.get_update_info_flag():
|
||||||
|
self.update_info_all = self.eplb_process.block_update_q.get()
|
||||||
if self.update_expert_weight_flag():
|
if self.update_expert_weight_flag():
|
||||||
(expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(
|
(expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(
|
||||||
0
|
0
|
||||||
@@ -117,11 +120,6 @@ class EplbUpdator:
|
|||||||
self.reqs = []
|
self.reqs = []
|
||||||
self.eplb_loader.asyn_expert_weight_transfer(self.reqs)
|
self.eplb_loader.asyn_expert_weight_transfer(self.reqs)
|
||||||
|
|
||||||
def take_update_info_from_eplb_process(self):
|
|
||||||
# Batch after eplb process being triggered, get update info provided by eplb process
|
|
||||||
if self.get_update_info_flag():
|
|
||||||
self.update_info_all = self.eplb_process.block_update_q.get()
|
|
||||||
|
|
||||||
def forward_end(self):
|
def forward_end(self):
|
||||||
if self.wakeup_eplb_worker_flag():
|
if self.wakeup_eplb_worker_flag():
|
||||||
self.compute_and_set_moe_load()
|
self.compute_and_set_moe_load()
|
||||||
|
|||||||
@@ -1144,9 +1144,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
"it when the requests need prompt logprobs"
|
"it when the requests need prompt logprobs"
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.dynamic_eplb:
|
|
||||||
self.eplb_updator.forward_before()
|
|
||||||
|
|
||||||
num_reqs = self.input_batch.num_reqs
|
num_reqs = self.input_batch.num_reqs
|
||||||
req_ids = self.input_batch.req_ids
|
req_ids = self.input_batch.req_ids
|
||||||
tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
|
tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
|
||||||
@@ -1255,12 +1252,13 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
intermediate_tensors,
|
intermediate_tensors,
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.dynamic_eplb:
|
|
||||||
self.eplb_updator.take_update_info_from_eplb_process()
|
|
||||||
|
|
||||||
# update global cos, sin
|
# update global cos, sin
|
||||||
update_cos_sin(positions)
|
update_cos_sin(positions)
|
||||||
|
|
||||||
|
if self.dynamic_eplb:
|
||||||
|
with record_function_or_nullcontext("EPLB weight D2D"):
|
||||||
|
self.eplb_updator.forward_before()
|
||||||
|
|
||||||
# Set cudagraph mode to none if calc_kv_scales is true.
|
# Set cudagraph mode to none if calc_kv_scales is true.
|
||||||
# KV scales calculation involves dynamic operations that are incompatible
|
# KV scales calculation involves dynamic operations that are incompatible
|
||||||
# with CUDA graph capture.
|
# with CUDA graph capture.
|
||||||
@@ -1507,6 +1505,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.dynamic_eplb:
|
if self.dynamic_eplb:
|
||||||
|
with record_function_or_nullcontext("EPLB update"):
|
||||||
self.eplb_updator.forward_end()
|
self.eplb_updator.forward_end()
|
||||||
|
|
||||||
if self.debugger is not None:
|
if self.debugger is not None:
|
||||||
@@ -2354,7 +2353,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
if is_profile and self.dynamic_eplb:
|
if is_profile and self.dynamic_eplb:
|
||||||
self.model.clear_all_moe_loads()
|
self.model.clear_all_moe_loads()
|
||||||
if self.dynamic_eplb:
|
if self.dynamic_eplb:
|
||||||
self.eplb_updator.take_update_info_from_eplb_process()
|
|
||||||
self.eplb_updator.forward_end()
|
self.eplb_updator.forward_end()
|
||||||
return hidden_states, hidden_states
|
return hidden_states, hidden_states
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user