diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 55fab81f..28fb90d2 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -99,6 +99,9 @@ class EplbUpdator: self.eplb_process.planner_q.put(1) def forward_before(self): + # Batch after eplb process being triggered, get update info provided by eplb process + if self.get_update_info_flag(): + self.update_info_all = self.eplb_process.block_update_q.get() if self.update_expert_weight_flag(): (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop( 0 @@ -117,11 +120,6 @@ class EplbUpdator: self.reqs = [] self.eplb_loader.asyn_expert_weight_transfer(self.reqs) - def take_update_info_from_eplb_process(self): - # Batch after eplb process being triggered, get update info provided by eplb process - if self.get_update_info_flag(): - self.update_info_all = self.eplb_process.block_update_q.get() - def forward_end(self): if self.wakeup_eplb_worker_flag(): self.compute_and_set_moe_load() diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 748c7e01..ca475dd3 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1144,9 +1144,6 @@ class NPUModelRunner(GPUModelRunner): "it when the requests need prompt logprobs" ) - if self.dynamic_eplb: - self.eplb_updator.forward_before() - num_reqs = self.input_batch.num_reqs req_ids = self.input_batch.req_ids tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] @@ -1255,12 +1252,13 @@ class NPUModelRunner(GPUModelRunner): intermediate_tensors, ) - if self.dynamic_eplb: - self.eplb_updator.take_update_info_from_eplb_process() - # update global cos, sin update_cos_sin(positions) + if self.dynamic_eplb: + with record_function_or_nullcontext("EPLB weight D2D"): + self.eplb_updator.forward_before() + # Set cudagraph mode to none if calc_kv_scales is true. # KV scales calculation involves dynamic operations that are incompatible # with CUDA graph capture. @@ -1507,7 +1505,8 @@ class NPUModelRunner(GPUModelRunner): ) if self.dynamic_eplb: - self.eplb_updator.forward_end() + with record_function_or_nullcontext("EPLB update"): + self.eplb_updator.forward_end() if self.debugger is not None: self.debugger.stop() @@ -2354,7 +2353,6 @@ class NPUModelRunner(GPUModelRunner): if is_profile and self.dynamic_eplb: self.model.clear_all_moe_loads() if self.dynamic_eplb: - self.eplb_updator.take_update_info_from_eplb_process() self.eplb_updator.forward_end() return hidden_states, hidden_states