diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py
index 55fab81f..28fb90d2 100644
--- a/vllm_ascend/eplb/eplb_updator.py
+++ b/vllm_ascend/eplb/eplb_updator.py
@@ -99,6 +99,9 @@ class EplbUpdator:
         self.eplb_process.planner_q.put(1)
 
     def forward_before(self):
+        # Batch after eplb process being triggered, get update info provided by eplb process
+        if self.get_update_info_flag():
+            self.update_info_all = self.eplb_process.block_update_q.get()
         if self.update_expert_weight_flag():
             (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(
                 0
@@ -117,11 +120,6 @@ class EplbUpdator:
             self.reqs = []
             self.eplb_loader.asyn_expert_weight_transfer(self.reqs)
 
-    def take_update_info_from_eplb_process(self):
-        # Batch after eplb process being triggered, get update info provided by eplb process
-        if self.get_update_info_flag():
-            self.update_info_all = self.eplb_process.block_update_q.get()
-
     def forward_end(self):
         if self.wakeup_eplb_worker_flag():
             self.compute_and_set_moe_load()
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 748c7e01..ca475dd3 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1144,9 +1144,6 @@ class NPUModelRunner(GPUModelRunner):
                         "it when the requests need prompt logprobs"
                     )
 
-                if self.dynamic_eplb:
-                    self.eplb_updator.forward_before()
-
                 num_reqs = self.input_batch.num_reqs
                 req_ids = self.input_batch.req_ids
                 tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
@@ -1255,12 +1252,13 @@ class NPUModelRunner(GPUModelRunner):
                 intermediate_tensors,
             )
 
-            if self.dynamic_eplb:
-                self.eplb_updator.take_update_info_from_eplb_process()
-
             # update global cos, sin
             update_cos_sin(positions)
 
+        if self.dynamic_eplb:
+            with record_function_or_nullcontext("EPLB weight D2D"):
+                self.eplb_updator.forward_before()
+
         # Set cudagraph mode to none if calc_kv_scales is true.
         # KV scales calculation involves dynamic operations that are incompatible
         # with CUDA graph capture.
@@ -1507,7 +1505,8 @@ class NPUModelRunner(GPUModelRunner):
         )
 
         if self.dynamic_eplb:
-            self.eplb_updator.forward_end()
+            with record_function_or_nullcontext("EPLB update"):
+                self.eplb_updator.forward_end()
 
         if self.debugger is not None:
             self.debugger.stop()
@@ -2354,7 +2353,6 @@ class NPUModelRunner(GPUModelRunner):
             if is_profile and self.dynamic_eplb:
                 self.model.clear_all_moe_loads()
             if self.dynamic_eplb:
-                self.eplb_updator.take_update_info_from_eplb_process()
                 self.eplb_updator.forward_end()
             return hidden_states, hidden_states