[EPLB][Bugfix] Bugfix for ineffective dynamic eplb (#6653)

### What this PR does / why we need it? #6043 deleted the forward_before phase of the dynamic eplb. Currently, the end-to-end precision is monitored in the UT, and the log is not printed in the key place. As a result, the eplb does not take effect and is not intercepted. 1. The forward_before function is added back. 2. Delete unnecessary logs and add key logs. 3. Warm-up of algorithm 3 is added. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ![Snipaste_2026-02-10_15-57-31](https://github.com/user-attachments/assets/03813e5f-3d19-42d8-8118-76223afe8298) #### The conversation is normal. Okay, the user is asking, \"What is deep learning?\" I need to explain this in a clear and concise way. Let me start by recalling what I know about deep learning. It's a subset of machine learning, right? So first, I should mention that it's part of machine learning, which itself is a branch of AI. Then, the key aspect of deep learning is the use of neural networks with multiple layers. These are called deep neural networks.\n\nWait, I should define neural networks first. Maybe start with the basics. A neural network is inspired by the human brain, with layers of nodes (neurons) that process data. But deep learning specifically refers to networks with many layers—hence \"deep.\" So the term \"deep\" comes from the number of layers. \n\nI should explain how deep learning works. It involves training these networks on large datasets, allowing them to automatically learn features from the data. Unlike traditional machine learning, where you might have to manually extract features, deep learning models can do this automatically. That's a key point. For example, in image recognition, a deep learning model can learn to detect edges, shapes, and then more complex patterns without human intervention.\n\nApplications are important too. The user might want to know where deep learning is used. Common examples include image and speech recognition, natural language processing, autonomous vehicles, and recommendation systems. Maybe mention specific technologies like self-driving cars using computer vision or virtual assistants like Siri or Alexa - vLLM version: v0.15.0 - vLLM main: 13397841ab Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-02-24 14:43:04 +08:00
parent f41eeeb11e
commit ff29e029de
5 changed files with 21 additions and 15 deletions
--- a/tests/ut/eplb/core/test_eplb_device_transfer_loader.py
+++ b/tests/ut/eplb/core/test_eplb_device_transfer_loader.py
@@ -107,10 +107,3 @@ def test_invalid_state_asyn_update(mock_adaptor):
    loader_obj.update_expert_map_and_weight([])

    assert not mock_adaptor.do_update_expert_map.called
-
-
-def test_load_impl_not_implemented(mock_adaptor):
-    loader_obj = loader.D2DExpertWeightLoader()
-    loader_obj.set_adator(mock_adaptor)
-    with pytest.raises(NotImplementedError):
-        loader_obj.load_impl({}, {})
--- a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py
+++ b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py
@@ -34,7 +34,7 @@ class D2DExpertWeightLoader:
        self.layer_id = -1  # layer id to be updated
        self.state = ExpertWeightUpdateState.WAITING
        self.recv_expert_list = []
-        self.mock_flag = True
+        self.num_layers = 0

    def set_adator(self, eplb_adaptor):
        self.eplb_adaptor = eplb_adaptor
@@ -103,12 +103,10 @@ class D2DExpertWeightLoader:
            local_expert_to_replace, buffer_tensor_id = recv_expert_info
            self.eplb_adaptor.do_update_expert_weight(self.layer_id, local_expert_to_replace, buffer_tensor_id)

-        logger.debug(f"[EPLB] finished update expert weight for layer: {self.layer_id}")
+        if self.layer_id == self.num_layers - 1:
+            logger.info("[EPLB] finished update expert weight.")

        self.recv_expert_list = []
        self.updated_expert_map = None
        self.layer_id = -1
        self.state = ExpertWeightUpdateState.WAITING
-
-    def load_impl(self, old_expert_table, new_expert_table):
-        raise NotImplementedError
--- a/vllm_ascend/eplb/core/eplb_worker.py
+++ b/vllm_ascend/eplb/core/eplb_worker.py
@@ -68,7 +68,7 @@ class EplbWorker:

        update_info = self.compose_expert_update_info_greedy(new_expert_maps, self.old_expert_maps)
        self.old_expert_maps = new_expert_maps
-        logger.info("EPLB Process compute complete")
+        logger.debug("EPLB Process compute complete")

        packed_update_info = self.pack_update_info(update_info)

@@ -274,6 +274,10 @@ class EplbProcess:
        Subprocess entry: bind to specified NPU, loop waiting for planner_q to wake up,
        call do_update, then notify main process update is complete.
        """
+        if self.policy_type == 3:
+            from vllm_ascend.eplb.core.policy.policy_flashlb import warm_up
+
+            warm_up()
        while True:
            try:
                planner_q.get()
--- a/vllm_ascend/eplb/eplb_updator.py
+++ b/vllm_ascend/eplb/eplb_updator.py
@@ -22,11 +22,12 @@ import vllm.envs as envs
 from vllm.logger import logger

 from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor
+from vllm_ascend.eplb.core.eplb_device_transfer_loader import D2DExpertWeightLoader
 from vllm_ascend.eplb.core.eplb_worker import EplbProcess


 class EplbUpdator:
-    def __init__(self, eplb_config, loader, eplb_process: EplbProcess, process):
+    def __init__(self, eplb_config, loader: D2DExpertWeightLoader, eplb_process: EplbProcess, process):
        self.eplb_config = eplb_config
        self.init_eplb(self.eplb_config.expert_map_path, process)
        self.eplb_loader = loader
@@ -42,6 +43,7 @@ class EplbUpdator:
        self.device = local_load.device
        shape = (self.world_size, *local_load.shape)
        self._gather_buffer = torch.empty(shape, dtype=local_load.dtype, device=self.device)
+        self.eplb_loader.num_layers = self.adaptor.num_dense_layers + self.adaptor.num_moe_layers

    def init_eplb(self, expert_map_path, process):
        self.rank_id = dist.get_rank()
@@ -75,7 +77,6 @@ class EplbUpdator:
        if self.cur_iterations == (
            self.expert_heat_collection_interval + self.algorithm_execution_interval + self.num_moe_layers
        ):
-            logger.info("Finish expert parallel load balancing.")
            if self.expert_map_record_path is not None:
                self.adaptor._export_tensor_to_file(self.shared_dict["expert_maps"], self.expert_map_record_path)

--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1099,6 +1099,10 @@ class NPUModelRunner(GPUModelRunner):
                        "logprobs for prompt tokens, tokens, please disable "
                        "it when the requests need prompt logprobs"
                    )
+
+                if self.dynamic_eplb:
+                    self.eplb_updator.forward_before()
+
                num_reqs = self.input_batch.num_reqs
                req_ids = self.input_batch.req_ids
                tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
@@ -1198,6 +1202,9 @@ class NPUModelRunner(GPUModelRunner):
                ec_connector_output,
            ) = self._preprocess(scheduler_output, num_tokens_padded, intermediate_tensors)

+            if self.dynamic_eplb:
+                self.eplb_updator.take_update_info_from_eplb_process()
+
            # update global cos, sin
            update_cos_sin(positions)

@@ -2072,6 +2079,9 @@ class NPUModelRunner(GPUModelRunner):
        assert sum(num_scheduled_tokens_list) == num_tokens
        assert len(num_scheduled_tokens_list) == num_reqs

+        if not is_profile and self.dynamic_eplb:
+            self.eplb_updator.forward_before()
+
        num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
        self.query_lens = torch.from_numpy(num_scheduled_tokens)
        num_tokens_unpadded = int(num_scheduled_tokens.sum())