[Bugfix] Revert pr4214 multi-stream collect expert hotpot (#5529)

### What this PR does / why we need it? PR4214 was intended to collect expert heat by processing multiple streams, which could lead to memory overwriting and accuracy issues. After communicating with the PR submitter, this PR has been reverted. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? qwen3-moe dynamic eplb Befor revert | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 43.33 | After revert | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | baseline (without eplb) | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | - vLLM version: v0.13.0 - vLLM main: 45c1ca1ca1 Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-07 11:26:47 +08:00
parent 25baf6df09
commit cd59323e40
3 changed files with 15 additions and 36 deletions
--- a/vllm_ascend/eplb/eplb_updator.py
+++ b/vllm_ascend/eplb/eplb_updator.py
@@ -23,8 +23,6 @@ from vllm.logger import logger

 from vllm_ascend.eplb.core.eplb_utils import EPLBParamUtils
 from vllm_ascend.eplb.core.eplb_worker import EplbProcess
-from vllm_ascend.eplb.utils import moe_load_async_stream
-from vllm_ascend.utils import npu_stream_switch


 class EplbUpdator:
@@ -155,22 +153,21 @@ class EplbUpdator:

        self._gather_buffer = None
        if dist.is_initialized():
-            with npu_stream_switch(moe_load_async_stream()):
-                self.world_size = dist.get_world_size()
-                self.device = local_load.device
-                if self._gather_buffer is None:
-                    shape = (self.world_size, *local_load.shape)
-                    self._gather_buffer = torch.empty(shape,
-                                                      dtype=local_load.dtype,
-                                                      device=self.device)
+            self.world_size = dist.get_world_size()
+            self.device = local_load.device
+            if self._gather_buffer is None:
+                shape = (self.world_size, *local_load.shape)
+                self._gather_buffer = torch.empty(shape,
+                                                  dtype=local_load.dtype,
+                                                  device=self.device)

-                dist.all_gather_into_tensor(self._gather_buffer, local_load)
+            dist.all_gather_into_tensor(self._gather_buffer, local_load)

-                moe_load = self._gather_buffer.permute(1, 0, 2)
-                self.shared_dict["moe_load"] = moe_load.cpu()
-                logger.debug(
-                    f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}"
-                )
+            moe_load = self._gather_buffer.permute(1, 0, 2)
+            self.shared_dict["moe_load"] = moe_load.cpu()
+            logger.debug(
+                f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}"
+            )
        else:
            moe_load = local_load.unsqueeze(1)
            self.shared_dict["moe_load"] = moe_load.cpu()