[EPLB] Reduce the memory used for heat aggregation (#6729)

### What this PR does / why we need it? If dist.all_gather is used directly, 2 x HCCL_BUFFSIZE memory will be consumed, but the actual memory required for hotspot aggregation is less than 1 MB. Therefore, a separate small communication domain is created for it. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Original： ![1](https://github.com/user-attachments/assets/8880b461-c26f-497c-9a05-2ca60cc46aa4) Current： ![2](https://github.com/user-attachments/assets/c9da32b5-9200-4fa2-aff9-d8c4978ac602) - vLLM version: v0.15.0 - vLLM main: 9562912cea Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-02-24 18:02:24 +08:00
parent 5c8ab7af39
commit 0331f16a50
3 changed files with 25 additions and 7 deletions
--- a/vllm_ascend/eplb/eplb_updator.py
+++ b/vllm_ascend/eplb/eplb_updator.py
@@ -21,6 +21,7 @@ import torch.distributed as dist
 import vllm.envs as envs
 from vllm.logger import logger

+from vllm_ascend.distributed.parallel_state import get_dynamic_eplb_group
 from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor
 from vllm_ascend.eplb.core.eplb_device_transfer_loader import D2DExpertWeightLoader
 from vllm_ascend.eplb.core.eplb_worker import EplbProcess
@@ -34,6 +35,7 @@ class EplbUpdator:
        self.eplb_process = eplb_process
        self.shared_dict = self.eplb_process.shared_dict
        self.moe_imbalance_dict: dict[int, float] = {}
+        self.comm_group = get_dynamic_eplb_group()

    def set_adaptor(self, adaptor: VllmEplbAdaptor):
        self.adaptor = adaptor
@@ -41,8 +43,6 @@ class EplbUpdator:
        local_load = self.adaptor.get_rank_expert_workload()
        self.world_size = dist.get_world_size()
        self.device = local_load.device
-        shape = (self.world_size, *local_load.shape)
-        self._gather_buffer = torch.empty(shape, dtype=local_load.dtype, device=self.device)
        self.eplb_loader.num_layers = self.adaptor.num_dense_layers + self.adaptor.num_moe_layers

    def init_eplb(self, expert_map_path, process):
@@ -134,9 +134,8 @@ class EplbUpdator:

    def compute_and_set_moe_load(self):
        local_load = self.adaptor.get_rank_expert_workload()
-        dist.all_gather_into_tensor(self._gather_buffer, local_load)
+        moe_load = self.comm_group.all_gather(local_load, dim=0).reshape(-1, self.world_size, *local_load.shape[1:])

-        moe_load = self._gather_buffer.permute(1, 0, 2)
        self.shared_dict["moe_load"] = moe_load.cpu()
        logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}")

@@ -183,17 +182,16 @@ class EplbUpdator:
        self.compute_and_set_moe_load()

        src_tensor = torch.empty((1,), device=self.device)
-        self_rank = dist.get_rank()

        comm_op_list = []

        for dst_rank in range(self.world_size):
-            if dst_rank == self_rank:
+            if dst_rank == self.rank_id:
                continue
            comm_op_list.append(dist.P2POp(dist.isend, src_tensor, dst_rank))

        for src_rank in range(self.world_size):
-            if src_rank == self_rank:
+            if src_rank == self.rank_id:
                continue
            comm_op_list.append(dist.P2POp(dist.irecv, src_tensor, src_rank))
        if comm_op_list: