[EPLB] Reduce the memory used for heat aggregation (#6729)

### What this PR does / why we need it? If dist.all_gather is used directly, 2 x HCCL_BUFFSIZE memory will be consumed, but the actual memory required for hotspot aggregation is less than 1 MB. Therefore, a separate small communication domain is created for it. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Original： ![1](https://github.com/user-attachments/assets/8880b461-c26f-497c-9a05-2ca60cc46aa4) Current： ![2](https://github.com/user-attachments/assets/c9da32b5-9200-4fa2-aff9-d8c4978ac602) - vLLM version: v0.15.0 - vLLM main: 9562912cea Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-02-24 18:02:24 +08:00
parent 5c8ab7af39
commit 0331f16a50
3 changed files with 25 additions and 7 deletions
--- a/vllm_ascend/distributed/parallel_state.py
+++ b/vllm_ascend/distributed/parallel_state.py
@@ -24,6 +24,8 @@ _SHARD_WEIGHT: GroupCoordinator | None = None

 _P_TP: GroupCoordinator | None = None

+_DYNAMIC_EPLB: GroupCoordinator | None = None
+

 def init_ascend_model_parallel(
    parallel_config: ParallelConfig,
@@ -85,6 +87,12 @@ def init_ascend_model_parallel(

    _MC2 = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, group_name="mc2")

+    if get_ascend_config().eplb_config.dynamic_eplb:
+        global _DYNAMIC_EPLB
+        _DYNAMIC_EPLB = init_model_parallel_group(
+            group_ranks, get_world_group().local_rank, backend, group_name="dynamic_eplb"
+        )
+
    # Initialize fine-grained TP process groups on Ascend for four components:
    # 1. LM Head: output logits projection (`lmhead_tensor_parallel_size`)
    # 2. O Proj: attention output projection (`oproj_tensor_parallel_size`)
@@ -265,6 +273,11 @@ def get_fc3_quant_x_group() -> GroupCoordinator:
    return _FC3_QUANT_X


+def get_dynamic_eplb_group() -> GroupCoordinator:
+    assert _DYNAMIC_EPLB is not None, "fc3 quant x group is not initialized"
+    return _DYNAMIC_EPLB
+
+
 def destroy_ascend_model_parallel():
    global _MC2
    if _MC2:
@@ -315,3 +328,8 @@ def destroy_ascend_model_parallel():
    if _FC3_QUANT_X:
        _FC3_QUANT_X.destroy()
    _FC3_QUANT_X = None
+
+    global _DYNAMIC_EPLB
+    if _DYNAMIC_EPLB:
+        _DYNAMIC_EPLB.destroy()
+    _DYNAMIC_EPLB = None