[EPLB] Reduce the memory used for heat aggregation (#6729)
### What this PR does / why we need it?
If dist.all_gather is used directly, 2 x HCCL_BUFFSIZE memory will be
consumed, but the actual memory required for hotspot aggregation is less
than 1 MB. Therefore, a separate small communication domain is created
for it.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Original:

Current:

- vLLM version: v0.15.0
- vLLM main:
9562912cea
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -24,6 +24,8 @@ _SHARD_WEIGHT: GroupCoordinator | None = None
|
||||
|
||||
_P_TP: GroupCoordinator | None = None
|
||||
|
||||
_DYNAMIC_EPLB: GroupCoordinator | None = None
|
||||
|
||||
|
||||
def init_ascend_model_parallel(
|
||||
parallel_config: ParallelConfig,
|
||||
@@ -85,6 +87,12 @@ def init_ascend_model_parallel(
|
||||
|
||||
_MC2 = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, group_name="mc2")
|
||||
|
||||
if get_ascend_config().eplb_config.dynamic_eplb:
|
||||
global _DYNAMIC_EPLB
|
||||
_DYNAMIC_EPLB = init_model_parallel_group(
|
||||
group_ranks, get_world_group().local_rank, backend, group_name="dynamic_eplb"
|
||||
)
|
||||
|
||||
# Initialize fine-grained TP process groups on Ascend for four components:
|
||||
# 1. LM Head: output logits projection (`lmhead_tensor_parallel_size`)
|
||||
# 2. O Proj: attention output projection (`oproj_tensor_parallel_size`)
|
||||
@@ -265,6 +273,11 @@ def get_fc3_quant_x_group() -> GroupCoordinator:
|
||||
return _FC3_QUANT_X
|
||||
|
||||
|
||||
def get_dynamic_eplb_group() -> GroupCoordinator:
|
||||
assert _DYNAMIC_EPLB is not None, "fc3 quant x group is not initialized"
|
||||
return _DYNAMIC_EPLB
|
||||
|
||||
|
||||
def destroy_ascend_model_parallel():
|
||||
global _MC2
|
||||
if _MC2:
|
||||
@@ -315,3 +328,8 @@ def destroy_ascend_model_parallel():
|
||||
if _FC3_QUANT_X:
|
||||
_FC3_QUANT_X.destroy()
|
||||
_FC3_QUANT_X = None
|
||||
|
||||
global _DYNAMIC_EPLB
|
||||
if _DYNAMIC_EPLB:
|
||||
_DYNAMIC_EPLB.destroy()
|
||||
_DYNAMIC_EPLB = None
|
||||
|
||||
Reference in New Issue
Block a user