[EPLB] Reduce the memory used for batch_isend_irecv (#7344)

### What this PR does / why we need it?

#6729 seems to reduce the NPU memory usage of eplb, but actually moves
the buffer allocation of dist.all_gather_into_tensor to
dist.batch_isend_irecv. Therefore, the overall NPU memory usage is not
reduced. This PR completely reduces the memory usage in this part.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
Remaining memory of each rank before the repair.
<img width="649" height="99" alt="image"
src="https://github.com/user-attachments/assets/52a67592-e0e8-4f9a-b194-b84cb848c598"
/>

Remaining memory of each rank after the repair.
<img width="641" height="99" alt="image"
src="https://github.com/user-attachments/assets/0bc2e67c-f328-4dea-98af-d7a459fb4876"
/>

Close EPLB.
<img width="543" height="45" alt="image"
src="https://github.com/user-attachments/assets/6dcba19d-4401-44b8-a6d3-c7b35ee983c7"
/>

Memory of weights for each rank.
<img width="648" height="46" alt="image"
src="https://github.com/user-attachments/assets/4db2fd04-98a0-4d26-a026-2e8287102b99"
/>

Estimated memory for EPLB: 15.68  / 48 (layer_num) + 2 * 0.02 = 0.35 GB


- vLLM version: v0.17.0
- vLLM main:
4034c3d32e

Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
LI SHENGYONG
2026-03-20 12:25:58 +08:00
committed by GitHub
parent a1f321a556
commit 1e05c4908f
5 changed files with 21 additions and 10 deletions

View File

@@ -31,7 +31,8 @@ def mock_adaptor():
def test_generate_task_and_state_flow(mock_adaptor): def test_generate_task_and_state_flow(mock_adaptor):
loader_obj = loader.D2DExpertWeightLoader() with patch("vllm_ascend.eplb.core.eplb_device_transfer_loader.get_dynamic_eplb_group", return_value=None):
loader_obj = loader.D2DExpertWeightLoader()
loader_obj.set_adator(mock_adaptor) loader_obj.set_adator(mock_adaptor)
with patch("torch.distributed.P2POp") as mock_p2p, \ with patch("torch.distributed.P2POp") as mock_p2p, \
@@ -52,7 +53,8 @@ def test_generate_task_and_state_flow(mock_adaptor):
def test_asyn_transfer_and_update(mock_adaptor): def test_asyn_transfer_and_update(mock_adaptor):
loader_obj = loader.D2DExpertWeightLoader() with patch("vllm_ascend.eplb.core.eplb_device_transfer_loader.get_dynamic_eplb_group", return_value=None):
loader_obj = loader.D2DExpertWeightLoader()
loader_obj.set_adator(mock_adaptor) loader_obj.set_adator(mock_adaptor)
loader_obj.comm_op_list = ["fake_op"] loader_obj.comm_op_list = ["fake_op"]
@@ -88,14 +90,16 @@ def test_asyn_transfer_and_update(mock_adaptor):
def test_set_log2phy_map(mock_adaptor): def test_set_log2phy_map(mock_adaptor):
loader_obj = loader.D2DExpertWeightLoader() with patch("vllm_ascend.eplb.core.eplb_device_transfer_loader.get_dynamic_eplb_group", return_value=None):
loader_obj = loader.D2DExpertWeightLoader()
loader_obj.set_adator(mock_adaptor) loader_obj.set_adator(mock_adaptor)
loader_obj.set_log2phy_map({"a": 1}) loader_obj.set_log2phy_map({"a": 1})
assert loader_obj.updated_log2phy_map == {"a": 1} assert loader_obj.updated_log2phy_map == {"a": 1}
def test_invalid_state_asyn_update(mock_adaptor): def test_invalid_state_asyn_update(mock_adaptor):
loader_obj = loader.D2DExpertWeightLoader() with patch("vllm_ascend.eplb.core.eplb_device_transfer_loader.get_dynamic_eplb_group", return_value=None):
loader_obj = loader.D2DExpertWeightLoader()
loader_obj.set_adator(mock_adaptor) loader_obj.set_adator(mock_adaptor)
loader_obj.state = loader.ExpertWeightUpdateState.WAITING loader_obj.state = loader.ExpertWeightUpdateState.WAITING

View File

@@ -275,7 +275,7 @@ def get_fc3_quant_x_group() -> GroupCoordinator:
def get_dynamic_eplb_group() -> GroupCoordinator: def get_dynamic_eplb_group() -> GroupCoordinator:
assert _DYNAMIC_EPLB is not None, "fc3 quant x group is not initialized" assert _DYNAMIC_EPLB is not None, "Dynamic eplb group is not initialized"
return _DYNAMIC_EPLB return _DYNAMIC_EPLB

View File

@@ -19,6 +19,8 @@ from enum import Enum
import torch.distributed as dist import torch.distributed as dist
from vllm.logger import logger from vllm.logger import logger
from vllm_ascend.distributed.parallel_state import get_dynamic_eplb_group
class ExpertWeightUpdateState(Enum): class ExpertWeightUpdateState(Enum):
WAITING = 0 # waiting for updated expert_map by EplbWorker WAITING = 0 # waiting for updated expert_map by EplbWorker
@@ -35,6 +37,7 @@ class D2DExpertWeightLoader:
self.state = ExpertWeightUpdateState.WAITING self.state = ExpertWeightUpdateState.WAITING
self.recv_expert_list = [] self.recv_expert_list = []
self.num_layers = 0 self.num_layers = 0
self.comm_group = get_dynamic_eplb_group()
def set_adator(self, eplb_adaptor): def set_adator(self, eplb_adaptor):
self.eplb_adaptor = eplb_adaptor self.eplb_adaptor = eplb_adaptor
@@ -53,12 +56,16 @@ class D2DExpertWeightLoader:
dst_rank, global_expert_id_to_send = send_info dst_rank, global_expert_id_to_send = send_info
local_expert_id = self.eplb_adaptor.expert_map_per_layer_cpu[layer_id][global_expert_id_to_send].item() local_expert_id = self.eplb_adaptor.expert_map_per_layer_cpu[layer_id][global_expert_id_to_send].item()
for src_tensor in self.eplb_adaptor.expert_param_per_layer[layer_id][local_expert_id]: for src_tensor in self.eplb_adaptor.expert_param_per_layer[layer_id][local_expert_id]:
self.comm_op_list.append(dist.P2POp(dist.isend, src_tensor, dst_rank)) self.comm_op_list.append(
dist.P2POp(dist.isend, src_tensor, dst_rank, group=self.comm_group.device_group)
)
for buffer_tensor_id, recv_info in enumerate(expert_recv_info): for buffer_tensor_id, recv_info in enumerate(expert_recv_info):
recv_rank, global_expert_id_to_recv = recv_info recv_rank, global_expert_id_to_recv = recv_info
for buffer_tensor in self.eplb_adaptor.buffer_tensor_list[buffer_tensor_id]: for buffer_tensor in self.eplb_adaptor.buffer_tensor_list[buffer_tensor_id]:
self.comm_op_list.append(dist.P2POp(dist.irecv, buffer_tensor, recv_rank)) self.comm_op_list.append(
dist.P2POp(dist.irecv, buffer_tensor, recv_rank, group=self.comm_group.device_group)
)
local_expert_to_replace = self.updated_expert_map[global_expert_id_to_recv].item() local_expert_to_replace = self.updated_expert_map[global_expert_id_to_recv].item()
self.recv_expert_list.append((local_expert_to_replace, buffer_tensor_id)) self.recv_expert_list.append((local_expert_to_replace, buffer_tensor_id))

View File

@@ -155,12 +155,12 @@ class EplbUpdator:
for dst_rank in range(self.world_size): for dst_rank in range(self.world_size):
if dst_rank == self.rank_id: if dst_rank == self.rank_id:
continue continue
comm_op_list.append(dist.P2POp(dist.isend, src_tensor, dst_rank)) comm_op_list.append(dist.P2POp(dist.isend, src_tensor, dst_rank, group=self.comm_group.device_group))
for src_rank in range(self.world_size): for src_rank in range(self.world_size):
if src_rank == self.rank_id: if src_rank == self.rank_id:
continue continue
comm_op_list.append(dist.P2POp(dist.irecv, src_tensor, src_rank)) comm_op_list.append(dist.P2POp(dist.irecv, src_tensor, src_rank, group=self.comm_group.device_group))
if comm_op_list: if comm_op_list:
reqs = dist.batch_isend_irecv(comm_op_list) reqs = dist.batch_isend_irecv(comm_op_list)

View File

@@ -62,7 +62,7 @@ _CP_CHUNKEDPREFILL_COMM_STREAM = None
_ASCEND_CUSTOMOP_IS_REIGISTERED = False _ASCEND_CUSTOMOP_IS_REIGISTERED = False
_DEFAULT_BUFFER_SIZE = 200 _DEFAULT_BUFFER_SIZE = 200
_MIN_DP_BUFFER_SIZE = 50 _MIN_DP_BUFFER_SIZE = 50
_DYNAMIC_EPLB_BUFFER_SIZE = 1 # num_experts * num_layers * 64 byte _DYNAMIC_EPLB_BUFFER_SIZE = 100
_IS_MOE_MODEL = None _IS_MOE_MODEL = None
_IS_DRAFTER_MOE_MODEL = None _IS_DRAFTER_MOE_MODEL = None
_IS_VL_MODEL = None _IS_VL_MODEL = None