[feat] support customized and separated hccl_buffer_size for process group initialization (#3073)
### What this PR does / why we need it? Currently, users have to set `HCCL_BUFFSIZE` to 512~1024 to perform mc2 operators (dispatch and combine) while running moe models with large `ep_size` and `batch_size`. This environmental variable not only affects allocated VRAM for mc2 group, but also increases VRAM allocation for dp, tp & ep groups, leading to significant kvcache and free_memory drops. This PR supports to automatically calculate and set `hccl_buffer_size` for each process group **(except mc2 group)** separately when users set `HCCL_BUFFSIZE` for mc2 group. This can significantly reduce wasted buffer_size set for dp, tp & ep groups. Note that current mc2 operators can only perform communication space partitioning based on `HCCL_BUFFSIZE` configuration. Once they support `hccl_buffer_size` configuration with `pg_options` while initializing process group, we'll caculate the required buffer size and users would avoid set `HCCL_BUFFSIZE` themselves. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? We performed E2E serving with deepseek_r1 initializing DP/TP/EP/MC2 process group and observed significant kv_cache and free_memory increase! - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
@@ -29,7 +29,7 @@ class TestPatchDistributed(TestBase):
|
||||
self.mock_group_ranks = [[0, 1]]
|
||||
self.mock_local_rank = 0
|
||||
self.mock_backend = "hccl"
|
||||
self.mock_use_device_comm = True
|
||||
self.mock_use_device_comm = False
|
||||
|
||||
patcher_get_rank = patch("torch.distributed.get_rank", return_value=0)
|
||||
patcher_new_group = patch("torch.distributed.new_group",
|
||||
@@ -39,16 +39,24 @@ class TestPatchDistributed(TestBase):
|
||||
patcher_device_comm_cls = patch(
|
||||
"vllm.distributed.parallel_state.resolve_obj_by_qualname",
|
||||
return_value=MagicMock())
|
||||
patcher_calculate_dp_buffer = patch(
|
||||
"vllm_ascend.utils.calculate_dp_buffer_size", return_value=64)
|
||||
patcher_npu_current_device = patch("torch.npu.current_device",
|
||||
return_value=MagicMock())
|
||||
|
||||
self.mock_get_rank = patcher_get_rank.start()
|
||||
self.mock_new_group = patcher_new_group.start()
|
||||
self.mock_is_cuda_alike = patcher_is_cuda_alike.start()
|
||||
self.mock_resolve_obj = patcher_device_comm_cls.start()
|
||||
self.mock_calculate_dp_buffer = patcher_calculate_dp_buffer.start()
|
||||
self.mock_npu_current_device = patcher_npu_current_device.start()
|
||||
|
||||
self.addCleanup(patcher_get_rank.stop)
|
||||
self.addCleanup(patcher_new_group.stop)
|
||||
self.addCleanup(patcher_is_cuda_alike.stop)
|
||||
self.addCleanup(patcher_device_comm_cls.stop)
|
||||
self.addCleanup(patcher_calculate_dp_buffer.stop)
|
||||
self.addCleanup(patcher_npu_current_device.stop)
|
||||
|
||||
self.group_coordinator = GroupCoordinatorPatch(
|
||||
group_ranks=self.mock_group_ranks,
|
||||
|
||||
@@ -87,6 +87,19 @@
|
||||
# ** File: worker/patch_common/patch_distributed.py **
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.distributed.parallel_state.GroupCoordinator`
|
||||
# (1) __init__()
|
||||
# Why:
|
||||
# The original GroupCoordinator initialization lacks pg_options to generate new
|
||||
# process group with customized options.
|
||||
# How:
|
||||
# Inject HCCL options during process group initialization.
|
||||
# Related PR (if no, explain why):
|
||||
# Need a PR to vllm to support a dictionary as input while initializing distributed
|
||||
# environment (e.g., Dict[str, torch.distributed.ProcessGroupHCCL.Options])
|
||||
# https://github.com/vllm-project/vllm/pull/25417
|
||||
# Future Plan:
|
||||
# Remove this patch when vllm merges this PR.
|
||||
# (2) all_to_all()
|
||||
# Why:
|
||||
# vllm doesn't support all_to_all for GroupCoordinator.
|
||||
# How:
|
||||
|
||||
@@ -15,17 +15,82 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import torch
|
||||
import vllm
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
from torch.distributed import Backend
|
||||
from vllm.distributed.parallel_state import (GroupCoordinator,
|
||||
_get_unique_name, _register_group)
|
||||
|
||||
from vllm_ascend.distributed.communicator import NPUCommunicator
|
||||
from vllm_ascend.utils import create_hccl_pg_options
|
||||
|
||||
|
||||
class GroupCoordinatorPatch(GroupCoordinator):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
def __init__(
|
||||
self,
|
||||
group_ranks: list[list[int]],
|
||||
local_rank: int,
|
||||
torch_distributed_backend: Union[str, Backend],
|
||||
use_device_communicator: bool, # whether to use device communicator
|
||||
use_message_queue_broadcaster: bool = False,
|
||||
group_name: Optional[str] = None,
|
||||
):
|
||||
group_name = group_name or "anonymous"
|
||||
self.unique_name = _get_unique_name(group_name)
|
||||
_register_group(self)
|
||||
|
||||
self.rank = torch.distributed.get_rank()
|
||||
self.local_rank = local_rank
|
||||
|
||||
self_device_group = None
|
||||
self_cpu_group = None
|
||||
hccl_pg_options = create_hccl_pg_options(group_name)
|
||||
|
||||
for ranks in group_ranks:
|
||||
device_group = torch.distributed.new_group(
|
||||
ranks,
|
||||
backend=torch_distributed_backend,
|
||||
pg_options=hccl_pg_options)
|
||||
|
||||
# a group with `gloo` backend, to allow direct coordination between
|
||||
# processes through the CPU.
|
||||
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
||||
if self.rank in ranks:
|
||||
self.ranks = ranks
|
||||
self.world_size = len(ranks)
|
||||
self.rank_in_group = ranks.index(self.rank)
|
||||
self_device_group = device_group
|
||||
self_cpu_group = cpu_group
|
||||
|
||||
assert self_cpu_group is not None
|
||||
assert self_device_group is not None
|
||||
|
||||
self.cpu_group = self_cpu_group
|
||||
self.device_group = self_device_group
|
||||
self.device = torch.npu.current_device()
|
||||
|
||||
self.use_device_communicator = use_device_communicator
|
||||
self.device_communicator = None
|
||||
if use_device_communicator and self.world_size > 1:
|
||||
self.device_communicator = NPUCommunicator(
|
||||
cpu_group=self.cpu_group,
|
||||
device=self.device,
|
||||
device_group=self.device_group,
|
||||
unique_name=self.unique_name,
|
||||
)
|
||||
|
||||
from vllm.distributed.device_communicators.shm_broadcast import \
|
||||
MessageQueue
|
||||
self.mq_broadcaster: Optional[MessageQueue] = None
|
||||
if use_message_queue_broadcaster and self.world_size > 1:
|
||||
self.mq_broadcaster = MessageQueue.create_from_process_group(
|
||||
self.cpu_group, 1 << 22, 6)
|
||||
|
||||
self.use_custom_op_call = False
|
||||
self.use_cpu_custom_send_recv = False
|
||||
|
||||
def all_to_all(self,
|
||||
input_: torch.Tensor,
|
||||
@@ -41,9 +106,10 @@ class GroupCoordinatorPatch(GroupCoordinator):
|
||||
assert -input_.dim() <= gather_dim < input_.dim(), (
|
||||
f"Invalid gather dim ({gather_dim}) for input tensor with shape {input_.size()}"
|
||||
)
|
||||
assert self.device_communicator is not None, "device_communicator should be initialized when world_size > 1"
|
||||
return self.device_communicator.all_to_all(input_, scatter_dim,
|
||||
gather_dim, scatter_sizes,
|
||||
gather_sizes)
|
||||
|
||||
|
||||
vllm.distributed.parallel_state.GroupCoordinator = GroupCoordinatorPatch # Note: check the GroupCoordinator with online serving
|
||||
vllm.distributed.parallel_state.GroupCoordinator = GroupCoordinatorPatch
|
||||
|
||||
@@ -53,6 +53,8 @@ _SLEEP_MODE_ENABLED = None
|
||||
_CURRENT_STREAM = None
|
||||
_PREFETCH_STREAM = None
|
||||
_ASCEND_CUSTOMOP_IS_REIGISTERED = False
|
||||
_DEFAULT_BUFFER_SIZE = 200
|
||||
_MIN_DP_BUFFER_SIZE = 50
|
||||
|
||||
|
||||
def is_310p():
|
||||
@@ -648,3 +650,51 @@ def npu_stream_switch(target_stream: torch.npu.Stream,
|
||||
return nullcontext()
|
||||
assert target_stream is not None
|
||||
return torch.npu.stream(target_stream)
|
||||
|
||||
|
||||
def create_hccl_pg_options(group_name: str):
|
||||
options = torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options()
|
||||
hccl_config = get_hccl_config_for_pg_options(group_name)
|
||||
if hccl_config is not None:
|
||||
options.hccl_config = hccl_config
|
||||
return options
|
||||
|
||||
|
||||
def get_hccl_config_for_pg_options(group_name: str) -> Optional[dict]:
|
||||
"""
|
||||
Get HCCL process group options for the given communication group name.
|
||||
|
||||
Args:
|
||||
group_name: Name of the communication group
|
||||
|
||||
Returns:
|
||||
HCCL pg_options or None for mc2 group
|
||||
"""
|
||||
# FIXME: Current mc2 operators only perform communication space partitioning
|
||||
# based on HCCL_BUFFSIZE configuration. Using pg_options with mc2 group would
|
||||
# result in memory misalignment problems.
|
||||
if group_name and "mc2" in group_name:
|
||||
return None
|
||||
hccl_config_map = {
|
||||
"dp": {
|
||||
"hccl_buffer_size": calculate_dp_buffer_size()
|
||||
},
|
||||
}
|
||||
return hccl_config_map.get(group_name, get_default_buffer_config())
|
||||
|
||||
|
||||
def get_default_buffer_config() -> dict:
|
||||
return {"hccl_buffer_size": _DEFAULT_BUFFER_SIZE}
|
||||
|
||||
|
||||
def calculate_dp_buffer_size() -> int:
|
||||
"""
|
||||
formula of dp buffer size:
|
||||
dp_size + 2 (flags: with_prefill and enable_dbo)
|
||||
"""
|
||||
from vllm.config import get_current_vllm_config
|
||||
vllm_config = get_current_vllm_config()
|
||||
dp_size = vllm_config.parallel_config.data_parallel_size
|
||||
int32_size = torch.iinfo(torch.int32).bits // 8
|
||||
dp_buffer_size = math.ceil((dp_size + 2) * int32_size / (1024 * 1024))
|
||||
return max(dp_buffer_size, _MIN_DP_BUFFER_SIZE)
|
||||
|
||||
Reference in New Issue
Block a user