Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -24,7 +24,6 @@ logger = init_logger(__name__)
|
||||
|
||||
def start_async_worker(
|
||||
state: "EplbState",
|
||||
rank_mapping: dict[int, int] | None = None,
|
||||
is_profile: bool = False,
|
||||
) -> threading.Thread:
|
||||
eplb_group = get_eplb_group().device_group
|
||||
@@ -45,7 +44,6 @@ def start_async_worker(
|
||||
eplb_group=eplb_group,
|
||||
cuda_stream=cuda_stream,
|
||||
is_profile=is_profile,
|
||||
rank_mapping=rank_mapping,
|
||||
)
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - diagnostic path
|
||||
@@ -107,7 +105,6 @@ async def transfer_run_periodically(
|
||||
eplb_group: ProcessGroup,
|
||||
cuda_stream: torch.cuda.Stream,
|
||||
is_profile: bool = False,
|
||||
rank_mapping: dict[int, int] | None = None,
|
||||
) -> None:
|
||||
while True:
|
||||
await asyncio.to_thread(state.rearrange_event.wait)
|
||||
@@ -176,7 +173,6 @@ async def transfer_run_periodically(
|
||||
ep_group=eplb_group,
|
||||
is_profile=is_profile,
|
||||
cuda_stream=cuda_stream,
|
||||
rank_mapping=rank_mapping,
|
||||
)
|
||||
event = torch.cuda.Event(blocking=False)
|
||||
cuda_stream.record_event(event)
|
||||
|
||||
@@ -40,6 +40,7 @@ from vllm.distributed.parallel_state import (
|
||||
get_node_count,
|
||||
in_the_same_node_as,
|
||||
)
|
||||
from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models.interfaces import MixtureOfExperts
|
||||
@@ -159,7 +160,7 @@ class EplbModelState:
|
||||
|
||||
NOTE: The expert_load_view now records load for all physical experts
|
||||
rather than just local experts. This ensures consistent load statistics
|
||||
across different dispatch methods (naive all-to-all, DeepEP, pplx-kernels).
|
||||
across different dispatch methods (naive all-to-all, DeepEP).
|
||||
The recorded load will be multiplied by dp_size when using naive all-to-all
|
||||
due to each DP rank contributing the same token set to the calculation.
|
||||
See:
|
||||
@@ -302,6 +303,14 @@ class EplbState:
|
||||
"""
|
||||
CUDA device index for the async EPLB worker thread.
|
||||
"""
|
||||
self.num_valid_physical_experts: int = 0
|
||||
"""
|
||||
Number of valid physical experts.
|
||||
This is the number of physical experts that are
|
||||
actually mapped to logical experts. In elastic EP,
|
||||
newly started EP ranks may not have physical experts
|
||||
mapped yet.
|
||||
"""
|
||||
if self.device.type == "cuda":
|
||||
self.cuda_device_index = self.device.index
|
||||
if self.cuda_device_index is None and torch.cuda.is_available():
|
||||
@@ -367,9 +376,6 @@ class EplbState:
|
||||
self,
|
||||
model: MixtureOfExperts,
|
||||
model_config: ModelConfig,
|
||||
global_expert_load: torch.Tensor | None = None,
|
||||
old_global_expert_indices: torch.Tensor | None = None,
|
||||
rank_mapping: dict[int, int] | None = None,
|
||||
):
|
||||
"""
|
||||
Build the initial EPLB state.
|
||||
@@ -462,75 +468,15 @@ class EplbState:
|
||||
)
|
||||
self.expert_rearrangement_step_interval = eplb_step_interval
|
||||
|
||||
# Set the policy based on the selected eplb algorithm type.
|
||||
policy_type = self.parallel_config.eplb_config.policy
|
||||
self.policy = EPLB_POLICIES[policy_type]
|
||||
logger.debug("Selected EPLB policy: %s", policy_type)
|
||||
if global_expert_load is not None:
|
||||
ep_group = get_ep_group().device_group
|
||||
assert global_expert_load.shape == (
|
||||
model.num_moe_layers,
|
||||
model.num_logical_experts,
|
||||
)
|
||||
assert global_expert_load.dtype == torch.int64
|
||||
|
||||
num_replicas = model.num_physical_experts
|
||||
num_groups = model.num_expert_groups
|
||||
num_nodes = get_node_count()
|
||||
num_gpus = ep_group.size()
|
||||
|
||||
if num_gpus % num_nodes != 0:
|
||||
num_nodes = 1
|
||||
logger.warning_once(
|
||||
f"num_gpus % num_nodes != 0, "
|
||||
"not using hierarchical rearrangement algorithm.\n"
|
||||
f"{num_gpus=}, {num_nodes=}"
|
||||
)
|
||||
|
||||
# Get new expert mappings
|
||||
(
|
||||
new_physical_to_logical_map,
|
||||
new_logical_to_physical_map,
|
||||
new_logical_replica_count,
|
||||
) = self.policy.rebalance_experts(
|
||||
global_expert_load,
|
||||
num_replicas,
|
||||
num_groups,
|
||||
num_nodes,
|
||||
num_gpus,
|
||||
)
|
||||
|
||||
max_physical_slots = new_logical_to_physical_map.shape[-1]
|
||||
assert max_physical_slots <= logical_to_physical_map.shape[-1]
|
||||
new_logical_to_physical_map = torch.nn.functional.pad(
|
||||
new_logical_to_physical_map,
|
||||
(0, logical_to_physical_map.shape[-1] - max_physical_slots),
|
||||
value=-1,
|
||||
)
|
||||
physical_to_logical_map = new_physical_to_logical_map.to(self.device)
|
||||
logical_to_physical_map.copy_(new_logical_to_physical_map)
|
||||
logical_replica_count.copy_(new_logical_replica_count)
|
||||
else:
|
||||
new_physical_to_logical_map = None
|
||||
|
||||
new_logical_to_physical_map = None
|
||||
|
||||
new_logical_replica_count = None
|
||||
model.set_eplb_state(
|
||||
expert_load_pass,
|
||||
logical_to_physical_map,
|
||||
logical_replica_count,
|
||||
)
|
||||
if global_expert_load is not None:
|
||||
rearrange_expert_weights_inplace(
|
||||
old_global_expert_indices,
|
||||
new_physical_to_logical_map,
|
||||
model.expert_weights,
|
||||
ep_group,
|
||||
False,
|
||||
rank_mapping,
|
||||
)
|
||||
self.expert_rearrangement_step = 0
|
||||
|
||||
expert_buffer = [torch.empty_like(w) for w in model.expert_weights[0]]
|
||||
|
||||
@@ -561,11 +507,12 @@ class EplbState:
|
||||
recv_dst_rows=np.array([]),
|
||||
),
|
||||
cuda_device_index=self.cuda_device_index,
|
||||
new_physical_to_logical_map=new_physical_to_logical_map,
|
||||
new_logical_to_physical_map=new_logical_to_physical_map,
|
||||
new_logical_replica_count=new_logical_replica_count,
|
||||
new_physical_to_logical_map=None,
|
||||
new_logical_to_physical_map=None,
|
||||
new_logical_replica_count=None,
|
||||
)
|
||||
self.model_states[model_config.compute_hash()] = model_state
|
||||
self.num_valid_physical_experts = model.num_physical_experts
|
||||
|
||||
def step(
|
||||
self,
|
||||
@@ -696,8 +643,6 @@ class EplbState:
|
||||
def rearrange(
|
||||
self,
|
||||
is_profile: bool = False,
|
||||
execute_shuffle: bool = True,
|
||||
global_expert_loads: list[torch.Tensor] | None = None,
|
||||
rank_mapping: dict[int, int] | None = None,
|
||||
) -> torch.Tensor | None:
|
||||
"""
|
||||
@@ -707,12 +652,6 @@ class EplbState:
|
||||
is_profile (bool): If `True`, perform a dummy rearrangement.
|
||||
This is used in `profile_run` to reserve enough memory,
|
||||
no memory movement will be performed. Default is False.
|
||||
execute_shuffle (bool): If `True`, execute the shuffle
|
||||
in elastic expert parallel (EEP). Default is True.
|
||||
global_expert_loads (list[torch.Tensor] | None): The global expert
|
||||
loads when scaling is done in EEP.
|
||||
List of expert loads for the main and drafter
|
||||
(when spec decode is used) models.
|
||||
rank_mapping (dict[int, int] | None): The rank mapping
|
||||
when scaling is done in EEP.
|
||||
"""
|
||||
@@ -734,67 +673,34 @@ class EplbState:
|
||||
"(profile)" if is_profile else "",
|
||||
)
|
||||
|
||||
if global_expert_loads is None:
|
||||
# Map the physical expert load to global logical experts
|
||||
global_expert_load_windows = []
|
||||
if not execute_shuffle:
|
||||
num_models = torch.tensor(
|
||||
[len(self.model_states)], dtype=torch.int32, device="cpu"
|
||||
)
|
||||
torch.distributed.broadcast(
|
||||
num_models, group=get_ep_group().cpu_group, group_src=0
|
||||
)
|
||||
|
||||
for eplb_model_state in self.model_states.values():
|
||||
logical_expert_load_window = torch.zeros(
|
||||
self.expert_load_window_size,
|
||||
eplb_model_state.model.num_moe_layers,
|
||||
eplb_model_state.model.num_logical_experts,
|
||||
dtype=eplb_model_state.expert_load_window.dtype,
|
||||
device=eplb_model_state.expert_load_window.device,
|
||||
)
|
||||
logical_expert_load_window.scatter_add_(
|
||||
dim=-1,
|
||||
index=eplb_model_state.physical_to_logical_map.unsqueeze(0)
|
||||
.expand_as(eplb_model_state.expert_load_window)
|
||||
.long(),
|
||||
src=eplb_model_state.expert_load_window,
|
||||
)
|
||||
|
||||
if not execute_shuffle:
|
||||
metadata = torch.tensor(
|
||||
[
|
||||
eplb_model_state.model.num_moe_layers,
|
||||
eplb_model_state.model.num_logical_experts,
|
||||
eplb_model_state.physical_to_logical_map.shape[1],
|
||||
],
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
)
|
||||
torch.distributed.broadcast(
|
||||
metadata, group=get_ep_group().cpu_group, group_src=0
|
||||
)
|
||||
|
||||
global_expert_load_window = logical_expert_load_window.sum(dim=0)
|
||||
global_expert_load_windows.append(global_expert_load_window)
|
||||
# Perform all-reduce to get the expert load across all ranks for each model
|
||||
global_expert_load_windows = self._allreduce_list(
|
||||
global_expert_load_windows
|
||||
# Map the physical expert load to global logical experts
|
||||
global_expert_load_windows = []
|
||||
for eplb_model_state in self.model_states.values():
|
||||
expert_load_window = eplb_model_state.expert_load_window[
|
||||
:, :, : self.num_valid_physical_experts
|
||||
]
|
||||
logical_expert_load_window = torch.zeros(
|
||||
self.expert_load_window_size,
|
||||
eplb_model_state.model.num_moe_layers,
|
||||
eplb_model_state.model.num_logical_experts,
|
||||
dtype=eplb_model_state.expert_load_window.dtype,
|
||||
device=eplb_model_state.expert_load_window.device,
|
||||
)
|
||||
if not execute_shuffle:
|
||||
for eplb_model_state, global_expert_load_window in zip(
|
||||
self.model_states.values(), global_expert_load_windows
|
||||
):
|
||||
# (num_moe_layers, old_num_physical_experts)
|
||||
old_global_expert_indices = eplb_model_state.physical_to_logical_map
|
||||
torch.distributed.broadcast(
|
||||
old_global_expert_indices, group=ep_group, group_src=0
|
||||
)
|
||||
if not execute_shuffle:
|
||||
return global_expert_load_windows
|
||||
else:
|
||||
assert execute_shuffle
|
||||
global_expert_load_windows = global_expert_loads
|
||||
logical_expert_load_window.scatter_add_(
|
||||
dim=-1,
|
||||
index=eplb_model_state.physical_to_logical_map[
|
||||
:, : self.num_valid_physical_experts
|
||||
]
|
||||
.unsqueeze(0)
|
||||
.expand_as(expert_load_window)
|
||||
.long(),
|
||||
src=expert_load_window,
|
||||
)
|
||||
|
||||
global_expert_load_window = logical_expert_load_window.sum(dim=0)
|
||||
global_expert_load_windows.append(global_expert_load_window)
|
||||
# Perform all-reduce to get the expert load across all ranks for each model
|
||||
global_expert_load_windows = self._allreduce_list(global_expert_load_windows)
|
||||
|
||||
# TODO(bowen): Treat differently for prefill and decode nodes
|
||||
eplb_model_state = next(iter(self.model_states.values()))
|
||||
@@ -806,8 +712,10 @@ class EplbState:
|
||||
# NOTE(yongji): scale down, we need to rebalance the experts on
|
||||
# remaining GPUs, transfer the experts while we haven't shutdown
|
||||
# the GPUs to be released.
|
||||
cpu_group = get_ep_group().cpu_group
|
||||
num_nodes = _node_count_with_rank_mapping(cpu_group, rank_mapping)
|
||||
coordinator = get_ep_group()
|
||||
assert isinstance(coordinator, StatelessGroupCoordinator)
|
||||
tcp_store_group = coordinator.tcp_store_group
|
||||
num_nodes = _node_count_with_rank_mapping(tcp_store_group, rank_mapping)
|
||||
num_gpus = sum(new_rank != -1 for new_rank in rank_mapping.values())
|
||||
num_replicas = (
|
||||
num_replicas // ep_group.size() * num_gpus
|
||||
@@ -933,7 +841,6 @@ class EplbState:
|
||||
if self.async_worker is None:
|
||||
self.async_worker = start_async_worker(
|
||||
self,
|
||||
rank_mapping=rank_mapping,
|
||||
is_profile=is_profile,
|
||||
)
|
||||
|
||||
@@ -1089,83 +996,6 @@ class EplbState:
|
||||
model_state.new_logical_to_physical_map = None
|
||||
model_state.new_logical_replica_count = None
|
||||
|
||||
@staticmethod
|
||||
def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
|
||||
"""
|
||||
Receive the expert load and old placement from the master rank.
|
||||
"""
|
||||
ep_group = get_ep_group()
|
||||
num_models = torch.empty(1, dtype=torch.int32, device="cpu")
|
||||
torch.distributed.broadcast(num_models, group=ep_group.cpu_group, group_src=0)
|
||||
num_models = num_models.item()
|
||||
global_expert_loads = []
|
||||
old_global_expert_indices_per_model = []
|
||||
for _ in range(num_models):
|
||||
metadata = torch.empty(3, dtype=torch.int32, device="cpu")
|
||||
torch.distributed.broadcast(metadata, group=ep_group.cpu_group, group_src=0)
|
||||
num_moe_layers, num_logical_experts, num_old_physical_experts = (
|
||||
metadata.tolist()
|
||||
)
|
||||
global_expert_load = torch.zeros(
|
||||
(num_moe_layers, num_logical_experts),
|
||||
dtype=torch.int64,
|
||||
device=ep_group.device,
|
||||
)
|
||||
all_reduce(global_expert_load, group=ep_group.device_group)
|
||||
old_global_expert_indices = torch.empty(
|
||||
(num_moe_layers, num_old_physical_experts),
|
||||
dtype=torch.int64,
|
||||
device=ep_group.device,
|
||||
)
|
||||
torch.distributed.broadcast(
|
||||
old_global_expert_indices,
|
||||
group=ep_group.device_group,
|
||||
group_src=0,
|
||||
)
|
||||
global_expert_loads.append(global_expert_load)
|
||||
old_global_expert_indices_per_model.append(old_global_expert_indices)
|
||||
return global_expert_loads, old_global_expert_indices_per_model
|
||||
|
||||
@classmethod
|
||||
def get_eep_state(
|
||||
cls, parallel_config: ParallelConfig
|
||||
) -> tuple[
|
||||
list[torch.Tensor] | None,
|
||||
list[torch.Tensor] | None,
|
||||
dict[int, int] | None,
|
||||
]:
|
||||
num_local_physical_experts = torch.empty(1, dtype=torch.int32, device="cpu")
|
||||
torch.distributed.broadcast(
|
||||
num_local_physical_experts,
|
||||
group=get_ep_group().cpu_group,
|
||||
group_src=0,
|
||||
)
|
||||
num_local_physical_experts = int(num_local_physical_experts.item())
|
||||
new_ep_size = get_ep_group().world_size
|
||||
global_expert_loads, old_global_expert_indices_per_model = (
|
||||
EplbState.recv_state()
|
||||
)
|
||||
|
||||
# EP configuration for all models has to be the same so as eplb config
|
||||
num_logical_experts = global_expert_loads[0].shape[1]
|
||||
parallel_config.eplb_config.num_redundant_experts = (
|
||||
num_local_physical_experts * new_ep_size - num_logical_experts
|
||||
)
|
||||
assert (
|
||||
old_global_expert_indices_per_model[0].shape[1] % num_local_physical_experts
|
||||
== 0
|
||||
)
|
||||
old_ep_size = (
|
||||
old_global_expert_indices_per_model[0].shape[1]
|
||||
// num_local_physical_experts
|
||||
)
|
||||
rank_mapping = {old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)}
|
||||
return (
|
||||
global_expert_loads,
|
||||
old_global_expert_indices_per_model,
|
||||
rank_mapping,
|
||||
)
|
||||
|
||||
def _allreduce_list(self, tensor_list: list[torch.Tensor]) -> list[torch.Tensor]:
|
||||
"""
|
||||
All-reduce a list of tensors.
|
||||
@@ -1203,6 +1033,60 @@ class EplbState:
|
||||
load_pass_list.append(eplb_model_state.expert_load_pass.clone())
|
||||
return self._allreduce_list(load_pass_list)
|
||||
|
||||
@classmethod
|
||||
def from_mapping(
|
||||
cls,
|
||||
model: MixtureOfExperts,
|
||||
model_config: ModelConfig,
|
||||
device: torch.device,
|
||||
parallel_config: ParallelConfig,
|
||||
expanded_physical_to_logical: torch.Tensor,
|
||||
num_valid_physical_experts: int,
|
||||
) -> "EplbState":
|
||||
eplb_state = cls(
|
||||
parallel_config=parallel_config,
|
||||
device=device,
|
||||
)
|
||||
eplb_state.add_model(
|
||||
model=model,
|
||||
model_config=model_config,
|
||||
)
|
||||
eplb_state.num_valid_physical_experts = num_valid_physical_experts
|
||||
num_moe_layers = expanded_physical_to_logical.shape[0]
|
||||
num_physical_experts = expanded_physical_to_logical.shape[1]
|
||||
eplb_model_state = eplb_state.model_states[model_config.compute_hash()]
|
||||
eplb_model_state.physical_to_logical_map.copy_(expanded_physical_to_logical)
|
||||
|
||||
logical_to_physical_map = torch.full(
|
||||
(
|
||||
num_moe_layers,
|
||||
model.num_logical_experts,
|
||||
eplb_model_state.logical_to_physical_map.shape[2],
|
||||
),
|
||||
-1,
|
||||
dtype=torch.int64,
|
||||
)
|
||||
logical_replica_count = torch.zeros(
|
||||
(num_moe_layers, model.num_logical_experts),
|
||||
dtype=torch.int64,
|
||||
)
|
||||
expanded_physical_to_logical_numpy = expanded_physical_to_logical.cpu().numpy()
|
||||
for layer_idx in range(num_moe_layers):
|
||||
for phys_idx in range(num_physical_experts):
|
||||
logical_idx = expanded_physical_to_logical_numpy[layer_idx, phys_idx]
|
||||
if logical_idx >= 0:
|
||||
replica_idx = logical_replica_count[layer_idx, logical_idx]
|
||||
logical_to_physical_map[layer_idx, logical_idx, replica_idx] = (
|
||||
phys_idx
|
||||
)
|
||||
logical_replica_count[layer_idx, logical_idx] += 1
|
||||
|
||||
logical_to_physical_map = logical_to_physical_map.to(device)
|
||||
logical_replica_count = logical_replica_count.to(device)
|
||||
eplb_model_state.logical_to_physical_map.copy_(logical_to_physical_map)
|
||||
eplb_model_state.logical_replica_count.copy_(logical_replica_count)
|
||||
return eplb_state
|
||||
|
||||
|
||||
@dataclass
|
||||
class EplbLayerState:
|
||||
|
||||
@@ -19,6 +19,8 @@ from torch.distributed import (
|
||||
get_global_rank,
|
||||
)
|
||||
|
||||
from vllm.distributed.parallel_state import get_ep_group
|
||||
from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -249,10 +251,18 @@ def move_to_buffer(
|
||||
b[dst].copy_(w[src_local], non_blocking=True)
|
||||
|
||||
p2p_ops: list[P2POp] = []
|
||||
if isinstance(get_ep_group(), StatelessGroupCoordinator):
|
||||
ep_group = get_ep_group()
|
||||
is_stateless = True
|
||||
else:
|
||||
is_stateless = False
|
||||
|
||||
# Pre-compute global ranks mapping
|
||||
# Pre-compute global ranks mapping (only needed for non-stateless groups)
|
||||
ep_size = ep_group.size()
|
||||
rank_to_global = {rank: get_global_rank(ep_group, rank) for rank in range(ep_size)}
|
||||
if not is_stateless:
|
||||
rank_to_global = {
|
||||
rank: get_global_rank(ep_group, rank) for rank in range(ep_size)
|
||||
}
|
||||
|
||||
# 2. Post sends
|
||||
if send_count > 0:
|
||||
@@ -284,15 +294,23 @@ def move_to_buffer(
|
||||
if recver_pos < len(ranks_to_recv):
|
||||
recv_ranks.append(ranks_to_recv[recver_pos])
|
||||
for dst in recv_ranks:
|
||||
dst_global = rank_to_global[dst]
|
||||
p2p_ops += [
|
||||
P2POp(
|
||||
torch.distributed.isend,
|
||||
w[src],
|
||||
dst_global,
|
||||
)
|
||||
for w in expert_weights
|
||||
]
|
||||
if is_stateless:
|
||||
for w in expert_weights:
|
||||
op = object.__new__(P2POp)
|
||||
op.op = torch.distributed.isend
|
||||
op.tensor = w[src]
|
||||
op.group_peer = dst
|
||||
p2p_ops.append(op)
|
||||
else:
|
||||
dst_global = rank_to_global[dst]
|
||||
p2p_ops += [
|
||||
P2POp(
|
||||
torch.distributed.isend,
|
||||
w[src],
|
||||
dst_global,
|
||||
)
|
||||
for w in expert_weights
|
||||
]
|
||||
|
||||
# 3. Post recvs
|
||||
if recv_count > 0:
|
||||
@@ -321,26 +339,40 @@ def move_to_buffer(
|
||||
src = ranks_to_send[recver_pos // num_dst_per_sender]
|
||||
else:
|
||||
src = ranks_to_send[recver_pos - remainder_start]
|
||||
src_global = rank_to_global[src]
|
||||
p2p_ops += [
|
||||
P2POp(
|
||||
torch.distributed.irecv,
|
||||
b[dst],
|
||||
src_global,
|
||||
)
|
||||
for b in expert_weights_buffers
|
||||
]
|
||||
if is_stateless:
|
||||
for b in expert_weights_buffers:
|
||||
op = object.__new__(P2POp)
|
||||
op.op = torch.distributed.irecv
|
||||
op.tensor = b[dst]
|
||||
op.group_peer = src
|
||||
p2p_ops.append(op)
|
||||
else:
|
||||
src_global = rank_to_global[src]
|
||||
p2p_ops += [
|
||||
P2POp(
|
||||
torch.distributed.irecv,
|
||||
b[dst],
|
||||
src_global,
|
||||
)
|
||||
for b in expert_weights_buffers
|
||||
]
|
||||
|
||||
# 4. Execute the P2P operations. The real communication happens here.
|
||||
if p2p_ops and cuda_stream is not None:
|
||||
with torch.cuda.stream(cuda_stream):
|
||||
if is_stateless:
|
||||
ep_group.device_communicator.batch_isend_irecv(p2p_ops)
|
||||
else:
|
||||
reqs = batch_isend_irecv(p2p_ops)
|
||||
for req in reqs:
|
||||
req.wait()
|
||||
elif p2p_ops:
|
||||
if is_stateless:
|
||||
ep_group.device_communicator.batch_isend_irecv(p2p_ops)
|
||||
else:
|
||||
reqs = batch_isend_irecv(p2p_ops)
|
||||
for req in reqs:
|
||||
req.wait()
|
||||
elif p2p_ops:
|
||||
reqs = batch_isend_irecv(p2p_ops)
|
||||
for req in reqs:
|
||||
req.wait()
|
||||
# wait for the communication to finish
|
||||
return (
|
||||
is_unchanged,
|
||||
|
||||
Reference in New Issue
Block a user