forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
from . import communication_op
|
||||
from . import parallel_state
|
||||
@@ -0,0 +1,21 @@
|
||||
import torch
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
from .parallel_state import get_tp_group
|
||||
|
||||
def tensor_model_parallel_all_reduce(input_: torch.Tensor, tp_group: Any = None) -> torch.Tensor:
|
||||
"""All-reduce the input tensor across model parallel group."""
|
||||
return get_tp_group(tp_group).all_reduce(input_)
|
||||
|
||||
|
||||
def tensor_model_parallel_all_gather(input_: torch.Tensor,
|
||||
dim: int = -1, tp_group: Any = None) -> torch.Tensor:
|
||||
"""All-gather the input tensor across model parallel group."""
|
||||
return get_tp_group(tp_group).all_gather(input_, dim)
|
||||
|
||||
|
||||
def tensor_model_parallel_gather(input_: torch.Tensor,
|
||||
dst: int = 0,
|
||||
dim: int = -1, tp_group: Any = None) -> Optional[torch.Tensor]:
|
||||
"""Gather the input tensor across model parallel group."""
|
||||
return get_tp_group(tp_group).gather(input_, dst, dim)
|
||||
@@ -0,0 +1,339 @@
|
||||
import torch
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.distributed.parallel_state import (init_model_parallel_group, get_tensor_model_parallel_world_size,
|
||||
get_tensor_model_parallel_rank, get_world_group, get_pp_group,
|
||||
GroupCoordinator)
|
||||
import vllm.distributed.parallel_state as parallel_state_org
|
||||
from vllm.distributed.parallel_state import model_parallel_is_initialized as model_parallel_is_initialized_org
|
||||
from vllm.distributed.parallel_state import destroy_model_parallel as destroy_model_parallel_org
|
||||
|
||||
def get_tp_group(tp_group: Any = None) -> GroupCoordinator:
|
||||
if tp_group is not None:
|
||||
return tp_group
|
||||
assert parallel_state_org._TP is not None, ("tensor model parallel group is not initialized")
|
||||
return parallel_state_org._TP
|
||||
|
||||
_CP: Optional[GroupCoordinator] = None
|
||||
|
||||
def get_cp_group() -> GroupCoordinator:
|
||||
assert _CP is not None, ("context parallel group is not initialized")
|
||||
return _CP
|
||||
|
||||
# kept for backward compatibility
|
||||
get_context_model_parallel_group = get_cp_group
|
||||
|
||||
_MOE_TP: Optional[GroupCoordinator] = None
|
||||
|
||||
def get_moe_tp_group() -> GroupCoordinator:
|
||||
assert _MOE_TP is not None, ("moe tensor parallel group is not initialized")
|
||||
return _MOE_TP
|
||||
|
||||
# kept for backward compatibility
|
||||
get_moe_tensor_parallel_group = get_moe_tp_group
|
||||
|
||||
_MOE_EP: Optional[GroupCoordinator] = None
|
||||
|
||||
def get_moe_ep_group() -> GroupCoordinator:
|
||||
assert _MOE_EP is not None, ("moe expert parallel group is not initialized")
|
||||
return _MOE_EP
|
||||
|
||||
|
||||
# kept for backward compatibility
|
||||
get_moe_expert_parallel_group = get_moe_ep_group
|
||||
|
||||
|
||||
def initialize_model_parallel(
|
||||
parallel_config: ParallelConfig,
|
||||
backend: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize model parallel groups.
|
||||
|
||||
Arguments:
|
||||
tensor_model_parallel_size: number of GPUs used for tensor model
|
||||
parallelism.
|
||||
pipeline_model_parallel_size: number of GPUs used for pipeline model
|
||||
parallelism.
|
||||
|
||||
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
|
||||
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
|
||||
the model pipeline. The present function will
|
||||
create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
|
||||
4 tensor model-parallel groups:
|
||||
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
|
||||
2 pipeline model-parallel groups:
|
||||
[g0, g2, g4, g6], [g1, g3, g5, g7]
|
||||
Note that for efficiency, the caller should make sure adjacent ranks
|
||||
are on the same DGX box. For example if we are using 2 DGX-1 boxes
|
||||
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
|
||||
ranks 8 to 15 belong to the second box.
|
||||
"""
|
||||
# Get world size and rank. Ensure some consistencies.
|
||||
assert torch.distributed.is_initialized()
|
||||
world_size: int = torch.distributed.get_world_size()
|
||||
backend = backend or torch.distributed.get_backend(
|
||||
get_world_group().device_group)
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: get parallel_size from parallel_config and valid world_size
|
||||
'''
|
||||
tensor_model_parallel_size = parallel_config.tensor_parallel_size
|
||||
pipeline_model_parallel_size = parallel_config.pipeline_parallel_size
|
||||
context_model_parallel_size = parallel_config.context_parallel_size
|
||||
moe_tensor_parallel_size = parallel_config.moe_tp_size
|
||||
moe_expert_parallel_size = parallel_config.moe_ep_size
|
||||
|
||||
if (world_size !=
|
||||
tensor_model_parallel_size * pipeline_model_parallel_size * context_model_parallel_size):
|
||||
raise RuntimeError(
|
||||
f"world_size ({world_size}) is not equal to "
|
||||
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
|
||||
f"pipeline_model_parallel_size ({pipeline_model_parallel_size}) x"
|
||||
f"context_model_parallel_size ({context_model_parallel_size})")
|
||||
|
||||
if (moe_tensor_parallel_size < 1 or moe_expert_parallel_size < 1 or tensor_model_parallel_size !=
|
||||
moe_tensor_parallel_size * moe_expert_parallel_size):
|
||||
raise RuntimeError(
|
||||
f"tensor_model_parallel_size ({world_size}) is not equal to "
|
||||
f"moe_tensor_parallel_size ({moe_tensor_parallel_size}) x "
|
||||
f"moe_expert_parallel_size ({moe_expert_parallel_size})")
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
# Build the tensor model-parallel groups.
|
||||
num_tensor_model_parallel_groups: int = (world_size //
|
||||
tensor_model_parallel_size)
|
||||
assert parallel_state_org._TP is None, ("tensor model parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_tensor_model_parallel_groups):
|
||||
ranks = list(
|
||||
range(i * tensor_model_parallel_size,
|
||||
(i + 1) * tensor_model_parallel_size))
|
||||
group_ranks.append(ranks)
|
||||
|
||||
# message queue broadcaster is only used in tensor model parallel group
|
||||
parallel_state_org._TP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_message_queue_broadcaster=True,
|
||||
group_name="tp")
|
||||
|
||||
# Build the pipeline model-parallel groups.
|
||||
num_pipeline_model_parallel_groups: int = (world_size //
|
||||
pipeline_model_parallel_size)
|
||||
assert parallel_state_org._PP is None, (
|
||||
"pipeline model parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_pipeline_model_parallel_groups):
|
||||
ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
|
||||
group_ranks.append(ranks)
|
||||
# pipeline parallel does not need custom allreduce
|
||||
parallel_state_org._PP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_custom_allreduce=False,
|
||||
group_name="pp")
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add _CP, _MOE_TP, MOE_EP
|
||||
'''
|
||||
# Build the context parallel groups.
|
||||
num_context_model_parallel_groups: int = (world_size //
|
||||
context_model_parallel_size)
|
||||
global _CP
|
||||
assert _CP is None, (
|
||||
"context parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_context_model_parallel_groups):
|
||||
ranks = list(range(i, context_model_parallel_size * tensor_model_parallel_size + i, tensor_model_parallel_size))
|
||||
group_ranks.append(ranks)
|
||||
# message queue broadcaster is set to be used in context parallel group
|
||||
_CP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_message_queue_broadcaster=True,
|
||||
group_name="cp")
|
||||
|
||||
# Build the moe tensor parallel groups.
|
||||
global _MOE_TP
|
||||
assert _MOE_TP is None, ("moe tensor parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_tensor_model_parallel_groups):
|
||||
for j in range(moe_expert_parallel_size):
|
||||
ranks = list(range(i * tensor_model_parallel_size + j, (i + 1) * tensor_model_parallel_size,
|
||||
moe_expert_parallel_size))
|
||||
group_ranks.append(ranks)
|
||||
|
||||
# message queue broadcaster is set to be used in moe tensor parallel group
|
||||
_MOE_TP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_message_queue_broadcaster=True,
|
||||
group_name="moe_tp")
|
||||
|
||||
# Build the moe expert parallel groups.
|
||||
global _MOE_EP
|
||||
assert _MOE_EP is None, ("moe expert parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_tensor_model_parallel_groups):
|
||||
for j in range(moe_tensor_parallel_size):
|
||||
ranks = range(i * tensor_model_parallel_size + j * moe_expert_parallel_size,
|
||||
i * tensor_model_parallel_size + (j + 1) * moe_expert_parallel_size)
|
||||
group_ranks.append(ranks)
|
||||
|
||||
# message queue broadcaster is set to be used in moe expert parallel group
|
||||
_MOE_EP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_message_queue_broadcaster=True,
|
||||
group_name="moe_ep")
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
def ensure_model_parallel_initialized(
|
||||
parallel_config: ParallelConfig,
|
||||
backend: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Helper to initialize model parallel groups if they are not initialized,
|
||||
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
|
||||
values if the model parallel groups are initialized.
|
||||
"""
|
||||
backend = backend or torch.distributed.get_backend(
|
||||
get_world_group().device_group)
|
||||
if not model_parallel_is_initialized():
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace all parallel_size to parallel_config
|
||||
'''
|
||||
initialize_model_parallel(parallel_config, backend)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
return
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: check parallel_size with prefix parallel_config
|
||||
'''
|
||||
assert (
|
||||
get_tensor_model_parallel_world_size() == parallel_config.tensor_model_parallel_size
|
||||
), ("tensor parallel group already initialized, but of unexpected size: "
|
||||
f"{get_tensor_model_parallel_world_size()=} vs. "
|
||||
f"{parallel_config.tensor_model_parallel_size=}")
|
||||
pp_world_size = get_pp_group().world_size
|
||||
assert (pp_world_size == parallel_config.pipeline_model_parallel_size), (
|
||||
"pipeline parallel group already initialized, but of unexpected size: "
|
||||
f"{pp_world_size=} vs. "
|
||||
f"{parallel_config.pipeline_model_parallel_size=}")
|
||||
cp_world_size = get_cp_group().world_size
|
||||
assert (cp_world_size == parallel_config.context_parallel_size), (
|
||||
"context parallel group already initialized, but of unexpected size: "
|
||||
f"{cp_world_size=} vs. "
|
||||
f"{parallel_config.context_parallel_size=}")
|
||||
moe_tp_world_size = get_moe_tp_group().world_size
|
||||
assert (moe_tp_world_size == parallel_config.moe_tp_size), (
|
||||
"moe tensor parallel group already initialized, but of unexpected size: "
|
||||
f"{moe_tp_world_size=} vs. "
|
||||
f"{parallel_config.moe_tp_size=}")
|
||||
moe_ep_world_size = get_moe_ep_group().world_size
|
||||
assert (moe_ep_world_size == parallel_config.moe_ep_size), (
|
||||
"moe expert parallel group already initialized, but of unexpected size: "
|
||||
f"{moe_ep_world_size=} vs. "
|
||||
f"{parallel_config.moe_ep_size=}")
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
def model_parallel_is_initialized():
|
||||
"""Check if tensor, pipeline, context, moe parallel groups are initialized."""
|
||||
return model_parallel_is_initialized_org and (_CP is not None and _CP is not None) and (
|
||||
_MOE_TP is not None and _MOE_TP is not None) and (_MOE_EP is not None and _MOE_EP is not None)
|
||||
|
||||
|
||||
def destroy_model_parallel():
|
||||
"""Set the groups to none and destroy them."""
|
||||
destroy_model_parallel_org()
|
||||
global _CP
|
||||
if _CP:
|
||||
_CP.destroy()
|
||||
_CP = None
|
||||
|
||||
global _MOE_TP
|
||||
if _MOE_TP:
|
||||
_MOE_TP.destroy()
|
||||
_MOE_TP = None
|
||||
|
||||
global _MOE_EP
|
||||
if _MOE_EP:
|
||||
_MOE_EP.destroy()
|
||||
_MOE_EP = None
|
||||
|
||||
|
||||
def get_context_model_parallel_world_size():
|
||||
"""Return world size for the context parallel group."""
|
||||
return get_cp_group().world_size
|
||||
|
||||
|
||||
def get_context_model_parallel_rank():
|
||||
"""Return my rank for the context parallel group."""
|
||||
return get_cp_group().rank_in_group
|
||||
|
||||
|
||||
def get_moe_tensor_parallel_world_size():
|
||||
"""Return world size for the moe tensor parallel group."""
|
||||
return get_moe_tp_group().world_size
|
||||
|
||||
|
||||
def get_moe_tensor_parallel_rank():
|
||||
"""Return my rank for the moe tensor parallel group."""
|
||||
return get_moe_tp_group().rank_in_group
|
||||
|
||||
|
||||
def get_moe_expert_parallel_world_size():
|
||||
"""Return world size for the moe expert parallel group."""
|
||||
return get_moe_ep_group().world_size
|
||||
|
||||
|
||||
def get_moe_expert_parallel_rank():
|
||||
"""Return my rank for the moe expert parallel group."""
|
||||
return get_moe_ep_group().rank_in_group
|
||||
|
||||
|
||||
def get_parallel_world_size_with_group(group):
|
||||
"""Return world size for the special group."""
|
||||
if group is not None:
|
||||
return group.world_size
|
||||
else:
|
||||
return get_tensor_model_parallel_world_size()
|
||||
|
||||
|
||||
def get_parallel_rank_with_group(group):
|
||||
"""Return my rank for the special group."""
|
||||
if group is not None:
|
||||
return group.rank_in_group
|
||||
else:
|
||||
return get_tensor_model_parallel_rank()
|
||||
Reference in New Issue
Block a user