add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
from . import communication_op
from . import parallel_state

View File

@@ -0,0 +1,21 @@
import torch
from typing import Any, Dict, Optional, Union
from .parallel_state import get_tp_group
def tensor_model_parallel_all_reduce(input_: torch.Tensor, tp_group: Any = None) -> torch.Tensor:
"""All-reduce the input tensor across model parallel group."""
return get_tp_group(tp_group).all_reduce(input_)
def tensor_model_parallel_all_gather(input_: torch.Tensor,
dim: int = -1, tp_group: Any = None) -> torch.Tensor:
"""All-gather the input tensor across model parallel group."""
return get_tp_group(tp_group).all_gather(input_, dim)
def tensor_model_parallel_gather(input_: torch.Tensor,
dst: int = 0,
dim: int = -1, tp_group: Any = None) -> Optional[torch.Tensor]:
"""Gather the input tensor across model parallel group."""
return get_tp_group(tp_group).gather(input_, dst, dim)

View File

@@ -0,0 +1,339 @@
import torch
from typing import Any, Dict, List, Optional, Tuple, Union
from vllm.config import ParallelConfig
from vllm.distributed.parallel_state import (init_model_parallel_group, get_tensor_model_parallel_world_size,
get_tensor_model_parallel_rank, get_world_group, get_pp_group,
GroupCoordinator)
import vllm.distributed.parallel_state as parallel_state_org
from vllm.distributed.parallel_state import model_parallel_is_initialized as model_parallel_is_initialized_org
from vllm.distributed.parallel_state import destroy_model_parallel as destroy_model_parallel_org
def get_tp_group(tp_group: Any = None) -> GroupCoordinator:
if tp_group is not None:
return tp_group
assert parallel_state_org._TP is not None, ("tensor model parallel group is not initialized")
return parallel_state_org._TP
_CP: Optional[GroupCoordinator] = None
def get_cp_group() -> GroupCoordinator:
assert _CP is not None, ("context parallel group is not initialized")
return _CP
# kept for backward compatibility
get_context_model_parallel_group = get_cp_group
_MOE_TP: Optional[GroupCoordinator] = None
def get_moe_tp_group() -> GroupCoordinator:
assert _MOE_TP is not None, ("moe tensor parallel group is not initialized")
return _MOE_TP
# kept for backward compatibility
get_moe_tensor_parallel_group = get_moe_tp_group
_MOE_EP: Optional[GroupCoordinator] = None
def get_moe_ep_group() -> GroupCoordinator:
assert _MOE_EP is not None, ("moe expert parallel group is not initialized")
return _MOE_EP
# kept for backward compatibility
get_moe_expert_parallel_group = get_moe_ep_group
def initialize_model_parallel(
parallel_config: ParallelConfig,
backend: Optional[str] = None,
) -> None:
"""
Initialize model parallel groups.
Arguments:
tensor_model_parallel_size: number of GPUs used for tensor model
parallelism.
pipeline_model_parallel_size: number of GPUs used for pipeline model
parallelism.
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
the model pipeline. The present function will
create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
4 tensor model-parallel groups:
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
2 pipeline model-parallel groups:
[g0, g2, g4, g6], [g1, g3, g5, g7]
Note that for efficiency, the caller should make sure adjacent ranks
are on the same DGX box. For example if we are using 2 DGX-1 boxes
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
ranks 8 to 15 belong to the second box.
"""
# Get world size and rank. Ensure some consistencies.
assert torch.distributed.is_initialized()
world_size: int = torch.distributed.get_world_size()
backend = backend or torch.distributed.get_backend(
get_world_group().device_group)
'''
=============================
Modify by vllm_mlu
=============================
@brief: get parallel_size from parallel_config and valid world_size
'''
tensor_model_parallel_size = parallel_config.tensor_parallel_size
pipeline_model_parallel_size = parallel_config.pipeline_parallel_size
context_model_parallel_size = parallel_config.context_parallel_size
moe_tensor_parallel_size = parallel_config.moe_tp_size
moe_expert_parallel_size = parallel_config.moe_ep_size
if (world_size !=
tensor_model_parallel_size * pipeline_model_parallel_size * context_model_parallel_size):
raise RuntimeError(
f"world_size ({world_size}) is not equal to "
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
f"pipeline_model_parallel_size ({pipeline_model_parallel_size}) x"
f"context_model_parallel_size ({context_model_parallel_size})")
if (moe_tensor_parallel_size < 1 or moe_expert_parallel_size < 1 or tensor_model_parallel_size !=
moe_tensor_parallel_size * moe_expert_parallel_size):
raise RuntimeError(
f"tensor_model_parallel_size ({world_size}) is not equal to "
f"moe_tensor_parallel_size ({moe_tensor_parallel_size}) x "
f"moe_expert_parallel_size ({moe_expert_parallel_size})")
'''
==================
End of MLU Hijack
==================
'''
# Build the tensor model-parallel groups.
num_tensor_model_parallel_groups: int = (world_size //
tensor_model_parallel_size)
assert parallel_state_org._TP is None, ("tensor model parallel group is already initialized")
group_ranks = []
for i in range(num_tensor_model_parallel_groups):
ranks = list(
range(i * tensor_model_parallel_size,
(i + 1) * tensor_model_parallel_size))
group_ranks.append(ranks)
# message queue broadcaster is only used in tensor model parallel group
parallel_state_org._TP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_message_queue_broadcaster=True,
group_name="tp")
# Build the pipeline model-parallel groups.
num_pipeline_model_parallel_groups: int = (world_size //
pipeline_model_parallel_size)
assert parallel_state_org._PP is None, (
"pipeline model parallel group is already initialized")
group_ranks = []
for i in range(num_pipeline_model_parallel_groups):
ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
group_ranks.append(ranks)
# pipeline parallel does not need custom allreduce
parallel_state_org._PP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_custom_allreduce=False,
group_name="pp")
'''
=============================
Modify by vllm_mlu
=============================
@brief: add _CP, _MOE_TP, MOE_EP
'''
# Build the context parallel groups.
num_context_model_parallel_groups: int = (world_size //
context_model_parallel_size)
global _CP
assert _CP is None, (
"context parallel group is already initialized")
group_ranks = []
for i in range(num_context_model_parallel_groups):
ranks = list(range(i, context_model_parallel_size * tensor_model_parallel_size + i, tensor_model_parallel_size))
group_ranks.append(ranks)
# message queue broadcaster is set to be used in context parallel group
_CP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_message_queue_broadcaster=True,
group_name="cp")
# Build the moe tensor parallel groups.
global _MOE_TP
assert _MOE_TP is None, ("moe tensor parallel group is already initialized")
group_ranks = []
for i in range(num_tensor_model_parallel_groups):
for j in range(moe_expert_parallel_size):
ranks = list(range(i * tensor_model_parallel_size + j, (i + 1) * tensor_model_parallel_size,
moe_expert_parallel_size))
group_ranks.append(ranks)
# message queue broadcaster is set to be used in moe tensor parallel group
_MOE_TP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_message_queue_broadcaster=True,
group_name="moe_tp")
# Build the moe expert parallel groups.
global _MOE_EP
assert _MOE_EP is None, ("moe expert parallel group is already initialized")
group_ranks = []
for i in range(num_tensor_model_parallel_groups):
for j in range(moe_tensor_parallel_size):
ranks = range(i * tensor_model_parallel_size + j * moe_expert_parallel_size,
i * tensor_model_parallel_size + (j + 1) * moe_expert_parallel_size)
group_ranks.append(ranks)
# message queue broadcaster is set to be used in moe expert parallel group
_MOE_EP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_message_queue_broadcaster=True,
group_name="moe_ep")
'''
==================
End of MLU Hijack
==================
'''
def ensure_model_parallel_initialized(
parallel_config: ParallelConfig,
backend: Optional[str] = None,
) -> None:
"""Helper to initialize model parallel groups if they are not initialized,
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
values if the model parallel groups are initialized.
"""
backend = backend or torch.distributed.get_backend(
get_world_group().device_group)
if not model_parallel_is_initialized():
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace all parallel_size to parallel_config
'''
initialize_model_parallel(parallel_config, backend)
'''
==================
End of MLU Hijack
==================
'''
return
'''
=============================
Modify by vllm_mlu
=============================
@brief: check parallel_size with prefix parallel_config
'''
assert (
get_tensor_model_parallel_world_size() == parallel_config.tensor_model_parallel_size
), ("tensor parallel group already initialized, but of unexpected size: "
f"{get_tensor_model_parallel_world_size()=} vs. "
f"{parallel_config.tensor_model_parallel_size=}")
pp_world_size = get_pp_group().world_size
assert (pp_world_size == parallel_config.pipeline_model_parallel_size), (
"pipeline parallel group already initialized, but of unexpected size: "
f"{pp_world_size=} vs. "
f"{parallel_config.pipeline_model_parallel_size=}")
cp_world_size = get_cp_group().world_size
assert (cp_world_size == parallel_config.context_parallel_size), (
"context parallel group already initialized, but of unexpected size: "
f"{cp_world_size=} vs. "
f"{parallel_config.context_parallel_size=}")
moe_tp_world_size = get_moe_tp_group().world_size
assert (moe_tp_world_size == parallel_config.moe_tp_size), (
"moe tensor parallel group already initialized, but of unexpected size: "
f"{moe_tp_world_size=} vs. "
f"{parallel_config.moe_tp_size=}")
moe_ep_world_size = get_moe_ep_group().world_size
assert (moe_ep_world_size == parallel_config.moe_ep_size), (
"moe expert parallel group already initialized, but of unexpected size: "
f"{moe_ep_world_size=} vs. "
f"{parallel_config.moe_ep_size=}")
'''
==================
End of MLU Hijack
==================
'''
def model_parallel_is_initialized():
"""Check if tensor, pipeline, context, moe parallel groups are initialized."""
return model_parallel_is_initialized_org and (_CP is not None and _CP is not None) and (
_MOE_TP is not None and _MOE_TP is not None) and (_MOE_EP is not None and _MOE_EP is not None)
def destroy_model_parallel():
"""Set the groups to none and destroy them."""
destroy_model_parallel_org()
global _CP
if _CP:
_CP.destroy()
_CP = None
global _MOE_TP
if _MOE_TP:
_MOE_TP.destroy()
_MOE_TP = None
global _MOE_EP
if _MOE_EP:
_MOE_EP.destroy()
_MOE_EP = None
def get_context_model_parallel_world_size():
"""Return world size for the context parallel group."""
return get_cp_group().world_size
def get_context_model_parallel_rank():
"""Return my rank for the context parallel group."""
return get_cp_group().rank_in_group
def get_moe_tensor_parallel_world_size():
"""Return world size for the moe tensor parallel group."""
return get_moe_tp_group().world_size
def get_moe_tensor_parallel_rank():
"""Return my rank for the moe tensor parallel group."""
return get_moe_tp_group().rank_in_group
def get_moe_expert_parallel_world_size():
"""Return world size for the moe expert parallel group."""
return get_moe_ep_group().world_size
def get_moe_expert_parallel_rank():
"""Return my rank for the moe expert parallel group."""
return get_moe_ep_group().rank_in_group
def get_parallel_world_size_with_group(group):
"""Return world size for the special group."""
if group is not None:
return group.world_size
else:
return get_tensor_model_parallel_world_size()
def get_parallel_rank_with_group(group):
"""Return my rank for the special group."""
if group is not None:
return group.rank_in_group
else:
return get_tensor_model_parallel_rank()