add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

View File

@@ -0,0 +1,17 @@
import logging
from logging import Logger
def init_logger(name: str) -> Logger:
"""Initialize loggers for benchmarks module,
and keep the configuration consistent with the vllm module"""
logger = logging.getLogger(name)
vllm_logger = logging.Logger.manager.loggerDict.get('vllm', None)
if vllm_logger:
logger.setLevel(vllm_logger.level)
logger.propagate = vllm_logger.propagate
logger.handlers = vllm_logger.handlers
return logger

View File

@@ -0,0 +1,110 @@
import torch
from vllm.config import ParallelConfig, TokenizerPoolConfig
from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
from vllm.logger import init_logger
from vllm.utils import cuda_device_count_stateless
from vllm.platforms import current_platform
from vllm_mlu.mlu_hijack_utils import MluHijackObject
if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
from vllm.executor.executor_base import ExecutorBase
logger = init_logger(__name__)
def vllm__config__ParallelConfig___init__(
self,
pipeline_parallel_size: int,
tensor_parallel_size: int,
worker_use_ray: Optional[bool] = None,
max_parallel_loading_workers: Optional[int] = None,
disable_custom_all_reduce: bool = False,
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
ray_workers_use_nsight: bool = False,
placement_group: Optional["PlacementGroup"] = None,
distributed_executor_backend: Optional[Union[
str, Type["ExecutorBase"]]] = None,
) -> None:
self.pipeline_parallel_size = pipeline_parallel_size
self.tensor_parallel_size = tensor_parallel_size
self.distributed_executor_backend = distributed_executor_backend
self.max_parallel_loading_workers = max_parallel_loading_workers
self.disable_custom_all_reduce = disable_custom_all_reduce
self.tokenizer_pool_config = tokenizer_pool_config
self.ray_workers_use_nsight = ray_workers_use_nsight
self.placement_group = placement_group
'''
==========================
Modify by vllm_mlu
==========================
@brief: modify world_size
'''
self.context_parallel_size = self.context_parallel_size
self.moe_tp_size = self.moe_tp_size
self.moe_ep_size = self.moe_ep_size
self.world_size = pipeline_parallel_size * tensor_parallel_size * self.context_parallel_size
'''
=======================
End of MLU Hijack
=======================
'''
if worker_use_ray:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
elif not self.use_ray:
raise ValueError(f"worker-use-ray can't be used with "
f"distributed executor backend "
f"'{self.distributed_executor_backend}'.")
if current_platform.is_tpu() and self.world_size > 1:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
if self.distributed_executor_backend != "ray":
raise ValueError(
"TPU backend only supports Ray for distributed inference.")
if current_platform.is_hpu() and self.world_size > 1:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
if self.distributed_executor_backend != "ray":
raise ValueError(
"HPU backend only supports Ray for distributed inference.")
if self.distributed_executor_backend is None and self.world_size > 1:
# We use multiprocessing by default if world_size fits on the
# current node and we aren't in a ray placement group.
from vllm.executor import ray_utils
backend = "mp"
ray_found = ray_utils.ray_is_available()
if (current_platform.is_cuda()
and cuda_device_count_stateless() < self.world_size):
if not ray_found:
raise ValueError("Unable to load Ray which is "
"required for multi-node inference, "
"please install Ray with `pip install "
"ray`.") from ray_utils.ray_import_err
backend = "ray"
elif ray_found:
if self.placement_group:
backend = "ray"
else:
from ray import is_initialized as ray_is_initialized
if ray_is_initialized():
from ray.util import get_current_placement_group
if get_current_placement_group():
backend = "ray"
self.distributed_executor_backend = backend
logger.info("Defaulting to use %s for distributed inference",
backend)
self._verify_args()
self.rank: int = 0
MluHijackObject.apply_hijack(ParallelConfig,
ParallelConfig.__init__,
vllm__config__ParallelConfig___init__)

View File

@@ -0,0 +1,2 @@
from . import communication_op
from . import parallel_state

View File

@@ -0,0 +1,21 @@
import torch
from typing import Any, Dict, Optional, Union
from .parallel_state import get_tp_group
def tensor_model_parallel_all_reduce(input_: torch.Tensor, tp_group: Any = None) -> torch.Tensor:
"""All-reduce the input tensor across model parallel group."""
return get_tp_group(tp_group).all_reduce(input_)
def tensor_model_parallel_all_gather(input_: torch.Tensor,
dim: int = -1, tp_group: Any = None) -> torch.Tensor:
"""All-gather the input tensor across model parallel group."""
return get_tp_group(tp_group).all_gather(input_, dim)
def tensor_model_parallel_gather(input_: torch.Tensor,
dst: int = 0,
dim: int = -1, tp_group: Any = None) -> Optional[torch.Tensor]:
"""Gather the input tensor across model parallel group."""
return get_tp_group(tp_group).gather(input_, dst, dim)

View File

@@ -0,0 +1,339 @@
import torch
from typing import Any, Dict, List, Optional, Tuple, Union
from vllm.config import ParallelConfig
from vllm.distributed.parallel_state import (init_model_parallel_group, get_tensor_model_parallel_world_size,
get_tensor_model_parallel_rank, get_world_group, get_pp_group,
GroupCoordinator)
import vllm.distributed.parallel_state as parallel_state_org
from vllm.distributed.parallel_state import model_parallel_is_initialized as model_parallel_is_initialized_org
from vllm.distributed.parallel_state import destroy_model_parallel as destroy_model_parallel_org
def get_tp_group(tp_group: Any = None) -> GroupCoordinator:
if tp_group is not None:
return tp_group
assert parallel_state_org._TP is not None, ("tensor model parallel group is not initialized")
return parallel_state_org._TP
_CP: Optional[GroupCoordinator] = None
def get_cp_group() -> GroupCoordinator:
assert _CP is not None, ("context parallel group is not initialized")
return _CP
# kept for backward compatibility
get_context_model_parallel_group = get_cp_group
_MOE_TP: Optional[GroupCoordinator] = None
def get_moe_tp_group() -> GroupCoordinator:
assert _MOE_TP is not None, ("moe tensor parallel group is not initialized")
return _MOE_TP
# kept for backward compatibility
get_moe_tensor_parallel_group = get_moe_tp_group
_MOE_EP: Optional[GroupCoordinator] = None
def get_moe_ep_group() -> GroupCoordinator:
assert _MOE_EP is not None, ("moe expert parallel group is not initialized")
return _MOE_EP
# kept for backward compatibility
get_moe_expert_parallel_group = get_moe_ep_group
def initialize_model_parallel(
parallel_config: ParallelConfig,
backend: Optional[str] = None,
) -> None:
"""
Initialize model parallel groups.
Arguments:
tensor_model_parallel_size: number of GPUs used for tensor model
parallelism.
pipeline_model_parallel_size: number of GPUs used for pipeline model
parallelism.
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
the model pipeline. The present function will
create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
4 tensor model-parallel groups:
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
2 pipeline model-parallel groups:
[g0, g2, g4, g6], [g1, g3, g5, g7]
Note that for efficiency, the caller should make sure adjacent ranks
are on the same DGX box. For example if we are using 2 DGX-1 boxes
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
ranks 8 to 15 belong to the second box.
"""
# Get world size and rank. Ensure some consistencies.
assert torch.distributed.is_initialized()
world_size: int = torch.distributed.get_world_size()
backend = backend or torch.distributed.get_backend(
get_world_group().device_group)
'''
=============================
Modify by vllm_mlu
=============================
@brief: get parallel_size from parallel_config and valid world_size
'''
tensor_model_parallel_size = parallel_config.tensor_parallel_size
pipeline_model_parallel_size = parallel_config.pipeline_parallel_size
context_model_parallel_size = parallel_config.context_parallel_size
moe_tensor_parallel_size = parallel_config.moe_tp_size
moe_expert_parallel_size = parallel_config.moe_ep_size
if (world_size !=
tensor_model_parallel_size * pipeline_model_parallel_size * context_model_parallel_size):
raise RuntimeError(
f"world_size ({world_size}) is not equal to "
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
f"pipeline_model_parallel_size ({pipeline_model_parallel_size}) x"
f"context_model_parallel_size ({context_model_parallel_size})")
if (moe_tensor_parallel_size < 1 or moe_expert_parallel_size < 1 or tensor_model_parallel_size !=
moe_tensor_parallel_size * moe_expert_parallel_size):
raise RuntimeError(
f"tensor_model_parallel_size ({world_size}) is not equal to "
f"moe_tensor_parallel_size ({moe_tensor_parallel_size}) x "
f"moe_expert_parallel_size ({moe_expert_parallel_size})")
'''
==================
End of MLU Hijack
==================
'''
# Build the tensor model-parallel groups.
num_tensor_model_parallel_groups: int = (world_size //
tensor_model_parallel_size)
assert parallel_state_org._TP is None, ("tensor model parallel group is already initialized")
group_ranks = []
for i in range(num_tensor_model_parallel_groups):
ranks = list(
range(i * tensor_model_parallel_size,
(i + 1) * tensor_model_parallel_size))
group_ranks.append(ranks)
# message queue broadcaster is only used in tensor model parallel group
parallel_state_org._TP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_message_queue_broadcaster=True,
group_name="tp")
# Build the pipeline model-parallel groups.
num_pipeline_model_parallel_groups: int = (world_size //
pipeline_model_parallel_size)
assert parallel_state_org._PP is None, (
"pipeline model parallel group is already initialized")
group_ranks = []
for i in range(num_pipeline_model_parallel_groups):
ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
group_ranks.append(ranks)
# pipeline parallel does not need custom allreduce
parallel_state_org._PP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_custom_allreduce=False,
group_name="pp")
'''
=============================
Modify by vllm_mlu
=============================
@brief: add _CP, _MOE_TP, MOE_EP
'''
# Build the context parallel groups.
num_context_model_parallel_groups: int = (world_size //
context_model_parallel_size)
global _CP
assert _CP is None, (
"context parallel group is already initialized")
group_ranks = []
for i in range(num_context_model_parallel_groups):
ranks = list(range(i, context_model_parallel_size * tensor_model_parallel_size + i, tensor_model_parallel_size))
group_ranks.append(ranks)
# message queue broadcaster is set to be used in context parallel group
_CP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_message_queue_broadcaster=True,
group_name="cp")
# Build the moe tensor parallel groups.
global _MOE_TP
assert _MOE_TP is None, ("moe tensor parallel group is already initialized")
group_ranks = []
for i in range(num_tensor_model_parallel_groups):
for j in range(moe_expert_parallel_size):
ranks = list(range(i * tensor_model_parallel_size + j, (i + 1) * tensor_model_parallel_size,
moe_expert_parallel_size))
group_ranks.append(ranks)
# message queue broadcaster is set to be used in moe tensor parallel group
_MOE_TP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_message_queue_broadcaster=True,
group_name="moe_tp")
# Build the moe expert parallel groups.
global _MOE_EP
assert _MOE_EP is None, ("moe expert parallel group is already initialized")
group_ranks = []
for i in range(num_tensor_model_parallel_groups):
for j in range(moe_tensor_parallel_size):
ranks = range(i * tensor_model_parallel_size + j * moe_expert_parallel_size,
i * tensor_model_parallel_size + (j + 1) * moe_expert_parallel_size)
group_ranks.append(ranks)
# message queue broadcaster is set to be used in moe expert parallel group
_MOE_EP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
use_message_queue_broadcaster=True,
group_name="moe_ep")
'''
==================
End of MLU Hijack
==================
'''
def ensure_model_parallel_initialized(
parallel_config: ParallelConfig,
backend: Optional[str] = None,
) -> None:
"""Helper to initialize model parallel groups if they are not initialized,
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
values if the model parallel groups are initialized.
"""
backend = backend or torch.distributed.get_backend(
get_world_group().device_group)
if not model_parallel_is_initialized():
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace all parallel_size to parallel_config
'''
initialize_model_parallel(parallel_config, backend)
'''
==================
End of MLU Hijack
==================
'''
return
'''
=============================
Modify by vllm_mlu
=============================
@brief: check parallel_size with prefix parallel_config
'''
assert (
get_tensor_model_parallel_world_size() == parallel_config.tensor_model_parallel_size
), ("tensor parallel group already initialized, but of unexpected size: "
f"{get_tensor_model_parallel_world_size()=} vs. "
f"{parallel_config.tensor_model_parallel_size=}")
pp_world_size = get_pp_group().world_size
assert (pp_world_size == parallel_config.pipeline_model_parallel_size), (
"pipeline parallel group already initialized, but of unexpected size: "
f"{pp_world_size=} vs. "
f"{parallel_config.pipeline_model_parallel_size=}")
cp_world_size = get_cp_group().world_size
assert (cp_world_size == parallel_config.context_parallel_size), (
"context parallel group already initialized, but of unexpected size: "
f"{cp_world_size=} vs. "
f"{parallel_config.context_parallel_size=}")
moe_tp_world_size = get_moe_tp_group().world_size
assert (moe_tp_world_size == parallel_config.moe_tp_size), (
"moe tensor parallel group already initialized, but of unexpected size: "
f"{moe_tp_world_size=} vs. "
f"{parallel_config.moe_tp_size=}")
moe_ep_world_size = get_moe_ep_group().world_size
assert (moe_ep_world_size == parallel_config.moe_ep_size), (
"moe expert parallel group already initialized, but of unexpected size: "
f"{moe_ep_world_size=} vs. "
f"{parallel_config.moe_ep_size=}")
'''
==================
End of MLU Hijack
==================
'''
def model_parallel_is_initialized():
"""Check if tensor, pipeline, context, moe parallel groups are initialized."""
return model_parallel_is_initialized_org and (_CP is not None and _CP is not None) and (
_MOE_TP is not None and _MOE_TP is not None) and (_MOE_EP is not None and _MOE_EP is not None)
def destroy_model_parallel():
"""Set the groups to none and destroy them."""
destroy_model_parallel_org()
global _CP
if _CP:
_CP.destroy()
_CP = None
global _MOE_TP
if _MOE_TP:
_MOE_TP.destroy()
_MOE_TP = None
global _MOE_EP
if _MOE_EP:
_MOE_EP.destroy()
_MOE_EP = None
def get_context_model_parallel_world_size():
"""Return world size for the context parallel group."""
return get_cp_group().world_size
def get_context_model_parallel_rank():
"""Return my rank for the context parallel group."""
return get_cp_group().rank_in_group
def get_moe_tensor_parallel_world_size():
"""Return world size for the moe tensor parallel group."""
return get_moe_tp_group().world_size
def get_moe_tensor_parallel_rank():
"""Return my rank for the moe tensor parallel group."""
return get_moe_tp_group().rank_in_group
def get_moe_expert_parallel_world_size():
"""Return world size for the moe expert parallel group."""
return get_moe_ep_group().world_size
def get_moe_expert_parallel_rank():
"""Return my rank for the moe expert parallel group."""
return get_moe_ep_group().rank_in_group
def get_parallel_world_size_with_group(group):
"""Return world size for the special group."""
if group is not None:
return group.world_size
else:
return get_tensor_model_parallel_world_size()
def get_parallel_rank_with_group(group):
"""Return my rank for the special group."""
if group is not None:
return group.rank_in_group
else:
return get_tensor_model_parallel_rank()

View File

@@ -0,0 +1 @@
from . import arg_utils

View File

@@ -0,0 +1,141 @@
import argparse
import torch
from vllm.config import VllmConfig, ParallelConfig
from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs
from vllm_mlu._mlu_utils import *
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
logger = init_logger(__name__)
vllm__engine__arg_utils__EngineArgs__create_engine_config_org = EngineArgs.create_engine_config
vllm__engine__arg_utils__EngineArgs__add_cli_args_org = EngineArgs.add_cli_args
vllm__engine__arg_utils__EngineArgs__from_cli_args_org = EngineArgs.from_cli_args
vllm__engine__arg_utils__AsyncEngineArgs__from_cli_args_org = AsyncEngineArgs.from_cli_args
def vllm__engine__arg_utils__EngineArgs__create_engine_config(self, ) -> VllmConfig:
'''
=============================
Modify by vllm_mlu
=============================
@brief: chunked parallel pipeline only support batch size = 1 yet.
'''
if CHUNKED_PIPELINE_PARALLEL_EN:
self.max_num_seqs = 1
logger.info("Reset max_num_seqs to 1 as the chunked parallel pipeline mode "
"only supports batch size to 1.")
'''
@brief: disable custom_all_reduce, re-set block_size to support paged and unpaged mode.
'''
# MLU not support custom all reduce
self.disable_custom_all_reduce = True
BlockSizeInfo.set_block_size(self.block_size)
if not USE_PAGED and self.enable_chunked_prefill:
raise ValueError("Not support chunked_prefill in unpaged mode.")
# set parallel_config context_parallel_size, moe_tp_size, moe_ep_size
self.context_parallel_size = getattr(self, "context_parallel_size", 1)
self.moe_tp_size = getattr(self, "moe_tp_size", -1)
self.moe_ep_size = getattr(self, "moe_ep_size", -1)
# check context parallel whether supported or not
if CONTEXT_PARALLEL_EN:
if self.context_parallel_size > 1 and get_device_major_capability() == 3:
raise ValueError('Context parallel does not support MLU370.')
else:
if self.context_parallel_size > 1:
raise ValueError('Context parallel does not support when CONTEXT_PARALLEL_EN=False')
# check expert parallel whether supported or not
if not EXPERT_PARALLEL_EN and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
raise ValueError('Expert parallel does not support when EXPERT_PARALLEL_EN=False')
ParallelConfig.context_parallel_size = self.context_parallel_size
# set parallel_config moe_tp_size and moe_ep_size
if self.moe_tp_size < 1 and self.moe_ep_size < 1:
moe_tp_size = self.tensor_parallel_size
moe_ep_size = 1
elif self.moe_tp_size >= 1 and self.moe_ep_size < 1:
moe_tp_size = self.moe_tp_size
moe_ep_size = self.tensor_parallel_size // self.moe_tp_size
elif self.moe_tp_size < 1 and self.moe_ep_size >= 1:
moe_tp_size = self.tensor_parallel_size // self.moe_ep_size
moe_ep_size = self.moe_ep_size
else:
moe_tp_size = self.moe_tp_size
moe_ep_size = self.moe_ep_size
assert moe_tp_size * moe_ep_size == self.tensor_parallel_size, (
f"tensor_parallel_size ({self.tensor_parallel_size}) is not equal to "
f"moe_tp_size ({self.moe_tp_size}) x moe_ep_size ({self.moe_ep_size})"
"or moe_tp_size and moe_ep_size should be -1 or one of them should be -1")
ParallelConfig.moe_tp_size = moe_tp_size
ParallelConfig.moe_ep_size = moe_ep_size
engine_config = vllm__engine__arg_utils__EngineArgs__create_engine_config_org(self)
engine_config.cache_config.block_size = BlockSizeInfo.BLOCK_SIZE
'''
==================
End of MLU Hijack
==================
'''
return engine_config
@staticmethod
def vllm__engine__arg_utils__EngineArgs__add_cli_args(
parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser = vllm__engine__arg_utils__EngineArgs__add_cli_args_org(parser)
'''
=============================
Modify by vllm_mlu
=============================
@brief: add --context-parallel-size, --moe-tp-size and --moe-ep-size
'''
parser.add_argument('--context-parallel-size',
'-cp',
type=int,
default=1,
help='number of context parallel replicas')
parser.add_argument('--moe-tp-size',
type=int,
default=-1,
help='Number of moe tensor parallel replicas')
parser.add_argument('--moe-ep-size',
type=int,
default=-1,
help='Number of moe expert parallel replicas')
'''
==================
End of MLU Hijack
==================
'''
return parser
@classmethod
def vllm__engine__arg_utils__EngineArgs__from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
if cls == AsyncEngineArgs:
engine_args = vllm__engine__arg_utils__AsyncEngineArgs__from_cli_args_org(args)
else:
engine_args = vllm__engine__arg_utils__EngineArgs__from_cli_args_org(args)
setattr(engine_args, 'context_parallel_size', getattr(args, "context_parallel_size"))
setattr(engine_args, 'moe_tp_size', getattr(args, "moe_tp_size"))
setattr(engine_args, 'moe_ep_size', getattr(args, "moe_ep_size"))
return engine_args
MluHijackObject.apply_hijack(EngineArgs,
EngineArgs.create_engine_config,
vllm__engine__arg_utils__EngineArgs__create_engine_config)
MluHijackObject.apply_hijack(EngineArgs,
EngineArgs.add_cli_args,
vllm__engine__arg_utils__EngineArgs__add_cli_args)
MluHijackObject.apply_hijack(EngineArgs,
EngineArgs.from_cli_args,
vllm__engine__arg_utils__EngineArgs__from_cli_args)
MluHijackObject.apply_hijack(AsyncEngineArgs,
AsyncEngineArgs.from_cli_args,
vllm__engine__arg_utils__EngineArgs__from_cli_args)

View File

@@ -0,0 +1 @@
from . import llm

View File

@@ -0,0 +1,98 @@
from typing import Optional, Dict, Any
from vllm.entrypoints.llm import LLM
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm.logger import init_logger
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
TaskOption)
logger = init_logger(__name__)
vllm__entrypoints__llm__LLM____init__org = LLM.__init__
def vllm__entrypoints__llm__LLM____init__(
self,
model: str,
tokenizer: Optional[str] = None,
tokenizer_mode: str = "auto",
skip_tokenizer_init: bool = False,
trust_remote_code: bool = False,
allowed_local_media_path: str = "",
tensor_parallel_size: int = 1,
dtype: str = "auto",
quantization: Optional[str] = None,
revision: Optional[str] = None,
tokenizer_revision: Optional[str] = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: float = 4,
cpu_offload_gb: float = 0,
enforce_eager: Optional[bool] = None,
max_seq_len_to_capture: int = 8192,
disable_custom_all_reduce: bool = False,
disable_async_output_proc: bool = False,
hf_overrides: Optional[HfOverrides] = None,
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
# After positional args are removed, move this right below `model`
task: TaskOption = "auto",
override_pooler_config: Optional[PoolerConfig] = None,
**kwargs,
) -> None:
'''
LLM constructor.
Note: if enforce_eager is unset (enforce_eager is None)
it defaults to False.
'''
'''
=============================
Modify by vllm_mlu
=============================
@brief: add cp and ep parameter
'''
# pop context_parallel_size
EngineArgs.context_parallel_size = kwargs.pop("context_parallel_size", 1)
# pop moe_tp_size and moe_ep_size
EngineArgs.moe_tp_size = kwargs.pop("moe_tp_size", -1)
# pop moe_ep_size and moe_ep_size
EngineArgs.moe_ep_size = kwargs.pop("moe_ep_size", -1)
'''
==================
End of MLU Hijack
==================
'''
vllm__entrypoints__llm__LLM____init__org(
self=self,
model=model,
tokenizer=tokenizer,
tokenizer_mode=tokenizer_mode,
skip_tokenizer_init=skip_tokenizer_init,
trust_remote_code=trust_remote_code,
allowed_local_media_path=allowed_local_media_path,
tensor_parallel_size=tensor_parallel_size,
dtype=dtype,
quantization=quantization,
revision=revision,
tokenizer_revision=tokenizer_revision,
seed=seed,
gpu_memory_utilization=gpu_memory_utilization,
swap_space=swap_space,
cpu_offload_gb=cpu_offload_gb,
enforce_eager=enforce_eager,
max_seq_len_to_capture=max_seq_len_to_capture,
disable_custom_all_reduce=disable_custom_all_reduce,
disable_async_output_proc=disable_async_output_proc,
hf_overrides=hf_overrides,
mm_processor_kwargs=mm_processor_kwargs,
# After positional args are removed, move this right below `model`
task=task,
override_pooler_config=override_pooler_config,
**kwargs
)
MluHijackObject.apply_hijack(LLM,
LLM.__init__,
vllm__entrypoints__llm__LLM____init__)

View File

@@ -0,0 +1,7 @@
print("Apply Custom VLLM Demo!")
from . import distributed
from . import engine
from . import entrypoints
from . import worker
from . import config
from . import model_executor

View File

@@ -0,0 +1,2 @@
from . import layers
from . import parameter

View File

@@ -0,0 +1,2 @@
from . import linear
from . import feed_forward

View File

@@ -0,0 +1,93 @@
from typing import Optional, Any
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear,
ColumnParallelLinear,
RowParallelLinear
)
from vllm_mlu.mlu_hijack_utils import set_is_gated, MluHijackObject
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
from ....mlu_hijack.distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group)
logger = init_logger(__name__)
def vllm__mlu_hijack__model_executor__layers__feed_forward__FeedForward____init__(
self,
hidden_size: int,
intermediate_size: int,
hidden_act: str,
up_proj_name: str,
is_gated: bool,
down_proj_name: str,
bias: bool,
quant_config: Optional[QuantizationConfig] = None,
skip_bias_add: bool = False,
reduce_results: bool = True,
prefix: str = "",
tp_group: Any = None,
):
super(FeedForward, self).__init__()
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.is_gated = is_gated
self.bias = bias
self.up_proj_name = up_proj_name
self.down_proj_name = down_proj_name
self.quant_config = quant_config
self.is_initialized = False
self.skip_bias_add = skip_bias_add
self.reduce_results = reduce_results
self.use_bt_ffn = True if quant_config is None else False
set_is_gated(self.is_gated)
self.tp_size = get_parallel_world_size_with_group(tp_group)
self.tp_rank = get_parallel_rank_with_group(tp_group)
'''
=============================
Modify by vllm_mlu
=============================
@brief: add tp_group parameter at the end of each linear class
'''
self.tp_group = tp_group
# up_proj with gate or not
if self.is_gated:
up_proj = MergedColumnParallelLinear(hidden_size,
[intermediate_size] * 2,
bias=bias,
quant_config=quant_config,
prefix=f"{prefix}.{up_proj_name}",
tp_group=tp_group)
else:
up_proj = ColumnParallelLinear(hidden_size,
intermediate_size,
bias=bias,
skip_bias_add=skip_bias_add,
quant_config=quant_config,
prefix=f"{prefix}.{up_proj_name}",
tp_group=tp_group)
self.register_module(up_proj_name, up_proj)
# down_proj
down_proj = RowParallelLinear(intermediate_size,
hidden_size,
bias=bias,
skip_bias_add=skip_bias_add,
reduce_results=reduce_results,
quant_config=quant_config,
prefix=f"{prefix}.{down_proj_name}",
tp_group=tp_group)
'''
==================
End of MLU Hijack
==================
'''
self.register_module(down_proj_name, down_proj)
MluHijackObject.apply_hijack(FeedForward,
FeedForward.__init__,
vllm__mlu_hijack__model_executor__layers__feed_forward__FeedForward____init__)

View File

@@ -0,0 +1,696 @@
from typing import Optional, List, Any, Tuple
import torch
from torch.nn.parameter import Parameter, UninitializedParameter
from vllm.distributed import (divide, split_tensor_along_last_dim)
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm.model_executor.utils import set_weight_attrs
from vllm.model_executor.parameter import (BasevLLMParameter,
PerTensorScaleParameter,
RowvLLMParameter)
from vllm.logger import init_logger
from vllm_mlu._mlu_utils import *
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED, LinearBase, ColumnParallelLinear,
MergedColumnParallelLinear, RowParallelLinear, adjust_marlin_shard,
adjust_scalar_to_fused_array)
from vllm import _mlu_ops as mlu_ops
from ....mlu_hijack.distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group,
get_tp_group)
from ....mlu_hijack.distributed.communication_op import (tensor_model_parallel_all_reduce,
tensor_model_parallel_all_gather)
vllm__model_executor__layers__linear__LinearBase____init__org = LinearBase.__init__
logger = init_logger(__name__)
def vllm__model_executor__layers__linear__LinearBase____init__(
self,
input_size: int,
output_size: int,
skip_bias_add: bool = False,
params_dtype: Optional[torch.dtype] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
tp_group: Any = None,
):
vllm__model_executor__layers__linear__LinearBase____init__org(self=self,
input_size=input_size,
output_size=output_size,
skip_bias_add=skip_bias_add,
params_dtype=params_dtype,
quant_config=quant_config,
prefix=prefix)
'''
=============================
Modify by vllm_mlu
=============================
@brief: add self.tp_group, world_size and tp_rank to support moe expert parallel
'''
self.tp_group = tp_group
self.tp_world_size = get_parallel_world_size_with_group(self.tp_group)
self.tp_rank = get_parallel_rank_with_group(self.tp_group)
'''
=================
End of MLU Hijack
=================
'''
def vllm__model_executor__layers__linear__ColumnParallelLinear____init__(
self,
input_size: int,
output_size: int,
bias: bool = True,
gather_output: bool = False,
skip_bias_add: bool = False,
params_dtype: Optional[torch.dtype] = None,
quant_config: Optional[QuantizationConfig] = None,
output_sizes: Optional[List[int]] = None,
prefix: str = "",
tp_group: Any = None,
):
super(ColumnParallelLinear, self).__init__(input_size, output_size, skip_bias_add, params_dtype,
quant_config, prefix, tp_group)
self.gather_output = gather_output
# Divide the weight matrix along the last dimension.
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
@brief: move checking output_sizes logic from MergedColumnParallelLinear to here
'''
tp_size = self.tp_world_size
if output_sizes is not None:
assert all(output_size_var % tp_size == 0 for output_size_var in output_sizes)
'''
=================
End of MLU Hijack
=================
'''
assert self.quant_method is not None
self.output_size_per_partition = divide(self.output_size, tp_size)
self.output_partition_sizes = [self.output_size_per_partition]
# If QKV or MergedColumn, use output size of each partition.
if hasattr(self, "output_sizes"):
self.output_partition_sizes = [
divide(output_size, tp_size)
for output_size in self.output_sizes
]
if output_sizes is None:
output_sizes = [output_size]
self.quant_method.create_weights(
layer=self,
input_size_per_partition=self.input_size,
output_partition_sizes=self.output_partition_sizes,
input_size=self.input_size,
output_size=self.output_size,
params_dtype=self.params_dtype,
weight_loader=(
self.weight_loader_v2 if self.quant_method.__class__.__name__
in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
if bias:
self.bias = Parameter(
torch.empty(self.output_size_per_partition,
dtype=params_dtype))
set_weight_attrs(self.bias, {
"output_dim": 0,
"weight_loader": self.weight_loader,
})
else:
self.register_parameter("bias", None)
def vllm__model_executor__layers__linear__ColumnParallelLinear__weight_loader(
self, param: Parameter, loaded_weight: torch.Tensor):
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
'''
tp_rank = self.tp_rank
'''
=================
End of MLU Hijack
=================
'''
output_dim = getattr(param, "output_dim", None)
# Special case for GGUF
is_gguf_weight = getattr(param, "is_gguf_weight", False)
is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
if is_gguf_weight_type:
param.weight_type = loaded_weight.item()
# Materialize GGUF UninitializedParameter
if is_gguf_weight and isinstance(param, UninitializedParameter):
param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
param_data = param.data
# bitsandbytes loads the weights of the specific portion
# no need to narrow here
if output_dim is not None and not use_bitsandbytes_4bit:
shard_size = param_data.shape[output_dim]
start_idx = tp_rank * shard_size
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
shard_size)
# Special case for loading scales off disk, which often do not
# have a shape (such as in the case of AutoFP8).
if len(loaded_weight.shape) == 0:
loaded_weight = loaded_weight.reshape(1)
assert param_data.shape == loaded_weight.shape
param_data.copy_(loaded_weight)
def vllm__model_executor__layers__linear__ColumnParallelLinear__forward(
self, input_, smooth_quant_scale: Optional[torch.Tensor] = None):
bias = self.bias if not self.skip_bias_add else None
# Matrix multiply.
assert self.quant_method is not None
'''
=============================
Modify by vllm_mlu
=============================
@brief: Add input_scale parameter.
'''
if smooth_quant_scale is not None:
output_parallel = self.quant_method.apply(self, input_, bias,
input_scale=smooth_quant_scale)
else:
output_parallel = self.quant_method.apply(self, input_, bias)
'''
==================
End of MLU Hijack
==================
'''
if self.gather_output:
# All-gather across the partitions.
'''
=============================
Modify by vllm_mlu
=============================
@brief: add tp_group param to tensor_model_parallel_all_gather
'''
output = tensor_model_parallel_all_gather(output_parallel, self.tp_group)
'''
=================
End of MLU Hijack
=================
'''
else:
output = output_parallel
output_bias = self.bias if self.skip_bias_add else None
return output, output_bias
def vllm__model_executor__layers__linear__ColumnParallelLinear__extra_repr(self) -> str:
s = f"in_features={self.input_size}"
s += f", output_features={self.output_size_per_partition}"
s += f", bias={self.bias is not None}"
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
'''
s += f", tp_size={self.tp_world_size}"
'''
=================
End of MLU Hijack
=================
'''
s += f", gather_output={self.gather_output}"
return s
def vllm__model_executor__layers__linear__MergedColumnParallelLinear____init__(
self,
input_size: int,
output_sizes: List[int],
bias: bool = True,
gather_output: bool = False,
skip_bias_add: bool = False,
params_dtype: Optional[torch.dtype] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
tp_group: Any = None,
):
self.output_sizes = output_sizes
'''
=============================
Modify by vllm_mlu
=============================
@brief: move checking output_sizes logic from MergedColumnParallelLinear to ColumnParallelLinear.__init__
'''
# tp_size = get_tensor_model_parallel_world_size()
# assert all(output_size % tp_size == 0 for output_size in output_sizes)
'''
=================
End of MLU Hijack
=================
'''
super(MergedColumnParallelLinear, self).__init__(input_size=input_size,
output_size=sum(output_sizes),
bias=bias,
gather_output=gather_output,
skip_bias_add=skip_bias_add,
params_dtype=params_dtype,
quant_config=quant_config,
output_sizes=self.output_sizes,
prefix=prefix,
tp_group=tp_group)
def vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader(self,
param: Parameter,
loaded_weight: torch.Tensor,
loaded_shard_id: Optional[int] = None):
# Special case for GGUF
# initialize GGUF param after we know the quantize type
is_gguf_weight = getattr(param, "is_gguf_weight", False)
is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
if is_gguf_weight_type:
param.data[loaded_shard_id].copy_(loaded_weight)
param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
return
if is_gguf_weight:
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
'''
tp_rank = self.tp_rank
tp_size = self.tp_world_size
'''
=================
End of MLU Hijack
=================
'''
output_dim = getattr(param, "output_dim", None)
shard_size = loaded_weight.size(output_dim) // tp_size
start_idx = tp_rank * shard_size
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
shard_size)
param.shard_id.append(loaded_shard_id)
param.shard_id_map[loaded_shard_id] = len(param.data_container)
param.data_container.append(loaded_weight)
if len(param.data_container) == 2:
self.qweight = param.materialize_nested()
return
param_data = param.data
output_dim = getattr(param, "output_dim", None)
# Special case for AQLM codebooks.
is_metadata = getattr(param, "is_metadata", False)
# Special case for per-tensor scale to load scalar into fused array.
needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
if loaded_shard_id is None:
# Loaded weight is already fused on disk (qkv/mlp).
if output_dim is None:
if needs_scalar_to_array:
param_data, loaded_weight = adjust_scalar_to_fused_array(
param_data, loaded_weight, 0)
assert param_data.shape == loaded_weight.shape
param_data.copy_(loaded_weight)
return
current_shard_offset = 0
shard_offsets: List[Tuple[int, int, int]] = []
for i, output_size in enumerate(self.output_sizes):
shard_offsets.append((i, current_shard_offset, output_size))
current_shard_offset += output_size
packed_dim = getattr(param, "packed_dim", None)
for shard_id, shard_offset, shard_size in shard_offsets:
# Special case for Quantization.
# If quantized, we need to adjust the offset and size to account
# for the packing.
if packed_dim == output_dim:
shard_size = shard_size // param.pack_factor
shard_offset = shard_offset // param.pack_factor
# Special case for Marlin.
shard_size, shard_offset = adjust_marlin_shard(
param, shard_size, shard_offset)
loaded_weight_shard = loaded_weight.narrow(
output_dim, shard_offset, shard_size)
self.weight_loader(param, loaded_weight_shard, shard_id)
return
assert loaded_shard_id < len(self.output_sizes)
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
'''
tp_rank = self.tp_rank
tp_size = self.tp_world_size
'''
=================
End of MLU Hijack
=================
'''
if output_dim is not None:
shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
shard_size = self.output_sizes[loaded_shard_id] // tp_size
# Special case for quantization.
# If quantized, we need to adjust the offset and size to account
# for the packing.
packed_dim = getattr(param, "packed_dim", None)
if packed_dim == output_dim:
shard_size = shard_size // param.pack_factor
shard_offset = shard_offset // param.pack_factor
# Special case for Marlin.
shard_size, shard_offset = adjust_marlin_shard(
param, shard_size, shard_offset)
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
False)
if use_bitsandbytes_4bit:
shard_size = loaded_weight.shape[output_dim]
shard_offset = loaded_weight.shape[output_dim] * \
loaded_shard_id
param_data = param_data.narrow(output_dim, shard_offset,
shard_size)
start_idx = tp_rank * shard_size
# bitsandbytes loads the weights of the specific portion
# no need to narrow here
if not use_bitsandbytes_4bit:
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
shard_size)
# Special case for AQLM codebooks.
elif is_metadata:
# metadata indicates fixed size concatenated along dim 0
shard_size = loaded_weight.shape[0]
shard_offset = loaded_shard_id * shard_size
param_data = param_data.narrow(0, shard_offset, shard_size)
# Special case for per-tensor scales in fused case.
elif needs_scalar_to_array:
param_data, loaded_weight = adjust_scalar_to_fused_array(
param_data, loaded_weight, loaded_shard_id)
else:
ignore_warning = getattr(param, "ignore_warning", False)
if not ignore_warning:
logger.warning(
"Loading a weight without `output_dim` attribute in "
"MergedColumnParallelLinear, assume the weight is "
"the same for all partitions.")
assert param_data.shape == loaded_weight.shape
param_data.copy_(loaded_weight)
def vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader_v2(self,
param: BasevLLMParameter,
loaded_weight: torch.Tensor,
loaded_shard_id: Optional[int] = None):
if loaded_shard_id is None:
if isinstance(param, PerTensorScaleParameter):
param.load_merged_column_weight(loaded_weight=loaded_weight,
shard_id=0)
return
elif type(param) in (RowvLLMParameter, BasevLLMParameter):
param.load_merged_column_weight(loaded_weight=loaded_weight)
return
# TODO: @dsikka - move to parameter.py
self._load_fused_module_from_checkpoint(param, loaded_weight)
return
assert loaded_shard_id < len(self.output_sizes)
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
'''
tp_size = self.tp_world_size
'''
=================
End of MLU Hijack
=================
'''
shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
shard_size = self.output_sizes[loaded_shard_id] // tp_size
param.load_merged_column_weight(loaded_weight=loaded_weight,
shard_id=loaded_shard_id,
shard_offset=shard_offset,
shard_size=shard_size)
def vllm__model_executor__layers__linear__RowParallelLinear____init__(
self,
input_size: int,
output_size: int,
bias: bool = True,
input_is_parallel: bool = True,
skip_bias_add: bool = False,
params_dtype: Optional[torch.dtype] = None,
reduce_results: bool = True,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
tp_group: Any = None,
):
super(RowParallelLinear, self).__init__(input_size, output_size, skip_bias_add, params_dtype,
quant_config, prefix, tp_group)
self.input_is_parallel = input_is_parallel
self.reduce_results = reduce_results
# Divide the weight matrix along the last dimension.
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
'''
self.tp_size = self.tp_world_size
'''
=================
End of MLU Hijack
=================
'''
self.input_size_per_partition = divide(input_size, self.tp_size)
assert self.quant_method is not None
self.quant_method.create_weights(
layer=self,
input_size_per_partition=self.input_size_per_partition,
output_partition_sizes=[self.output_size],
input_size=self.input_size,
output_size=self.output_size,
params_dtype=self.params_dtype,
weight_loader=(
self.weight_loader_v2 if self.quant_method.__class__.__name__
in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
if not reduce_results and (bias and not skip_bias_add):
raise ValueError("When not reduce the results, adding bias to the "
"results can lead to incorrect results")
if bias:
self.bias = Parameter(
torch.empty(self.output_size, dtype=params_dtype))
set_weight_attrs(self.bias, {
"output_dim": 0,
"weight_loader": self.weight_loader,
})
else:
self.register_parameter("bias", None)
def vllm__model_executor__layers__linear__RowParallelLinear__weight_loader(
self, param: Parameter, loaded_weight: torch.Tensor):
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
'''
tp_rank = self.tp_rank
tp_size = self.tp_world_size
'''
=================
End of MLU Hijack
=================
'''
input_dim = getattr(param, "input_dim", None)
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
# Special case for GGUF
is_gguf_weight = getattr(param, "is_gguf_weight", False)
is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
if is_gguf_weight_type:
param.weight_type = loaded_weight.item()
# Materialize GGUF UninitializedParameter
if is_gguf_weight and isinstance(param, UninitializedParameter):
weight_shape = list(loaded_weight.shape)
if input_dim:
weight_shape[input_dim] = weight_shape[input_dim] // tp_size
param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
param_data = param.data
# bitsandbytes loads the weights of the specific portion
# no need to narrow here
if input_dim is not None and not use_bitsandbytes_4bit:
shard_size = param_data.shape[input_dim]
start_idx = tp_rank * shard_size
loaded_weight = loaded_weight.narrow(input_dim, start_idx,
shard_size)
# Special case for loading scales off disk, which often do not
# have a shape (such as in the case of AutoFP8).
if len(loaded_weight.shape) == 0:
loaded_weight = loaded_weight.reshape(1)
assert param_data.shape == loaded_weight.shape
param_data.copy_(loaded_weight)
def vllm__model_executor__layers__linear__RowParallelLinear__forward(
self,
input_,
residual: Optional[torch.Tensor] = None
):
if self.input_is_parallel:
input_parallel = input_
else:
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
'''
tp_rank = self.tp_rank
'''
=================
End of MLU Hijack
=================
'''
splitted_input = split_tensor_along_last_dim(
input_, num_partitions=self.tp_size)
input_parallel = splitted_input[tp_rank].contiguous()
# Matrix multiply.
assert self.quant_method is not None
# Only fuse bias add into GEMM for rank 0 (this ensures that
# bias will not get added more than once in TP>1 case)
bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
residual_ = None if self.tp_rank > 0 else residual
'''
=====================================================
Modify by custom vllm_mlu
=====================================================
@brief: abandon original reduce if parallel_num is set
'''
is_parallel_enable = hasattr(self.quant_method, 'parallel_num') and get_is_prompt()
'''
=====================================================
End of custom MLU Hijack
=====================================================
'''
output_parallel = self.quant_method.apply(self,
input_parallel,
bias=bias_,
residual=residual_)
'''
=============================
Modify by custom vllm_mlu
=============================
@brief: when preload_size is set, call GroupCoordinator.all_reduce() directly and
use async_op to set all_reduce paralleled with preload
'''
if self.reduce_results and self.tp_size > 1 and not is_parallel_enable:
if hasattr(self, 'preload_size') and self.preload_size > 0 and not self.is_prompt:
handle = get_tp_group(self.tp_group).all_reduce(output_parallel, async_op=True)
_MB = 1 << 20
mlu_ops.preload(self.preloaded_weights[0].data, self.preload_size * _MB)
preloaded_weights_size = self.preloaded_weights[0].numel() * self.preloaded_weights[0].element_size()
if preloaded_weights_size < (self.preload_size * _MB) and len(self.preloaded_weights) > 1:
mlu_ops.preload(self.preloaded_weights[1].data, (self.preload_size * _MB) - preloaded_weights_size)
handle.wait()
output = output_parallel
else:
'''
=============================
Modify by vllm_mlu
=============================
@brief: add tensor_model_parallel_all_reduce() with self.tp_group
'''
output = tensor_model_parallel_all_reduce(output_parallel, tp_group=self.tp_group)
'''
=================
End of MLU Hijack
=================
'''
else:
output = output_parallel
'''
=========================
End of custom MLU Hijack
=========================
'''
output_bias = self.bias if self.skip_bias_add else None
return output, output_bias
MluHijackObject.apply_hijack(LinearBase,
LinearBase.__init__,
vllm__model_executor__layers__linear__LinearBase____init__)
MluHijackObject.apply_hijack(ColumnParallelLinear,
ColumnParallelLinear.__init__,
vllm__model_executor__layers__linear__ColumnParallelLinear____init__)
MluHijackObject.apply_hijack(ColumnParallelLinear,
ColumnParallelLinear.weight_loader,
vllm__model_executor__layers__linear__ColumnParallelLinear__weight_loader)
MluHijackObject.apply_hijack(ColumnParallelLinear,
ColumnParallelLinear.forward,
vllm__model_executor__layers__linear__ColumnParallelLinear__forward)
MluHijackObject.apply_hijack(ColumnParallelLinear,
ColumnParallelLinear.extra_repr,
vllm__model_executor__layers__linear__ColumnParallelLinear__extra_repr)
MluHijackObject.apply_hijack(MergedColumnParallelLinear,
MergedColumnParallelLinear.__init__,
vllm__model_executor__layers__linear__MergedColumnParallelLinear____init__)
MluHijackObject.apply_hijack(MergedColumnParallelLinear,
MergedColumnParallelLinear.weight_loader,
vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader)
MluHijackObject.apply_hijack(MergedColumnParallelLinear,
MergedColumnParallelLinear.weight_loader_v2,
vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader_v2)
MluHijackObject.apply_hijack(RowParallelLinear,
RowParallelLinear.__init__,
vllm__model_executor__layers__linear__RowParallelLinear____init__)
MluHijackObject.apply_hijack(RowParallelLinear,
RowParallelLinear.weight_loader,
vllm__model_executor__layers__linear__RowParallelLinear__weight_loader)
MluHijackObject.apply_hijack(RowParallelLinear,
RowParallelLinear.forward,
vllm__model_executor__layers__linear__RowParallelLinear__forward)

View File

@@ -0,0 +1,173 @@
from fractions import Fraction
from typing import Callable, Optional, Union, Any
import torch
from torch.nn import Parameter
from vllm.model_executor.parameter import (BasevLLMParameter,
PackedColumnParameter,
PackedvLLMParameter,
PerTensorScaleParameter,
RowvLLMParameter,
_ColumnvLLMParameter)
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.logger import init_logger
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from ..distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group)
logger = init_logger(__name__)
def vllm__model_executor__parameter__BasevLLMParameter____init__(self, data: torch.Tensor, weight_loader: Callable, tp_group: Any = None):
"""
Initialize the BasevLLMParameter
:param data: torch tensor with the parameter data
:param weight_loader: weight loader callable
:returns: a torch.nn.parameter
"""
self._weight_loader = weight_loader
'''
=============================
Modify by vllm_mlu
=============================
@brief: add self.tp_group, world_size and tp_rank to support moe expert parallel
'''
self.tp_group = tp_group
self.tp_world_size = get_parallel_world_size_with_group(self.tp_group)
self.tp_rank = get_parallel_rank_with_group(self.tp_group)
'''
=================
End of MLU Hijack
=================
'''
def vllm__model_executor__parameter___ColumnvLLMParameter__load_column_parallel_weight(self, loaded_weight: torch.Tensor):
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
'''
tp_rank = self.tp_rank
'''
=================
End of MLU Hijack
=================
'''
shard_size = self.data.shape[self.output_dim]
loaded_weight = loaded_weight.narrow(self.output_dim,
tp_rank * shard_size, shard_size)
assert self.data.shape == loaded_weight.shape
self.data.copy_(loaded_weight)
def vllm__model_executor__parameter___ColumnvLLMParameter__load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
shard_offset = kwargs.get("shard_offset")
shard_size = kwargs.get("shard_size")
if isinstance(
self,
(PackedColumnParameter,
PackedvLLMParameter)) and self.packed_dim == self.output_dim:
shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
shard_offset=shard_offset, shard_size=shard_size)
param_data = self.data
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
'''
tp_rank = self.tp_rank
'''
=================
End of MLU Hijack
=================
'''
param_data = param_data.narrow(self.output_dim, shard_offset,
shard_size)
loaded_weight = loaded_weight.narrow(self.output_dim,
tp_rank * shard_size, shard_size)
assert param_data.shape == loaded_weight.shape
param_data.copy_(loaded_weight)
def vllm__model_executor__parameter___ColumnvLLMParameter__load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
shard_offset = kwargs.get("shard_offset")
shard_size = kwargs.get("shard_size")
shard_id = kwargs.get("shard_id")
num_heads = kwargs.get("num_heads")
if isinstance(
self,
(PackedColumnParameter,
PackedvLLMParameter)) and self.output_dim == self.packed_dim:
shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
shard_offset=shard_offset, shard_size=shard_size)
param_data = self.data
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
'''
tp_rank = self.tp_rank
'''
=================
End of MLU Hijack
=================
'''
shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
param_data = param_data.narrow(self.output_dim, shard_offset,
shard_size)
loaded_weight = loaded_weight.narrow(self.output_dim,
shard_id * shard_size, shard_size)
assert param_data.shape == loaded_weight.shape
param_data.copy_(loaded_weight)
def vllm__model_executor__parameter__RowvLLMParameter__load_row_parallel_weight(self, loaded_weight: torch.Tensor):
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
'''
tp_rank = self.tp_rank
'''
=================
End of MLU Hijack
=================
'''
shard_size = self.data.shape[self.input_dim]
loaded_weight = loaded_weight.narrow(self.input_dim,
tp_rank * shard_size, shard_size)
if len(loaded_weight.shape) == 0:
loaded_weight = loaded_weight.reshape(1)
assert self.data.shape == loaded_weight.shape
self.data.copy_(loaded_weight)
MluHijackObject.apply_hijack(BasevLLMParameter,
BasevLLMParameter.__init__,
vllm__model_executor__parameter__BasevLLMParameter____init__)
MluHijackObject.apply_hijack(_ColumnvLLMParameter,
_ColumnvLLMParameter.load_column_parallel_weight,
vllm__model_executor__parameter___ColumnvLLMParameter__load_column_parallel_weight)
MluHijackObject.apply_hijack(_ColumnvLLMParameter,
_ColumnvLLMParameter.load_merged_column_weight,
vllm__model_executor__parameter___ColumnvLLMParameter__load_merged_column_weight)
MluHijackObject.apply_hijack(_ColumnvLLMParameter,
_ColumnvLLMParameter.load_qkv_weight,
vllm__model_executor__parameter___ColumnvLLMParameter__load_qkv_weight)
MluHijackObject.apply_hijack(RowvLLMParameter,
RowvLLMParameter.load_row_parallel_weight,
vllm__model_executor__parameter__RowvLLMParameter__load_row_parallel_weight)

View File

@@ -0,0 +1 @@
from . import mlu_worker

View File

@@ -0,0 +1,192 @@
import gc
import os
import torch
from typing import List, Optional, Set, Tuple, Type
from vllm.config import ParallelConfig
from vllm.distributed import init_distributed_environment, set_custom_all_reduce
from vllm.model_executor import set_random_seed
from vllm.worker.mlu_worker import MLUWorker, _check_if_gpu_supports_dtype
from vllm_mlu.worker.mlu_worker import MLUWorker_V2
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from ..distributed.parallel_state import ensure_model_parallel_initialized
import functools
from collections import defaultdict
from vllm.logger import init_logger
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
MergedColumnParallelLinear,
QKVParallelLinear,
RowParallelLinear)
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size
from ..distributed.parallel_state import (get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size,
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
logger = init_logger(__name__)
def vllm__worker__mlu_worker__init_worker_distributed_environment(
parallel_config: ParallelConfig,
rank: int,
distributed_init_method: Optional[str] = None,
local_rank: int = -1,
) -> None:
set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
init_distributed_environment(parallel_config.world_size, rank,
distributed_init_method, local_rank,
backend='cncl')
'''
=============================
Modify by vllm_mlu
=============================
@brief: add context_parallel_size, moe_tp_size, moe_ep_size
'''
ensure_model_parallel_initialized(parallel_config=parallel_config)
'''
==================
End of MLU Hijack
==================
'''
def vllm__worker__mlu_worker__MLUWorker__init_device(self) -> None:
if self.device_config.device.type == "mlu":
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
# as the number of all_reduce calls increases. This env var disables
# this behavior.
# Related issue:
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
os.environ["TORCH_CNCL_AVOID_RECORD_STREAMS"] = "1"
# This env var set by Ray causes exceptions with graph building.
os.environ.pop("CNCL_ASYNC_ERROR_HANDLING", None)
self.device = torch.device(f"mlu:{self.local_rank}")
torch.mlu.set_device(self.device)
_check_if_gpu_supports_dtype(self.model_config.dtype)
gc.collect()
torch.mlu.empty_cache()
self.init_gpu_memory = torch.mlu.mem_get_info()[0]
else:
raise RuntimeError(
f"Not support device type: {self.device_config.device}")
# Initialize the distributed environment.
'''
=============================
Modify by vllm_mlu
=============================
@brief: @brief: modify to vllm__worker__mlu_worker__init_worker_distributed_environment
'''
vllm__worker__mlu_worker__init_worker_distributed_environment(self.parallel_config, self.rank,
self.distributed_init_method, self.local_rank)
'''
==================
End of MLU Hijack
==================
'''
# Set random seed.
set_random_seed(self.model_config.seed)
def default_act_range_value():
return {
"x": None,
"split": None,
"is_linear": False,
"is_qkv": False,
"q_proj_size": 0,
"num_kv_head_replicas": 1,
"is_merge": False,
"input_id": [],
"self_rank": 0,
"rank": None,
"tensor_rank": None,
"tp_world_size": None,
"moe_tp_rank": None,
"moe_tp_world_size": None,
"moe_ep_rank": None,
"moe_ep_world_size": None,
"weight": None,
}
def vllm_mlu__worker__mlu_worker__MLUWorker_V2__setup_smooth_hook(self,
is_save_input_id: bool = False,
is_save_moe_info: bool = False):
model = self.model_runner.model
self.act_range = defaultdict(default_act_range_value)
self.hooks = []
linear_class_list = (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
other_class_list = (VocabParallelEmbedding, ParallelLMHead)
class_list = linear_class_list + other_class_list
row_class_list = (RowParallelLinear)
for name, m in model.named_modules():
if isinstance(m, FeedForward):
m.use_bt_ffn = False
if isinstance(m, SparseMoeMlp):
m.is_use_fused_moe = False
if isinstance(m, class_list):
is_linear = True if isinstance(m, linear_class_list) else False
split_type = "row" if isinstance(m, row_class_list) else "col"
self.act_range[name]["split"] = split_type
self.act_range[name]["is_linear"] = is_linear
if isinstance(m, QKVParallelLinear):
self.act_range[name]["is_qkv"] = True
self.act_range[name]["q_proj_size"] = m.num_heads * m.head_size
self.act_range[name]["num_kv_head_replicas"] = m.num_kv_head_replicas
self.act_range[name]["is_merge"] = isinstance(m, MergedColumnParallelLinear)
if is_save_moe_info:
self.act_range[name]["rank"] = torch.distributed.get_rank()
self.act_range[name]["tensor_rank"] = get_tensor_model_parallel_rank()
self.act_range[name]["tp_world_size"] = get_tensor_model_parallel_world_size()
self.act_range[name]["moe_tp_rank"] = get_moe_tensor_parallel_rank()
self.act_range[name]["moe_tp_world_size"] = get_moe_tensor_parallel_world_size()
self.act_range[name]["moe_ep_rank"] = get_moe_expert_parallel_rank()
self.act_range[name]["moe_ep_world_size"] = get_moe_expert_parallel_world_size()
if ".expert." in name:
self.act_range[name]["weight"] = m.weight
logger.info(f"rank:{self.rank}, add hook to {name}, is_linear:{is_linear}, split_type:{split_type}")
self.hooks.append(
m.register_forward_hook(
functools.partial(self.stat_input_hook,
name=name,
act_range=self.act_range,
is_linear=is_linear,
is_save_input_id=is_save_input_id)))
def vllm_mlu__worker__mlu_worker__MLUWorker_V2__get_act_range(self):
act_range = defaultdict(default_act_range_value)
for layer_name, layer_range in self.act_range.items():
for tensor_key, tensor_value in layer_range.items():
if isinstance(tensor_value, torch.Tensor):
act_range[layer_name][tensor_key] = tensor_value.to("cpu")
elif tensor_key == "input_id" and isinstance(tensor_value, list):
input_id_len = len(tensor_value)
for i in range(input_id_len):
if isinstance(tensor_value[i], torch.Tensor):
act_range[layer_name][tensor_key].append(tensor_value[i].to("cpu"))
else:
act_range[layer_name][tensor_key].append(tensor_value[i])
else:
act_range[layer_name][tensor_key] = tensor_value
return act_range
MluHijackObject.apply_hijack(MLUWorker,
MLUWorker.init_device,
vllm__worker__mlu_worker__MLUWorker__init_device)
MluHijackObject.apply_hijack(MLUWorker,
"setup_smooth_hook",
vllm_mlu__worker__mlu_worker__MLUWorker_V2__setup_smooth_hook)
MluHijackObject.apply_hijack(MLUWorker,
"get_act_range",
vllm_mlu__worker__mlu_worker__MLUWorker_V2__get_act_range)