add qwen3
This commit is contained in:
@@ -0,0 +1,17 @@
|
||||
import logging
|
||||
from logging import Logger
|
||||
|
||||
def init_logger(name: str) -> Logger:
|
||||
"""Initialize loggers for benchmarks module,
|
||||
and keep the configuration consistent with the vllm module"""
|
||||
|
||||
logger = logging.getLogger(name)
|
||||
|
||||
vllm_logger = logging.Logger.manager.loggerDict.get('vllm', None)
|
||||
if vllm_logger:
|
||||
logger.setLevel(vllm_logger.level)
|
||||
logger.propagate = vllm_logger.propagate
|
||||
logger.handlers = vllm_logger.handlers
|
||||
|
||||
return logger
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
import torch
|
||||
from vllm.config import ParallelConfig, TokenizerPoolConfig
|
||||
from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
from vllm.platforms import current_platform
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
if TYPE_CHECKING:
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def vllm__config__ParallelConfig___init__(
|
||||
self,
|
||||
pipeline_parallel_size: int,
|
||||
tensor_parallel_size: int,
|
||||
worker_use_ray: Optional[bool] = None,
|
||||
max_parallel_loading_workers: Optional[int] = None,
|
||||
disable_custom_all_reduce: bool = False,
|
||||
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
|
||||
ray_workers_use_nsight: bool = False,
|
||||
placement_group: Optional["PlacementGroup"] = None,
|
||||
distributed_executor_backend: Optional[Union[
|
||||
str, Type["ExecutorBase"]]] = None,
|
||||
) -> None:
|
||||
self.pipeline_parallel_size = pipeline_parallel_size
|
||||
self.tensor_parallel_size = tensor_parallel_size
|
||||
self.distributed_executor_backend = distributed_executor_backend
|
||||
self.max_parallel_loading_workers = max_parallel_loading_workers
|
||||
self.disable_custom_all_reduce = disable_custom_all_reduce
|
||||
self.tokenizer_pool_config = tokenizer_pool_config
|
||||
self.ray_workers_use_nsight = ray_workers_use_nsight
|
||||
self.placement_group = placement_group
|
||||
|
||||
'''
|
||||
==========================
|
||||
Modify by vllm_mlu
|
||||
==========================
|
||||
@brief: modify world_size
|
||||
'''
|
||||
self.context_parallel_size = self.context_parallel_size
|
||||
self.moe_tp_size = self.moe_tp_size
|
||||
self.moe_ep_size = self.moe_ep_size
|
||||
|
||||
self.world_size = pipeline_parallel_size * tensor_parallel_size * self.context_parallel_size
|
||||
'''
|
||||
=======================
|
||||
End of MLU Hijack
|
||||
=======================
|
||||
'''
|
||||
if worker_use_ray:
|
||||
if self.distributed_executor_backend is None:
|
||||
self.distributed_executor_backend = "ray"
|
||||
elif not self.use_ray:
|
||||
raise ValueError(f"worker-use-ray can't be used with "
|
||||
f"distributed executor backend "
|
||||
f"'{self.distributed_executor_backend}'.")
|
||||
|
||||
if current_platform.is_tpu() and self.world_size > 1:
|
||||
if self.distributed_executor_backend is None:
|
||||
self.distributed_executor_backend = "ray"
|
||||
if self.distributed_executor_backend != "ray":
|
||||
raise ValueError(
|
||||
"TPU backend only supports Ray for distributed inference.")
|
||||
|
||||
if current_platform.is_hpu() and self.world_size > 1:
|
||||
if self.distributed_executor_backend is None:
|
||||
self.distributed_executor_backend = "ray"
|
||||
if self.distributed_executor_backend != "ray":
|
||||
raise ValueError(
|
||||
"HPU backend only supports Ray for distributed inference.")
|
||||
|
||||
if self.distributed_executor_backend is None and self.world_size > 1:
|
||||
# We use multiprocessing by default if world_size fits on the
|
||||
# current node and we aren't in a ray placement group.
|
||||
|
||||
from vllm.executor import ray_utils
|
||||
backend = "mp"
|
||||
ray_found = ray_utils.ray_is_available()
|
||||
if (current_platform.is_cuda()
|
||||
and cuda_device_count_stateless() < self.world_size):
|
||||
if not ray_found:
|
||||
raise ValueError("Unable to load Ray which is "
|
||||
"required for multi-node inference, "
|
||||
"please install Ray with `pip install "
|
||||
"ray`.") from ray_utils.ray_import_err
|
||||
backend = "ray"
|
||||
elif ray_found:
|
||||
if self.placement_group:
|
||||
backend = "ray"
|
||||
else:
|
||||
from ray import is_initialized as ray_is_initialized
|
||||
if ray_is_initialized():
|
||||
from ray.util import get_current_placement_group
|
||||
if get_current_placement_group():
|
||||
backend = "ray"
|
||||
self.distributed_executor_backend = backend
|
||||
logger.info("Defaulting to use %s for distributed inference",
|
||||
backend)
|
||||
|
||||
self._verify_args()
|
||||
self.rank: int = 0
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(ParallelConfig,
|
||||
ParallelConfig.__init__,
|
||||
vllm__config__ParallelConfig___init__)
|
||||
@@ -0,0 +1,2 @@
|
||||
from . import communication_op
|
||||
from . import parallel_state
|
||||
@@ -0,0 +1,21 @@
|
||||
import torch
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
from .parallel_state import get_tp_group
|
||||
|
||||
def tensor_model_parallel_all_reduce(input_: torch.Tensor, tp_group: Any = None) -> torch.Tensor:
|
||||
"""All-reduce the input tensor across model parallel group."""
|
||||
return get_tp_group(tp_group).all_reduce(input_)
|
||||
|
||||
|
||||
def tensor_model_parallel_all_gather(input_: torch.Tensor,
|
||||
dim: int = -1, tp_group: Any = None) -> torch.Tensor:
|
||||
"""All-gather the input tensor across model parallel group."""
|
||||
return get_tp_group(tp_group).all_gather(input_, dim)
|
||||
|
||||
|
||||
def tensor_model_parallel_gather(input_: torch.Tensor,
|
||||
dst: int = 0,
|
||||
dim: int = -1, tp_group: Any = None) -> Optional[torch.Tensor]:
|
||||
"""Gather the input tensor across model parallel group."""
|
||||
return get_tp_group(tp_group).gather(input_, dst, dim)
|
||||
@@ -0,0 +1,339 @@
|
||||
import torch
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.distributed.parallel_state import (init_model_parallel_group, get_tensor_model_parallel_world_size,
|
||||
get_tensor_model_parallel_rank, get_world_group, get_pp_group,
|
||||
GroupCoordinator)
|
||||
import vllm.distributed.parallel_state as parallel_state_org
|
||||
from vllm.distributed.parallel_state import model_parallel_is_initialized as model_parallel_is_initialized_org
|
||||
from vllm.distributed.parallel_state import destroy_model_parallel as destroy_model_parallel_org
|
||||
|
||||
def get_tp_group(tp_group: Any = None) -> GroupCoordinator:
|
||||
if tp_group is not None:
|
||||
return tp_group
|
||||
assert parallel_state_org._TP is not None, ("tensor model parallel group is not initialized")
|
||||
return parallel_state_org._TP
|
||||
|
||||
_CP: Optional[GroupCoordinator] = None
|
||||
|
||||
def get_cp_group() -> GroupCoordinator:
|
||||
assert _CP is not None, ("context parallel group is not initialized")
|
||||
return _CP
|
||||
|
||||
# kept for backward compatibility
|
||||
get_context_model_parallel_group = get_cp_group
|
||||
|
||||
_MOE_TP: Optional[GroupCoordinator] = None
|
||||
|
||||
def get_moe_tp_group() -> GroupCoordinator:
|
||||
assert _MOE_TP is not None, ("moe tensor parallel group is not initialized")
|
||||
return _MOE_TP
|
||||
|
||||
# kept for backward compatibility
|
||||
get_moe_tensor_parallel_group = get_moe_tp_group
|
||||
|
||||
_MOE_EP: Optional[GroupCoordinator] = None
|
||||
|
||||
def get_moe_ep_group() -> GroupCoordinator:
|
||||
assert _MOE_EP is not None, ("moe expert parallel group is not initialized")
|
||||
return _MOE_EP
|
||||
|
||||
|
||||
# kept for backward compatibility
|
||||
get_moe_expert_parallel_group = get_moe_ep_group
|
||||
|
||||
|
||||
def initialize_model_parallel(
|
||||
parallel_config: ParallelConfig,
|
||||
backend: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize model parallel groups.
|
||||
|
||||
Arguments:
|
||||
tensor_model_parallel_size: number of GPUs used for tensor model
|
||||
parallelism.
|
||||
pipeline_model_parallel_size: number of GPUs used for pipeline model
|
||||
parallelism.
|
||||
|
||||
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
|
||||
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
|
||||
the model pipeline. The present function will
|
||||
create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
|
||||
4 tensor model-parallel groups:
|
||||
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
|
||||
2 pipeline model-parallel groups:
|
||||
[g0, g2, g4, g6], [g1, g3, g5, g7]
|
||||
Note that for efficiency, the caller should make sure adjacent ranks
|
||||
are on the same DGX box. For example if we are using 2 DGX-1 boxes
|
||||
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
|
||||
ranks 8 to 15 belong to the second box.
|
||||
"""
|
||||
# Get world size and rank. Ensure some consistencies.
|
||||
assert torch.distributed.is_initialized()
|
||||
world_size: int = torch.distributed.get_world_size()
|
||||
backend = backend or torch.distributed.get_backend(
|
||||
get_world_group().device_group)
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: get parallel_size from parallel_config and valid world_size
|
||||
'''
|
||||
tensor_model_parallel_size = parallel_config.tensor_parallel_size
|
||||
pipeline_model_parallel_size = parallel_config.pipeline_parallel_size
|
||||
context_model_parallel_size = parallel_config.context_parallel_size
|
||||
moe_tensor_parallel_size = parallel_config.moe_tp_size
|
||||
moe_expert_parallel_size = parallel_config.moe_ep_size
|
||||
|
||||
if (world_size !=
|
||||
tensor_model_parallel_size * pipeline_model_parallel_size * context_model_parallel_size):
|
||||
raise RuntimeError(
|
||||
f"world_size ({world_size}) is not equal to "
|
||||
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
|
||||
f"pipeline_model_parallel_size ({pipeline_model_parallel_size}) x"
|
||||
f"context_model_parallel_size ({context_model_parallel_size})")
|
||||
|
||||
if (moe_tensor_parallel_size < 1 or moe_expert_parallel_size < 1 or tensor_model_parallel_size !=
|
||||
moe_tensor_parallel_size * moe_expert_parallel_size):
|
||||
raise RuntimeError(
|
||||
f"tensor_model_parallel_size ({world_size}) is not equal to "
|
||||
f"moe_tensor_parallel_size ({moe_tensor_parallel_size}) x "
|
||||
f"moe_expert_parallel_size ({moe_expert_parallel_size})")
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
# Build the tensor model-parallel groups.
|
||||
num_tensor_model_parallel_groups: int = (world_size //
|
||||
tensor_model_parallel_size)
|
||||
assert parallel_state_org._TP is None, ("tensor model parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_tensor_model_parallel_groups):
|
||||
ranks = list(
|
||||
range(i * tensor_model_parallel_size,
|
||||
(i + 1) * tensor_model_parallel_size))
|
||||
group_ranks.append(ranks)
|
||||
|
||||
# message queue broadcaster is only used in tensor model parallel group
|
||||
parallel_state_org._TP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_message_queue_broadcaster=True,
|
||||
group_name="tp")
|
||||
|
||||
# Build the pipeline model-parallel groups.
|
||||
num_pipeline_model_parallel_groups: int = (world_size //
|
||||
pipeline_model_parallel_size)
|
||||
assert parallel_state_org._PP is None, (
|
||||
"pipeline model parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_pipeline_model_parallel_groups):
|
||||
ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
|
||||
group_ranks.append(ranks)
|
||||
# pipeline parallel does not need custom allreduce
|
||||
parallel_state_org._PP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_custom_allreduce=False,
|
||||
group_name="pp")
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add _CP, _MOE_TP, MOE_EP
|
||||
'''
|
||||
# Build the context parallel groups.
|
||||
num_context_model_parallel_groups: int = (world_size //
|
||||
context_model_parallel_size)
|
||||
global _CP
|
||||
assert _CP is None, (
|
||||
"context parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_context_model_parallel_groups):
|
||||
ranks = list(range(i, context_model_parallel_size * tensor_model_parallel_size + i, tensor_model_parallel_size))
|
||||
group_ranks.append(ranks)
|
||||
# message queue broadcaster is set to be used in context parallel group
|
||||
_CP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_message_queue_broadcaster=True,
|
||||
group_name="cp")
|
||||
|
||||
# Build the moe tensor parallel groups.
|
||||
global _MOE_TP
|
||||
assert _MOE_TP is None, ("moe tensor parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_tensor_model_parallel_groups):
|
||||
for j in range(moe_expert_parallel_size):
|
||||
ranks = list(range(i * tensor_model_parallel_size + j, (i + 1) * tensor_model_parallel_size,
|
||||
moe_expert_parallel_size))
|
||||
group_ranks.append(ranks)
|
||||
|
||||
# message queue broadcaster is set to be used in moe tensor parallel group
|
||||
_MOE_TP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_message_queue_broadcaster=True,
|
||||
group_name="moe_tp")
|
||||
|
||||
# Build the moe expert parallel groups.
|
||||
global _MOE_EP
|
||||
assert _MOE_EP is None, ("moe expert parallel group is already initialized")
|
||||
group_ranks = []
|
||||
for i in range(num_tensor_model_parallel_groups):
|
||||
for j in range(moe_tensor_parallel_size):
|
||||
ranks = range(i * tensor_model_parallel_size + j * moe_expert_parallel_size,
|
||||
i * tensor_model_parallel_size + (j + 1) * moe_expert_parallel_size)
|
||||
group_ranks.append(ranks)
|
||||
|
||||
# message queue broadcaster is set to be used in moe expert parallel group
|
||||
_MOE_EP = init_model_parallel_group(group_ranks,
|
||||
get_world_group().local_rank,
|
||||
backend,
|
||||
use_message_queue_broadcaster=True,
|
||||
group_name="moe_ep")
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
def ensure_model_parallel_initialized(
|
||||
parallel_config: ParallelConfig,
|
||||
backend: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Helper to initialize model parallel groups if they are not initialized,
|
||||
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
|
||||
values if the model parallel groups are initialized.
|
||||
"""
|
||||
backend = backend or torch.distributed.get_backend(
|
||||
get_world_group().device_group)
|
||||
if not model_parallel_is_initialized():
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace all parallel_size to parallel_config
|
||||
'''
|
||||
initialize_model_parallel(parallel_config, backend)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
return
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: check parallel_size with prefix parallel_config
|
||||
'''
|
||||
assert (
|
||||
get_tensor_model_parallel_world_size() == parallel_config.tensor_model_parallel_size
|
||||
), ("tensor parallel group already initialized, but of unexpected size: "
|
||||
f"{get_tensor_model_parallel_world_size()=} vs. "
|
||||
f"{parallel_config.tensor_model_parallel_size=}")
|
||||
pp_world_size = get_pp_group().world_size
|
||||
assert (pp_world_size == parallel_config.pipeline_model_parallel_size), (
|
||||
"pipeline parallel group already initialized, but of unexpected size: "
|
||||
f"{pp_world_size=} vs. "
|
||||
f"{parallel_config.pipeline_model_parallel_size=}")
|
||||
cp_world_size = get_cp_group().world_size
|
||||
assert (cp_world_size == parallel_config.context_parallel_size), (
|
||||
"context parallel group already initialized, but of unexpected size: "
|
||||
f"{cp_world_size=} vs. "
|
||||
f"{parallel_config.context_parallel_size=}")
|
||||
moe_tp_world_size = get_moe_tp_group().world_size
|
||||
assert (moe_tp_world_size == parallel_config.moe_tp_size), (
|
||||
"moe tensor parallel group already initialized, but of unexpected size: "
|
||||
f"{moe_tp_world_size=} vs. "
|
||||
f"{parallel_config.moe_tp_size=}")
|
||||
moe_ep_world_size = get_moe_ep_group().world_size
|
||||
assert (moe_ep_world_size == parallel_config.moe_ep_size), (
|
||||
"moe expert parallel group already initialized, but of unexpected size: "
|
||||
f"{moe_ep_world_size=} vs. "
|
||||
f"{parallel_config.moe_ep_size=}")
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
def model_parallel_is_initialized():
|
||||
"""Check if tensor, pipeline, context, moe parallel groups are initialized."""
|
||||
return model_parallel_is_initialized_org and (_CP is not None and _CP is not None) and (
|
||||
_MOE_TP is not None and _MOE_TP is not None) and (_MOE_EP is not None and _MOE_EP is not None)
|
||||
|
||||
|
||||
def destroy_model_parallel():
|
||||
"""Set the groups to none and destroy them."""
|
||||
destroy_model_parallel_org()
|
||||
global _CP
|
||||
if _CP:
|
||||
_CP.destroy()
|
||||
_CP = None
|
||||
|
||||
global _MOE_TP
|
||||
if _MOE_TP:
|
||||
_MOE_TP.destroy()
|
||||
_MOE_TP = None
|
||||
|
||||
global _MOE_EP
|
||||
if _MOE_EP:
|
||||
_MOE_EP.destroy()
|
||||
_MOE_EP = None
|
||||
|
||||
|
||||
def get_context_model_parallel_world_size():
|
||||
"""Return world size for the context parallel group."""
|
||||
return get_cp_group().world_size
|
||||
|
||||
|
||||
def get_context_model_parallel_rank():
|
||||
"""Return my rank for the context parallel group."""
|
||||
return get_cp_group().rank_in_group
|
||||
|
||||
|
||||
def get_moe_tensor_parallel_world_size():
|
||||
"""Return world size for the moe tensor parallel group."""
|
||||
return get_moe_tp_group().world_size
|
||||
|
||||
|
||||
def get_moe_tensor_parallel_rank():
|
||||
"""Return my rank for the moe tensor parallel group."""
|
||||
return get_moe_tp_group().rank_in_group
|
||||
|
||||
|
||||
def get_moe_expert_parallel_world_size():
|
||||
"""Return world size for the moe expert parallel group."""
|
||||
return get_moe_ep_group().world_size
|
||||
|
||||
|
||||
def get_moe_expert_parallel_rank():
|
||||
"""Return my rank for the moe expert parallel group."""
|
||||
return get_moe_ep_group().rank_in_group
|
||||
|
||||
|
||||
def get_parallel_world_size_with_group(group):
|
||||
"""Return world size for the special group."""
|
||||
if group is not None:
|
||||
return group.world_size
|
||||
else:
|
||||
return get_tensor_model_parallel_world_size()
|
||||
|
||||
|
||||
def get_parallel_rank_with_group(group):
|
||||
"""Return my rank for the special group."""
|
||||
if group is not None:
|
||||
return group.rank_in_group
|
||||
else:
|
||||
return get_tensor_model_parallel_rank()
|
||||
@@ -0,0 +1 @@
|
||||
from . import arg_utils
|
||||
@@ -0,0 +1,141 @@
|
||||
import argparse
|
||||
import torch
|
||||
from vllm.config import VllmConfig, ParallelConfig
|
||||
from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs
|
||||
from vllm_mlu._mlu_utils import *
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
vllm__engine__arg_utils__EngineArgs__create_engine_config_org = EngineArgs.create_engine_config
|
||||
vllm__engine__arg_utils__EngineArgs__add_cli_args_org = EngineArgs.add_cli_args
|
||||
vllm__engine__arg_utils__EngineArgs__from_cli_args_org = EngineArgs.from_cli_args
|
||||
vllm__engine__arg_utils__AsyncEngineArgs__from_cli_args_org = AsyncEngineArgs.from_cli_args
|
||||
|
||||
|
||||
def vllm__engine__arg_utils__EngineArgs__create_engine_config(self, ) -> VllmConfig:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: chunked parallel pipeline only support batch size = 1 yet.
|
||||
'''
|
||||
if CHUNKED_PIPELINE_PARALLEL_EN:
|
||||
self.max_num_seqs = 1
|
||||
logger.info("Reset max_num_seqs to 1 as the chunked parallel pipeline mode "
|
||||
"only supports batch size to 1.")
|
||||
'''
|
||||
@brief: disable custom_all_reduce, re-set block_size to support paged and unpaged mode.
|
||||
'''
|
||||
# MLU not support custom all reduce
|
||||
self.disable_custom_all_reduce = True
|
||||
BlockSizeInfo.set_block_size(self.block_size)
|
||||
if not USE_PAGED and self.enable_chunked_prefill:
|
||||
raise ValueError("Not support chunked_prefill in unpaged mode.")
|
||||
|
||||
# set parallel_config context_parallel_size, moe_tp_size, moe_ep_size
|
||||
self.context_parallel_size = getattr(self, "context_parallel_size", 1)
|
||||
self.moe_tp_size = getattr(self, "moe_tp_size", -1)
|
||||
self.moe_ep_size = getattr(self, "moe_ep_size", -1)
|
||||
# check context parallel whether supported or not
|
||||
if CONTEXT_PARALLEL_EN:
|
||||
if self.context_parallel_size > 1 and get_device_major_capability() == 3:
|
||||
raise ValueError('Context parallel does not support MLU370.')
|
||||
else:
|
||||
if self.context_parallel_size > 1:
|
||||
raise ValueError('Context parallel does not support when CONTEXT_PARALLEL_EN=False')
|
||||
# check expert parallel whether supported or not
|
||||
if not EXPERT_PARALLEL_EN and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
|
||||
raise ValueError('Expert parallel does not support when EXPERT_PARALLEL_EN=False')
|
||||
|
||||
ParallelConfig.context_parallel_size = self.context_parallel_size
|
||||
|
||||
# set parallel_config moe_tp_size and moe_ep_size
|
||||
if self.moe_tp_size < 1 and self.moe_ep_size < 1:
|
||||
moe_tp_size = self.tensor_parallel_size
|
||||
moe_ep_size = 1
|
||||
elif self.moe_tp_size >= 1 and self.moe_ep_size < 1:
|
||||
moe_tp_size = self.moe_tp_size
|
||||
moe_ep_size = self.tensor_parallel_size // self.moe_tp_size
|
||||
elif self.moe_tp_size < 1 and self.moe_ep_size >= 1:
|
||||
moe_tp_size = self.tensor_parallel_size // self.moe_ep_size
|
||||
moe_ep_size = self.moe_ep_size
|
||||
else:
|
||||
moe_tp_size = self.moe_tp_size
|
||||
moe_ep_size = self.moe_ep_size
|
||||
assert moe_tp_size * moe_ep_size == self.tensor_parallel_size, (
|
||||
f"tensor_parallel_size ({self.tensor_parallel_size}) is not equal to "
|
||||
f"moe_tp_size ({self.moe_tp_size}) x moe_ep_size ({self.moe_ep_size})"
|
||||
"or moe_tp_size and moe_ep_size should be -1 or one of them should be -1")
|
||||
|
||||
ParallelConfig.moe_tp_size = moe_tp_size
|
||||
ParallelConfig.moe_ep_size = moe_ep_size
|
||||
|
||||
engine_config = vllm__engine__arg_utils__EngineArgs__create_engine_config_org(self)
|
||||
engine_config.cache_config.block_size = BlockSizeInfo.BLOCK_SIZE
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
return engine_config
|
||||
|
||||
|
||||
@staticmethod
|
||||
def vllm__engine__arg_utils__EngineArgs__add_cli_args(
|
||||
parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
parser = vllm__engine__arg_utils__EngineArgs__add_cli_args_org(parser)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add --context-parallel-size, --moe-tp-size and --moe-ep-size
|
||||
'''
|
||||
parser.add_argument('--context-parallel-size',
|
||||
'-cp',
|
||||
type=int,
|
||||
default=1,
|
||||
help='number of context parallel replicas')
|
||||
parser.add_argument('--moe-tp-size',
|
||||
type=int,
|
||||
default=-1,
|
||||
help='Number of moe tensor parallel replicas')
|
||||
parser.add_argument('--moe-ep-size',
|
||||
type=int,
|
||||
default=-1,
|
||||
help='Number of moe expert parallel replicas')
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
return parser
|
||||
|
||||
|
||||
@classmethod
|
||||
def vllm__engine__arg_utils__EngineArgs__from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
|
||||
if cls == AsyncEngineArgs:
|
||||
engine_args = vllm__engine__arg_utils__AsyncEngineArgs__from_cli_args_org(args)
|
||||
else:
|
||||
engine_args = vllm__engine__arg_utils__EngineArgs__from_cli_args_org(args)
|
||||
setattr(engine_args, 'context_parallel_size', getattr(args, "context_parallel_size"))
|
||||
setattr(engine_args, 'moe_tp_size', getattr(args, "moe_tp_size"))
|
||||
setattr(engine_args, 'moe_ep_size', getattr(args, "moe_ep_size"))
|
||||
return engine_args
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(EngineArgs,
|
||||
EngineArgs.create_engine_config,
|
||||
vllm__engine__arg_utils__EngineArgs__create_engine_config)
|
||||
MluHijackObject.apply_hijack(EngineArgs,
|
||||
EngineArgs.add_cli_args,
|
||||
vllm__engine__arg_utils__EngineArgs__add_cli_args)
|
||||
MluHijackObject.apply_hijack(EngineArgs,
|
||||
EngineArgs.from_cli_args,
|
||||
vllm__engine__arg_utils__EngineArgs__from_cli_args)
|
||||
MluHijackObject.apply_hijack(AsyncEngineArgs,
|
||||
AsyncEngineArgs.from_cli_args,
|
||||
vllm__engine__arg_utils__EngineArgs__from_cli_args)
|
||||
@@ -0,0 +1 @@
|
||||
from . import llm
|
||||
@@ -0,0 +1,98 @@
|
||||
from typing import Optional, Dict, Any
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.logger import init_logger
|
||||
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
|
||||
TaskOption)
|
||||
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
vllm__entrypoints__llm__LLM____init__org = LLM.__init__
|
||||
|
||||
def vllm__entrypoints__llm__LLM____init__(
|
||||
self,
|
||||
model: str,
|
||||
tokenizer: Optional[str] = None,
|
||||
tokenizer_mode: str = "auto",
|
||||
skip_tokenizer_init: bool = False,
|
||||
trust_remote_code: bool = False,
|
||||
allowed_local_media_path: str = "",
|
||||
tensor_parallel_size: int = 1,
|
||||
dtype: str = "auto",
|
||||
quantization: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
tokenizer_revision: Optional[str] = None,
|
||||
seed: int = 0,
|
||||
gpu_memory_utilization: float = 0.9,
|
||||
swap_space: float = 4,
|
||||
cpu_offload_gb: float = 0,
|
||||
enforce_eager: Optional[bool] = None,
|
||||
max_seq_len_to_capture: int = 8192,
|
||||
disable_custom_all_reduce: bool = False,
|
||||
disable_async_output_proc: bool = False,
|
||||
hf_overrides: Optional[HfOverrides] = None,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
# After positional args are removed, move this right below `model`
|
||||
task: TaskOption = "auto",
|
||||
override_pooler_config: Optional[PoolerConfig] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
'''
|
||||
LLM constructor.
|
||||
|
||||
Note: if enforce_eager is unset (enforce_eager is None)
|
||||
it defaults to False.
|
||||
'''
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add cp and ep parameter
|
||||
'''
|
||||
# pop context_parallel_size
|
||||
EngineArgs.context_parallel_size = kwargs.pop("context_parallel_size", 1)
|
||||
# pop moe_tp_size and moe_ep_size
|
||||
EngineArgs.moe_tp_size = kwargs.pop("moe_tp_size", -1)
|
||||
# pop moe_ep_size and moe_ep_size
|
||||
EngineArgs.moe_ep_size = kwargs.pop("moe_ep_size", -1)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
vllm__entrypoints__llm__LLM____init__org(
|
||||
self=self,
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
skip_tokenizer_init=skip_tokenizer_init,
|
||||
trust_remote_code=trust_remote_code,
|
||||
allowed_local_media_path=allowed_local_media_path,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
dtype=dtype,
|
||||
quantization=quantization,
|
||||
revision=revision,
|
||||
tokenizer_revision=tokenizer_revision,
|
||||
seed=seed,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
swap_space=swap_space,
|
||||
cpu_offload_gb=cpu_offload_gb,
|
||||
enforce_eager=enforce_eager,
|
||||
max_seq_len_to_capture=max_seq_len_to_capture,
|
||||
disable_custom_all_reduce=disable_custom_all_reduce,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
hf_overrides=hf_overrides,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
# After positional args are removed, move this right below `model`
|
||||
task=task,
|
||||
override_pooler_config=override_pooler_config,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(LLM,
|
||||
LLM.__init__,
|
||||
vllm__entrypoints__llm__LLM____init__)
|
||||
@@ -0,0 +1,7 @@
|
||||
print("Apply Custom VLLM Demo!")
|
||||
from . import distributed
|
||||
from . import engine
|
||||
from . import entrypoints
|
||||
from . import worker
|
||||
from . import config
|
||||
from . import model_executor
|
||||
@@ -0,0 +1,2 @@
|
||||
from . import layers
|
||||
from . import parameter
|
||||
@@ -0,0 +1,2 @@
|
||||
from . import linear
|
||||
from . import feed_forward
|
||||
@@ -0,0 +1,93 @@
|
||||
from typing import Optional, Any
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.linear import (
|
||||
MergedColumnParallelLinear,
|
||||
ColumnParallelLinear,
|
||||
RowParallelLinear
|
||||
)
|
||||
from vllm_mlu.mlu_hijack_utils import set_is_gated, MluHijackObject
|
||||
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
|
||||
from ....mlu_hijack.distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group)
|
||||
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def vllm__mlu_hijack__model_executor__layers__feed_forward__FeedForward____init__(
|
||||
self,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
hidden_act: str,
|
||||
up_proj_name: str,
|
||||
is_gated: bool,
|
||||
down_proj_name: str,
|
||||
bias: bool,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
skip_bias_add: bool = False,
|
||||
reduce_results: bool = True,
|
||||
prefix: str = "",
|
||||
tp_group: Any = None,
|
||||
):
|
||||
super(FeedForward, self).__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.hidden_act = hidden_act
|
||||
self.is_gated = is_gated
|
||||
self.bias = bias
|
||||
self.up_proj_name = up_proj_name
|
||||
self.down_proj_name = down_proj_name
|
||||
self.quant_config = quant_config
|
||||
self.is_initialized = False
|
||||
self.skip_bias_add = skip_bias_add
|
||||
self.reduce_results = reduce_results
|
||||
self.use_bt_ffn = True if quant_config is None else False
|
||||
set_is_gated(self.is_gated)
|
||||
self.tp_size = get_parallel_world_size_with_group(tp_group)
|
||||
self.tp_rank = get_parallel_rank_with_group(tp_group)
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add tp_group parameter at the end of each linear class
|
||||
'''
|
||||
self.tp_group = tp_group
|
||||
# up_proj with gate or not
|
||||
if self.is_gated:
|
||||
up_proj = MergedColumnParallelLinear(hidden_size,
|
||||
[intermediate_size] * 2,
|
||||
bias=bias,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.{up_proj_name}",
|
||||
tp_group=tp_group)
|
||||
else:
|
||||
up_proj = ColumnParallelLinear(hidden_size,
|
||||
intermediate_size,
|
||||
bias=bias,
|
||||
skip_bias_add=skip_bias_add,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.{up_proj_name}",
|
||||
tp_group=tp_group)
|
||||
self.register_module(up_proj_name, up_proj)
|
||||
|
||||
# down_proj
|
||||
down_proj = RowParallelLinear(intermediate_size,
|
||||
hidden_size,
|
||||
bias=bias,
|
||||
skip_bias_add=skip_bias_add,
|
||||
reduce_results=reduce_results,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.{down_proj_name}",
|
||||
tp_group=tp_group)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
self.register_module(down_proj_name, down_proj)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(FeedForward,
|
||||
FeedForward.__init__,
|
||||
vllm__mlu_hijack__model_executor__layers__feed_forward__FeedForward____init__)
|
||||
@@ -0,0 +1,696 @@
|
||||
from typing import Optional, List, Any, Tuple
|
||||
import torch
|
||||
from torch.nn.parameter import Parameter, UninitializedParameter
|
||||
|
||||
from vllm.distributed import (divide, split_tensor_along_last_dim)
|
||||
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.model_executor.parameter import (BasevLLMParameter,
|
||||
PerTensorScaleParameter,
|
||||
RowvLLMParameter)
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm_mlu._mlu_utils import *
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED, LinearBase, ColumnParallelLinear,
|
||||
MergedColumnParallelLinear, RowParallelLinear, adjust_marlin_shard,
|
||||
adjust_scalar_to_fused_array)
|
||||
from vllm import _mlu_ops as mlu_ops
|
||||
from ....mlu_hijack.distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group,
|
||||
get_tp_group)
|
||||
from ....mlu_hijack.distributed.communication_op import (tensor_model_parallel_all_reduce,
|
||||
tensor_model_parallel_all_gather)
|
||||
|
||||
vllm__model_executor__layers__linear__LinearBase____init__org = LinearBase.__init__
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__LinearBase____init__(
|
||||
self,
|
||||
input_size: int,
|
||||
output_size: int,
|
||||
skip_bias_add: bool = False,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
tp_group: Any = None,
|
||||
):
|
||||
vllm__model_executor__layers__linear__LinearBase____init__org(self=self,
|
||||
input_size=input_size,
|
||||
output_size=output_size,
|
||||
skip_bias_add=skip_bias_add,
|
||||
params_dtype=params_dtype,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add self.tp_group, world_size and tp_rank to support moe expert parallel
|
||||
'''
|
||||
self.tp_group = tp_group
|
||||
self.tp_world_size = get_parallel_world_size_with_group(self.tp_group)
|
||||
self.tp_rank = get_parallel_rank_with_group(self.tp_group)
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__ColumnParallelLinear____init__(
|
||||
self,
|
||||
input_size: int,
|
||||
output_size: int,
|
||||
bias: bool = True,
|
||||
gather_output: bool = False,
|
||||
skip_bias_add: bool = False,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
output_sizes: Optional[List[int]] = None,
|
||||
prefix: str = "",
|
||||
tp_group: Any = None,
|
||||
):
|
||||
super(ColumnParallelLinear, self).__init__(input_size, output_size, skip_bias_add, params_dtype,
|
||||
quant_config, prefix, tp_group)
|
||||
|
||||
self.gather_output = gather_output
|
||||
|
||||
# Divide the weight matrix along the last dimension.
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
|
||||
@brief: move checking output_sizes logic from MergedColumnParallelLinear to here
|
||||
'''
|
||||
tp_size = self.tp_world_size
|
||||
|
||||
if output_sizes is not None:
|
||||
assert all(output_size_var % tp_size == 0 for output_size_var in output_sizes)
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
assert self.quant_method is not None
|
||||
self.output_size_per_partition = divide(self.output_size, tp_size)
|
||||
self.output_partition_sizes = [self.output_size_per_partition]
|
||||
# If QKV or MergedColumn, use output size of each partition.
|
||||
if hasattr(self, "output_sizes"):
|
||||
self.output_partition_sizes = [
|
||||
divide(output_size, tp_size)
|
||||
for output_size in self.output_sizes
|
||||
]
|
||||
|
||||
if output_sizes is None:
|
||||
output_sizes = [output_size]
|
||||
|
||||
self.quant_method.create_weights(
|
||||
layer=self,
|
||||
input_size_per_partition=self.input_size,
|
||||
output_partition_sizes=self.output_partition_sizes,
|
||||
input_size=self.input_size,
|
||||
output_size=self.output_size,
|
||||
params_dtype=self.params_dtype,
|
||||
weight_loader=(
|
||||
self.weight_loader_v2 if self.quant_method.__class__.__name__
|
||||
in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
|
||||
if bias:
|
||||
self.bias = Parameter(
|
||||
torch.empty(self.output_size_per_partition,
|
||||
dtype=params_dtype))
|
||||
set_weight_attrs(self.bias, {
|
||||
"output_dim": 0,
|
||||
"weight_loader": self.weight_loader,
|
||||
})
|
||||
else:
|
||||
self.register_parameter("bias", None)
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__ColumnParallelLinear__weight_loader(
|
||||
self, param: Parameter, loaded_weight: torch.Tensor):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
'''
|
||||
tp_rank = self.tp_rank
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
output_dim = getattr(param, "output_dim", None)
|
||||
|
||||
# Special case for GGUF
|
||||
is_gguf_weight = getattr(param, "is_gguf_weight", False)
|
||||
is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
|
||||
if is_gguf_weight_type:
|
||||
param.weight_type = loaded_weight.item()
|
||||
|
||||
# Materialize GGUF UninitializedParameter
|
||||
if is_gguf_weight and isinstance(param, UninitializedParameter):
|
||||
param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
|
||||
|
||||
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
|
||||
|
||||
param_data = param.data
|
||||
# bitsandbytes loads the weights of the specific portion
|
||||
# no need to narrow here
|
||||
if output_dim is not None and not use_bitsandbytes_4bit:
|
||||
shard_size = param_data.shape[output_dim]
|
||||
start_idx = tp_rank * shard_size
|
||||
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
|
||||
shard_size)
|
||||
|
||||
# Special case for loading scales off disk, which often do not
|
||||
# have a shape (such as in the case of AutoFP8).
|
||||
if len(loaded_weight.shape) == 0:
|
||||
loaded_weight = loaded_weight.reshape(1)
|
||||
|
||||
assert param_data.shape == loaded_weight.shape
|
||||
param_data.copy_(loaded_weight)
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__ColumnParallelLinear__forward(
|
||||
self, input_, smooth_quant_scale: Optional[torch.Tensor] = None):
|
||||
bias = self.bias if not self.skip_bias_add else None
|
||||
|
||||
# Matrix multiply.
|
||||
assert self.quant_method is not None
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: Add input_scale parameter.
|
||||
'''
|
||||
if smooth_quant_scale is not None:
|
||||
output_parallel = self.quant_method.apply(self, input_, bias,
|
||||
input_scale=smooth_quant_scale)
|
||||
else:
|
||||
output_parallel = self.quant_method.apply(self, input_, bias)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
if self.gather_output:
|
||||
# All-gather across the partitions.
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add tp_group param to tensor_model_parallel_all_gather
|
||||
'''
|
||||
output = tensor_model_parallel_all_gather(output_parallel, self.tp_group)
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
else:
|
||||
output = output_parallel
|
||||
output_bias = self.bias if self.skip_bias_add else None
|
||||
return output, output_bias
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__ColumnParallelLinear__extra_repr(self) -> str:
|
||||
s = f"in_features={self.input_size}"
|
||||
s += f", output_features={self.output_size_per_partition}"
|
||||
s += f", bias={self.bias is not None}"
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
|
||||
'''
|
||||
s += f", tp_size={self.tp_world_size}"
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
s += f", gather_output={self.gather_output}"
|
||||
return s
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__MergedColumnParallelLinear____init__(
|
||||
self,
|
||||
input_size: int,
|
||||
output_sizes: List[int],
|
||||
bias: bool = True,
|
||||
gather_output: bool = False,
|
||||
skip_bias_add: bool = False,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
tp_group: Any = None,
|
||||
):
|
||||
self.output_sizes = output_sizes
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: move checking output_sizes logic from MergedColumnParallelLinear to ColumnParallelLinear.__init__
|
||||
'''
|
||||
# tp_size = get_tensor_model_parallel_world_size()
|
||||
# assert all(output_size % tp_size == 0 for output_size in output_sizes)
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
super(MergedColumnParallelLinear, self).__init__(input_size=input_size,
|
||||
output_size=sum(output_sizes),
|
||||
bias=bias,
|
||||
gather_output=gather_output,
|
||||
skip_bias_add=skip_bias_add,
|
||||
params_dtype=params_dtype,
|
||||
quant_config=quant_config,
|
||||
output_sizes=self.output_sizes,
|
||||
prefix=prefix,
|
||||
tp_group=tp_group)
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader(self,
|
||||
param: Parameter,
|
||||
loaded_weight: torch.Tensor,
|
||||
loaded_shard_id: Optional[int] = None):
|
||||
# Special case for GGUF
|
||||
# initialize GGUF param after we know the quantize type
|
||||
is_gguf_weight = getattr(param, "is_gguf_weight", False)
|
||||
is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
|
||||
if is_gguf_weight_type:
|
||||
param.data[loaded_shard_id].copy_(loaded_weight)
|
||||
param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
|
||||
return
|
||||
|
||||
if is_gguf_weight:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
|
||||
'''
|
||||
tp_rank = self.tp_rank
|
||||
tp_size = self.tp_world_size
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
output_dim = getattr(param, "output_dim", None)
|
||||
shard_size = loaded_weight.size(output_dim) // tp_size
|
||||
start_idx = tp_rank * shard_size
|
||||
|
||||
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
|
||||
shard_size)
|
||||
|
||||
param.shard_id.append(loaded_shard_id)
|
||||
param.shard_id_map[loaded_shard_id] = len(param.data_container)
|
||||
param.data_container.append(loaded_weight)
|
||||
if len(param.data_container) == 2:
|
||||
self.qweight = param.materialize_nested()
|
||||
return
|
||||
|
||||
param_data = param.data
|
||||
output_dim = getattr(param, "output_dim", None)
|
||||
# Special case for AQLM codebooks.
|
||||
is_metadata = getattr(param, "is_metadata", False)
|
||||
# Special case for per-tensor scale to load scalar into fused array.
|
||||
needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
|
||||
|
||||
if loaded_shard_id is None:
|
||||
# Loaded weight is already fused on disk (qkv/mlp).
|
||||
if output_dim is None:
|
||||
if needs_scalar_to_array:
|
||||
param_data, loaded_weight = adjust_scalar_to_fused_array(
|
||||
param_data, loaded_weight, 0)
|
||||
|
||||
assert param_data.shape == loaded_weight.shape
|
||||
param_data.copy_(loaded_weight)
|
||||
return
|
||||
current_shard_offset = 0
|
||||
shard_offsets: List[Tuple[int, int, int]] = []
|
||||
for i, output_size in enumerate(self.output_sizes):
|
||||
shard_offsets.append((i, current_shard_offset, output_size))
|
||||
current_shard_offset += output_size
|
||||
packed_dim = getattr(param, "packed_dim", None)
|
||||
for shard_id, shard_offset, shard_size in shard_offsets:
|
||||
# Special case for Quantization.
|
||||
# If quantized, we need to adjust the offset and size to account
|
||||
# for the packing.
|
||||
if packed_dim == output_dim:
|
||||
shard_size = shard_size // param.pack_factor
|
||||
shard_offset = shard_offset // param.pack_factor
|
||||
# Special case for Marlin.
|
||||
shard_size, shard_offset = adjust_marlin_shard(
|
||||
param, shard_size, shard_offset)
|
||||
|
||||
loaded_weight_shard = loaded_weight.narrow(
|
||||
output_dim, shard_offset, shard_size)
|
||||
self.weight_loader(param, loaded_weight_shard, shard_id)
|
||||
return
|
||||
|
||||
assert loaded_shard_id < len(self.output_sizes)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
|
||||
'''
|
||||
tp_rank = self.tp_rank
|
||||
tp_size = self.tp_world_size
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
if output_dim is not None:
|
||||
shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
|
||||
shard_size = self.output_sizes[loaded_shard_id] // tp_size
|
||||
# Special case for quantization.
|
||||
# If quantized, we need to adjust the offset and size to account
|
||||
# for the packing.
|
||||
packed_dim = getattr(param, "packed_dim", None)
|
||||
if packed_dim == output_dim:
|
||||
shard_size = shard_size // param.pack_factor
|
||||
shard_offset = shard_offset // param.pack_factor
|
||||
# Special case for Marlin.
|
||||
shard_size, shard_offset = adjust_marlin_shard(
|
||||
param, shard_size, shard_offset)
|
||||
|
||||
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
|
||||
False)
|
||||
if use_bitsandbytes_4bit:
|
||||
shard_size = loaded_weight.shape[output_dim]
|
||||
shard_offset = loaded_weight.shape[output_dim] * \
|
||||
loaded_shard_id
|
||||
|
||||
param_data = param_data.narrow(output_dim, shard_offset,
|
||||
shard_size)
|
||||
start_idx = tp_rank * shard_size
|
||||
# bitsandbytes loads the weights of the specific portion
|
||||
# no need to narrow here
|
||||
if not use_bitsandbytes_4bit:
|
||||
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
|
||||
shard_size)
|
||||
# Special case for AQLM codebooks.
|
||||
elif is_metadata:
|
||||
# metadata indicates fixed size concatenated along dim 0
|
||||
shard_size = loaded_weight.shape[0]
|
||||
shard_offset = loaded_shard_id * shard_size
|
||||
param_data = param_data.narrow(0, shard_offset, shard_size)
|
||||
|
||||
# Special case for per-tensor scales in fused case.
|
||||
elif needs_scalar_to_array:
|
||||
param_data, loaded_weight = adjust_scalar_to_fused_array(
|
||||
param_data, loaded_weight, loaded_shard_id)
|
||||
|
||||
else:
|
||||
ignore_warning = getattr(param, "ignore_warning", False)
|
||||
if not ignore_warning:
|
||||
logger.warning(
|
||||
"Loading a weight without `output_dim` attribute in "
|
||||
"MergedColumnParallelLinear, assume the weight is "
|
||||
"the same for all partitions.")
|
||||
|
||||
assert param_data.shape == loaded_weight.shape
|
||||
param_data.copy_(loaded_weight)
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader_v2(self,
|
||||
param: BasevLLMParameter,
|
||||
loaded_weight: torch.Tensor,
|
||||
loaded_shard_id: Optional[int] = None):
|
||||
if loaded_shard_id is None:
|
||||
if isinstance(param, PerTensorScaleParameter):
|
||||
param.load_merged_column_weight(loaded_weight=loaded_weight,
|
||||
shard_id=0)
|
||||
return
|
||||
elif type(param) in (RowvLLMParameter, BasevLLMParameter):
|
||||
param.load_merged_column_weight(loaded_weight=loaded_weight)
|
||||
return
|
||||
# TODO: @dsikka - move to parameter.py
|
||||
self._load_fused_module_from_checkpoint(param, loaded_weight)
|
||||
return
|
||||
|
||||
assert loaded_shard_id < len(self.output_sizes)
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
|
||||
'''
|
||||
tp_size = self.tp_world_size
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
|
||||
shard_size = self.output_sizes[loaded_shard_id] // tp_size
|
||||
|
||||
param.load_merged_column_weight(loaded_weight=loaded_weight,
|
||||
shard_id=loaded_shard_id,
|
||||
shard_offset=shard_offset,
|
||||
shard_size=shard_size)
|
||||
|
||||
def vllm__model_executor__layers__linear__RowParallelLinear____init__(
|
||||
self,
|
||||
input_size: int,
|
||||
output_size: int,
|
||||
bias: bool = True,
|
||||
input_is_parallel: bool = True,
|
||||
skip_bias_add: bool = False,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
reduce_results: bool = True,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
tp_group: Any = None,
|
||||
):
|
||||
super(RowParallelLinear, self).__init__(input_size, output_size, skip_bias_add, params_dtype,
|
||||
quant_config, prefix, tp_group)
|
||||
|
||||
self.input_is_parallel = input_is_parallel
|
||||
self.reduce_results = reduce_results
|
||||
|
||||
# Divide the weight matrix along the last dimension.
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
|
||||
'''
|
||||
self.tp_size = self.tp_world_size
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
self.input_size_per_partition = divide(input_size, self.tp_size)
|
||||
assert self.quant_method is not None
|
||||
|
||||
self.quant_method.create_weights(
|
||||
layer=self,
|
||||
input_size_per_partition=self.input_size_per_partition,
|
||||
output_partition_sizes=[self.output_size],
|
||||
input_size=self.input_size,
|
||||
output_size=self.output_size,
|
||||
params_dtype=self.params_dtype,
|
||||
weight_loader=(
|
||||
self.weight_loader_v2 if self.quant_method.__class__.__name__
|
||||
in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
|
||||
if not reduce_results and (bias and not skip_bias_add):
|
||||
raise ValueError("When not reduce the results, adding bias to the "
|
||||
"results can lead to incorrect results")
|
||||
|
||||
if bias:
|
||||
self.bias = Parameter(
|
||||
torch.empty(self.output_size, dtype=params_dtype))
|
||||
set_weight_attrs(self.bias, {
|
||||
"output_dim": 0,
|
||||
"weight_loader": self.weight_loader,
|
||||
})
|
||||
else:
|
||||
self.register_parameter("bias", None)
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__RowParallelLinear__weight_loader(
|
||||
self, param: Parameter, loaded_weight: torch.Tensor):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
@brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
|
||||
'''
|
||||
tp_rank = self.tp_rank
|
||||
tp_size = self.tp_world_size
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
input_dim = getattr(param, "input_dim", None)
|
||||
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
|
||||
|
||||
# Special case for GGUF
|
||||
is_gguf_weight = getattr(param, "is_gguf_weight", False)
|
||||
is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
|
||||
if is_gguf_weight_type:
|
||||
param.weight_type = loaded_weight.item()
|
||||
|
||||
# Materialize GGUF UninitializedParameter
|
||||
if is_gguf_weight and isinstance(param, UninitializedParameter):
|
||||
weight_shape = list(loaded_weight.shape)
|
||||
if input_dim:
|
||||
weight_shape[input_dim] = weight_shape[input_dim] // tp_size
|
||||
param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
|
||||
|
||||
param_data = param.data
|
||||
# bitsandbytes loads the weights of the specific portion
|
||||
# no need to narrow here
|
||||
if input_dim is not None and not use_bitsandbytes_4bit:
|
||||
shard_size = param_data.shape[input_dim]
|
||||
start_idx = tp_rank * shard_size
|
||||
loaded_weight = loaded_weight.narrow(input_dim, start_idx,
|
||||
shard_size)
|
||||
|
||||
# Special case for loading scales off disk, which often do not
|
||||
# have a shape (such as in the case of AutoFP8).
|
||||
if len(loaded_weight.shape) == 0:
|
||||
loaded_weight = loaded_weight.reshape(1)
|
||||
|
||||
assert param_data.shape == loaded_weight.shape
|
||||
param_data.copy_(loaded_weight)
|
||||
|
||||
|
||||
def vllm__model_executor__layers__linear__RowParallelLinear__forward(
|
||||
self,
|
||||
input_,
|
||||
residual: Optional[torch.Tensor] = None
|
||||
):
|
||||
if self.input_is_parallel:
|
||||
input_parallel = input_
|
||||
else:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
'''
|
||||
tp_rank = self.tp_rank
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
splitted_input = split_tensor_along_last_dim(
|
||||
input_, num_partitions=self.tp_size)
|
||||
input_parallel = splitted_input[tp_rank].contiguous()
|
||||
|
||||
# Matrix multiply.
|
||||
assert self.quant_method is not None
|
||||
# Only fuse bias add into GEMM for rank 0 (this ensures that
|
||||
# bias will not get added more than once in TP>1 case)
|
||||
bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
|
||||
residual_ = None if self.tp_rank > 0 else residual
|
||||
'''
|
||||
=====================================================
|
||||
Modify by custom vllm_mlu
|
||||
=====================================================
|
||||
@brief: abandon original reduce if parallel_num is set
|
||||
'''
|
||||
is_parallel_enable = hasattr(self.quant_method, 'parallel_num') and get_is_prompt()
|
||||
'''
|
||||
=====================================================
|
||||
End of custom MLU Hijack
|
||||
=====================================================
|
||||
'''
|
||||
output_parallel = self.quant_method.apply(self,
|
||||
input_parallel,
|
||||
bias=bias_,
|
||||
residual=residual_)
|
||||
'''
|
||||
=============================
|
||||
Modify by custom vllm_mlu
|
||||
=============================
|
||||
@brief: when preload_size is set, call GroupCoordinator.all_reduce() directly and
|
||||
use async_op to set all_reduce paralleled with preload
|
||||
'''
|
||||
if self.reduce_results and self.tp_size > 1 and not is_parallel_enable:
|
||||
if hasattr(self, 'preload_size') and self.preload_size > 0 and not self.is_prompt:
|
||||
handle = get_tp_group(self.tp_group).all_reduce(output_parallel, async_op=True)
|
||||
_MB = 1 << 20
|
||||
mlu_ops.preload(self.preloaded_weights[0].data, self.preload_size * _MB)
|
||||
preloaded_weights_size = self.preloaded_weights[0].numel() * self.preloaded_weights[0].element_size()
|
||||
if preloaded_weights_size < (self.preload_size * _MB) and len(self.preloaded_weights) > 1:
|
||||
mlu_ops.preload(self.preloaded_weights[1].data, (self.preload_size * _MB) - preloaded_weights_size)
|
||||
handle.wait()
|
||||
output = output_parallel
|
||||
else:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add tensor_model_parallel_all_reduce() with self.tp_group
|
||||
'''
|
||||
output = tensor_model_parallel_all_reduce(output_parallel, tp_group=self.tp_group)
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
else:
|
||||
output = output_parallel
|
||||
'''
|
||||
=========================
|
||||
End of custom MLU Hijack
|
||||
=========================
|
||||
'''
|
||||
output_bias = self.bias if self.skip_bias_add else None
|
||||
|
||||
return output, output_bias
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(LinearBase,
|
||||
LinearBase.__init__,
|
||||
vllm__model_executor__layers__linear__LinearBase____init__)
|
||||
MluHijackObject.apply_hijack(ColumnParallelLinear,
|
||||
ColumnParallelLinear.__init__,
|
||||
vllm__model_executor__layers__linear__ColumnParallelLinear____init__)
|
||||
MluHijackObject.apply_hijack(ColumnParallelLinear,
|
||||
ColumnParallelLinear.weight_loader,
|
||||
vllm__model_executor__layers__linear__ColumnParallelLinear__weight_loader)
|
||||
MluHijackObject.apply_hijack(ColumnParallelLinear,
|
||||
ColumnParallelLinear.forward,
|
||||
vllm__model_executor__layers__linear__ColumnParallelLinear__forward)
|
||||
MluHijackObject.apply_hijack(ColumnParallelLinear,
|
||||
ColumnParallelLinear.extra_repr,
|
||||
vllm__model_executor__layers__linear__ColumnParallelLinear__extra_repr)
|
||||
MluHijackObject.apply_hijack(MergedColumnParallelLinear,
|
||||
MergedColumnParallelLinear.__init__,
|
||||
vllm__model_executor__layers__linear__MergedColumnParallelLinear____init__)
|
||||
MluHijackObject.apply_hijack(MergedColumnParallelLinear,
|
||||
MergedColumnParallelLinear.weight_loader,
|
||||
vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader)
|
||||
MluHijackObject.apply_hijack(MergedColumnParallelLinear,
|
||||
MergedColumnParallelLinear.weight_loader_v2,
|
||||
vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader_v2)
|
||||
MluHijackObject.apply_hijack(RowParallelLinear,
|
||||
RowParallelLinear.__init__,
|
||||
vllm__model_executor__layers__linear__RowParallelLinear____init__)
|
||||
MluHijackObject.apply_hijack(RowParallelLinear,
|
||||
RowParallelLinear.weight_loader,
|
||||
vllm__model_executor__layers__linear__RowParallelLinear__weight_loader)
|
||||
MluHijackObject.apply_hijack(RowParallelLinear,
|
||||
RowParallelLinear.forward,
|
||||
vllm__model_executor__layers__linear__RowParallelLinear__forward)
|
||||
@@ -0,0 +1,173 @@
|
||||
from fractions import Fraction
|
||||
from typing import Callable, Optional, Union, Any
|
||||
|
||||
import torch
|
||||
from torch.nn import Parameter
|
||||
from vllm.model_executor.parameter import (BasevLLMParameter,
|
||||
PackedColumnParameter,
|
||||
PackedvLLMParameter,
|
||||
PerTensorScaleParameter,
|
||||
RowvLLMParameter,
|
||||
_ColumnvLLMParameter)
|
||||
|
||||
from vllm.distributed import get_tensor_model_parallel_rank
|
||||
from vllm.logger import init_logger
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from ..distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def vllm__model_executor__parameter__BasevLLMParameter____init__(self, data: torch.Tensor, weight_loader: Callable, tp_group: Any = None):
|
||||
"""
|
||||
Initialize the BasevLLMParameter
|
||||
|
||||
:param data: torch tensor with the parameter data
|
||||
:param weight_loader: weight loader callable
|
||||
|
||||
:returns: a torch.nn.parameter
|
||||
"""
|
||||
|
||||
self._weight_loader = weight_loader
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add self.tp_group, world_size and tp_rank to support moe expert parallel
|
||||
'''
|
||||
self.tp_group = tp_group
|
||||
self.tp_world_size = get_parallel_world_size_with_group(self.tp_group)
|
||||
self.tp_rank = get_parallel_rank_with_group(self.tp_group)
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
|
||||
|
||||
def vllm__model_executor__parameter___ColumnvLLMParameter__load_column_parallel_weight(self, loaded_weight: torch.Tensor):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
'''
|
||||
tp_rank = self.tp_rank
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
shard_size = self.data.shape[self.output_dim]
|
||||
loaded_weight = loaded_weight.narrow(self.output_dim,
|
||||
tp_rank * shard_size, shard_size)
|
||||
assert self.data.shape == loaded_weight.shape
|
||||
self.data.copy_(loaded_weight)
|
||||
|
||||
def vllm__model_executor__parameter___ColumnvLLMParameter__load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
|
||||
|
||||
shard_offset = kwargs.get("shard_offset")
|
||||
shard_size = kwargs.get("shard_size")
|
||||
if isinstance(
|
||||
self,
|
||||
(PackedColumnParameter,
|
||||
PackedvLLMParameter)) and self.packed_dim == self.output_dim:
|
||||
shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
|
||||
shard_offset=shard_offset, shard_size=shard_size)
|
||||
|
||||
param_data = self.data
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
'''
|
||||
tp_rank = self.tp_rank
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
param_data = param_data.narrow(self.output_dim, shard_offset,
|
||||
shard_size)
|
||||
loaded_weight = loaded_weight.narrow(self.output_dim,
|
||||
tp_rank * shard_size, shard_size)
|
||||
assert param_data.shape == loaded_weight.shape
|
||||
param_data.copy_(loaded_weight)
|
||||
|
||||
def vllm__model_executor__parameter___ColumnvLLMParameter__load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
|
||||
shard_offset = kwargs.get("shard_offset")
|
||||
shard_size = kwargs.get("shard_size")
|
||||
shard_id = kwargs.get("shard_id")
|
||||
num_heads = kwargs.get("num_heads")
|
||||
|
||||
if isinstance(
|
||||
self,
|
||||
(PackedColumnParameter,
|
||||
PackedvLLMParameter)) and self.output_dim == self.packed_dim:
|
||||
shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
|
||||
shard_offset=shard_offset, shard_size=shard_size)
|
||||
|
||||
param_data = self.data
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
'''
|
||||
tp_rank = self.tp_rank
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
|
||||
param_data = param_data.narrow(self.output_dim, shard_offset,
|
||||
shard_size)
|
||||
loaded_weight = loaded_weight.narrow(self.output_dim,
|
||||
shard_id * shard_size, shard_size)
|
||||
|
||||
assert param_data.shape == loaded_weight.shape
|
||||
param_data.copy_(loaded_weight)
|
||||
|
||||
|
||||
def vllm__model_executor__parameter__RowvLLMParameter__load_row_parallel_weight(self, loaded_weight: torch.Tensor):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify get_tensor_model_parallel_rank() to self.tp_rank
|
||||
'''
|
||||
tp_rank = self.tp_rank
|
||||
'''
|
||||
=================
|
||||
End of MLU Hijack
|
||||
=================
|
||||
'''
|
||||
shard_size = self.data.shape[self.input_dim]
|
||||
loaded_weight = loaded_weight.narrow(self.input_dim,
|
||||
tp_rank * shard_size, shard_size)
|
||||
|
||||
if len(loaded_weight.shape) == 0:
|
||||
loaded_weight = loaded_weight.reshape(1)
|
||||
|
||||
assert self.data.shape == loaded_weight.shape
|
||||
self.data.copy_(loaded_weight)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(BasevLLMParameter,
|
||||
BasevLLMParameter.__init__,
|
||||
vllm__model_executor__parameter__BasevLLMParameter____init__)
|
||||
MluHijackObject.apply_hijack(_ColumnvLLMParameter,
|
||||
_ColumnvLLMParameter.load_column_parallel_weight,
|
||||
vllm__model_executor__parameter___ColumnvLLMParameter__load_column_parallel_weight)
|
||||
MluHijackObject.apply_hijack(_ColumnvLLMParameter,
|
||||
_ColumnvLLMParameter.load_merged_column_weight,
|
||||
vllm__model_executor__parameter___ColumnvLLMParameter__load_merged_column_weight)
|
||||
MluHijackObject.apply_hijack(_ColumnvLLMParameter,
|
||||
_ColumnvLLMParameter.load_qkv_weight,
|
||||
vllm__model_executor__parameter___ColumnvLLMParameter__load_qkv_weight)
|
||||
MluHijackObject.apply_hijack(RowvLLMParameter,
|
||||
RowvLLMParameter.load_row_parallel_weight,
|
||||
vllm__model_executor__parameter__RowvLLMParameter__load_row_parallel_weight)
|
||||
@@ -0,0 +1 @@
|
||||
from . import mlu_worker
|
||||
@@ -0,0 +1,192 @@
|
||||
import gc
|
||||
import os
|
||||
import torch
|
||||
from typing import List, Optional, Set, Tuple, Type
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.distributed import init_distributed_environment, set_custom_all_reduce
|
||||
from vllm.model_executor import set_random_seed
|
||||
from vllm.worker.mlu_worker import MLUWorker, _check_if_gpu_supports_dtype
|
||||
from vllm_mlu.worker.mlu_worker import MLUWorker_V2
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from ..distributed.parallel_state import ensure_model_parallel_initialized
|
||||
|
||||
import functools
|
||||
from collections import defaultdict
|
||||
from vllm.logger import init_logger
|
||||
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size
|
||||
from ..distributed.parallel_state import (get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size,
|
||||
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
|
||||
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def vllm__worker__mlu_worker__init_worker_distributed_environment(
|
||||
parallel_config: ParallelConfig,
|
||||
rank: int,
|
||||
distributed_init_method: Optional[str] = None,
|
||||
local_rank: int = -1,
|
||||
) -> None:
|
||||
set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
|
||||
|
||||
init_distributed_environment(parallel_config.world_size, rank,
|
||||
distributed_init_method, local_rank,
|
||||
backend='cncl')
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add context_parallel_size, moe_tp_size, moe_ep_size
|
||||
'''
|
||||
ensure_model_parallel_initialized(parallel_config=parallel_config)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
def vllm__worker__mlu_worker__MLUWorker__init_device(self) -> None:
|
||||
if self.device_config.device.type == "mlu":
|
||||
# torch.distributed.all_reduce does not free the input tensor until
|
||||
# the synchronization point. This causes the memory usage to grow
|
||||
# as the number of all_reduce calls increases. This env var disables
|
||||
# this behavior.
|
||||
# Related issue:
|
||||
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
|
||||
os.environ["TORCH_CNCL_AVOID_RECORD_STREAMS"] = "1"
|
||||
|
||||
# This env var set by Ray causes exceptions with graph building.
|
||||
os.environ.pop("CNCL_ASYNC_ERROR_HANDLING", None)
|
||||
self.device = torch.device(f"mlu:{self.local_rank}")
|
||||
torch.mlu.set_device(self.device)
|
||||
|
||||
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
||||
gc.collect()
|
||||
torch.mlu.empty_cache()
|
||||
self.init_gpu_memory = torch.mlu.mem_get_info()[0]
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Not support device type: {self.device_config.device}")
|
||||
# Initialize the distributed environment.
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: @brief: modify to vllm__worker__mlu_worker__init_worker_distributed_environment
|
||||
'''
|
||||
vllm__worker__mlu_worker__init_worker_distributed_environment(self.parallel_config, self.rank,
|
||||
self.distributed_init_method, self.local_rank)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
# Set random seed.
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
|
||||
def default_act_range_value():
|
||||
return {
|
||||
"x": None,
|
||||
"split": None,
|
||||
"is_linear": False,
|
||||
"is_qkv": False,
|
||||
"q_proj_size": 0,
|
||||
"num_kv_head_replicas": 1,
|
||||
"is_merge": False,
|
||||
"input_id": [],
|
||||
"self_rank": 0,
|
||||
"rank": None,
|
||||
"tensor_rank": None,
|
||||
"tp_world_size": None,
|
||||
"moe_tp_rank": None,
|
||||
"moe_tp_world_size": None,
|
||||
"moe_ep_rank": None,
|
||||
"moe_ep_world_size": None,
|
||||
"weight": None,
|
||||
}
|
||||
|
||||
def vllm_mlu__worker__mlu_worker__MLUWorker_V2__setup_smooth_hook(self,
|
||||
is_save_input_id: bool = False,
|
||||
is_save_moe_info: bool = False):
|
||||
model = self.model_runner.model
|
||||
self.act_range = defaultdict(default_act_range_value)
|
||||
self.hooks = []
|
||||
linear_class_list = (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
|
||||
other_class_list = (VocabParallelEmbedding, ParallelLMHead)
|
||||
class_list = linear_class_list + other_class_list
|
||||
row_class_list = (RowParallelLinear)
|
||||
|
||||
for name, m in model.named_modules():
|
||||
if isinstance(m, FeedForward):
|
||||
m.use_bt_ffn = False
|
||||
if isinstance(m, SparseMoeMlp):
|
||||
m.is_use_fused_moe = False
|
||||
|
||||
if isinstance(m, class_list):
|
||||
is_linear = True if isinstance(m, linear_class_list) else False
|
||||
split_type = "row" if isinstance(m, row_class_list) else "col"
|
||||
self.act_range[name]["split"] = split_type
|
||||
self.act_range[name]["is_linear"] = is_linear
|
||||
if isinstance(m, QKVParallelLinear):
|
||||
self.act_range[name]["is_qkv"] = True
|
||||
self.act_range[name]["q_proj_size"] = m.num_heads * m.head_size
|
||||
self.act_range[name]["num_kv_head_replicas"] = m.num_kv_head_replicas
|
||||
self.act_range[name]["is_merge"] = isinstance(m, MergedColumnParallelLinear)
|
||||
if is_save_moe_info:
|
||||
self.act_range[name]["rank"] = torch.distributed.get_rank()
|
||||
self.act_range[name]["tensor_rank"] = get_tensor_model_parallel_rank()
|
||||
self.act_range[name]["tp_world_size"] = get_tensor_model_parallel_world_size()
|
||||
self.act_range[name]["moe_tp_rank"] = get_moe_tensor_parallel_rank()
|
||||
self.act_range[name]["moe_tp_world_size"] = get_moe_tensor_parallel_world_size()
|
||||
self.act_range[name]["moe_ep_rank"] = get_moe_expert_parallel_rank()
|
||||
self.act_range[name]["moe_ep_world_size"] = get_moe_expert_parallel_world_size()
|
||||
if ".expert." in name:
|
||||
self.act_range[name]["weight"] = m.weight
|
||||
logger.info(f"rank:{self.rank}, add hook to {name}, is_linear:{is_linear}, split_type:{split_type}")
|
||||
self.hooks.append(
|
||||
m.register_forward_hook(
|
||||
functools.partial(self.stat_input_hook,
|
||||
name=name,
|
||||
act_range=self.act_range,
|
||||
is_linear=is_linear,
|
||||
is_save_input_id=is_save_input_id)))
|
||||
|
||||
|
||||
def vllm_mlu__worker__mlu_worker__MLUWorker_V2__get_act_range(self):
|
||||
act_range = defaultdict(default_act_range_value)
|
||||
for layer_name, layer_range in self.act_range.items():
|
||||
for tensor_key, tensor_value in layer_range.items():
|
||||
if isinstance(tensor_value, torch.Tensor):
|
||||
act_range[layer_name][tensor_key] = tensor_value.to("cpu")
|
||||
elif tensor_key == "input_id" and isinstance(tensor_value, list):
|
||||
input_id_len = len(tensor_value)
|
||||
for i in range(input_id_len):
|
||||
if isinstance(tensor_value[i], torch.Tensor):
|
||||
act_range[layer_name][tensor_key].append(tensor_value[i].to("cpu"))
|
||||
else:
|
||||
act_range[layer_name][tensor_key].append(tensor_value[i])
|
||||
else:
|
||||
act_range[layer_name][tensor_key] = tensor_value
|
||||
|
||||
return act_range
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(MLUWorker,
|
||||
MLUWorker.init_device,
|
||||
vllm__worker__mlu_worker__MLUWorker__init_device)
|
||||
MluHijackObject.apply_hijack(MLUWorker,
|
||||
"setup_smooth_hook",
|
||||
vllm_mlu__worker__mlu_worker__MLUWorker_V2__setup_smooth_hook)
|
||||
MluHijackObject.apply_hijack(MLUWorker,
|
||||
"get_act_range",
|
||||
vllm_mlu__worker__mlu_worker__MLUWorker_V2__get_act_range)
|
||||
Reference in New Issue
Block a user