enginex-mlu370-vllm/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/config.py

import torch
from vllm.config import ParallelConfig, TokenizerPoolConfig
from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
from vllm.logger import init_logger
from vllm.utils import cuda_device_count_stateless
from vllm.platforms import current_platform
from vllm_mlu.mlu_hijack_utils import MluHijackObject
if TYPE_CHECKING:
    from ray.util.placement_group import PlacementGroup

    from vllm.executor.executor_base import ExecutorBase

logger = init_logger(__name__)


def vllm__config__ParallelConfig___init__(
        self,
        pipeline_parallel_size: int,
        tensor_parallel_size: int,
        worker_use_ray: Optional[bool] = None,
        max_parallel_loading_workers: Optional[int] = None,
        disable_custom_all_reduce: bool = False,
        tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
        ray_workers_use_nsight: bool = False,
        placement_group: Optional["PlacementGroup"] = None,
        distributed_executor_backend: Optional[Union[
            str, Type["ExecutorBase"]]] = None,
) -> None:
    self.pipeline_parallel_size = pipeline_parallel_size
    self.tensor_parallel_size = tensor_parallel_size
    self.distributed_executor_backend = distributed_executor_backend
    self.max_parallel_loading_workers = max_parallel_loading_workers
    self.disable_custom_all_reduce = disable_custom_all_reduce
    self.tokenizer_pool_config = tokenizer_pool_config
    self.ray_workers_use_nsight = ray_workers_use_nsight
    self.placement_group = placement_group

    '''
    ==========================
    Modify by vllm_mlu
    ==========================
    @brief: modify world_size
    '''
    self.context_parallel_size = self.context_parallel_size
    self.moe_tp_size = self.moe_tp_size
    self.moe_ep_size = self.moe_ep_size

    self.world_size = pipeline_parallel_size * tensor_parallel_size * self.context_parallel_size
    '''
    =======================
    End of MLU Hijack
    =======================
    '''
    if worker_use_ray:
        if self.distributed_executor_backend is None:
            self.distributed_executor_backend = "ray"
        elif not self.use_ray:
            raise ValueError(f"worker-use-ray can't be used with "
                             f"distributed executor backend "
                             f"'{self.distributed_executor_backend}'.")

    if current_platform.is_tpu() and self.world_size > 1:
        if self.distributed_executor_backend is None:
            self.distributed_executor_backend = "ray"
        if self.distributed_executor_backend != "ray":
            raise ValueError(
                "TPU backend only supports Ray for distributed inference.")

    if current_platform.is_hpu() and self.world_size > 1:
        if self.distributed_executor_backend is None:
            self.distributed_executor_backend = "ray"
        if self.distributed_executor_backend != "ray":
            raise ValueError(
                "HPU backend only supports Ray for distributed inference.")

    if self.distributed_executor_backend is None and self.world_size > 1:
        # We use multiprocessing by default if world_size fits on the
        # current node and we aren't in a ray placement group.

        from vllm.executor import ray_utils
        backend = "mp"
        ray_found = ray_utils.ray_is_available()
        if (current_platform.is_cuda()
                and cuda_device_count_stateless() < self.world_size):
            if not ray_found:
                raise ValueError("Unable to load Ray which is "
                                 "required for multi-node inference, "
                                 "please install Ray with `pip install "
                                 "ray`.") from ray_utils.ray_import_err
            backend = "ray"
        elif ray_found:
            if self.placement_group:
                backend = "ray"
            else:
                from ray import is_initialized as ray_is_initialized
                if ray_is_initialized():
                    from ray.util import get_current_placement_group
                    if get_current_placement_group():
                        backend = "ray"
        self.distributed_executor_backend = backend
        logger.info("Defaulting to use %s for distributed inference",
                    backend)

    self._verify_args()
    self.rank: int = 0


MluHijackObject.apply_hijack(ParallelConfig,
                             ParallelConfig.__init__,
                             vllm__config__ParallelConfig___init__)