import torch from vllm.config import ParallelConfig, TokenizerPoolConfig from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union from vllm.logger import init_logger from vllm.utils import cuda_device_count_stateless from vllm.platforms import current_platform from vllm_mlu.mlu_hijack_utils import MluHijackObject if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup from vllm.executor.executor_base import ExecutorBase logger = init_logger(__name__) def vllm__config__ParallelConfig___init__( self, pipeline_parallel_size: int, tensor_parallel_size: int, worker_use_ray: Optional[bool] = None, max_parallel_loading_workers: Optional[int] = None, disable_custom_all_reduce: bool = False, tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, ray_workers_use_nsight: bool = False, placement_group: Optional["PlacementGroup"] = None, distributed_executor_backend: Optional[Union[ str, Type["ExecutorBase"]]] = None, ) -> None: self.pipeline_parallel_size = pipeline_parallel_size self.tensor_parallel_size = tensor_parallel_size self.distributed_executor_backend = distributed_executor_backend self.max_parallel_loading_workers = max_parallel_loading_workers self.disable_custom_all_reduce = disable_custom_all_reduce self.tokenizer_pool_config = tokenizer_pool_config self.ray_workers_use_nsight = ray_workers_use_nsight self.placement_group = placement_group ''' ========================== Modify by vllm_mlu ========================== @brief: modify world_size ''' self.context_parallel_size = self.context_parallel_size self.moe_tp_size = self.moe_tp_size self.moe_ep_size = self.moe_ep_size self.world_size = pipeline_parallel_size * tensor_parallel_size * self.context_parallel_size ''' ======================= End of MLU Hijack ======================= ''' if worker_use_ray: if self.distributed_executor_backend is None: self.distributed_executor_backend = "ray" elif not self.use_ray: raise ValueError(f"worker-use-ray can't be used with " f"distributed executor backend " f"'{self.distributed_executor_backend}'.") if current_platform.is_tpu() and self.world_size > 1: if self.distributed_executor_backend is None: self.distributed_executor_backend = "ray" if self.distributed_executor_backend != "ray": raise ValueError( "TPU backend only supports Ray for distributed inference.") if current_platform.is_hpu() and self.world_size > 1: if self.distributed_executor_backend is None: self.distributed_executor_backend = "ray" if self.distributed_executor_backend != "ray": raise ValueError( "HPU backend only supports Ray for distributed inference.") if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. from vllm.executor import ray_utils backend = "mp" ray_found = ray_utils.ray_is_available() if (current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size): if not ray_found: raise ValueError("Unable to load Ray which is " "required for multi-node inference, " "please install Ray with `pip install " "ray`.") from ray_utils.ray_import_err backend = "ray" elif ray_found: if self.placement_group: backend = "ray" else: from ray import is_initialized as ray_is_initialized if ray_is_initialized(): from ray.util import get_current_placement_group if get_current_placement_group(): backend = "ray" self.distributed_executor_backend = backend logger.info("Defaulting to use %s for distributed inference", backend) self._verify_args() self.rank: int = 0 MluHijackObject.apply_hijack(ParallelConfig, ParallelConfig.__init__, vllm__config__ParallelConfig___init__)