111 lines
4.4 KiB
Python
111 lines
4.4 KiB
Python
import torch
|
|
from vllm.config import ParallelConfig, TokenizerPoolConfig
|
|
from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
|
|
from vllm.logger import init_logger
|
|
from vllm.utils import cuda_device_count_stateless
|
|
from vllm.platforms import current_platform
|
|
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
|
if TYPE_CHECKING:
|
|
from ray.util.placement_group import PlacementGroup
|
|
|
|
from vllm.executor.executor_base import ExecutorBase
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
def vllm__config__ParallelConfig___init__(
|
|
self,
|
|
pipeline_parallel_size: int,
|
|
tensor_parallel_size: int,
|
|
worker_use_ray: Optional[bool] = None,
|
|
max_parallel_loading_workers: Optional[int] = None,
|
|
disable_custom_all_reduce: bool = False,
|
|
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
|
|
ray_workers_use_nsight: bool = False,
|
|
placement_group: Optional["PlacementGroup"] = None,
|
|
distributed_executor_backend: Optional[Union[
|
|
str, Type["ExecutorBase"]]] = None,
|
|
) -> None:
|
|
self.pipeline_parallel_size = pipeline_parallel_size
|
|
self.tensor_parallel_size = tensor_parallel_size
|
|
self.distributed_executor_backend = distributed_executor_backend
|
|
self.max_parallel_loading_workers = max_parallel_loading_workers
|
|
self.disable_custom_all_reduce = disable_custom_all_reduce
|
|
self.tokenizer_pool_config = tokenizer_pool_config
|
|
self.ray_workers_use_nsight = ray_workers_use_nsight
|
|
self.placement_group = placement_group
|
|
|
|
'''
|
|
==========================
|
|
Modify by vllm_mlu
|
|
==========================
|
|
@brief: modify world_size
|
|
'''
|
|
self.context_parallel_size = self.context_parallel_size
|
|
self.moe_tp_size = self.moe_tp_size
|
|
self.moe_ep_size = self.moe_ep_size
|
|
|
|
self.world_size = pipeline_parallel_size * tensor_parallel_size * self.context_parallel_size
|
|
'''
|
|
=======================
|
|
End of MLU Hijack
|
|
=======================
|
|
'''
|
|
if worker_use_ray:
|
|
if self.distributed_executor_backend is None:
|
|
self.distributed_executor_backend = "ray"
|
|
elif not self.use_ray:
|
|
raise ValueError(f"worker-use-ray can't be used with "
|
|
f"distributed executor backend "
|
|
f"'{self.distributed_executor_backend}'.")
|
|
|
|
if current_platform.is_tpu() and self.world_size > 1:
|
|
if self.distributed_executor_backend is None:
|
|
self.distributed_executor_backend = "ray"
|
|
if self.distributed_executor_backend != "ray":
|
|
raise ValueError(
|
|
"TPU backend only supports Ray for distributed inference.")
|
|
|
|
if current_platform.is_hpu() and self.world_size > 1:
|
|
if self.distributed_executor_backend is None:
|
|
self.distributed_executor_backend = "ray"
|
|
if self.distributed_executor_backend != "ray":
|
|
raise ValueError(
|
|
"HPU backend only supports Ray for distributed inference.")
|
|
|
|
if self.distributed_executor_backend is None and self.world_size > 1:
|
|
# We use multiprocessing by default if world_size fits on the
|
|
# current node and we aren't in a ray placement group.
|
|
|
|
from vllm.executor import ray_utils
|
|
backend = "mp"
|
|
ray_found = ray_utils.ray_is_available()
|
|
if (current_platform.is_cuda()
|
|
and cuda_device_count_stateless() < self.world_size):
|
|
if not ray_found:
|
|
raise ValueError("Unable to load Ray which is "
|
|
"required for multi-node inference, "
|
|
"please install Ray with `pip install "
|
|
"ray`.") from ray_utils.ray_import_err
|
|
backend = "ray"
|
|
elif ray_found:
|
|
if self.placement_group:
|
|
backend = "ray"
|
|
else:
|
|
from ray import is_initialized as ray_is_initialized
|
|
if ray_is_initialized():
|
|
from ray.util import get_current_placement_group
|
|
if get_current_placement_group():
|
|
backend = "ray"
|
|
self.distributed_executor_backend = backend
|
|
logger.info("Defaulting to use %s for distributed inference",
|
|
backend)
|
|
|
|
self._verify_args()
|
|
self.rank: int = 0
|
|
|
|
|
|
MluHijackObject.apply_hijack(ParallelConfig,
|
|
ParallelConfig.__init__,
|
|
vllm__config__ParallelConfig___init__)
|