Files
enginex-mlu370-vllm/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/config.py
2026-02-04 17:22:39 +08:00

111 lines
4.4 KiB
Python

import torch
from vllm.config import ParallelConfig, TokenizerPoolConfig
from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
from vllm.logger import init_logger
from vllm.utils import cuda_device_count_stateless
from vllm.platforms import current_platform
from vllm_mlu.mlu_hijack_utils import MluHijackObject
if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
from vllm.executor.executor_base import ExecutorBase
logger = init_logger(__name__)
def vllm__config__ParallelConfig___init__(
self,
pipeline_parallel_size: int,
tensor_parallel_size: int,
worker_use_ray: Optional[bool] = None,
max_parallel_loading_workers: Optional[int] = None,
disable_custom_all_reduce: bool = False,
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
ray_workers_use_nsight: bool = False,
placement_group: Optional["PlacementGroup"] = None,
distributed_executor_backend: Optional[Union[
str, Type["ExecutorBase"]]] = None,
) -> None:
self.pipeline_parallel_size = pipeline_parallel_size
self.tensor_parallel_size = tensor_parallel_size
self.distributed_executor_backend = distributed_executor_backend
self.max_parallel_loading_workers = max_parallel_loading_workers
self.disable_custom_all_reduce = disable_custom_all_reduce
self.tokenizer_pool_config = tokenizer_pool_config
self.ray_workers_use_nsight = ray_workers_use_nsight
self.placement_group = placement_group
'''
==========================
Modify by vllm_mlu
==========================
@brief: modify world_size
'''
self.context_parallel_size = self.context_parallel_size
self.moe_tp_size = self.moe_tp_size
self.moe_ep_size = self.moe_ep_size
self.world_size = pipeline_parallel_size * tensor_parallel_size * self.context_parallel_size
'''
=======================
End of MLU Hijack
=======================
'''
if worker_use_ray:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
elif not self.use_ray:
raise ValueError(f"worker-use-ray can't be used with "
f"distributed executor backend "
f"'{self.distributed_executor_backend}'.")
if current_platform.is_tpu() and self.world_size > 1:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
if self.distributed_executor_backend != "ray":
raise ValueError(
"TPU backend only supports Ray for distributed inference.")
if current_platform.is_hpu() and self.world_size > 1:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
if self.distributed_executor_backend != "ray":
raise ValueError(
"HPU backend only supports Ray for distributed inference.")
if self.distributed_executor_backend is None and self.world_size > 1:
# We use multiprocessing by default if world_size fits on the
# current node and we aren't in a ray placement group.
from vllm.executor import ray_utils
backend = "mp"
ray_found = ray_utils.ray_is_available()
if (current_platform.is_cuda()
and cuda_device_count_stateless() < self.world_size):
if not ray_found:
raise ValueError("Unable to load Ray which is "
"required for multi-node inference, "
"please install Ray with `pip install "
"ray`.") from ray_utils.ray_import_err
backend = "ray"
elif ray_found:
if self.placement_group:
backend = "ray"
else:
from ray import is_initialized as ray_is_initialized
if ray_is_initialized():
from ray.util import get_current_placement_group
if get_current_placement_group():
backend = "ray"
self.distributed_executor_backend = backend
logger.info("Defaulting to use %s for distributed inference",
backend)
self._verify_args()
self.rank: int = 0
MluHijackObject.apply_hijack(ParallelConfig,
ParallelConfig.__init__,
vllm__config__ParallelConfig___init__)