add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/config.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/config.py
@@ -0,0 +1,110 @@
+import torch
+from vllm.config import ParallelConfig, TokenizerPoolConfig
+from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
+from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless
+from vllm.platforms import current_platform
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+    from vllm.executor.executor_base import ExecutorBase
+
+logger = init_logger(__name__)
+
+
+def vllm__config__ParallelConfig___init__(
+        self,
+        pipeline_parallel_size: int,
+        tensor_parallel_size: int,
+        worker_use_ray: Optional[bool] = None,
+        max_parallel_loading_workers: Optional[int] = None,
+        disable_custom_all_reduce: bool = False,
+        tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
+        ray_workers_use_nsight: bool = False,
+        placement_group: Optional["PlacementGroup"] = None,
+        distributed_executor_backend: Optional[Union[
+            str, Type["ExecutorBase"]]] = None,
+) -> None:
+    self.pipeline_parallel_size = pipeline_parallel_size
+    self.tensor_parallel_size = tensor_parallel_size
+    self.distributed_executor_backend = distributed_executor_backend
+    self.max_parallel_loading_workers = max_parallel_loading_workers
+    self.disable_custom_all_reduce = disable_custom_all_reduce
+    self.tokenizer_pool_config = tokenizer_pool_config
+    self.ray_workers_use_nsight = ray_workers_use_nsight
+    self.placement_group = placement_group
+
+    '''
+    ==========================
+    Modify by vllm_mlu
+    ==========================
+    @brief: modify world_size
+    '''
+    self.context_parallel_size = self.context_parallel_size
+    self.moe_tp_size = self.moe_tp_size
+    self.moe_ep_size = self.moe_ep_size
+
+    self.world_size = pipeline_parallel_size * tensor_parallel_size * self.context_parallel_size
+    '''
+    =======================
+    End of MLU Hijack
+    =======================
+    '''
+    if worker_use_ray:
+        if self.distributed_executor_backend is None:
+            self.distributed_executor_backend = "ray"
+        elif not self.use_ray:
+            raise ValueError(f"worker-use-ray can't be used with "
+                             f"distributed executor backend "
+                             f"'{self.distributed_executor_backend}'.")
+
+    if current_platform.is_tpu() and self.world_size > 1:
+        if self.distributed_executor_backend is None:
+            self.distributed_executor_backend = "ray"
+        if self.distributed_executor_backend != "ray":
+            raise ValueError(
+                "TPU backend only supports Ray for distributed inference.")
+
+    if current_platform.is_hpu() and self.world_size > 1:
+        if self.distributed_executor_backend is None:
+            self.distributed_executor_backend = "ray"
+        if self.distributed_executor_backend != "ray":
+            raise ValueError(
+                "HPU backend only supports Ray for distributed inference.")
+
+    if self.distributed_executor_backend is None and self.world_size > 1:
+        # We use multiprocessing by default if world_size fits on the
+        # current node and we aren't in a ray placement group.
+
+        from vllm.executor import ray_utils
+        backend = "mp"
+        ray_found = ray_utils.ray_is_available()
+        if (current_platform.is_cuda()
+                and cuda_device_count_stateless() < self.world_size):
+            if not ray_found:
+                raise ValueError("Unable to load Ray which is "
+                                 "required for multi-node inference, "
+                                 "please install Ray with `pip install "
+                                 "ray`.") from ray_utils.ray_import_err
+            backend = "ray"
+        elif ray_found:
+            if self.placement_group:
+                backend = "ray"
+            else:
+                from ray import is_initialized as ray_is_initialized
+                if ray_is_initialized():
+                    from ray.util import get_current_placement_group
+                    if get_current_placement_group():
+                        backend = "ray"
+        self.distributed_executor_backend = backend
+        logger.info("Defaulting to use %s for distributed inference",
+                    backend)
+
+    self._verify_args()
+    self.rank: int = 0
+
+
+MluHijackObject.apply_hijack(ParallelConfig,
+                             ParallelConfig.__init__,
+                             vllm__config__ParallelConfig___init__)