[Model] Support DeepSeek-V4

2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions
--- a/tools/ray_mlu/device_manager/init.py
+++ b/tools/ray_mlu/device_manager/init.py
@@ -0,0 +1,114 @@
+import logging
+import threading
+from typing import Optional
+
+import ray
+import ray._private.ray_constants as ray_constants
+from ray.air._internal.device_manager.cpu import CPUTorchDeviceManager
+from ray.air._internal.device_manager.hpu import HPUTorchDeviceManager
+from ray.air._internal.device_manager.npu import NPUTorchDeviceManager
+from ray.air._internal.device_manager.mlu import MLUTorchDeviceManager
+from ray.air._internal.device_manager.nvidia_gpu import CUDATorchDeviceManager
+from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_TORCH_DEVICE_MANAGER_CLS = CPUTorchDeviceManager
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief: use MLUTorchDeviceManager when key="GPU"
+'''
+SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER = {
+    ray_constants.GPU: MLUTorchDeviceManager,
+    ray_constants.HPU: HPUTorchDeviceManager,
+    ray_constants.NPU: NPUTorchDeviceManager,
+}
+'''
+==================
+End of MLU Hijack
+==================
+'''
+
+
+def register_custom_torch_dist_backend(backend: Optional[str] = None) -> None:
+    if backend == "hccl":
+        # The name for the communication backend of Habana and torch-npu is the same.
+        HPUTorchDeviceManager.register_custom_torch_dist_backend()
+
+        NPUTorchDeviceManager.register_custom_torch_dist_backend()
+
+
+_torch_device_manager = None
+_torch_device_manager_lock = threading.Lock()
+
+
+def get_torch_device_manager_by_context() -> TorchDeviceManager:
+    global _torch_device_manager
+
+    with _torch_device_manager_lock:
+        if not _torch_device_manager:
+            existing_device_manager_cls = None
+            resources = ray.get_runtime_context().get_accelerator_ids()
+
+            # select correct accelerator type from resources
+            for resource_type, resource_value in resources.items():
+                device_manager_cls = SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER.get(
+                    resource_type, None
+                )
+                if resource_value and device_manager_cls:
+                    # An error will raise when multiple accelerators are specified.
+                    if existing_device_manager_cls:
+                        raise RuntimeError(
+                            "Unable to determine the appropriate DeviceManager "
+                            f"for the specified resources {resources}."
+                        )
+                    else:
+                        existing_device_manager_cls = device_manager_cls
+
+            device_manager_cls = (
+                existing_device_manager_cls or DEFAULT_TORCH_DEVICE_MANAGER_CLS
+            )
+
+            _torch_device_manager = device_manager_cls()
+
+    return _torch_device_manager
+
+
+def get_torch_device_manager_by_device_type(device_type: str):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: use MLUTorchDeviceManager when key="GPU"
+    '''
+    if device_type.lower() == ray_constants.GPU.lower() or device_type == "cuda":
+        return MLUTorchDeviceManager()
+    elif device_type.lower() == ray_constants.NPU.lower():
+        return NPUTorchDeviceManager()
+    elif device_type.lower() == ray_constants.HPU.lower():
+        return HPUTorchDeviceManager()
+    elif device_type.lower() == "cpu":
+        return CPUTorchDeviceManager()
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    raise RuntimeError(f"Device type {device_type} cannot be recognized.")
+
+
+__all__ = [
+    TorchDeviceManager,
+    CPUTorchDeviceManager,
+    CUDATorchDeviceManager,
+    HPUTorchDeviceManager,
+    NPUTorchDeviceManager,
+    MLUTorchDeviceManager,
+    register_custom_torch_dist_backend,
+    get_torch_device_manager_by_context,
+    get_torch_device_manager_by_device_type,
+]
--- a/tools/ray_mlu/device_manager/mlu.py
+++ b/tools/ray_mlu/device_manager/mlu.py
@@ -0,0 +1,103 @@
+import os
+from importlib.util import find_spec
+from typing import List, Union
+
+import torch
+
+import ray
+import ray._private.ray_constants as ray_constants
+from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
+from ray._private.accelerators.mlu import MLU_VISIBLE_DEVICES_ENV_VAR
+
+
+def is_package_present(package_name: str) -> bool:
+    try:
+        return find_spec(package_name) is not None
+    except ModuleNotFoundError:
+        return False
+
+
+MLU_TORCH_PACKAGE_AVAILABLE = is_package_present("torch_mlu")
+
+
+if MLU_TORCH_PACKAGE_AVAILABLE:
+    import torch_mlu  # noqa: F401
+
+
+class MLUTorchDeviceManager(TorchDeviceManager):
+    """Cambricon MLU device manager"""
+
+    @staticmethod
+    def register_custom_torch_dist_backend():
+        if MLU_TORCH_PACKAGE_AVAILABLE:
+            import torch_mlu  # noqa: F401, F811
+
+    def is_available(self) -> bool:
+        if not MLU_TORCH_PACKAGE_AVAILABLE:
+            return False
+
+        return torch.mlu.is_available()
+
+    def get_devices(self) -> List[torch.device]:
+        """Gets the correct torch device list configured for this process.
+        Returns a list of torch MLU devices allocated for the current worker.
+        If no MLUs are assigned, then it returns a list with a single CPU device.
+        """
+        if MLU_TORCH_PACKAGE_AVAILABLE and torch.mlu.is_available():
+            mlu_ids = [
+                str(id)
+                for id in ray.get_runtime_context().get_accelerator_ids()[
+                    ray_constants.GPU
+                ]
+            ]
+
+            device_ids = []
+
+            if len(mlu_ids) > 0:
+                mlu_visible_str = os.environ.get(MLU_VISIBLE_DEVICES_ENV_VAR, "")
+                if mlu_visible_str and mlu_visible_str != "NoDevFiles":
+                    mlu_visible_list = mlu_visible_str.split(",")
+                else:
+                    mlu_visible_list = []
+
+                for mlu_id in mlu_ids:
+                    try:
+                        device_ids.append(mlu_visible_list.index(mlu_id))
+                    except IndexError:
+                        raise RuntimeError(
+                            "MLU_VISIBLE_DEVICES set incorrectly. "
+                            f"Got {mlu_visible_str}, expected to include {mlu_id}. "
+                            "Did you override the `MLU_VISIBLE_DEVICES` "
+                            "environment variable?"
+                        )
+            else:
+                # If called on the driver or outside of Ray Train, return the
+                # 0th device.
+                device_ids.append(0)
+
+            devices = [torch.device(f"mlu:{device_id}") for device_id in device_ids]
+        else:
+            raise RuntimeError(
+                "Using MLUTorchDeviceManager but torch mlu is not available."
+            )
+
+        return devices
+
+    def set_device(self, device: Union[torch.device, int]):
+        torch.mlu.set_device(device)
+
+    def supports_stream(self) -> bool:
+        """Validate if the device type support to create a stream"""
+        return True
+
+    def create_stream(self, device):
+        """Create a stream on MLU device"""
+        return torch.mlu.Stream(device)
+
+    def get_stream_context(self, stream):
+        """Get a torch.stream context on MLU device"""
+        return torch.mlu.stream(stream)
+
+    def get_current_stream(self):
+        """Get current stream for MLU device"""
+        return torch.mlu.current_stream()