[Model] Support DeepSeek-V4

2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions
--- a/tools/ray_mlu/init.py
+++ b/tools/ray_mlu/init.py
@@ -0,0 +1,89 @@
+from typing import Optional, Set
+
+from ray._private.accelerators.accelerator import (
+    RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR,
+    AcceleratorManager,
+)
+from ray._private.accelerators.amd_gpu import AMDGPUAcceleratorManager
+from ray._private.accelerators.hpu import HPUAcceleratorManager
+from ray._private.accelerators.intel_gpu import IntelGPUAcceleratorManager
+from ray._private.accelerators.neuron import NeuronAcceleratorManager
+from ray._private.accelerators.npu import NPUAcceleratorManager
+from ray._private.accelerators.nvidia_gpu import NvidiaGPUAcceleratorManager
+from ray._private.accelerators.rbln import RBLNAcceleratorManager
+from ray._private.accelerators.tpu import TPUAcceleratorManager
+from ray._private.accelerators.mlu import MLUAcceleratorManager
+
+
+def get_all_accelerator_managers() -> Set[AcceleratorManager]:
+    """Get all accelerator managers supported by Ray."""
+    return {
+        NvidiaGPUAcceleratorManager,
+        IntelGPUAcceleratorManager,
+        AMDGPUAcceleratorManager,
+        TPUAcceleratorManager,
+        NeuronAcceleratorManager,
+        HPUAcceleratorManager,
+        NPUAcceleratorManager,
+        RBLNAcceleratorManager,
+        MLUAcceleratorManager,
+    }
+
+
+def get_all_accelerator_resource_names() -> Set[str]:
+    """Get all resource names for accelerators."""
+    return {
+        accelerator_manager.get_resource_name()
+        for accelerator_manager in get_all_accelerator_managers()
+    }
+
+
+def get_accelerator_manager_for_resource(
+    resource_name: str,
+) -> Optional[AcceleratorManager]:
+    """Get the corresponding accelerator manager for the given
+    accelerator resource name
+
+    E.g., TPUAcceleratorManager is returned if resource name is "TPU"
+    """
+    try:
+        return get_accelerator_manager_for_resource._resource_name_to_accelerator_manager.get(  # noqa: E501
+            resource_name, None
+        )
+    except AttributeError:
+        # Lazy initialization.
+        resource_name_to_accelerator_manager = {
+            accelerator_manager.get_resource_name(): accelerator_manager
+            for accelerator_manager in get_all_accelerator_managers()
+        }
+        # Special handling for GPU resource name since multiple accelerator managers
+        # have the same GPU resource name.
+        if AMDGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
+            resource_name_to_accelerator_manager["GPU"] = AMDGPUAcceleratorManager
+        elif IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
+            resource_name_to_accelerator_manager["GPU"] = IntelGPUAcceleratorManager
+        elif MLUAcceleratorManager.get_current_node_num_accelerators() > 0:
+            resource_name_to_accelerator_manager["GPU"] = MLUAcceleratorManager
+        else:
+            resource_name_to_accelerator_manager["GPU"] = NvidiaGPUAcceleratorManager
+        get_accelerator_manager_for_resource._resource_name_to_accelerator_manager = (
+            resource_name_to_accelerator_manager
+        )
+        return resource_name_to_accelerator_manager.get(resource_name, None)
+
+
+__all__ = [
+    "NvidiaGPUAcceleratorManager",
+    "IntelGPUAcceleratorManager",
+    "AMDGPUAcceleratorManager",
+    "TPUAcceleratorManager",
+    "NeuronAcceleratorManager",
+    "HPUAcceleratorManager",
+    "NPUAcceleratorManager",
+    "RBLNAcceleratorManager",
+    "MLUAcceleratorManager",
+    "get_all_accelerator_managers",
+    "get_all_accelerator_resource_names",
+    "get_accelerator_manager_for_resource",
+    "RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR",
+]
--- a/tools/ray_mlu/device_manager/init.py
+++ b/tools/ray_mlu/device_manager/init.py
@@ -0,0 +1,114 @@
+import logging
+import threading
+from typing import Optional
+
+import ray
+import ray._private.ray_constants as ray_constants
+from ray.air._internal.device_manager.cpu import CPUTorchDeviceManager
+from ray.air._internal.device_manager.hpu import HPUTorchDeviceManager
+from ray.air._internal.device_manager.npu import NPUTorchDeviceManager
+from ray.air._internal.device_manager.mlu import MLUTorchDeviceManager
+from ray.air._internal.device_manager.nvidia_gpu import CUDATorchDeviceManager
+from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_TORCH_DEVICE_MANAGER_CLS = CPUTorchDeviceManager
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief: use MLUTorchDeviceManager when key="GPU"
+'''
+SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER = {
+    ray_constants.GPU: MLUTorchDeviceManager,
+    ray_constants.HPU: HPUTorchDeviceManager,
+    ray_constants.NPU: NPUTorchDeviceManager,
+}
+'''
+==================
+End of MLU Hijack
+==================
+'''
+
+
+def register_custom_torch_dist_backend(backend: Optional[str] = None) -> None:
+    if backend == "hccl":
+        # The name for the communication backend of Habana and torch-npu is the same.
+        HPUTorchDeviceManager.register_custom_torch_dist_backend()
+
+        NPUTorchDeviceManager.register_custom_torch_dist_backend()
+
+
+_torch_device_manager = None
+_torch_device_manager_lock = threading.Lock()
+
+
+def get_torch_device_manager_by_context() -> TorchDeviceManager:
+    global _torch_device_manager
+
+    with _torch_device_manager_lock:
+        if not _torch_device_manager:
+            existing_device_manager_cls = None
+            resources = ray.get_runtime_context().get_accelerator_ids()
+
+            # select correct accelerator type from resources
+            for resource_type, resource_value in resources.items():
+                device_manager_cls = SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER.get(
+                    resource_type, None
+                )
+                if resource_value and device_manager_cls:
+                    # An error will raise when multiple accelerators are specified.
+                    if existing_device_manager_cls:
+                        raise RuntimeError(
+                            "Unable to determine the appropriate DeviceManager "
+                            f"for the specified resources {resources}."
+                        )
+                    else:
+                        existing_device_manager_cls = device_manager_cls
+
+            device_manager_cls = (
+                existing_device_manager_cls or DEFAULT_TORCH_DEVICE_MANAGER_CLS
+            )
+
+            _torch_device_manager = device_manager_cls()
+
+    return _torch_device_manager
+
+
+def get_torch_device_manager_by_device_type(device_type: str):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: use MLUTorchDeviceManager when key="GPU"
+    '''
+    if device_type.lower() == ray_constants.GPU.lower() or device_type == "cuda":
+        return MLUTorchDeviceManager()
+    elif device_type.lower() == ray_constants.NPU.lower():
+        return NPUTorchDeviceManager()
+    elif device_type.lower() == ray_constants.HPU.lower():
+        return HPUTorchDeviceManager()
+    elif device_type.lower() == "cpu":
+        return CPUTorchDeviceManager()
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    raise RuntimeError(f"Device type {device_type} cannot be recognized.")
+
+
+__all__ = [
+    TorchDeviceManager,
+    CPUTorchDeviceManager,
+    CUDATorchDeviceManager,
+    HPUTorchDeviceManager,
+    NPUTorchDeviceManager,
+    MLUTorchDeviceManager,
+    register_custom_torch_dist_backend,
+    get_torch_device_manager_by_context,
+    get_torch_device_manager_by_device_type,
+]
--- a/tools/ray_mlu/device_manager/mlu.py
+++ b/tools/ray_mlu/device_manager/mlu.py
@@ -0,0 +1,103 @@
+import os
+from importlib.util import find_spec
+from typing import List, Union
+
+import torch
+
+import ray
+import ray._private.ray_constants as ray_constants
+from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
+from ray._private.accelerators.mlu import MLU_VISIBLE_DEVICES_ENV_VAR
+
+
+def is_package_present(package_name: str) -> bool:
+    try:
+        return find_spec(package_name) is not None
+    except ModuleNotFoundError:
+        return False
+
+
+MLU_TORCH_PACKAGE_AVAILABLE = is_package_present("torch_mlu")
+
+
+if MLU_TORCH_PACKAGE_AVAILABLE:
+    import torch_mlu  # noqa: F401
+
+
+class MLUTorchDeviceManager(TorchDeviceManager):
+    """Cambricon MLU device manager"""
+
+    @staticmethod
+    def register_custom_torch_dist_backend():
+        if MLU_TORCH_PACKAGE_AVAILABLE:
+            import torch_mlu  # noqa: F401, F811
+
+    def is_available(self) -> bool:
+        if not MLU_TORCH_PACKAGE_AVAILABLE:
+            return False
+
+        return torch.mlu.is_available()
+
+    def get_devices(self) -> List[torch.device]:
+        """Gets the correct torch device list configured for this process.
+        Returns a list of torch MLU devices allocated for the current worker.
+        If no MLUs are assigned, then it returns a list with a single CPU device.
+        """
+        if MLU_TORCH_PACKAGE_AVAILABLE and torch.mlu.is_available():
+            mlu_ids = [
+                str(id)
+                for id in ray.get_runtime_context().get_accelerator_ids()[
+                    ray_constants.GPU
+                ]
+            ]
+
+            device_ids = []
+
+            if len(mlu_ids) > 0:
+                mlu_visible_str = os.environ.get(MLU_VISIBLE_DEVICES_ENV_VAR, "")
+                if mlu_visible_str and mlu_visible_str != "NoDevFiles":
+                    mlu_visible_list = mlu_visible_str.split(",")
+                else:
+                    mlu_visible_list = []
+
+                for mlu_id in mlu_ids:
+                    try:
+                        device_ids.append(mlu_visible_list.index(mlu_id))
+                    except IndexError:
+                        raise RuntimeError(
+                            "MLU_VISIBLE_DEVICES set incorrectly. "
+                            f"Got {mlu_visible_str}, expected to include {mlu_id}. "
+                            "Did you override the `MLU_VISIBLE_DEVICES` "
+                            "environment variable?"
+                        )
+            else:
+                # If called on the driver or outside of Ray Train, return the
+                # 0th device.
+                device_ids.append(0)
+
+            devices = [torch.device(f"mlu:{device_id}") for device_id in device_ids]
+        else:
+            raise RuntimeError(
+                "Using MLUTorchDeviceManager but torch mlu is not available."
+            )
+
+        return devices
+
+    def set_device(self, device: Union[torch.device, int]):
+        torch.mlu.set_device(device)
+
+    def supports_stream(self) -> bool:
+        """Validate if the device type support to create a stream"""
+        return True
+
+    def create_stream(self, device):
+        """Create a stream on MLU device"""
+        return torch.mlu.Stream(device)
+
+    def get_stream_context(self, stream):
+        """Get a torch.stream context on MLU device"""
+        return torch.mlu.stream(stream)
+
+    def get_current_stream(self):
+        """Get current stream for MLU device"""
+        return torch.mlu.current_stream()
--- a/tools/ray_mlu/diff.patch
+++ b/tools/ray_mlu/diff.patch
@@ -0,0 +1,243 @@
+commit 7376225d16e381ecae5cc07d84db9eed043ed06a
+Author: tanhaojue <tanhaojue@cambricon.com>
+Date:   Thu Mar 7 15:54:09 2024 +0800
+
+    support mlu
+
+diff --git a/python/ray/_private/accelerators/__init__.py b/python/ray/_private/accelerators/__init__.py
+index 71550bc..07bdcd6 100644
+--- a/python/ray/_private/accelerators/__init__.py
+++ b/python/ray/_private/accelerators/__init__.py
+@@ -8,6 +8,7 @@ from ray._private.accelerators.tpu import TPUAcceleratorManager
+ from ray._private.accelerators.neuron import NeuronAcceleratorManager
+ from ray._private.accelerators.hpu import HPUAcceleratorManager
+ from ray._private.accelerators.npu import NPUAcceleratorManager
+from ray._private.accelerators.mlu import MLUAcceleratorManager
+ 
+ 
+ def get_all_accelerator_managers() -> Set[AcceleratorManager]:
+@@ -20,6 +21,7 @@ def get_all_accelerator_managers() -> Set[AcceleratorManager]:
+         NeuronAcceleratorManager,
+         HPUAcceleratorManager,
+         NPUAcceleratorManager,
+        MLUAcceleratorManager,
+     }
+ 
+ 
+@@ -55,6 +57,8 @@ def get_accelerator_manager_for_resource(
+             resource_name_to_accelerator_manager["GPU"] = AMDGPUAcceleratorManager
+         elif IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
+             resource_name_to_accelerator_manager["GPU"] = IntelGPUAcceleratorManager
+        elif MLUAcceleratorManager.get_current_node_num_accelerators() > 0:
+            resource_name_to_accelerator_manager["GPU"] = MLUAcceleratorManager
+         else:
+             resource_name_to_accelerator_manager["GPU"] = NvidiaGPUAcceleratorManager
+         get_accelerator_manager_for_resource._resource_name_to_accelerator_manager = (
+@@ -71,6 +75,7 @@ __all__ = [
+     "NeuronAcceleratorManager",
+     "HPUAcceleratorManager",
+     "NPUAcceleratorManager",
+    "MLUAcceleratorManager",
+     "get_all_accelerator_managers",
+     "get_all_accelerator_resource_names",
+     "get_accelerator_manager_for_resource",
+diff --git a/python/ray/_private/accelerators/mlu.py b/python/ray/_private/accelerators/mlu.py
+new file mode 100755
+index 0000000..21a5771
+--- /dev/null
+++ b/python/ray/_private/accelerators/mlu.py
+@@ -0,0 +1,92 @@
+import os
+import glob
+import logging
+from typing import Optional, List, Tuple
+import torch
+import torch_mlu
+from ray._private.accelerators.accelerator import AcceleratorManager
+
+logger = logging.getLogger(__name__)
+
+MLU_VISIBLE_DEVICES_ENV_VAR = "MLU_VISIBLE_DEVICES"
+NOSET_MLU_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_MLU_VISIBLE_DEVICES"
+
+
+class MLUAcceleratorManager(AcceleratorManager):
+    """Cambricon MLU accelerators."""
+
+    @staticmethod
+    def get_resource_name() -> str:
+        return "GPU"
+
+    @staticmethod
+    def get_visible_accelerator_ids_env_var() -> str:
+        return MLU_VISIBLE_DEVICES_ENV_VAR
+
+    @staticmethod
+    def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
+        mlu_visible_devices = os.environ.get(
+            MLUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
+        )
+
+        if mlu_visible_devices is None:
+            return None
+
+        if mlu_visible_devices == "":
+            return []
+
+        if mlu_visible_devices == "NoDevFiles":
+            return []
+
+        return list(mlu_visible_devices.split(","))
+
+    @staticmethod
+    def get_current_node_num_accelerators() -> int:
+        """Attempt to detect the number of MLUs on this machine.
+
+        MLU chips are represented as devices within `/dev/`, either as `/dev/davinci?`.
+
+        Returns:
+             The number of MLUs if any were detected, otherwise 0.
+        """
+        try:
+            return torch.mlu.device_count()
+        except Exception as e:
+            logger.debug("Could not import CambriconCL: %s", e)
+
+        try:
+            mlu_files = glob.glob("/dev/cambricon_dev?")
+            return len(mlu_files)
+        except Exception as e:
+            logger.debug("Failed to detect number of MLUs: %s", e)
+        return 0
+
+    @staticmethod
+    def get_current_node_accelerator_type() -> Optional[str]:
+        """Get the type of the Cambricon MLU on the current node.
+
+        Returns:
+            A string of the type, such as "MLU370".
+        """
+        try:
+            return torch.mlu.get_device_name(0)
+        except Exception:
+            logger.exception("Failed to detect MLU type.")
+        return None
+
+    @staticmethod
+    def validate_resource_request_quantity(
+        quantity: float,
+    ) -> Tuple[bool, Optional[str]]:
+        return (True, None)
+
+    @staticmethod
+    def set_current_process_visible_accelerator_ids(
+        visible_mlu_devices: List[str],
+    ) -> None:
+        if os.environ.get(NOSET_MLU_VISIBLE_DEVICES_ENV_VAR):
+            return
+
+        os.environ[
+            MLUAcceleratorManager.get_visible_accelerator_ids_env_var()
+        ] = ",".join([str(i) for i in visible_mlu_devices])
+diff --git a/python/ray/tests/accelerators/test_mlu.py b/python/ray/tests/accelerators/test_mlu.py
+new file mode 100755
+index 0000000..70e81f7
+--- /dev/null
+++ b/python/ray/tests/accelerators/test_mlu.py
+@@ -0,0 +1,92 @@
+import os
+import sys
+import pytest
+from unittest.mock import patch
+
+import ray
+from ray._private.accelerators import MLUAcceleratorManager as Accelerator
+
+
+@patch("glob.glob")
+@patch("os.listdir")
+def test_autodetect_num_mlus(mock_list, mock_glob):
+    mock_glob.return_value = [f"/dev/davinci{i}" for i in range(4)]
+    # mock_list.return_value = []
+    assert Accelerator.get_current_node_num_accelerators() == 4
+
+
+@patch("glob.glob")
+@patch("os.listdir")
+def test_autodetect_num_mlus_without_devices(mock_list, mock_glob):
+    mock_glob.side_effect = Exception
+    # mock_list.return_value = []
+    assert Accelerator.get_current_node_num_accelerators() == 0
+
+
+def test_mlu_accelerator_manager_api():
+    assert Accelerator.get_resource_name() == "MLU"
+    assert Accelerator.get_visible_accelerator_ids_env_var() == "MLU_VISIBLE_DEVICES"
+    assert Accelerator.validate_resource_request_quantity(0.5) == (True, None)
+    assert Accelerator.validate_resource_request_quantity(1) == (True, None)
+
+
+def test_visible_mlu_type(monkeypatch, shutdown_only):
+    with patch.object(
+        Accelerator, "get_current_node_num_accelerators", return_value=4
+    ), patch.object(
+        Accelerator, "get_current_node_accelerator_type", return_value="MLU370"
+    ):
+        monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
+        manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
+        assert manager.get_current_node_accelerator_type() == "MLU370"
+
+@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
+def test_visible_mlu_ids(monkeypatch, shutdown_only):
+    monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
+    with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
+
+        ray.init()
+        manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
+        assert manager.get_current_node_num_accelerators() == 4
+        assert manager.__name__ == "MLUAcceleratorManager"
+        assert ray.available_resources()["MLU"] == 3
+
+def test_get_current_process_visible_accelerator_ids(monkeypatch, shutdown_only):
+    monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
+    assert Accelerator.get_current_process_visible_accelerator_ids() == ["0", "1", "2"]
+
+    monkeypatch.delenv("MLU_VISIBLE_DEVICES")
+    assert Accelerator.get_current_process_visible_accelerator_ids() is None
+
+    monkeypatch.setenv("MLU_VISIBLE_DEVICES", "")
+    assert Accelerator.get_current_process_visible_accelerator_ids() == []
+
+    monkeypatch.setenv("MLU_VISIBLE_DEVICES", "NoDevFiles")
+    assert Accelerator.get_current_process_visible_accelerator_ids() == []
+
+
+def test_set_current_process_visible_accelerator_ids(shutdown_only):
+    Accelerator.set_current_process_visible_accelerator_ids(["0"])
+    assert os.environ["MLU_VISIBLE_DEVICES"] == "0"
+
+    Accelerator.set_current_process_visible_accelerator_ids(["0", "1"])
+    assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1"
+
+    Accelerator.set_current_process_visible_accelerator_ids(["0", "1", "2"])
+    assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1,2"
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
+def test_auto_detected_more_than_visible(monkeypatch, shutdown_only):
+    with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
+        # If more MLUs are detected than visible.
+        monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
+
+        ray.init()
+        assert ray.available_resources()["MLU"] == 3
+
+if __name__ == "__main__":
+    if os.environ.get("PARALLEL_CI"):
+        sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
+    else:
+        sys.exit(pytest.main(["-sv", __file__]))
+diff --git a/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl b/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl
+new file mode 100644
+index 0000000..8628a88
+Binary files /dev/null and b/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl differ
--- a/tools/ray_mlu/diff_for_dump_info.patch
+++ b/tools/ray_mlu/diff_for_dump_info.patch
@@ -0,0 +1,11 @@
+diff --git a/ray_mlu/mlu.py b/ray_mlu/mlu.py
+index 21a57719..2c63fd5b 100755
+--- a/ray_mlu/mlu.py
+++ b/ray_mlu/mlu.py
+@@ -87,6 +87,3 @@ class MLUAcceleratorManager(AcceleratorManager):
+         if os.environ.get(NOSET_MLU_VISIBLE_DEVICES_ENV_VAR):
+             return
+ 
+-        os.environ[
+-            MLUAcceleratorManager.get_visible_accelerator_ids_env_var()
+-        ] = ",".join([str(i) for i in visible_mlu_devices])
--- a/tools/ray_mlu/mlu.py
+++ b/tools/ray_mlu/mlu.py
@@ -0,0 +1,94 @@
+import os
+import glob
+import logging
+from typing import Optional, List, Tuple
+import torch
+import torch_mlu
+from ray._private.accelerators.accelerator import AcceleratorManager
+
+logger = logging.getLogger(__name__)
+
+MLU_VISIBLE_DEVICES_ENV_VAR = "MLU_VISIBLE_DEVICES"
+NOSET_MLU_VISIBLE_DEVICES_ENV_VAR = (
+    "RAY_EXPERIMENTAL_NOSET_MLU_VISIBLE_DEVICES"
+)
+
+
+class MLUAcceleratorManager(AcceleratorManager):
+    """Cambricon MLU accelerators."""
+
+    @staticmethod
+    def get_resource_name() -> str:
+        return "GPU"
+
+    @staticmethod
+    def get_visible_accelerator_ids_env_var() -> str:
+        return MLU_VISIBLE_DEVICES_ENV_VAR
+
+    @staticmethod
+    def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
+        mlu_visible_devices = os.environ.get(
+            MLUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
+        )
+
+        if mlu_visible_devices is None:
+            return None
+
+        if mlu_visible_devices == "":
+            return []
+
+        if mlu_visible_devices == "NoDevFiles":
+            return []
+
+        return list(mlu_visible_devices.split(","))
+
+    @staticmethod
+    def get_current_node_num_accelerators() -> int:
+        """Attempt to detect the number of MLUs on this machine.
+
+        MLU chips are represented as devices within `/dev/`, either as `/dev/davinci?`.
+
+        Returns:
+             The number of MLUs if any were detected, otherwise 0.
+        """
+        try:
+            return torch.mlu.device_count()
+        except Exception as e:
+            logger.debug("Could not import CambriconCL: %s", e)
+
+        try:
+            mlu_files = glob.glob("/dev/cambricon_dev?")
+            return len(mlu_files)
+        except Exception as e:
+            logger.debug("Failed to detect number of MLUs: %s", e)
+        return 0
+
+    @staticmethod
+    def get_current_node_accelerator_type() -> Optional[str]:
+        """Get the type of the Cambricon MLU on the current node.
+
+        Returns:
+            A string of the type, such as "MLU370".
+        """
+        try:
+            return torch.mlu.get_device_name(0)
+        except Exception:
+            logger.exception("Failed to detect MLU type.")
+        return None
+
+    @staticmethod
+    def validate_resource_request_quantity(
+        quantity: float,
+    ) -> Tuple[bool, Optional[str]]:
+        return (True, None)
+
+    @staticmethod
+    def set_current_process_visible_accelerator_ids(
+        visible_mlu_devices: List[str],
+    ) -> None:
+        if os.environ.get(NOSET_MLU_VISIBLE_DEVICES_ENV_VAR):
+            return
+
+        os.environ[
+            MLUAcceleratorManager.get_visible_accelerator_ids_env_var()
+        ] = ",".join([str(i) for i in visible_mlu_devices])
--- a/tools/ray_mlu/node.py
+++ b/tools/ray_mlu/node.py
--- a/tools/ray_mlu/nsight.py
+++ b/tools/ray_mlu/nsight.py
@@ -0,0 +1,142 @@
+import asyncio
+import copy
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+from ray._common.utils import (
+    try_to_create_directory,
+)
+from ray._private.runtime_env.context import RuntimeEnvContext
+from ray._private.runtime_env.plugin import RuntimeEnvPlugin
+from ray.exceptions import RuntimeEnvSetupError
+
+default_logger = logging.getLogger(__name__)
+
+# Nsight options used when runtime_env={"_nsight": "default"}
+# use default cnperf config, no need to specify any options
+NSIGHT_DEFAULT_CONFIG = {}
+
+def parse_nsight_config(nsight_config: Dict[str, str]) -> List[str]:
+    """
+    Function to convert dictionary of nsight options into
+    nsight command line
+
+    The function returns:
+    - List[str]: nsys profile cmd line split into list of str
+    """
+    nsight_cmd = ["cnperf-cli", "record"]
+    for option, option_val in nsight_config.items():
+        # option standard based on
+        # https://www.gnu.org/software/libc/manual/html_node/Argument-Syntax.html
+        if len(option) > 1:
+            nsight_cmd.append(f"--{option}={option_val}")
+        else:
+            nsight_cmd += [f"-{option}", option_val]
+    return nsight_cmd
+
+
+class NsightPlugin(RuntimeEnvPlugin):
+    name = "_nsight"
+
+    def __init__(self, resources_dir: str):
+        self.nsight_cmd = []
+
+        # replace this with better way to get logs dir
+        session_dir, runtime_dir = os.path.split(resources_dir)
+        self._nsight_dir = Path(session_dir) / "logs" / "nsight"
+        try_to_create_directory(self._nsight_dir)
+
+    async def _check_nsight_script(
+        self, nsight_config: Dict[str, str]
+    ) -> Tuple[bool, str]:
+        """
+        Function to validate if nsight_config is a valid nsight profile options
+        Args:
+            nsight_config: dictionary mapping nsight option to it's value
+        Returns:
+            a tuple consists of a boolean indicating if the nsight_config
+            is valid option and an error message if the nsight_config is invalid
+        """
+
+        # use empty as nsight report test filename
+        nsight_config_copy = copy.deepcopy(nsight_config)
+        try_to_create_directory(Path(self._nsight_dir) / "empty")
+        nsight_config_copy["o"] = str(Path(self._nsight_dir) / "empty/test")
+        nsight_cmd = parse_nsight_config(nsight_config_copy)
+        try:
+            nsight_cmd = nsight_cmd + ["python", "-c", '""']
+            process = await asyncio.create_subprocess_exec(
+                *nsight_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            stdout, stderr = await process.communicate()
+            error_msg = stderr.strip() if stderr.strip() != "" else stdout.strip()
+
+            # cleanup test.cnperf-rep file
+            clean_up_cmd = ["rm", f"{nsight_config_copy['o']}.cnperf-rep"]
+            cleanup_process = await asyncio.create_subprocess_exec(
+                *clean_up_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            _, _ = await cleanup_process.communicate()
+            if process.returncode == 0:
+                return True, None
+            else:
+                return False, error_msg
+        except FileNotFoundError:
+            return False, ("cnperf-cli is not installed")
+
+    async def create(
+        self,
+        uri: Optional[str],
+        runtime_env: "RuntimeEnv",  # noqa: F821
+        context: RuntimeEnvContext,
+        logger: logging.Logger = default_logger,
+    ) -> int:
+        nsight_config = runtime_env.nsight()
+        if not nsight_config:
+            return 0
+
+        if nsight_config and sys.platform != "linux":
+            raise RuntimeEnvSetupError(
+                "CNPerf CLI is only available in Linux.\n"
+                "More information can be found in "
+                "https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html"
+            )
+
+        if isinstance(nsight_config, str):
+            if nsight_config == "default":
+                nsight_config = NSIGHT_DEFAULT_CONFIG
+            else:
+                raise RuntimeEnvSetupError(
+                    f"Unsupported nsight config: {nsight_config}. "
+                    "The supported config is 'default' or "
+                    "Dictionary of cnperf options"
+                )
+
+        is_valid_nsight_cmd, error_msg = await self._check_nsight_script(nsight_config)
+        if not is_valid_nsight_cmd:
+            logger.warning(error_msg)
+            raise RuntimeEnvSetupError(
+                "cnperf-cli failed to run with the following "
+                f"error message:\n {error_msg}"
+            )
+        self.nsight_cmd = parse_nsight_config(nsight_config)
+        return 0
+
+    def modify_context(
+        self,
+        uris: List[str],
+        runtime_env: "RuntimeEnv",  # noqa: F821
+        context: RuntimeEnvContext,
+        logger: Optional[logging.Logger] = default_logger,
+    ):
+        context.py_executable = " ".join(self.nsight_cmd) + " python"
+        logger.info("Running CNPerf cmd: %s", context.py_executable)
+
--- a/tools/ray_mlu/test_mlu.py
+++ b/tools/ray_mlu/test_mlu.py
@@ -0,0 +1,92 @@
+import os
+import sys
+import pytest
+from unittest.mock import patch
+
+import ray
+from ray._private.accelerators import MLUAcceleratorManager as Accelerator
+
+
+@patch("glob.glob")
+@patch("os.listdir")
+def test_autodetect_num_mlus(mock_list, mock_glob):
+    mock_glob.return_value = [f"/dev/davinci{i}" for i in range(4)]
+    # mock_list.return_value = []
+    assert Accelerator.get_current_node_num_accelerators() == 4
+
+
+@patch("glob.glob")
+@patch("os.listdir")
+def test_autodetect_num_mlus_without_devices(mock_list, mock_glob):
+    mock_glob.side_effect = Exception
+    # mock_list.return_value = []
+    assert Accelerator.get_current_node_num_accelerators() == 0
+
+
+def test_mlu_accelerator_manager_api():
+    assert Accelerator.get_resource_name() == "MLU"
+    assert Accelerator.get_visible_accelerator_ids_env_var() == "MLU_VISIBLE_DEVICES"
+    assert Accelerator.validate_resource_request_quantity(0.5) == (True, None)
+    assert Accelerator.validate_resource_request_quantity(1) == (True, None)
+
+
+def test_visible_mlu_type(monkeypatch, shutdown_only):
+    with patch.object(
+        Accelerator, "get_current_node_num_accelerators", return_value=4
+    ), patch.object(
+        Accelerator, "get_current_node_accelerator_type", return_value="MLU370"
+    ):
+        monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
+        manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
+        assert manager.get_current_node_accelerator_type() == "MLU370"
+
+@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
+def test_visible_mlu_ids(monkeypatch, shutdown_only):
+    monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
+    with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
+
+        ray.init()
+        manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
+        assert manager.get_current_node_num_accelerators() == 4
+        assert manager.__name__ == "MLUAcceleratorManager"
+        assert ray.available_resources()["MLU"] == 3
+
+def test_get_current_process_visible_accelerator_ids(monkeypatch, shutdown_only):
+    monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
+    assert Accelerator.get_current_process_visible_accelerator_ids() == ["0", "1", "2"]
+
+    monkeypatch.delenv("MLU_VISIBLE_DEVICES")
+    assert Accelerator.get_current_process_visible_accelerator_ids() is None
+
+    monkeypatch.setenv("MLU_VISIBLE_DEVICES", "")
+    assert Accelerator.get_current_process_visible_accelerator_ids() == []
+
+    monkeypatch.setenv("MLU_VISIBLE_DEVICES", "NoDevFiles")
+    assert Accelerator.get_current_process_visible_accelerator_ids() == []
+
+
+def test_set_current_process_visible_accelerator_ids(shutdown_only):
+    Accelerator.set_current_process_visible_accelerator_ids(["0"])
+    assert os.environ["MLU_VISIBLE_DEVICES"] == "0"
+
+    Accelerator.set_current_process_visible_accelerator_ids(["0", "1"])
+    assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1"
+
+    Accelerator.set_current_process_visible_accelerator_ids(["0", "1", "2"])
+    assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1,2"
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
+def test_auto_detected_more_than_visible(monkeypatch, shutdown_only):
+    with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
+        # If more MLUs are detected than visible.
+        monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
+
+        ray.init()
+        assert ray.available_resources()["MLU"] == 3
+
+if __name__ == "__main__":
+    if os.environ.get("PARALLEL_CI"):
+        sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
+    else:
+        sys.exit(pytest.main(["-sv", __file__]))
--- a/tools/ray_mlu/worker.py
+++ b/tools/ray_mlu/worker.py