[Model] Support DeepSeek-V4
This commit is contained in:
89
tools/ray_mlu/__init__.py
Normal file
89
tools/ray_mlu/__init__.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from typing import Optional, Set
|
||||
|
||||
from ray._private.accelerators.accelerator import (
|
||||
RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR,
|
||||
AcceleratorManager,
|
||||
)
|
||||
from ray._private.accelerators.amd_gpu import AMDGPUAcceleratorManager
|
||||
from ray._private.accelerators.hpu import HPUAcceleratorManager
|
||||
from ray._private.accelerators.intel_gpu import IntelGPUAcceleratorManager
|
||||
from ray._private.accelerators.neuron import NeuronAcceleratorManager
|
||||
from ray._private.accelerators.npu import NPUAcceleratorManager
|
||||
from ray._private.accelerators.nvidia_gpu import NvidiaGPUAcceleratorManager
|
||||
from ray._private.accelerators.rbln import RBLNAcceleratorManager
|
||||
from ray._private.accelerators.tpu import TPUAcceleratorManager
|
||||
from ray._private.accelerators.mlu import MLUAcceleratorManager
|
||||
|
||||
|
||||
def get_all_accelerator_managers() -> Set[AcceleratorManager]:
|
||||
"""Get all accelerator managers supported by Ray."""
|
||||
return {
|
||||
NvidiaGPUAcceleratorManager,
|
||||
IntelGPUAcceleratorManager,
|
||||
AMDGPUAcceleratorManager,
|
||||
TPUAcceleratorManager,
|
||||
NeuronAcceleratorManager,
|
||||
HPUAcceleratorManager,
|
||||
NPUAcceleratorManager,
|
||||
RBLNAcceleratorManager,
|
||||
MLUAcceleratorManager,
|
||||
}
|
||||
|
||||
|
||||
def get_all_accelerator_resource_names() -> Set[str]:
|
||||
"""Get all resource names for accelerators."""
|
||||
return {
|
||||
accelerator_manager.get_resource_name()
|
||||
for accelerator_manager in get_all_accelerator_managers()
|
||||
}
|
||||
|
||||
|
||||
def get_accelerator_manager_for_resource(
|
||||
resource_name: str,
|
||||
) -> Optional[AcceleratorManager]:
|
||||
"""Get the corresponding accelerator manager for the given
|
||||
accelerator resource name
|
||||
|
||||
E.g., TPUAcceleratorManager is returned if resource name is "TPU"
|
||||
"""
|
||||
try:
|
||||
return get_accelerator_manager_for_resource._resource_name_to_accelerator_manager.get( # noqa: E501
|
||||
resource_name, None
|
||||
)
|
||||
except AttributeError:
|
||||
# Lazy initialization.
|
||||
resource_name_to_accelerator_manager = {
|
||||
accelerator_manager.get_resource_name(): accelerator_manager
|
||||
for accelerator_manager in get_all_accelerator_managers()
|
||||
}
|
||||
# Special handling for GPU resource name since multiple accelerator managers
|
||||
# have the same GPU resource name.
|
||||
if AMDGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
|
||||
resource_name_to_accelerator_manager["GPU"] = AMDGPUAcceleratorManager
|
||||
elif IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
|
||||
resource_name_to_accelerator_manager["GPU"] = IntelGPUAcceleratorManager
|
||||
elif MLUAcceleratorManager.get_current_node_num_accelerators() > 0:
|
||||
resource_name_to_accelerator_manager["GPU"] = MLUAcceleratorManager
|
||||
else:
|
||||
resource_name_to_accelerator_manager["GPU"] = NvidiaGPUAcceleratorManager
|
||||
get_accelerator_manager_for_resource._resource_name_to_accelerator_manager = (
|
||||
resource_name_to_accelerator_manager
|
||||
)
|
||||
return resource_name_to_accelerator_manager.get(resource_name, None)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"NvidiaGPUAcceleratorManager",
|
||||
"IntelGPUAcceleratorManager",
|
||||
"AMDGPUAcceleratorManager",
|
||||
"TPUAcceleratorManager",
|
||||
"NeuronAcceleratorManager",
|
||||
"HPUAcceleratorManager",
|
||||
"NPUAcceleratorManager",
|
||||
"RBLNAcceleratorManager",
|
||||
"MLUAcceleratorManager",
|
||||
"get_all_accelerator_managers",
|
||||
"get_all_accelerator_resource_names",
|
||||
"get_accelerator_manager_for_resource",
|
||||
"RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR",
|
||||
]
|
||||
114
tools/ray_mlu/device_manager/__init__.py
Normal file
114
tools/ray_mlu/device_manager/__init__.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import logging
|
||||
import threading
|
||||
from typing import Optional
|
||||
|
||||
import ray
|
||||
import ray._private.ray_constants as ray_constants
|
||||
from ray.air._internal.device_manager.cpu import CPUTorchDeviceManager
|
||||
from ray.air._internal.device_manager.hpu import HPUTorchDeviceManager
|
||||
from ray.air._internal.device_manager.npu import NPUTorchDeviceManager
|
||||
from ray.air._internal.device_manager.mlu import MLUTorchDeviceManager
|
||||
from ray.air._internal.device_manager.nvidia_gpu import CUDATorchDeviceManager
|
||||
from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
DEFAULT_TORCH_DEVICE_MANAGER_CLS = CPUTorchDeviceManager
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: use MLUTorchDeviceManager when key="GPU"
|
||||
'''
|
||||
SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER = {
|
||||
ray_constants.GPU: MLUTorchDeviceManager,
|
||||
ray_constants.HPU: HPUTorchDeviceManager,
|
||||
ray_constants.NPU: NPUTorchDeviceManager,
|
||||
}
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
def register_custom_torch_dist_backend(backend: Optional[str] = None) -> None:
|
||||
if backend == "hccl":
|
||||
# The name for the communication backend of Habana and torch-npu is the same.
|
||||
HPUTorchDeviceManager.register_custom_torch_dist_backend()
|
||||
|
||||
NPUTorchDeviceManager.register_custom_torch_dist_backend()
|
||||
|
||||
|
||||
_torch_device_manager = None
|
||||
_torch_device_manager_lock = threading.Lock()
|
||||
|
||||
|
||||
def get_torch_device_manager_by_context() -> TorchDeviceManager:
|
||||
global _torch_device_manager
|
||||
|
||||
with _torch_device_manager_lock:
|
||||
if not _torch_device_manager:
|
||||
existing_device_manager_cls = None
|
||||
resources = ray.get_runtime_context().get_accelerator_ids()
|
||||
|
||||
# select correct accelerator type from resources
|
||||
for resource_type, resource_value in resources.items():
|
||||
device_manager_cls = SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER.get(
|
||||
resource_type, None
|
||||
)
|
||||
if resource_value and device_manager_cls:
|
||||
# An error will raise when multiple accelerators are specified.
|
||||
if existing_device_manager_cls:
|
||||
raise RuntimeError(
|
||||
"Unable to determine the appropriate DeviceManager "
|
||||
f"for the specified resources {resources}."
|
||||
)
|
||||
else:
|
||||
existing_device_manager_cls = device_manager_cls
|
||||
|
||||
device_manager_cls = (
|
||||
existing_device_manager_cls or DEFAULT_TORCH_DEVICE_MANAGER_CLS
|
||||
)
|
||||
|
||||
_torch_device_manager = device_manager_cls()
|
||||
|
||||
return _torch_device_manager
|
||||
|
||||
|
||||
def get_torch_device_manager_by_device_type(device_type: str):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: use MLUTorchDeviceManager when key="GPU"
|
||||
'''
|
||||
if device_type.lower() == ray_constants.GPU.lower() or device_type == "cuda":
|
||||
return MLUTorchDeviceManager()
|
||||
elif device_type.lower() == ray_constants.NPU.lower():
|
||||
return NPUTorchDeviceManager()
|
||||
elif device_type.lower() == ray_constants.HPU.lower():
|
||||
return HPUTorchDeviceManager()
|
||||
elif device_type.lower() == "cpu":
|
||||
return CPUTorchDeviceManager()
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
raise RuntimeError(f"Device type {device_type} cannot be recognized.")
|
||||
|
||||
|
||||
__all__ = [
|
||||
TorchDeviceManager,
|
||||
CPUTorchDeviceManager,
|
||||
CUDATorchDeviceManager,
|
||||
HPUTorchDeviceManager,
|
||||
NPUTorchDeviceManager,
|
||||
MLUTorchDeviceManager,
|
||||
register_custom_torch_dist_backend,
|
||||
get_torch_device_manager_by_context,
|
||||
get_torch_device_manager_by_device_type,
|
||||
]
|
||||
103
tools/ray_mlu/device_manager/mlu.py
Normal file
103
tools/ray_mlu/device_manager/mlu.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import os
|
||||
from importlib.util import find_spec
|
||||
from typing import List, Union
|
||||
|
||||
import torch
|
||||
|
||||
import ray
|
||||
import ray._private.ray_constants as ray_constants
|
||||
from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
|
||||
from ray._private.accelerators.mlu import MLU_VISIBLE_DEVICES_ENV_VAR
|
||||
|
||||
|
||||
def is_package_present(package_name: str) -> bool:
|
||||
try:
|
||||
return find_spec(package_name) is not None
|
||||
except ModuleNotFoundError:
|
||||
return False
|
||||
|
||||
|
||||
MLU_TORCH_PACKAGE_AVAILABLE = is_package_present("torch_mlu")
|
||||
|
||||
|
||||
if MLU_TORCH_PACKAGE_AVAILABLE:
|
||||
import torch_mlu # noqa: F401
|
||||
|
||||
|
||||
class MLUTorchDeviceManager(TorchDeviceManager):
|
||||
"""Cambricon MLU device manager"""
|
||||
|
||||
@staticmethod
|
||||
def register_custom_torch_dist_backend():
|
||||
if MLU_TORCH_PACKAGE_AVAILABLE:
|
||||
import torch_mlu # noqa: F401, F811
|
||||
|
||||
def is_available(self) -> bool:
|
||||
if not MLU_TORCH_PACKAGE_AVAILABLE:
|
||||
return False
|
||||
|
||||
return torch.mlu.is_available()
|
||||
|
||||
def get_devices(self) -> List[torch.device]:
|
||||
"""Gets the correct torch device list configured for this process.
|
||||
Returns a list of torch MLU devices allocated for the current worker.
|
||||
If no MLUs are assigned, then it returns a list with a single CPU device.
|
||||
"""
|
||||
if MLU_TORCH_PACKAGE_AVAILABLE and torch.mlu.is_available():
|
||||
mlu_ids = [
|
||||
str(id)
|
||||
for id in ray.get_runtime_context().get_accelerator_ids()[
|
||||
ray_constants.GPU
|
||||
]
|
||||
]
|
||||
|
||||
device_ids = []
|
||||
|
||||
if len(mlu_ids) > 0:
|
||||
mlu_visible_str = os.environ.get(MLU_VISIBLE_DEVICES_ENV_VAR, "")
|
||||
if mlu_visible_str and mlu_visible_str != "NoDevFiles":
|
||||
mlu_visible_list = mlu_visible_str.split(",")
|
||||
else:
|
||||
mlu_visible_list = []
|
||||
|
||||
for mlu_id in mlu_ids:
|
||||
try:
|
||||
device_ids.append(mlu_visible_list.index(mlu_id))
|
||||
except IndexError:
|
||||
raise RuntimeError(
|
||||
"MLU_VISIBLE_DEVICES set incorrectly. "
|
||||
f"Got {mlu_visible_str}, expected to include {mlu_id}. "
|
||||
"Did you override the `MLU_VISIBLE_DEVICES` "
|
||||
"environment variable?"
|
||||
)
|
||||
else:
|
||||
# If called on the driver or outside of Ray Train, return the
|
||||
# 0th device.
|
||||
device_ids.append(0)
|
||||
|
||||
devices = [torch.device(f"mlu:{device_id}") for device_id in device_ids]
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Using MLUTorchDeviceManager but torch mlu is not available."
|
||||
)
|
||||
|
||||
return devices
|
||||
|
||||
def set_device(self, device: Union[torch.device, int]):
|
||||
torch.mlu.set_device(device)
|
||||
|
||||
def supports_stream(self) -> bool:
|
||||
"""Validate if the device type support to create a stream"""
|
||||
return True
|
||||
|
||||
def create_stream(self, device):
|
||||
"""Create a stream on MLU device"""
|
||||
return torch.mlu.Stream(device)
|
||||
|
||||
def get_stream_context(self, stream):
|
||||
"""Get a torch.stream context on MLU device"""
|
||||
return torch.mlu.stream(stream)
|
||||
|
||||
def get_current_stream(self):
|
||||
"""Get current stream for MLU device"""
|
||||
return torch.mlu.current_stream()
|
||||
243
tools/ray_mlu/diff.patch
Normal file
243
tools/ray_mlu/diff.patch
Normal file
@@ -0,0 +1,243 @@
|
||||
commit 7376225d16e381ecae5cc07d84db9eed043ed06a
|
||||
Author: tanhaojue <tanhaojue@cambricon.com>
|
||||
Date: Thu Mar 7 15:54:09 2024 +0800
|
||||
|
||||
support mlu
|
||||
|
||||
diff --git a/python/ray/_private/accelerators/__init__.py b/python/ray/_private/accelerators/__init__.py
|
||||
index 71550bc..07bdcd6 100644
|
||||
--- a/python/ray/_private/accelerators/__init__.py
|
||||
+++ b/python/ray/_private/accelerators/__init__.py
|
||||
@@ -8,6 +8,7 @@ from ray._private.accelerators.tpu import TPUAcceleratorManager
|
||||
from ray._private.accelerators.neuron import NeuronAcceleratorManager
|
||||
from ray._private.accelerators.hpu import HPUAcceleratorManager
|
||||
from ray._private.accelerators.npu import NPUAcceleratorManager
|
||||
+from ray._private.accelerators.mlu import MLUAcceleratorManager
|
||||
|
||||
|
||||
def get_all_accelerator_managers() -> Set[AcceleratorManager]:
|
||||
@@ -20,6 +21,7 @@ def get_all_accelerator_managers() -> Set[AcceleratorManager]:
|
||||
NeuronAcceleratorManager,
|
||||
HPUAcceleratorManager,
|
||||
NPUAcceleratorManager,
|
||||
+ MLUAcceleratorManager,
|
||||
}
|
||||
|
||||
|
||||
@@ -55,6 +57,8 @@ def get_accelerator_manager_for_resource(
|
||||
resource_name_to_accelerator_manager["GPU"] = AMDGPUAcceleratorManager
|
||||
elif IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
|
||||
resource_name_to_accelerator_manager["GPU"] = IntelGPUAcceleratorManager
|
||||
+ elif MLUAcceleratorManager.get_current_node_num_accelerators() > 0:
|
||||
+ resource_name_to_accelerator_manager["GPU"] = MLUAcceleratorManager
|
||||
else:
|
||||
resource_name_to_accelerator_manager["GPU"] = NvidiaGPUAcceleratorManager
|
||||
get_accelerator_manager_for_resource._resource_name_to_accelerator_manager = (
|
||||
@@ -71,6 +75,7 @@ __all__ = [
|
||||
"NeuronAcceleratorManager",
|
||||
"HPUAcceleratorManager",
|
||||
"NPUAcceleratorManager",
|
||||
+ "MLUAcceleratorManager",
|
||||
"get_all_accelerator_managers",
|
||||
"get_all_accelerator_resource_names",
|
||||
"get_accelerator_manager_for_resource",
|
||||
diff --git a/python/ray/_private/accelerators/mlu.py b/python/ray/_private/accelerators/mlu.py
|
||||
new file mode 100755
|
||||
index 0000000..21a5771
|
||||
--- /dev/null
|
||||
+++ b/python/ray/_private/accelerators/mlu.py
|
||||
@@ -0,0 +1,92 @@
|
||||
+import os
|
||||
+import glob
|
||||
+import logging
|
||||
+from typing import Optional, List, Tuple
|
||||
+import torch
|
||||
+import torch_mlu
|
||||
+from ray._private.accelerators.accelerator import AcceleratorManager
|
||||
+
|
||||
+logger = logging.getLogger(__name__)
|
||||
+
|
||||
+MLU_VISIBLE_DEVICES_ENV_VAR = "MLU_VISIBLE_DEVICES"
|
||||
+NOSET_MLU_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_MLU_VISIBLE_DEVICES"
|
||||
+
|
||||
+
|
||||
+class MLUAcceleratorManager(AcceleratorManager):
|
||||
+ """Cambricon MLU accelerators."""
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def get_resource_name() -> str:
|
||||
+ return "GPU"
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def get_visible_accelerator_ids_env_var() -> str:
|
||||
+ return MLU_VISIBLE_DEVICES_ENV_VAR
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
|
||||
+ mlu_visible_devices = os.environ.get(
|
||||
+ MLUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
|
||||
+ )
|
||||
+
|
||||
+ if mlu_visible_devices is None:
|
||||
+ return None
|
||||
+
|
||||
+ if mlu_visible_devices == "":
|
||||
+ return []
|
||||
+
|
||||
+ if mlu_visible_devices == "NoDevFiles":
|
||||
+ return []
|
||||
+
|
||||
+ return list(mlu_visible_devices.split(","))
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def get_current_node_num_accelerators() -> int:
|
||||
+ """Attempt to detect the number of MLUs on this machine.
|
||||
+
|
||||
+ MLU chips are represented as devices within `/dev/`, either as `/dev/davinci?`.
|
||||
+
|
||||
+ Returns:
|
||||
+ The number of MLUs if any were detected, otherwise 0.
|
||||
+ """
|
||||
+ try:
|
||||
+ return torch.mlu.device_count()
|
||||
+ except Exception as e:
|
||||
+ logger.debug("Could not import CambriconCL: %s", e)
|
||||
+
|
||||
+ try:
|
||||
+ mlu_files = glob.glob("/dev/cambricon_dev?")
|
||||
+ return len(mlu_files)
|
||||
+ except Exception as e:
|
||||
+ logger.debug("Failed to detect number of MLUs: %s", e)
|
||||
+ return 0
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def get_current_node_accelerator_type() -> Optional[str]:
|
||||
+ """Get the type of the Cambricon MLU on the current node.
|
||||
+
|
||||
+ Returns:
|
||||
+ A string of the type, such as "MLU370".
|
||||
+ """
|
||||
+ try:
|
||||
+ return torch.mlu.get_device_name(0)
|
||||
+ except Exception:
|
||||
+ logger.exception("Failed to detect MLU type.")
|
||||
+ return None
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def validate_resource_request_quantity(
|
||||
+ quantity: float,
|
||||
+ ) -> Tuple[bool, Optional[str]]:
|
||||
+ return (True, None)
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def set_current_process_visible_accelerator_ids(
|
||||
+ visible_mlu_devices: List[str],
|
||||
+ ) -> None:
|
||||
+ if os.environ.get(NOSET_MLU_VISIBLE_DEVICES_ENV_VAR):
|
||||
+ return
|
||||
+
|
||||
+ os.environ[
|
||||
+ MLUAcceleratorManager.get_visible_accelerator_ids_env_var()
|
||||
+ ] = ",".join([str(i) for i in visible_mlu_devices])
|
||||
diff --git a/python/ray/tests/accelerators/test_mlu.py b/python/ray/tests/accelerators/test_mlu.py
|
||||
new file mode 100755
|
||||
index 0000000..70e81f7
|
||||
--- /dev/null
|
||||
+++ b/python/ray/tests/accelerators/test_mlu.py
|
||||
@@ -0,0 +1,92 @@
|
||||
+import os
|
||||
+import sys
|
||||
+import pytest
|
||||
+from unittest.mock import patch
|
||||
+
|
||||
+import ray
|
||||
+from ray._private.accelerators import MLUAcceleratorManager as Accelerator
|
||||
+
|
||||
+
|
||||
+@patch("glob.glob")
|
||||
+@patch("os.listdir")
|
||||
+def test_autodetect_num_mlus(mock_list, mock_glob):
|
||||
+ mock_glob.return_value = [f"/dev/davinci{i}" for i in range(4)]
|
||||
+ # mock_list.return_value = []
|
||||
+ assert Accelerator.get_current_node_num_accelerators() == 4
|
||||
+
|
||||
+
|
||||
+@patch("glob.glob")
|
||||
+@patch("os.listdir")
|
||||
+def test_autodetect_num_mlus_without_devices(mock_list, mock_glob):
|
||||
+ mock_glob.side_effect = Exception
|
||||
+ # mock_list.return_value = []
|
||||
+ assert Accelerator.get_current_node_num_accelerators() == 0
|
||||
+
|
||||
+
|
||||
+def test_mlu_accelerator_manager_api():
|
||||
+ assert Accelerator.get_resource_name() == "MLU"
|
||||
+ assert Accelerator.get_visible_accelerator_ids_env_var() == "MLU_VISIBLE_DEVICES"
|
||||
+ assert Accelerator.validate_resource_request_quantity(0.5) == (True, None)
|
||||
+ assert Accelerator.validate_resource_request_quantity(1) == (True, None)
|
||||
+
|
||||
+
|
||||
+def test_visible_mlu_type(monkeypatch, shutdown_only):
|
||||
+ with patch.object(
|
||||
+ Accelerator, "get_current_node_num_accelerators", return_value=4
|
||||
+ ), patch.object(
|
||||
+ Accelerator, "get_current_node_accelerator_type", return_value="MLU370"
|
||||
+ ):
|
||||
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
||||
+ manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
|
||||
+ assert manager.get_current_node_accelerator_type() == "MLU370"
|
||||
+
|
||||
+@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
|
||||
+def test_visible_mlu_ids(monkeypatch, shutdown_only):
|
||||
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
||||
+ with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
|
||||
+
|
||||
+ ray.init()
|
||||
+ manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
|
||||
+ assert manager.get_current_node_num_accelerators() == 4
|
||||
+ assert manager.__name__ == "MLUAcceleratorManager"
|
||||
+ assert ray.available_resources()["MLU"] == 3
|
||||
+
|
||||
+def test_get_current_process_visible_accelerator_ids(monkeypatch, shutdown_only):
|
||||
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
||||
+ assert Accelerator.get_current_process_visible_accelerator_ids() == ["0", "1", "2"]
|
||||
+
|
||||
+ monkeypatch.delenv("MLU_VISIBLE_DEVICES")
|
||||
+ assert Accelerator.get_current_process_visible_accelerator_ids() is None
|
||||
+
|
||||
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "")
|
||||
+ assert Accelerator.get_current_process_visible_accelerator_ids() == []
|
||||
+
|
||||
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "NoDevFiles")
|
||||
+ assert Accelerator.get_current_process_visible_accelerator_ids() == []
|
||||
+
|
||||
+
|
||||
+def test_set_current_process_visible_accelerator_ids(shutdown_only):
|
||||
+ Accelerator.set_current_process_visible_accelerator_ids(["0"])
|
||||
+ assert os.environ["MLU_VISIBLE_DEVICES"] == "0"
|
||||
+
|
||||
+ Accelerator.set_current_process_visible_accelerator_ids(["0", "1"])
|
||||
+ assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1"
|
||||
+
|
||||
+ Accelerator.set_current_process_visible_accelerator_ids(["0", "1", "2"])
|
||||
+ assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1,2"
|
||||
+
|
||||
+
|
||||
+@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
|
||||
+def test_auto_detected_more_than_visible(monkeypatch, shutdown_only):
|
||||
+ with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
|
||||
+ # If more MLUs are detected than visible.
|
||||
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
||||
+
|
||||
+ ray.init()
|
||||
+ assert ray.available_resources()["MLU"] == 3
|
||||
+
|
||||
+if __name__ == "__main__":
|
||||
+ if os.environ.get("PARALLEL_CI"):
|
||||
+ sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
|
||||
+ else:
|
||||
+ sys.exit(pytest.main(["-sv", __file__]))
|
||||
diff --git a/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl b/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl
|
||||
new file mode 100644
|
||||
index 0000000..8628a88
|
||||
Binary files /dev/null and b/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl differ
|
||||
11
tools/ray_mlu/diff_for_dump_info.patch
Normal file
11
tools/ray_mlu/diff_for_dump_info.patch
Normal file
@@ -0,0 +1,11 @@
|
||||
diff --git a/ray_mlu/mlu.py b/ray_mlu/mlu.py
|
||||
index 21a57719..2c63fd5b 100755
|
||||
--- a/ray_mlu/mlu.py
|
||||
+++ b/ray_mlu/mlu.py
|
||||
@@ -87,6 +87,3 @@ class MLUAcceleratorManager(AcceleratorManager):
|
||||
if os.environ.get(NOSET_MLU_VISIBLE_DEVICES_ENV_VAR):
|
||||
return
|
||||
|
||||
- os.environ[
|
||||
- MLUAcceleratorManager.get_visible_accelerator_ids_env_var()
|
||||
- ] = ",".join([str(i) for i in visible_mlu_devices])
|
||||
94
tools/ray_mlu/mlu.py
Executable file
94
tools/ray_mlu/mlu.py
Executable file
@@ -0,0 +1,94 @@
|
||||
import os
|
||||
import glob
|
||||
import logging
|
||||
from typing import Optional, List, Tuple
|
||||
import torch
|
||||
import torch_mlu
|
||||
from ray._private.accelerators.accelerator import AcceleratorManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MLU_VISIBLE_DEVICES_ENV_VAR = "MLU_VISIBLE_DEVICES"
|
||||
NOSET_MLU_VISIBLE_DEVICES_ENV_VAR = (
|
||||
"RAY_EXPERIMENTAL_NOSET_MLU_VISIBLE_DEVICES"
|
||||
)
|
||||
|
||||
|
||||
class MLUAcceleratorManager(AcceleratorManager):
|
||||
"""Cambricon MLU accelerators."""
|
||||
|
||||
@staticmethod
|
||||
def get_resource_name() -> str:
|
||||
return "GPU"
|
||||
|
||||
@staticmethod
|
||||
def get_visible_accelerator_ids_env_var() -> str:
|
||||
return MLU_VISIBLE_DEVICES_ENV_VAR
|
||||
|
||||
@staticmethod
|
||||
def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
|
||||
mlu_visible_devices = os.environ.get(
|
||||
MLUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
|
||||
)
|
||||
|
||||
if mlu_visible_devices is None:
|
||||
return None
|
||||
|
||||
if mlu_visible_devices == "":
|
||||
return []
|
||||
|
||||
if mlu_visible_devices == "NoDevFiles":
|
||||
return []
|
||||
|
||||
return list(mlu_visible_devices.split(","))
|
||||
|
||||
@staticmethod
|
||||
def get_current_node_num_accelerators() -> int:
|
||||
"""Attempt to detect the number of MLUs on this machine.
|
||||
|
||||
MLU chips are represented as devices within `/dev/`, either as `/dev/davinci?`.
|
||||
|
||||
Returns:
|
||||
The number of MLUs if any were detected, otherwise 0.
|
||||
"""
|
||||
try:
|
||||
return torch.mlu.device_count()
|
||||
except Exception as e:
|
||||
logger.debug("Could not import CambriconCL: %s", e)
|
||||
|
||||
try:
|
||||
mlu_files = glob.glob("/dev/cambricon_dev?")
|
||||
return len(mlu_files)
|
||||
except Exception as e:
|
||||
logger.debug("Failed to detect number of MLUs: %s", e)
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def get_current_node_accelerator_type() -> Optional[str]:
|
||||
"""Get the type of the Cambricon MLU on the current node.
|
||||
|
||||
Returns:
|
||||
A string of the type, such as "MLU370".
|
||||
"""
|
||||
try:
|
||||
return torch.mlu.get_device_name(0)
|
||||
except Exception:
|
||||
logger.exception("Failed to detect MLU type.")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def validate_resource_request_quantity(
|
||||
quantity: float,
|
||||
) -> Tuple[bool, Optional[str]]:
|
||||
return (True, None)
|
||||
|
||||
@staticmethod
|
||||
def set_current_process_visible_accelerator_ids(
|
||||
visible_mlu_devices: List[str],
|
||||
) -> None:
|
||||
if os.environ.get(NOSET_MLU_VISIBLE_DEVICES_ENV_VAR):
|
||||
return
|
||||
|
||||
os.environ[
|
||||
MLUAcceleratorManager.get_visible_accelerator_ids_env_var()
|
||||
] = ",".join([str(i) for i in visible_mlu_devices])
|
||||
1890
tools/ray_mlu/node.py
Normal file
1890
tools/ray_mlu/node.py
Normal file
File diff suppressed because it is too large
Load Diff
142
tools/ray_mlu/nsight.py
Normal file
142
tools/ray_mlu/nsight.py
Normal file
@@ -0,0 +1,142 @@
|
||||
import asyncio
|
||||
import copy
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from ray._common.utils import (
|
||||
try_to_create_directory,
|
||||
)
|
||||
from ray._private.runtime_env.context import RuntimeEnvContext
|
||||
from ray._private.runtime_env.plugin import RuntimeEnvPlugin
|
||||
from ray.exceptions import RuntimeEnvSetupError
|
||||
|
||||
default_logger = logging.getLogger(__name__)
|
||||
|
||||
# Nsight options used when runtime_env={"_nsight": "default"}
|
||||
# use default cnperf config, no need to specify any options
|
||||
NSIGHT_DEFAULT_CONFIG = {}
|
||||
|
||||
def parse_nsight_config(nsight_config: Dict[str, str]) -> List[str]:
|
||||
"""
|
||||
Function to convert dictionary of nsight options into
|
||||
nsight command line
|
||||
|
||||
The function returns:
|
||||
- List[str]: nsys profile cmd line split into list of str
|
||||
"""
|
||||
nsight_cmd = ["cnperf-cli", "record"]
|
||||
for option, option_val in nsight_config.items():
|
||||
# option standard based on
|
||||
# https://www.gnu.org/software/libc/manual/html_node/Argument-Syntax.html
|
||||
if len(option) > 1:
|
||||
nsight_cmd.append(f"--{option}={option_val}")
|
||||
else:
|
||||
nsight_cmd += [f"-{option}", option_val]
|
||||
return nsight_cmd
|
||||
|
||||
|
||||
class NsightPlugin(RuntimeEnvPlugin):
|
||||
name = "_nsight"
|
||||
|
||||
def __init__(self, resources_dir: str):
|
||||
self.nsight_cmd = []
|
||||
|
||||
# replace this with better way to get logs dir
|
||||
session_dir, runtime_dir = os.path.split(resources_dir)
|
||||
self._nsight_dir = Path(session_dir) / "logs" / "nsight"
|
||||
try_to_create_directory(self._nsight_dir)
|
||||
|
||||
async def _check_nsight_script(
|
||||
self, nsight_config: Dict[str, str]
|
||||
) -> Tuple[bool, str]:
|
||||
"""
|
||||
Function to validate if nsight_config is a valid nsight profile options
|
||||
Args:
|
||||
nsight_config: dictionary mapping nsight option to it's value
|
||||
Returns:
|
||||
a tuple consists of a boolean indicating if the nsight_config
|
||||
is valid option and an error message if the nsight_config is invalid
|
||||
"""
|
||||
|
||||
# use empty as nsight report test filename
|
||||
nsight_config_copy = copy.deepcopy(nsight_config)
|
||||
try_to_create_directory(Path(self._nsight_dir) / "empty")
|
||||
nsight_config_copy["o"] = str(Path(self._nsight_dir) / "empty/test")
|
||||
nsight_cmd = parse_nsight_config(nsight_config_copy)
|
||||
try:
|
||||
nsight_cmd = nsight_cmd + ["python", "-c", '""']
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*nsight_cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = await process.communicate()
|
||||
error_msg = stderr.strip() if stderr.strip() != "" else stdout.strip()
|
||||
|
||||
# cleanup test.cnperf-rep file
|
||||
clean_up_cmd = ["rm", f"{nsight_config_copy['o']}.cnperf-rep"]
|
||||
cleanup_process = await asyncio.create_subprocess_exec(
|
||||
*clean_up_cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
_, _ = await cleanup_process.communicate()
|
||||
if process.returncode == 0:
|
||||
return True, None
|
||||
else:
|
||||
return False, error_msg
|
||||
except FileNotFoundError:
|
||||
return False, ("cnperf-cli is not installed")
|
||||
|
||||
async def create(
|
||||
self,
|
||||
uri: Optional[str],
|
||||
runtime_env: "RuntimeEnv", # noqa: F821
|
||||
context: RuntimeEnvContext,
|
||||
logger: logging.Logger = default_logger,
|
||||
) -> int:
|
||||
nsight_config = runtime_env.nsight()
|
||||
if not nsight_config:
|
||||
return 0
|
||||
|
||||
if nsight_config and sys.platform != "linux":
|
||||
raise RuntimeEnvSetupError(
|
||||
"CNPerf CLI is only available in Linux.\n"
|
||||
"More information can be found in "
|
||||
"https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html"
|
||||
)
|
||||
|
||||
if isinstance(nsight_config, str):
|
||||
if nsight_config == "default":
|
||||
nsight_config = NSIGHT_DEFAULT_CONFIG
|
||||
else:
|
||||
raise RuntimeEnvSetupError(
|
||||
f"Unsupported nsight config: {nsight_config}. "
|
||||
"The supported config is 'default' or "
|
||||
"Dictionary of cnperf options"
|
||||
)
|
||||
|
||||
is_valid_nsight_cmd, error_msg = await self._check_nsight_script(nsight_config)
|
||||
if not is_valid_nsight_cmd:
|
||||
logger.warning(error_msg)
|
||||
raise RuntimeEnvSetupError(
|
||||
"cnperf-cli failed to run with the following "
|
||||
f"error message:\n {error_msg}"
|
||||
)
|
||||
self.nsight_cmd = parse_nsight_config(nsight_config)
|
||||
return 0
|
||||
|
||||
def modify_context(
|
||||
self,
|
||||
uris: List[str],
|
||||
runtime_env: "RuntimeEnv", # noqa: F821
|
||||
context: RuntimeEnvContext,
|
||||
logger: Optional[logging.Logger] = default_logger,
|
||||
):
|
||||
context.py_executable = " ".join(self.nsight_cmd) + " python"
|
||||
logger.info("Running CNPerf cmd: %s", context.py_executable)
|
||||
|
||||
92
tools/ray_mlu/test_mlu.py
Executable file
92
tools/ray_mlu/test_mlu.py
Executable file
@@ -0,0 +1,92 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
import ray
|
||||
from ray._private.accelerators import MLUAcceleratorManager as Accelerator
|
||||
|
||||
|
||||
@patch("glob.glob")
|
||||
@patch("os.listdir")
|
||||
def test_autodetect_num_mlus(mock_list, mock_glob):
|
||||
mock_glob.return_value = [f"/dev/davinci{i}" for i in range(4)]
|
||||
# mock_list.return_value = []
|
||||
assert Accelerator.get_current_node_num_accelerators() == 4
|
||||
|
||||
|
||||
@patch("glob.glob")
|
||||
@patch("os.listdir")
|
||||
def test_autodetect_num_mlus_without_devices(mock_list, mock_glob):
|
||||
mock_glob.side_effect = Exception
|
||||
# mock_list.return_value = []
|
||||
assert Accelerator.get_current_node_num_accelerators() == 0
|
||||
|
||||
|
||||
def test_mlu_accelerator_manager_api():
|
||||
assert Accelerator.get_resource_name() == "MLU"
|
||||
assert Accelerator.get_visible_accelerator_ids_env_var() == "MLU_VISIBLE_DEVICES"
|
||||
assert Accelerator.validate_resource_request_quantity(0.5) == (True, None)
|
||||
assert Accelerator.validate_resource_request_quantity(1) == (True, None)
|
||||
|
||||
|
||||
def test_visible_mlu_type(monkeypatch, shutdown_only):
|
||||
with patch.object(
|
||||
Accelerator, "get_current_node_num_accelerators", return_value=4
|
||||
), patch.object(
|
||||
Accelerator, "get_current_node_accelerator_type", return_value="MLU370"
|
||||
):
|
||||
monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
||||
manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
|
||||
assert manager.get_current_node_accelerator_type() == "MLU370"
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
|
||||
def test_visible_mlu_ids(monkeypatch, shutdown_only):
|
||||
monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
||||
with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
|
||||
|
||||
ray.init()
|
||||
manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
|
||||
assert manager.get_current_node_num_accelerators() == 4
|
||||
assert manager.__name__ == "MLUAcceleratorManager"
|
||||
assert ray.available_resources()["MLU"] == 3
|
||||
|
||||
def test_get_current_process_visible_accelerator_ids(monkeypatch, shutdown_only):
|
||||
monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
||||
assert Accelerator.get_current_process_visible_accelerator_ids() == ["0", "1", "2"]
|
||||
|
||||
monkeypatch.delenv("MLU_VISIBLE_DEVICES")
|
||||
assert Accelerator.get_current_process_visible_accelerator_ids() is None
|
||||
|
||||
monkeypatch.setenv("MLU_VISIBLE_DEVICES", "")
|
||||
assert Accelerator.get_current_process_visible_accelerator_ids() == []
|
||||
|
||||
monkeypatch.setenv("MLU_VISIBLE_DEVICES", "NoDevFiles")
|
||||
assert Accelerator.get_current_process_visible_accelerator_ids() == []
|
||||
|
||||
|
||||
def test_set_current_process_visible_accelerator_ids(shutdown_only):
|
||||
Accelerator.set_current_process_visible_accelerator_ids(["0"])
|
||||
assert os.environ["MLU_VISIBLE_DEVICES"] == "0"
|
||||
|
||||
Accelerator.set_current_process_visible_accelerator_ids(["0", "1"])
|
||||
assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1"
|
||||
|
||||
Accelerator.set_current_process_visible_accelerator_ids(["0", "1", "2"])
|
||||
assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1,2"
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
|
||||
def test_auto_detected_more_than_visible(monkeypatch, shutdown_only):
|
||||
with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
|
||||
# If more MLUs are detected than visible.
|
||||
monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
||||
|
||||
ray.init()
|
||||
assert ray.available_resources()["MLU"] == 3
|
||||
|
||||
if __name__ == "__main__":
|
||||
if os.environ.get("PARALLEL_CI"):
|
||||
sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
|
||||
else:
|
||||
sys.exit(pytest.main(["-sv", __file__]))
|
||||
3785
tools/ray_mlu/worker.py
Normal file
3785
tools/ray_mlu/worker.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user