forked from EngineX-Cambricon/enginex-mlu370-vllm
244 lines
9.1 KiB
Diff
244 lines
9.1 KiB
Diff
commit 7376225d16e381ecae5cc07d84db9eed043ed06a
|
|
Author: tanhaojue <tanhaojue@cambricon.com>
|
|
Date: Thu Mar 7 15:54:09 2024 +0800
|
|
|
|
support mlu
|
|
|
|
diff --git a/python/ray/_private/accelerators/__init__.py b/python/ray/_private/accelerators/__init__.py
|
|
index 71550bc..07bdcd6 100644
|
|
--- a/python/ray/_private/accelerators/__init__.py
|
|
+++ b/python/ray/_private/accelerators/__init__.py
|
|
@@ -8,6 +8,7 @@ from ray._private.accelerators.tpu import TPUAcceleratorManager
|
|
from ray._private.accelerators.neuron import NeuronAcceleratorManager
|
|
from ray._private.accelerators.hpu import HPUAcceleratorManager
|
|
from ray._private.accelerators.npu import NPUAcceleratorManager
|
|
+from ray._private.accelerators.mlu import MLUAcceleratorManager
|
|
|
|
|
|
def get_all_accelerator_managers() -> Set[AcceleratorManager]:
|
|
@@ -20,6 +21,7 @@ def get_all_accelerator_managers() -> Set[AcceleratorManager]:
|
|
NeuronAcceleratorManager,
|
|
HPUAcceleratorManager,
|
|
NPUAcceleratorManager,
|
|
+ MLUAcceleratorManager,
|
|
}
|
|
|
|
|
|
@@ -55,6 +57,8 @@ def get_accelerator_manager_for_resource(
|
|
resource_name_to_accelerator_manager["GPU"] = AMDGPUAcceleratorManager
|
|
elif IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
|
|
resource_name_to_accelerator_manager["GPU"] = IntelGPUAcceleratorManager
|
|
+ elif MLUAcceleratorManager.get_current_node_num_accelerators() > 0:
|
|
+ resource_name_to_accelerator_manager["GPU"] = MLUAcceleratorManager
|
|
else:
|
|
resource_name_to_accelerator_manager["GPU"] = NvidiaGPUAcceleratorManager
|
|
get_accelerator_manager_for_resource._resource_name_to_accelerator_manager = (
|
|
@@ -71,6 +75,7 @@ __all__ = [
|
|
"NeuronAcceleratorManager",
|
|
"HPUAcceleratorManager",
|
|
"NPUAcceleratorManager",
|
|
+ "MLUAcceleratorManager",
|
|
"get_all_accelerator_managers",
|
|
"get_all_accelerator_resource_names",
|
|
"get_accelerator_manager_for_resource",
|
|
diff --git a/python/ray/_private/accelerators/mlu.py b/python/ray/_private/accelerators/mlu.py
|
|
new file mode 100755
|
|
index 0000000..21a5771
|
|
--- /dev/null
|
|
+++ b/python/ray/_private/accelerators/mlu.py
|
|
@@ -0,0 +1,92 @@
|
|
+import os
|
|
+import glob
|
|
+import logging
|
|
+from typing import Optional, List, Tuple
|
|
+import torch
|
|
+import torch_mlu
|
|
+from ray._private.accelerators.accelerator import AcceleratorManager
|
|
+
|
|
+logger = logging.getLogger(__name__)
|
|
+
|
|
+MLU_VISIBLE_DEVICES_ENV_VAR = "MLU_VISIBLE_DEVICES"
|
|
+NOSET_MLU_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_MLU_VISIBLE_DEVICES"
|
|
+
|
|
+
|
|
+class MLUAcceleratorManager(AcceleratorManager):
|
|
+ """Cambricon MLU accelerators."""
|
|
+
|
|
+ @staticmethod
|
|
+ def get_resource_name() -> str:
|
|
+ return "GPU"
|
|
+
|
|
+ @staticmethod
|
|
+ def get_visible_accelerator_ids_env_var() -> str:
|
|
+ return MLU_VISIBLE_DEVICES_ENV_VAR
|
|
+
|
|
+ @staticmethod
|
|
+ def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
|
|
+ mlu_visible_devices = os.environ.get(
|
|
+ MLUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
|
|
+ )
|
|
+
|
|
+ if mlu_visible_devices is None:
|
|
+ return None
|
|
+
|
|
+ if mlu_visible_devices == "":
|
|
+ return []
|
|
+
|
|
+ if mlu_visible_devices == "NoDevFiles":
|
|
+ return []
|
|
+
|
|
+ return list(mlu_visible_devices.split(","))
|
|
+
|
|
+ @staticmethod
|
|
+ def get_current_node_num_accelerators() -> int:
|
|
+ """Attempt to detect the number of MLUs on this machine.
|
|
+
|
|
+ MLU chips are represented as devices within `/dev/`, either as `/dev/davinci?`.
|
|
+
|
|
+ Returns:
|
|
+ The number of MLUs if any were detected, otherwise 0.
|
|
+ """
|
|
+ try:
|
|
+ return torch.mlu.device_count()
|
|
+ except Exception as e:
|
|
+ logger.debug("Could not import CambriconCL: %s", e)
|
|
+
|
|
+ try:
|
|
+ mlu_files = glob.glob("/dev/cambricon_dev?")
|
|
+ return len(mlu_files)
|
|
+ except Exception as e:
|
|
+ logger.debug("Failed to detect number of MLUs: %s", e)
|
|
+ return 0
|
|
+
|
|
+ @staticmethod
|
|
+ def get_current_node_accelerator_type() -> Optional[str]:
|
|
+ """Get the type of the Cambricon MLU on the current node.
|
|
+
|
|
+ Returns:
|
|
+ A string of the type, such as "MLU370".
|
|
+ """
|
|
+ try:
|
|
+ return torch.mlu.get_device_name(0)
|
|
+ except Exception:
|
|
+ logger.exception("Failed to detect MLU type.")
|
|
+ return None
|
|
+
|
|
+ @staticmethod
|
|
+ def validate_resource_request_quantity(
|
|
+ quantity: float,
|
|
+ ) -> Tuple[bool, Optional[str]]:
|
|
+ return (True, None)
|
|
+
|
|
+ @staticmethod
|
|
+ def set_current_process_visible_accelerator_ids(
|
|
+ visible_mlu_devices: List[str],
|
|
+ ) -> None:
|
|
+ if os.environ.get(NOSET_MLU_VISIBLE_DEVICES_ENV_VAR):
|
|
+ return
|
|
+
|
|
+ os.environ[
|
|
+ MLUAcceleratorManager.get_visible_accelerator_ids_env_var()
|
|
+ ] = ",".join([str(i) for i in visible_mlu_devices])
|
|
diff --git a/python/ray/tests/accelerators/test_mlu.py b/python/ray/tests/accelerators/test_mlu.py
|
|
new file mode 100755
|
|
index 0000000..70e81f7
|
|
--- /dev/null
|
|
+++ b/python/ray/tests/accelerators/test_mlu.py
|
|
@@ -0,0 +1,92 @@
|
|
+import os
|
|
+import sys
|
|
+import pytest
|
|
+from unittest.mock import patch
|
|
+
|
|
+import ray
|
|
+from ray._private.accelerators import MLUAcceleratorManager as Accelerator
|
|
+
|
|
+
|
|
+@patch("glob.glob")
|
|
+@patch("os.listdir")
|
|
+def test_autodetect_num_mlus(mock_list, mock_glob):
|
|
+ mock_glob.return_value = [f"/dev/davinci{i}" for i in range(4)]
|
|
+ # mock_list.return_value = []
|
|
+ assert Accelerator.get_current_node_num_accelerators() == 4
|
|
+
|
|
+
|
|
+@patch("glob.glob")
|
|
+@patch("os.listdir")
|
|
+def test_autodetect_num_mlus_without_devices(mock_list, mock_glob):
|
|
+ mock_glob.side_effect = Exception
|
|
+ # mock_list.return_value = []
|
|
+ assert Accelerator.get_current_node_num_accelerators() == 0
|
|
+
|
|
+
|
|
+def test_mlu_accelerator_manager_api():
|
|
+ assert Accelerator.get_resource_name() == "MLU"
|
|
+ assert Accelerator.get_visible_accelerator_ids_env_var() == "MLU_VISIBLE_DEVICES"
|
|
+ assert Accelerator.validate_resource_request_quantity(0.5) == (True, None)
|
|
+ assert Accelerator.validate_resource_request_quantity(1) == (True, None)
|
|
+
|
|
+
|
|
+def test_visible_mlu_type(monkeypatch, shutdown_only):
|
|
+ with patch.object(
|
|
+ Accelerator, "get_current_node_num_accelerators", return_value=4
|
|
+ ), patch.object(
|
|
+ Accelerator, "get_current_node_accelerator_type", return_value="MLU370"
|
|
+ ):
|
|
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
|
+ manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
|
|
+ assert manager.get_current_node_accelerator_type() == "MLU370"
|
|
+
|
|
+@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
|
|
+def test_visible_mlu_ids(monkeypatch, shutdown_only):
|
|
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
|
+ with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
|
|
+
|
|
+ ray.init()
|
|
+ manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU")
|
|
+ assert manager.get_current_node_num_accelerators() == 4
|
|
+ assert manager.__name__ == "MLUAcceleratorManager"
|
|
+ assert ray.available_resources()["MLU"] == 3
|
|
+
|
|
+def test_get_current_process_visible_accelerator_ids(monkeypatch, shutdown_only):
|
|
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
|
+ assert Accelerator.get_current_process_visible_accelerator_ids() == ["0", "1", "2"]
|
|
+
|
|
+ monkeypatch.delenv("MLU_VISIBLE_DEVICES")
|
|
+ assert Accelerator.get_current_process_visible_accelerator_ids() is None
|
|
+
|
|
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "")
|
|
+ assert Accelerator.get_current_process_visible_accelerator_ids() == []
|
|
+
|
|
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "NoDevFiles")
|
|
+ assert Accelerator.get_current_process_visible_accelerator_ids() == []
|
|
+
|
|
+
|
|
+def test_set_current_process_visible_accelerator_ids(shutdown_only):
|
|
+ Accelerator.set_current_process_visible_accelerator_ids(["0"])
|
|
+ assert os.environ["MLU_VISIBLE_DEVICES"] == "0"
|
|
+
|
|
+ Accelerator.set_current_process_visible_accelerator_ids(["0", "1"])
|
|
+ assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1"
|
|
+
|
|
+ Accelerator.set_current_process_visible_accelerator_ids(["0", "1", "2"])
|
|
+ assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1,2"
|
|
+
|
|
+
|
|
+@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
|
|
+def test_auto_detected_more_than_visible(monkeypatch, shutdown_only):
|
|
+ with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
|
|
+ # If more MLUs are detected than visible.
|
|
+ monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2")
|
|
+
|
|
+ ray.init()
|
|
+ assert ray.available_resources()["MLU"] == 3
|
|
+
|
|
+if __name__ == "__main__":
|
|
+ if os.environ.get("PARALLEL_CI"):
|
|
+ sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
|
|
+ else:
|
|
+ sys.exit(pytest.main(["-sv", __file__]))
|
|
diff --git a/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl b/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl
|
|
new file mode 100644
|
|
index 0000000..8628a88
|
|
Binary files /dev/null and b/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl differ
|