commit 7376225d16e381ecae5cc07d84db9eed043ed06a Author: tanhaojue Date: Thu Mar 7 15:54:09 2024 +0800 support mlu diff --git a/python/ray/_private/accelerators/__init__.py b/python/ray/_private/accelerators/__init__.py index 71550bc..07bdcd6 100644 --- a/python/ray/_private/accelerators/__init__.py +++ b/python/ray/_private/accelerators/__init__.py @@ -8,6 +8,7 @@ from ray._private.accelerators.tpu import TPUAcceleratorManager from ray._private.accelerators.neuron import NeuronAcceleratorManager from ray._private.accelerators.hpu import HPUAcceleratorManager from ray._private.accelerators.npu import NPUAcceleratorManager +from ray._private.accelerators.mlu import MLUAcceleratorManager def get_all_accelerator_managers() -> Set[AcceleratorManager]: @@ -20,6 +21,7 @@ def get_all_accelerator_managers() -> Set[AcceleratorManager]: NeuronAcceleratorManager, HPUAcceleratorManager, NPUAcceleratorManager, + MLUAcceleratorManager, } @@ -55,6 +57,8 @@ def get_accelerator_manager_for_resource( resource_name_to_accelerator_manager["GPU"] = AMDGPUAcceleratorManager elif IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0: resource_name_to_accelerator_manager["GPU"] = IntelGPUAcceleratorManager + elif MLUAcceleratorManager.get_current_node_num_accelerators() > 0: + resource_name_to_accelerator_manager["GPU"] = MLUAcceleratorManager else: resource_name_to_accelerator_manager["GPU"] = NvidiaGPUAcceleratorManager get_accelerator_manager_for_resource._resource_name_to_accelerator_manager = ( @@ -71,6 +75,7 @@ __all__ = [ "NeuronAcceleratorManager", "HPUAcceleratorManager", "NPUAcceleratorManager", + "MLUAcceleratorManager", "get_all_accelerator_managers", "get_all_accelerator_resource_names", "get_accelerator_manager_for_resource", diff --git a/python/ray/_private/accelerators/mlu.py b/python/ray/_private/accelerators/mlu.py new file mode 100755 index 0000000..21a5771 --- /dev/null +++ b/python/ray/_private/accelerators/mlu.py @@ -0,0 +1,92 @@ +import os +import glob +import logging +from typing import Optional, List, Tuple +import torch +import torch_mlu +from ray._private.accelerators.accelerator import AcceleratorManager + +logger = logging.getLogger(__name__) + +MLU_VISIBLE_DEVICES_ENV_VAR = "MLU_VISIBLE_DEVICES" +NOSET_MLU_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_MLU_VISIBLE_DEVICES" + + +class MLUAcceleratorManager(AcceleratorManager): + """Cambricon MLU accelerators.""" + + @staticmethod + def get_resource_name() -> str: + return "GPU" + + @staticmethod + def get_visible_accelerator_ids_env_var() -> str: + return MLU_VISIBLE_DEVICES_ENV_VAR + + @staticmethod + def get_current_process_visible_accelerator_ids() -> Optional[List[str]]: + mlu_visible_devices = os.environ.get( + MLUAcceleratorManager.get_visible_accelerator_ids_env_var(), None + ) + + if mlu_visible_devices is None: + return None + + if mlu_visible_devices == "": + return [] + + if mlu_visible_devices == "NoDevFiles": + return [] + + return list(mlu_visible_devices.split(",")) + + @staticmethod + def get_current_node_num_accelerators() -> int: + """Attempt to detect the number of MLUs on this machine. + + MLU chips are represented as devices within `/dev/`, either as `/dev/davinci?`. + + Returns: + The number of MLUs if any were detected, otherwise 0. + """ + try: + return torch.mlu.device_count() + except Exception as e: + logger.debug("Could not import CambriconCL: %s", e) + + try: + mlu_files = glob.glob("/dev/cambricon_dev?") + return len(mlu_files) + except Exception as e: + logger.debug("Failed to detect number of MLUs: %s", e) + return 0 + + @staticmethod + def get_current_node_accelerator_type() -> Optional[str]: + """Get the type of the Cambricon MLU on the current node. + + Returns: + A string of the type, such as "MLU370". + """ + try: + return torch.mlu.get_device_name(0) + except Exception: + logger.exception("Failed to detect MLU type.") + return None + + @staticmethod + def validate_resource_request_quantity( + quantity: float, + ) -> Tuple[bool, Optional[str]]: + return (True, None) + + @staticmethod + def set_current_process_visible_accelerator_ids( + visible_mlu_devices: List[str], + ) -> None: + if os.environ.get(NOSET_MLU_VISIBLE_DEVICES_ENV_VAR): + return + + os.environ[ + MLUAcceleratorManager.get_visible_accelerator_ids_env_var() + ] = ",".join([str(i) for i in visible_mlu_devices]) diff --git a/python/ray/tests/accelerators/test_mlu.py b/python/ray/tests/accelerators/test_mlu.py new file mode 100755 index 0000000..70e81f7 --- /dev/null +++ b/python/ray/tests/accelerators/test_mlu.py @@ -0,0 +1,92 @@ +import os +import sys +import pytest +from unittest.mock import patch + +import ray +from ray._private.accelerators import MLUAcceleratorManager as Accelerator + + +@patch("glob.glob") +@patch("os.listdir") +def test_autodetect_num_mlus(mock_list, mock_glob): + mock_glob.return_value = [f"/dev/davinci{i}" for i in range(4)] + # mock_list.return_value = [] + assert Accelerator.get_current_node_num_accelerators() == 4 + + +@patch("glob.glob") +@patch("os.listdir") +def test_autodetect_num_mlus_without_devices(mock_list, mock_glob): + mock_glob.side_effect = Exception + # mock_list.return_value = [] + assert Accelerator.get_current_node_num_accelerators() == 0 + + +def test_mlu_accelerator_manager_api(): + assert Accelerator.get_resource_name() == "MLU" + assert Accelerator.get_visible_accelerator_ids_env_var() == "MLU_VISIBLE_DEVICES" + assert Accelerator.validate_resource_request_quantity(0.5) == (True, None) + assert Accelerator.validate_resource_request_quantity(1) == (True, None) + + +def test_visible_mlu_type(monkeypatch, shutdown_only): + with patch.object( + Accelerator, "get_current_node_num_accelerators", return_value=4 + ), patch.object( + Accelerator, "get_current_node_accelerator_type", return_value="MLU370" + ): + monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2") + manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU") + assert manager.get_current_node_accelerator_type() == "MLU370" + +@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows") +def test_visible_mlu_ids(monkeypatch, shutdown_only): + monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2") + with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4): + + ray.init() + manager = ray._private.accelerators.get_accelerator_manager_for_resource("MLU") + assert manager.get_current_node_num_accelerators() == 4 + assert manager.__name__ == "MLUAcceleratorManager" + assert ray.available_resources()["MLU"] == 3 + +def test_get_current_process_visible_accelerator_ids(monkeypatch, shutdown_only): + monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2") + assert Accelerator.get_current_process_visible_accelerator_ids() == ["0", "1", "2"] + + monkeypatch.delenv("MLU_VISIBLE_DEVICES") + assert Accelerator.get_current_process_visible_accelerator_ids() is None + + monkeypatch.setenv("MLU_VISIBLE_DEVICES", "") + assert Accelerator.get_current_process_visible_accelerator_ids() == [] + + monkeypatch.setenv("MLU_VISIBLE_DEVICES", "NoDevFiles") + assert Accelerator.get_current_process_visible_accelerator_ids() == [] + + +def test_set_current_process_visible_accelerator_ids(shutdown_only): + Accelerator.set_current_process_visible_accelerator_ids(["0"]) + assert os.environ["MLU_VISIBLE_DEVICES"] == "0" + + Accelerator.set_current_process_visible_accelerator_ids(["0", "1"]) + assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1" + + Accelerator.set_current_process_visible_accelerator_ids(["0", "1", "2"]) + assert os.environ["MLU_VISIBLE_DEVICES"] == "0,1,2" + + +@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows") +def test_auto_detected_more_than_visible(monkeypatch, shutdown_only): + with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4): + # If more MLUs are detected than visible. + monkeypatch.setenv("MLU_VISIBLE_DEVICES", "0,1,2") + + ray.init() + assert ray.available_resources()["MLU"] == 3 + +if __name__ == "__main__": + if os.environ.get("PARALLEL_CI"): + sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) + else: + sys.exit(pytest.main(["-sv", __file__])) diff --git a/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl b/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl new file mode 100644 index 0000000..8628a88 Binary files /dev/null and b/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl differ