From 5ce9daea59774f64933036790ad7bb659fc10386 Mon Sep 17 00:00:00 2001 From: Hui Liu <96135754+hliuca@users.noreply.github.com> Date: Tue, 17 Dec 2024 03:45:14 -0800 Subject: [PATCH] ROCm support for sglang.check_env (#2426) --- python/sglang/check_env.py | 188 +++++++++++++++++++++++++++---------- 1 file changed, 140 insertions(+), 48 deletions(-) diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py index aafb8c101..19b8a8f9b 100644 --- a/python/sglang/check_env.py +++ b/python/sglang/check_env.py @@ -9,6 +9,13 @@ from collections import OrderedDict, defaultdict import torch +from sglang.srt.utils import is_hip + + +def is_cuda_v2(): + return torch.version.cuda is not None + + # List of packages to check versions PACKAGE_LIST = [ "sglang", @@ -63,13 +70,22 @@ def get_cuda_info(): """ Get CUDA-related information if available. """ - cuda_info = {"CUDA available": torch.cuda.is_available()} + if is_cuda_v2(): + cuda_info = {"CUDA available": torch.cuda.is_available()} - if cuda_info["CUDA available"]: - cuda_info.update(_get_gpu_info()) - cuda_info.update(_get_cuda_version_info()) + if cuda_info["CUDA available"]: + cuda_info.update(_get_gpu_info()) + cuda_info.update(_get_cuda_version_info()) - return cuda_info + return cuda_info + elif is_hip(): + cuda_info = {"ROCM available": torch.cuda.is_available()} + + if cuda_info["ROCM available"]: + cuda_info.update(_get_gpu_info()) + cuda_info.update(_get_cuda_version_info()) + + return cuda_info def _get_gpu_info(): @@ -103,34 +119,72 @@ def _get_cuda_version_info(): """ Get CUDA version information. """ - from torch.utils.cpp_extension import CUDA_HOME + if is_cuda_v2(): + from torch.utils.cpp_extension import CUDA_HOME - cuda_info = {"CUDA_HOME": CUDA_HOME} + cuda_info = {"CUDA_HOME": CUDA_HOME} - if CUDA_HOME and os.path.isdir(CUDA_HOME): - cuda_info.update(_get_nvcc_info()) - cuda_info.update(_get_cuda_driver_version()) + if CUDA_HOME and os.path.isdir(CUDA_HOME): + cuda_info.update(_get_nvcc_info()) + cuda_info.update(_get_cuda_driver_version()) - return cuda_info + return cuda_info + elif is_hip(): + from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME + + cuda_info = {"ROCM_HOME": ROCM_HOME} + + if ROCM_HOME and os.path.isdir(ROCM_HOME): + cuda_info.update(_get_nvcc_info()) + cuda_info.update(_get_cuda_driver_version()) + + return cuda_info + else: + cuda_info = {"CUDA_HOME": ""} + return cuda_info def _get_nvcc_info(): """ Get NVCC version information. """ - from torch.utils.cpp_extension import CUDA_HOME + if is_cuda_v2(): + from torch.utils.cpp_extension import CUDA_HOME - try: - nvcc = os.path.join(CUDA_HOME, "bin/nvcc") - nvcc_output = ( - subprocess.check_output(f'"{nvcc}" -V', shell=True).decode("utf-8").strip() - ) - return { - "NVCC": nvcc_output[ - nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind("Build") - ].strip() - } - except subprocess.SubprocessError: + try: + nvcc = os.path.join(CUDA_HOME, "bin/nvcc") + nvcc_output = ( + subprocess.check_output(f'"{nvcc}" -V', shell=True) + .decode("utf-8") + .strip() + ) + return { + "NVCC": nvcc_output[ + nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind( + "Build" + ) + ].strip() + } + except subprocess.SubprocessError: + return {"NVCC": "Not Available"} + elif is_hip(): + from torch.utils.cpp_extension import ROCM_HOME + + try: + hipcc = os.path.join(ROCM_HOME, "bin/hipcc") + hipcc_output = ( + subprocess.check_output(f'"{hipcc}" --version', shell=True) + .decode("utf-8") + .strip() + ) + return { + "HIPCC": hipcc_output[ + hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang") + ].strip() + } + except subprocess.SubprocessError: + return {"HIPCC": "Not Available"} + else: return {"NVCC": "Not Available"} @@ -139,20 +193,40 @@ def _get_cuda_driver_version(): Get CUDA driver version. """ versions = set() - try: - output = subprocess.check_output( - [ - "nvidia-smi", - "--query-gpu=driver_version", - "--format=csv,noheader,nounits", - ] - ) - versions = set(output.decode().strip().split("\n")) - if len(versions) == 1: - return {"CUDA Driver Version": versions.pop()} - else: - return {"CUDA Driver Versions": ", ".join(sorted(versions))} - except subprocess.SubprocessError: + if is_cuda_v2(): + try: + output = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=driver_version", + "--format=csv,noheader,nounits", + ] + ) + versions = set(output.decode().strip().split("\n")) + if len(versions) == 1: + return {"CUDA Driver Version": versions.pop()} + else: + return {"CUDA Driver Versions": ", ".join(sorted(versions))} + except subprocess.SubprocessError: + return {"CUDA Driver Version": "Not Available"} + elif is_hip(): + try: + output = subprocess.check_output( + [ + "rocm-smi", + "--showdriverversion", + "--csv", + ] + ) + versions = set(output.decode().strip().split("\n")) + versions.discard("name, value") + ver = versions.pop() + ver = ver.replace('"Driver version", ', "").replace('"', "") + + return {"ROCM Driver Version": ver} + except subprocess.SubprocessError: + return {"ROCM Driver Version": "Not Available"} + else: return {"CUDA Driver Version": "Not Available"} @@ -160,16 +234,31 @@ def get_gpu_topology(): """ Get GPU topology information. """ - try: - result = subprocess.run( - ["nvidia-smi", "topo", "-m"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=True, - ) - return "\n" + result.stdout if result.returncode == 0 else None - except subprocess.SubprocessError: + if is_cuda_v2(): + try: + result = subprocess.run( + ["nvidia-smi", "topo", "-m"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + return "\n" + result.stdout if result.returncode == 0 else None + except subprocess.SubprocessError: + return None + elif is_hip(): + try: + result = subprocess.run( + ["rocm-smi", "--showtopotype"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + return "\n" + result.stdout if result.returncode == 0 else None + except subprocess.SubprocessError: + return None + else: return None @@ -196,7 +285,10 @@ def check_env(): gpu_topo = get_gpu_topology() if gpu_topo: - env_info["NVIDIA Topology"] = gpu_topo + if is_cuda_v2(): + env_info["NVIDIA Topology"] = gpu_topo + elif is_hip(): + env_info["AMD Topology"] = gpu_topo hypervisor_vendor = get_hypervisor_vendor() if hypervisor_vendor: