ROCm support for sglang.check_env (#2426)
This commit is contained in:
@@ -9,6 +9,13 @@ from collections import OrderedDict, defaultdict
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from sglang.srt.utils import is_hip
|
||||||
|
|
||||||
|
|
||||||
|
def is_cuda_v2():
|
||||||
|
return torch.version.cuda is not None
|
||||||
|
|
||||||
|
|
||||||
# List of packages to check versions
|
# List of packages to check versions
|
||||||
PACKAGE_LIST = [
|
PACKAGE_LIST = [
|
||||||
"sglang",
|
"sglang",
|
||||||
@@ -63,13 +70,22 @@ def get_cuda_info():
|
|||||||
"""
|
"""
|
||||||
Get CUDA-related information if available.
|
Get CUDA-related information if available.
|
||||||
"""
|
"""
|
||||||
cuda_info = {"CUDA available": torch.cuda.is_available()}
|
if is_cuda_v2():
|
||||||
|
cuda_info = {"CUDA available": torch.cuda.is_available()}
|
||||||
|
|
||||||
if cuda_info["CUDA available"]:
|
if cuda_info["CUDA available"]:
|
||||||
cuda_info.update(_get_gpu_info())
|
cuda_info.update(_get_gpu_info())
|
||||||
cuda_info.update(_get_cuda_version_info())
|
cuda_info.update(_get_cuda_version_info())
|
||||||
|
|
||||||
return cuda_info
|
return cuda_info
|
||||||
|
elif is_hip():
|
||||||
|
cuda_info = {"ROCM available": torch.cuda.is_available()}
|
||||||
|
|
||||||
|
if cuda_info["ROCM available"]:
|
||||||
|
cuda_info.update(_get_gpu_info())
|
||||||
|
cuda_info.update(_get_cuda_version_info())
|
||||||
|
|
||||||
|
return cuda_info
|
||||||
|
|
||||||
|
|
||||||
def _get_gpu_info():
|
def _get_gpu_info():
|
||||||
@@ -103,34 +119,72 @@ def _get_cuda_version_info():
|
|||||||
"""
|
"""
|
||||||
Get CUDA version information.
|
Get CUDA version information.
|
||||||
"""
|
"""
|
||||||
from torch.utils.cpp_extension import CUDA_HOME
|
if is_cuda_v2():
|
||||||
|
from torch.utils.cpp_extension import CUDA_HOME
|
||||||
|
|
||||||
cuda_info = {"CUDA_HOME": CUDA_HOME}
|
cuda_info = {"CUDA_HOME": CUDA_HOME}
|
||||||
|
|
||||||
if CUDA_HOME and os.path.isdir(CUDA_HOME):
|
if CUDA_HOME and os.path.isdir(CUDA_HOME):
|
||||||
cuda_info.update(_get_nvcc_info())
|
cuda_info.update(_get_nvcc_info())
|
||||||
cuda_info.update(_get_cuda_driver_version())
|
cuda_info.update(_get_cuda_driver_version())
|
||||||
|
|
||||||
return cuda_info
|
return cuda_info
|
||||||
|
elif is_hip():
|
||||||
|
from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
|
||||||
|
|
||||||
|
cuda_info = {"ROCM_HOME": ROCM_HOME}
|
||||||
|
|
||||||
|
if ROCM_HOME and os.path.isdir(ROCM_HOME):
|
||||||
|
cuda_info.update(_get_nvcc_info())
|
||||||
|
cuda_info.update(_get_cuda_driver_version())
|
||||||
|
|
||||||
|
return cuda_info
|
||||||
|
else:
|
||||||
|
cuda_info = {"CUDA_HOME": ""}
|
||||||
|
return cuda_info
|
||||||
|
|
||||||
|
|
||||||
def _get_nvcc_info():
|
def _get_nvcc_info():
|
||||||
"""
|
"""
|
||||||
Get NVCC version information.
|
Get NVCC version information.
|
||||||
"""
|
"""
|
||||||
from torch.utils.cpp_extension import CUDA_HOME
|
if is_cuda_v2():
|
||||||
|
from torch.utils.cpp_extension import CUDA_HOME
|
||||||
|
|
||||||
try:
|
try:
|
||||||
nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
|
nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
|
||||||
nvcc_output = (
|
nvcc_output = (
|
||||||
subprocess.check_output(f'"{nvcc}" -V', shell=True).decode("utf-8").strip()
|
subprocess.check_output(f'"{nvcc}" -V', shell=True)
|
||||||
)
|
.decode("utf-8")
|
||||||
return {
|
.strip()
|
||||||
"NVCC": nvcc_output[
|
)
|
||||||
nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind("Build")
|
return {
|
||||||
].strip()
|
"NVCC": nvcc_output[
|
||||||
}
|
nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind(
|
||||||
except subprocess.SubprocessError:
|
"Build"
|
||||||
|
)
|
||||||
|
].strip()
|
||||||
|
}
|
||||||
|
except subprocess.SubprocessError:
|
||||||
|
return {"NVCC": "Not Available"}
|
||||||
|
elif is_hip():
|
||||||
|
from torch.utils.cpp_extension import ROCM_HOME
|
||||||
|
|
||||||
|
try:
|
||||||
|
hipcc = os.path.join(ROCM_HOME, "bin/hipcc")
|
||||||
|
hipcc_output = (
|
||||||
|
subprocess.check_output(f'"{hipcc}" --version', shell=True)
|
||||||
|
.decode("utf-8")
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"HIPCC": hipcc_output[
|
||||||
|
hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang")
|
||||||
|
].strip()
|
||||||
|
}
|
||||||
|
except subprocess.SubprocessError:
|
||||||
|
return {"HIPCC": "Not Available"}
|
||||||
|
else:
|
||||||
return {"NVCC": "Not Available"}
|
return {"NVCC": "Not Available"}
|
||||||
|
|
||||||
|
|
||||||
@@ -139,20 +193,40 @@ def _get_cuda_driver_version():
|
|||||||
Get CUDA driver version.
|
Get CUDA driver version.
|
||||||
"""
|
"""
|
||||||
versions = set()
|
versions = set()
|
||||||
try:
|
if is_cuda_v2():
|
||||||
output = subprocess.check_output(
|
try:
|
||||||
[
|
output = subprocess.check_output(
|
||||||
"nvidia-smi",
|
[
|
||||||
"--query-gpu=driver_version",
|
"nvidia-smi",
|
||||||
"--format=csv,noheader,nounits",
|
"--query-gpu=driver_version",
|
||||||
]
|
"--format=csv,noheader,nounits",
|
||||||
)
|
]
|
||||||
versions = set(output.decode().strip().split("\n"))
|
)
|
||||||
if len(versions) == 1:
|
versions = set(output.decode().strip().split("\n"))
|
||||||
return {"CUDA Driver Version": versions.pop()}
|
if len(versions) == 1:
|
||||||
else:
|
return {"CUDA Driver Version": versions.pop()}
|
||||||
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
|
else:
|
||||||
except subprocess.SubprocessError:
|
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
|
||||||
|
except subprocess.SubprocessError:
|
||||||
|
return {"CUDA Driver Version": "Not Available"}
|
||||||
|
elif is_hip():
|
||||||
|
try:
|
||||||
|
output = subprocess.check_output(
|
||||||
|
[
|
||||||
|
"rocm-smi",
|
||||||
|
"--showdriverversion",
|
||||||
|
"--csv",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
versions = set(output.decode().strip().split("\n"))
|
||||||
|
versions.discard("name, value")
|
||||||
|
ver = versions.pop()
|
||||||
|
ver = ver.replace('"Driver version", ', "").replace('"', "")
|
||||||
|
|
||||||
|
return {"ROCM Driver Version": ver}
|
||||||
|
except subprocess.SubprocessError:
|
||||||
|
return {"ROCM Driver Version": "Not Available"}
|
||||||
|
else:
|
||||||
return {"CUDA Driver Version": "Not Available"}
|
return {"CUDA Driver Version": "Not Available"}
|
||||||
|
|
||||||
|
|
||||||
@@ -160,16 +234,31 @@ def get_gpu_topology():
|
|||||||
"""
|
"""
|
||||||
Get GPU topology information.
|
Get GPU topology information.
|
||||||
"""
|
"""
|
||||||
try:
|
if is_cuda_v2():
|
||||||
result = subprocess.run(
|
try:
|
||||||
["nvidia-smi", "topo", "-m"],
|
result = subprocess.run(
|
||||||
stdout=subprocess.PIPE,
|
["nvidia-smi", "topo", "-m"],
|
||||||
stderr=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
text=True,
|
stderr=subprocess.PIPE,
|
||||||
check=True,
|
text=True,
|
||||||
)
|
check=True,
|
||||||
return "\n" + result.stdout if result.returncode == 0 else None
|
)
|
||||||
except subprocess.SubprocessError:
|
return "\n" + result.stdout if result.returncode == 0 else None
|
||||||
|
except subprocess.SubprocessError:
|
||||||
|
return None
|
||||||
|
elif is_hip():
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["rocm-smi", "--showtopotype"],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
return "\n" + result.stdout if result.returncode == 0 else None
|
||||||
|
except subprocess.SubprocessError:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@@ -196,7 +285,10 @@ def check_env():
|
|||||||
|
|
||||||
gpu_topo = get_gpu_topology()
|
gpu_topo = get_gpu_topology()
|
||||||
if gpu_topo:
|
if gpu_topo:
|
||||||
env_info["NVIDIA Topology"] = gpu_topo
|
if is_cuda_v2():
|
||||||
|
env_info["NVIDIA Topology"] = gpu_topo
|
||||||
|
elif is_hip():
|
||||||
|
env_info["AMD Topology"] = gpu_topo
|
||||||
|
|
||||||
hypervisor_vendor = get_hypervisor_vendor()
|
hypervisor_vendor = get_hypervisor_vendor()
|
||||||
if hypervisor_vendor:
|
if hypervisor_vendor:
|
||||||
|
|||||||
Reference in New Issue
Block a user