195 lines
4.8 KiB
Python
195 lines
4.8 KiB
Python
"""Check environment configurations and dependency versions."""
|
|
|
|
import importlib
|
|
import os
|
|
import resource
|
|
import subprocess
|
|
import sys
|
|
from collections import OrderedDict, defaultdict
|
|
|
|
import torch
|
|
|
|
# List of packages to check versions for
|
|
PACKAGE_LIST = [
|
|
"sglang",
|
|
"flashinfer",
|
|
"triton",
|
|
"requests",
|
|
"tqdm",
|
|
"numpy",
|
|
"aiohttp",
|
|
"fastapi",
|
|
"hf_transfer",
|
|
"huggingface_hub",
|
|
"interegular",
|
|
"packaging",
|
|
"PIL",
|
|
"psutil",
|
|
"pydantic",
|
|
"uvicorn",
|
|
"uvloop",
|
|
"zmq",
|
|
"vllm",
|
|
"outlines",
|
|
"multipart",
|
|
"openai",
|
|
"tiktoken",
|
|
"anthropic",
|
|
"litellm",
|
|
]
|
|
|
|
|
|
def get_package_versions(packages):
|
|
"""
|
|
Get versions of specified packages.
|
|
"""
|
|
versions = {}
|
|
for package in packages:
|
|
package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
|
|
try:
|
|
module = importlib.import_module(package_name)
|
|
if hasattr(module, "__version__"):
|
|
versions[package_name] = module.__version__
|
|
except ModuleNotFoundError:
|
|
versions[package_name] = "Module Not Found"
|
|
return versions
|
|
|
|
|
|
def get_cuda_info():
|
|
"""
|
|
Get CUDA-related information if available.
|
|
"""
|
|
cuda_info = {"CUDA available": torch.cuda.is_available()}
|
|
|
|
if cuda_info["CUDA available"]:
|
|
cuda_info.update(_get_gpu_info())
|
|
cuda_info.update(_get_cuda_version_info())
|
|
|
|
return cuda_info
|
|
|
|
|
|
def _get_gpu_info():
|
|
"""
|
|
Get information about available GPUs.
|
|
"""
|
|
devices = defaultdict(list)
|
|
capabilities = defaultdict(list)
|
|
for k in range(torch.cuda.device_count()):
|
|
devices[torch.cuda.get_device_name(k)].append(str(k))
|
|
capability = torch.cuda.get_device_capability(k)
|
|
capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
|
|
|
|
gpu_info = {}
|
|
for name, device_ids in devices.items():
|
|
gpu_info[f"GPU {','.join(device_ids)}"] = name
|
|
|
|
if len(capabilities) == 1:
|
|
# All GPUs have the same compute capability
|
|
cap, gpu_ids = list(capabilities.items())[0]
|
|
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
|
|
else:
|
|
# GPUs have different compute capabilities
|
|
for cap, gpu_ids in capabilities.items():
|
|
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
|
|
|
|
return gpu_info
|
|
|
|
|
|
def _get_cuda_version_info():
|
|
"""
|
|
Get CUDA version information.
|
|
"""
|
|
from torch.utils.cpp_extension import CUDA_HOME
|
|
|
|
cuda_info = {"CUDA_HOME": CUDA_HOME}
|
|
|
|
if CUDA_HOME and os.path.isdir(CUDA_HOME):
|
|
cuda_info.update(_get_nvcc_info())
|
|
cuda_info.update(_get_cuda_driver_version())
|
|
|
|
return cuda_info
|
|
|
|
|
|
def _get_nvcc_info():
|
|
"""
|
|
Get NVCC version information.
|
|
"""
|
|
from torch.utils.cpp_extension import CUDA_HOME
|
|
|
|
try:
|
|
nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
|
|
nvcc_output = (
|
|
subprocess.check_output(f'"{nvcc}" -V', shell=True).decode("utf-8").strip()
|
|
)
|
|
return {
|
|
"NVCC": nvcc_output[
|
|
nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind("Build")
|
|
].strip()
|
|
}
|
|
except subprocess.SubprocessError:
|
|
return {"NVCC": "Not Available"}
|
|
|
|
|
|
def _get_cuda_driver_version():
|
|
"""
|
|
Get CUDA driver version.
|
|
"""
|
|
versions = set()
|
|
try:
|
|
output = subprocess.check_output(
|
|
[
|
|
"nvidia-smi",
|
|
"--query-gpu=driver_version",
|
|
"--format=csv,noheader,nounits",
|
|
]
|
|
)
|
|
versions = set(output.decode().strip().split("\n"))
|
|
if len(versions) == 1:
|
|
return {"CUDA Driver Version": versions.pop()}
|
|
else:
|
|
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
|
|
except subprocess.SubprocessError:
|
|
return {"CUDA Driver Version": "Not Available"}
|
|
|
|
|
|
def get_gpu_topology():
|
|
"""
|
|
Get GPU topology information.
|
|
"""
|
|
try:
|
|
result = subprocess.run(
|
|
["nvidia-smi", "topo", "-m"],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
check=True,
|
|
)
|
|
return "\n" + result.stdout if result.returncode == 0 else None
|
|
except subprocess.SubprocessError:
|
|
return None
|
|
|
|
|
|
def check_env():
|
|
"""
|
|
Check and print environment information.
|
|
"""
|
|
env_info = OrderedDict()
|
|
env_info["Python"] = sys.version.replace("\n", "")
|
|
env_info.update(get_cuda_info())
|
|
env_info["PyTorch"] = torch.__version__
|
|
env_info.update(get_package_versions(PACKAGE_LIST))
|
|
|
|
gpu_topo = get_gpu_topology()
|
|
if gpu_topo:
|
|
env_info["NVIDIA Topology"] = gpu_topo
|
|
|
|
ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
env_info["ulimit soft"] = ulimit_soft
|
|
|
|
for k, v in env_info.items():
|
|
print(f"{k}: {v}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
check_env()
|