[Misc] add collect_env feat (#218)

Signed-off-by: Lidang-Jiang <lidangjiang@gmail.com>
This commit is contained in:
Lidang Jiang
2026-02-27 12:19:58 +08:00
committed by GitHub
parent d425a0d0e9
commit 153093d3b3

695
collect_env.py Normal file
View File

@@ -0,0 +1,695 @@
# SPDX-License-Identifier: Apache-2.0
# vLLM-Kunlun Environment Information Collection Tool (Fixed Version)
"""
Environment information collection script for Kunlun XPU
Fixed the following issues:
1. Device name displayed as "GPU" → Now correctly shows "P800 OAM"
2. XRE version command error → Now parsed from xpu-smi output
3. vLLM-Kunlun version hardcoded → Now fetched from pip package metadata
"""
import os
import re
import subprocess
import sys
from collections import namedtuple
# =============================================================================
# Part 1: Basic Utility Functions
# =============================================================================
def run(command):
"""
Execute shell command and return result
[Principle Explanation - Web Development Analogy]
This is like the fetch() function in frontend development, sending a request and getting a response.
- command: The command to execute (similar to a URL)
- returns: (return_code, stdout, stderr)
Args:
command: Command as string or list
Returns:
tuple: (return_code, stdout, stderr)
"""
shell = True if isinstance(command, str) else False
try:
p = subprocess.Popen(
command,
stdout=subprocess.PIPE, # Capture standard output
stderr=subprocess.PIPE, # Capture error output
shell=shell,
)
raw_output, raw_err = p.communicate()
rc = p.returncode
# Decode byte stream to string
output = raw_output.decode("utf-8").strip()
err = raw_err.decode("utf-8").strip()
return rc, output, err
except FileNotFoundError:
return 127, "", "Command not found"
def run_and_read_all(run_lambda, command):
"""Execute command, return output if successful, None otherwise"""
rc, out, _ = run_lambda(command)
if rc != 0:
return None
return out
def run_and_parse_first_match(run_lambda, command, regex):
"""Execute command and extract first regex match"""
rc, out, _ = run_lambda(command)
if rc != 0:
return None
match = re.search(regex, out)
if match is None:
return None
return match.group(1)
# Check if PyTorch is available
try:
import torch
TORCH_AVAILABLE = True
except (ImportError, NameError, AttributeError, OSError):
TORCH_AVAILABLE = False
# =============================================================================
# Part 2: General System Information Collection (Reusing vLLM Original Logic)
# =============================================================================
def get_platform():
"""Get operating system platform"""
if sys.platform.startswith("linux"):
return "linux"
elif sys.platform.startswith("win32"):
return "win32"
elif sys.platform.startswith("darwin"):
return "darwin"
return sys.platform
def get_os(run_lambda):
"""Get detailed operating system information"""
from platform import machine
if get_platform() == "linux":
# Try reading /etc/*-release
rc, out, _ = run_lambda(
"cat /etc/*-release 2>/dev/null | grep PRETTY_NAME | head -1"
)
if rc == 0 and out:
match = re.search(r'PRETTY_NAME="(.*)"', out)
if match:
return f"{match.group(1)} ({machine()})"
# Fallback: use lsb_release
rc, out, _ = run_lambda("lsb_release -d 2>/dev/null")
if rc == 0 and out:
match = re.search(r"Description:\s*(.*)", out)
if match:
return f"{match.group(1)} ({machine()})"
return f"{get_platform()} ({machine()})"
def get_gcc_version(run_lambda):
"""Get GCC version"""
return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
def get_clang_version(run_lambda):
"""Get Clang version"""
return run_and_parse_first_match(
run_lambda, "clang --version", r"clang version (.*)"
)
def get_cmake_version(run_lambda):
"""Get CMake version"""
return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
def get_libc_version():
"""Get libc version"""
import platform
if get_platform() != "linux":
return "N/A"
return "-".join(platform.libc_ver())
def get_python_platform():
"""Get Python platform information"""
import platform
return platform.platform()
def get_cpu_info(run_lambda):
"""Get CPU information"""
if get_platform() == "linux":
rc, out, err = run_lambda("lscpu")
return out if rc == 0 else err
return "N/A"
def get_pip_packages(run_lambda, patterns=None):
"""Get pip package list"""
if patterns is None:
patterns = {
"torch",
"numpy",
"triton",
"transformers",
"vllm",
"kunlun",
"xpu",
"bkcl",
"xmlir",
}
cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
out = run_and_read_all(run_lambda, cmd)
if out is None:
return "pip3", ""
filtered = "\n".join(
line
for line in out.splitlines()
if any(name.lower() in line.lower() for name in patterns)
)
pip_version = "pip3" if sys.version[0] == "3" else "pip"
return pip_version, filtered
def get_conda_packages(run_lambda, patterns=None):
"""Get conda package list"""
if patterns is None:
patterns = {
"torch",
"numpy",
"triton",
"transformers",
"kunlun",
"xpu",
"bkcl",
"xmlir",
}
conda = os.environ.get("CONDA_EXE", "conda")
out = run_and_read_all(run_lambda, [conda, "list"])
if out is None:
return None
return "\n".join(
line
for line in out.splitlines()
if not line.startswith("#")
and any(name.lower() in line.lower() for name in patterns)
)
# =============================================================================
# Part 3: Kunlun-Specific Information Collection (Core Fix)
# =============================================================================
def parse_xpu_smi_output(run_lambda):
"""
Parse the complete output of xpu-smi command
[Principle Explanation]
The xpu-smi output format is similar to nvidia-smi, we need to parse it with regex.
Example output format:
+-----------------------------------------------------------------------------+
| XPU-SMI Driver Version: 515.58 XPU-RT Version: N/A |
|-------------------------------+----------------------+----------------------+
| 0 P800 OAM N/A | 00000000:52:00.0 N/A | 0 |
| N/A 43C N/A 85W / 400W | 4MiB / 98304MiB | 0% Default |
Returns:
dict: Dictionary containing parsing results
"""
rc, output, _ = run_lambda("xpu-smi")
if rc != 0 or not output:
return None
result = {
"raw_output": output,
"driver_version": None,
"xre_version": None,
"devices": [],
}
# Parse header: Driver Version and XPU-RT Version
# Format: | XPU-SMI Driver Version: 515.58 XPU-RT Version: N/A |
header_match = re.search(
r"Driver Version:\s*(\S+)\s+XPU-RT Version:\s*(\S+)", output
)
if header_match:
result["driver_version"] = header_match.group(1)
xre = header_match.group(2)
result["xre_version"] = xre if xre != "N/A" else None
# Parse device information
# Format: | 0 P800 OAM N/A | 00000000:52:00.0 N/A |
# Following: | N/A 43C N/A 85W / 400W | 4MiB / 98304MiB |
# Find all device lines (containing device ID and name)
device_pattern = re.compile(
r"\|\s*(\d+)\s+(\S+(?:\s+\S+)?)\s+(?:N/A|On|Off)\s*\|" # ID and Name
r"\s*([0-9a-fA-F:\.]+)\s*" # Bus-Id
)
# Find memory information
memory_pattern = re.compile(
r"\|\s*N/A\s+\d+C\s+N/A\s+\d+W\s*/\s*\d+W\s*\|"
r"\s*(\d+)MiB\s*/\s*(\d+)MiB\s*\|" # Memory-Usage / Total
)
lines = output.split("\n")
i = 0
while i < len(lines):
line = lines[i]
device_match = device_pattern.search(line)
if device_match:
device_id = int(device_match.group(1))
device_name = device_match.group(2).strip()
bus_id = device_match.group(3)
# Next line should have memory info
memory_used = 0
memory_total = 0
if i + 1 < len(lines):
mem_match = memory_pattern.search(lines[i + 1])
if mem_match:
memory_used = int(mem_match.group(1))
memory_total = int(mem_match.group(2))
result["devices"].append(
{
"id": device_id,
"name": device_name, # This will correctly get "P800 OAM"
"bus_id": bus_id,
"memory_used_mib": memory_used,
"memory_total_mib": memory_total,
}
)
i += 1
return result
def get_kunlun_gpu_info(run_lambda):
"""
Get Kunlun XPU device information
[Fix Explanation]
Previously used torch.cuda.get_device_properties() to get the name,
but it only returns "GPU" (because Kunlun masquerades as CUDA).
Now parse xpu-smi output to correctly get "P800 OAM".
Returns:
str: Device information string
"""
parsed = parse_xpu_smi_output(run_lambda)
if parsed and parsed["devices"]:
# Get real device name from xpu-smi parsing
lines = []
for dev in parsed["devices"]:
memory_gb = dev["memory_total_mib"] / 1024
# Correctly display: XPU 0: P800 OAM (96.0GB)
lines.append(f"XPU {dev['id']}: {dev['name']} ({memory_gb:.1f}GB)")
return "\n".join(lines)
# Fallback: Use PyTorch interface (but will display as GPU)
if TORCH_AVAILABLE:
try:
device_count = torch.cuda.device_count()
lines = []
for i in range(device_count):
props = torch.cuda.get_device_properties(i)
name = props.name if hasattr(props, "name") else "Kunlun XPU"
memory_gb = (
props.total_memory / (1024**3)
if hasattr(props, "total_memory")
else 0
)
lines.append(f"XPU {i}: {name} ({memory_gb:.1f}GB)")
return "\n".join(lines)
except Exception as e:
return f"Error: {e}"
return None
def get_kunlun_driver_version(run_lambda):
"""
Get Kunlun driver version
[Fix Explanation]
Parse directly from xpu-smi output header instead of calling incorrect commands.
Returns:
str: Driver version, e.g., "515.58"
"""
parsed = parse_xpu_smi_output(run_lambda)
if parsed and parsed["driver_version"]:
return parsed["driver_version"]
return None
def get_kunlun_xre_version(run_lambda):
"""
Get Kunlun XRE (Runtime) version
[Fix Explanation]
Previously used `xpu-smi --version` but that parameter doesn't exist.
Now parse the "XPU-RT Version" field from xpu-smi standard output header.
Returns:
str: XRE version, or None (if showing N/A)
"""
parsed = parse_xpu_smi_output(run_lambda)
if parsed and parsed["xre_version"]:
return parsed["xre_version"]
return "N/A (not installed or not detected)"
def get_kunlun_topo(run_lambda):
"""
Get Kunlun XPU topology information
Returns:
str: Topology information
"""
# xpu-smi topo -m command can get topology
output = run_and_read_all(run_lambda, "xpu-smi topo -m")
if output:
return output
# Fallback: Show device count
if TORCH_AVAILABLE:
try:
count = torch.cuda.device_count()
return f"Detected {count} Kunlun XPU device(s)"
except Exception:
pass
return None
def get_bkcl_version(run_lambda):
"""
Get BKCL (communication library) version
[Principle Explanation]
BKCL = Baidu Kunlun Communication Library
Similar to NVIDIA's NCCL, used for multi-card communication.
Returns:
str: BKCL version information
"""
# Method 1: From your logs, BKCL prints version when loading
# [WARN][BKCL][globals.cpp:268] xccl version: 6ab4ffb [rdma] ...
# We can try importing related modules
try:
# Try getting from torch_xmlir
import torch_xmlir
# Find path to libbkcl.so
bkcl_path = None
if hasattr(torch_xmlir, "__file__"):
import os
base = os.path.dirname(torch_xmlir.__file__)
candidate = os.path.join(base, "libbkcl.so")
if os.path.exists(candidate):
bkcl_path = candidate
if bkcl_path:
return f"Found at: {bkcl_path}"
except ImportError:
pass
# Method 2: Search from ldconfig
rc, out, _ = run_lambda("ldconfig -p 2>/dev/null | grep -i bkcl | head -1")
if rc == 0 and out:
return out
return None
def get_vllm_kunlun_version():
"""
Get vLLM-Kunlun version
[Fix Explanation]
Previously got hardcoded version "0.9.2" from vllm_kunlun.platforms.version,
but actual pip installed version is "0.1.0".
Now prioritize using importlib.metadata to get real installed version.
Returns:
str: Version number
"""
# Method 1 (recommended): Use importlib.metadata (Python 3.8+)
try:
from importlib.metadata import version
return version("vllm-kunlun")
except Exception:
pass
# Method 2: Use pkg_resources
try:
import pkg_resources
return pkg_resources.get_distribution("vllm-kunlun").version
except Exception:
pass
# Method 3 (fallback): Get from code (may be inaccurate)
try:
from vllm_kunlun.platforms.version import get_xvllm_version
return get_xvllm_version() + " (from code, may be inaccurate)"
except ImportError:
pass
return "N/A"
def get_vllm_version():
"""Get vLLM main package version"""
try:
from importlib.metadata import version
return version("vllm")
except Exception:
pass
try:
from vllm import __version__
return __version__
except ImportError:
pass
return "N/A"
# =============================================================================
# Part 4: Environment Variable Collection
# =============================================================================
def get_kunlun_env_vars():
"""Get Kunlun-related environment variables"""
env_vars = ""
kunlun_prefixes = (
"XPU",
"KUNLUN",
"BKCL",
"XCCL",
"XRE",
"TORCH",
"VLLM",
)
secret_terms = ("secret", "token", "api", "access", "password")
for k, v in sorted(os.environ.items()):
if any(term in k.lower() for term in secret_terms):
continue
if any(k.upper().startswith(prefix) for prefix in kunlun_prefixes):
env_vars += f"{k}={v}\n"
return env_vars
# =============================================================================
# Part 5: Define Data Structure and Formatted Output
# =============================================================================
KunlunSystemEnv = namedtuple(
"KunlunSystemEnv",
[
# General system information
"os",
"gcc_version",
"clang_version",
"cmake_version",
"libc_version",
"python_version",
"python_platform",
"pip_version",
"pip_packages",
"conda_packages",
"cpu_info",
# PyTorch information
"torch_version",
"is_debug_build",
# Kunlun-specific information
"kunlun_xpu_info",
"kunlun_driver_version",
"kunlun_xre_version",
"bkcl_version",
"kunlun_topo",
# vLLM related
"vllm_version",
"vllm_kunlun_version",
"env_vars",
],
)
def get_kunlun_env_info():
"""Collect all environment information"""
run_lambda = run
pip_version, pip_list_output = get_pip_packages(run_lambda)
# PyTorch information
if TORCH_AVAILABLE:
torch_version = torch.__version__
debug_mode_str = str(torch.version.debug)
else:
torch_version = "N/A"
debug_mode_str = "N/A"
sys_version = sys.version.replace("\n", " ")
return KunlunSystemEnv(
# General system information
os=get_os(run_lambda),
gcc_version=get_gcc_version(run_lambda),
clang_version=get_clang_version(run_lambda),
cmake_version=get_cmake_version(run_lambda),
libc_version=get_libc_version(),
python_version=f"{sys_version} ({sys.maxsize.bit_length() + 1}-bit runtime)",
python_platform=get_python_platform(),
pip_version=pip_version,
pip_packages=pip_list_output,
conda_packages=get_conda_packages(run_lambda),
cpu_info=get_cpu_info(run_lambda),
# PyTorch information
torch_version=torch_version,
is_debug_build=debug_mode_str,
# Kunlun-specific information
kunlun_xpu_info=get_kunlun_gpu_info(run_lambda),
kunlun_driver_version=get_kunlun_driver_version(run_lambda),
kunlun_xre_version=get_kunlun_xre_version(run_lambda),
bkcl_version=get_bkcl_version(run_lambda),
kunlun_topo=get_kunlun_topo(run_lambda),
# vLLM related
vllm_version=get_vllm_version(),
vllm_kunlun_version=get_vllm_kunlun_version(),
env_vars=get_kunlun_env_vars(),
)
# Output format template
kunlun_env_info_fmt = """
==============================
System Info
==============================
OS : {os}
GCC version : {gcc_version}
Clang version : {clang_version}
CMake version : {cmake_version}
Libc version : {libc_version}
==============================
PyTorch Info
==============================
PyTorch version : {torch_version}
Is debug build : {is_debug_build}
==============================
Python Environment
==============================
Python version : {python_version}
Python platform : {python_platform}
==============================
Kunlun / XPU Info
==============================
XPU models and configuration :
{kunlun_xpu_info}
Kunlun driver version : {kunlun_driver_version}
XRE (Runtime) version : {kunlun_xre_version}
BKCL version : {bkcl_version}
XPU Topology:
{kunlun_topo}
==============================
CPU Info
==============================
{cpu_info}
==============================
Versions of relevant libraries
==============================
{pip_packages}
{conda_packages}
==============================
vLLM-Kunlun Info
==============================
vLLM Version : {vllm_version}
vLLM-Kunlun Version : {vllm_kunlun_version}
==============================
Environment Variables
==============================
{env_vars}
""".strip()
def pretty_str(envinfo):
"""Format environment information"""
mutable_dict = envinfo._asdict()
# Replace None with "Could not collect"
for key in mutable_dict:
if mutable_dict[key] is None:
mutable_dict[key] = "Could not collect"
# Handle pip package list
if mutable_dict["pip_packages"]:
mutable_dict["pip_packages"] = "\n".join(
f"[{envinfo.pip_version}] {line}"
for line in mutable_dict["pip_packages"].split("\n")
if line
)
else:
mutable_dict["pip_packages"] = "No relevant packages"
# Handle conda package list
if mutable_dict["conda_packages"]:
mutable_dict["conda_packages"] = "\n".join(
f"[conda] {line}"
for line in mutable_dict["conda_packages"].split("\n")
if line
)
else:
mutable_dict["conda_packages"] = ""
return kunlun_env_info_fmt.format(**mutable_dict)
def get_pretty_kunlun_env_info():
"""Get formatted environment information"""
return pretty_str(get_kunlun_env_info())
def main():
"""Main entry point"""
print("Collecting Kunlun XPU environment information...")
output = get_pretty_kunlun_env_info()
print(output)
if __name__ == "__main__":
main()