696 lines
20 KiB
Python
696 lines
20 KiB
Python
|
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
|
# vLLM-Kunlun Environment Information Collection Tool (Fixed Version)
|
||
|
|
"""
|
||
|
|
Environment information collection script for Kunlun XPU
|
||
|
|
Fixed the following issues:
|
||
|
|
1. Device name displayed as "GPU" → Now correctly shows "P800 OAM"
|
||
|
|
2. XRE version command error → Now parsed from xpu-smi output
|
||
|
|
3. vLLM-Kunlun version hardcoded → Now fetched from pip package metadata
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
from collections import namedtuple
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# Part 1: Basic Utility Functions
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
|
||
|
|
def run(command):
|
||
|
|
"""
|
||
|
|
Execute shell command and return result
|
||
|
|
[Principle Explanation - Web Development Analogy]
|
||
|
|
This is like the fetch() function in frontend development, sending a request and getting a response.
|
||
|
|
- command: The command to execute (similar to a URL)
|
||
|
|
- returns: (return_code, stdout, stderr)
|
||
|
|
Args:
|
||
|
|
command: Command as string or list
|
||
|
|
Returns:
|
||
|
|
tuple: (return_code, stdout, stderr)
|
||
|
|
"""
|
||
|
|
shell = True if isinstance(command, str) else False
|
||
|
|
try:
|
||
|
|
p = subprocess.Popen(
|
||
|
|
command,
|
||
|
|
stdout=subprocess.PIPE, # Capture standard output
|
||
|
|
stderr=subprocess.PIPE, # Capture error output
|
||
|
|
shell=shell,
|
||
|
|
)
|
||
|
|
raw_output, raw_err = p.communicate()
|
||
|
|
rc = p.returncode
|
||
|
|
# Decode byte stream to string
|
||
|
|
output = raw_output.decode("utf-8").strip()
|
||
|
|
err = raw_err.decode("utf-8").strip()
|
||
|
|
return rc, output, err
|
||
|
|
except FileNotFoundError:
|
||
|
|
return 127, "", "Command not found"
|
||
|
|
|
||
|
|
|
||
|
|
def run_and_read_all(run_lambda, command):
|
||
|
|
"""Execute command, return output if successful, None otherwise"""
|
||
|
|
rc, out, _ = run_lambda(command)
|
||
|
|
if rc != 0:
|
||
|
|
return None
|
||
|
|
return out
|
||
|
|
|
||
|
|
|
||
|
|
def run_and_parse_first_match(run_lambda, command, regex):
|
||
|
|
"""Execute command and extract first regex match"""
|
||
|
|
rc, out, _ = run_lambda(command)
|
||
|
|
if rc != 0:
|
||
|
|
return None
|
||
|
|
match = re.search(regex, out)
|
||
|
|
if match is None:
|
||
|
|
return None
|
||
|
|
return match.group(1)
|
||
|
|
|
||
|
|
|
||
|
|
# Check if PyTorch is available
|
||
|
|
try:
|
||
|
|
import torch
|
||
|
|
|
||
|
|
TORCH_AVAILABLE = True
|
||
|
|
except (ImportError, NameError, AttributeError, OSError):
|
||
|
|
TORCH_AVAILABLE = False
|
||
|
|
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# Part 2: General System Information Collection (Reusing vLLM Original Logic)
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
|
||
|
|
def get_platform():
|
||
|
|
"""Get operating system platform"""
|
||
|
|
if sys.platform.startswith("linux"):
|
||
|
|
return "linux"
|
||
|
|
elif sys.platform.startswith("win32"):
|
||
|
|
return "win32"
|
||
|
|
elif sys.platform.startswith("darwin"):
|
||
|
|
return "darwin"
|
||
|
|
return sys.platform
|
||
|
|
|
||
|
|
|
||
|
|
def get_os(run_lambda):
|
||
|
|
"""Get detailed operating system information"""
|
||
|
|
from platform import machine
|
||
|
|
|
||
|
|
if get_platform() == "linux":
|
||
|
|
# Try reading /etc/*-release
|
||
|
|
rc, out, _ = run_lambda(
|
||
|
|
"cat /etc/*-release 2>/dev/null | grep PRETTY_NAME | head -1"
|
||
|
|
)
|
||
|
|
if rc == 0 and out:
|
||
|
|
match = re.search(r'PRETTY_NAME="(.*)"', out)
|
||
|
|
if match:
|
||
|
|
return f"{match.group(1)} ({machine()})"
|
||
|
|
# Fallback: use lsb_release
|
||
|
|
rc, out, _ = run_lambda("lsb_release -d 2>/dev/null")
|
||
|
|
if rc == 0 and out:
|
||
|
|
match = re.search(r"Description:\s*(.*)", out)
|
||
|
|
if match:
|
||
|
|
return f"{match.group(1)} ({machine()})"
|
||
|
|
return f"{get_platform()} ({machine()})"
|
||
|
|
|
||
|
|
|
||
|
|
def get_gcc_version(run_lambda):
|
||
|
|
"""Get GCC version"""
|
||
|
|
return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
|
||
|
|
|
||
|
|
|
||
|
|
def get_clang_version(run_lambda):
|
||
|
|
"""Get Clang version"""
|
||
|
|
return run_and_parse_first_match(
|
||
|
|
run_lambda, "clang --version", r"clang version (.*)"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def get_cmake_version(run_lambda):
|
||
|
|
"""Get CMake version"""
|
||
|
|
return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
|
||
|
|
|
||
|
|
|
||
|
|
def get_libc_version():
|
||
|
|
"""Get libc version"""
|
||
|
|
import platform
|
||
|
|
|
||
|
|
if get_platform() != "linux":
|
||
|
|
return "N/A"
|
||
|
|
return "-".join(platform.libc_ver())
|
||
|
|
|
||
|
|
|
||
|
|
def get_python_platform():
|
||
|
|
"""Get Python platform information"""
|
||
|
|
import platform
|
||
|
|
|
||
|
|
return platform.platform()
|
||
|
|
|
||
|
|
|
||
|
|
def get_cpu_info(run_lambda):
|
||
|
|
"""Get CPU information"""
|
||
|
|
if get_platform() == "linux":
|
||
|
|
rc, out, err = run_lambda("lscpu")
|
||
|
|
return out if rc == 0 else err
|
||
|
|
return "N/A"
|
||
|
|
|
||
|
|
|
||
|
|
def get_pip_packages(run_lambda, patterns=None):
|
||
|
|
"""Get pip package list"""
|
||
|
|
if patterns is None:
|
||
|
|
patterns = {
|
||
|
|
"torch",
|
||
|
|
"numpy",
|
||
|
|
"triton",
|
||
|
|
"transformers",
|
||
|
|
"vllm",
|
||
|
|
"kunlun",
|
||
|
|
"xpu",
|
||
|
|
"bkcl",
|
||
|
|
"xmlir",
|
||
|
|
}
|
||
|
|
|
||
|
|
cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
|
||
|
|
out = run_and_read_all(run_lambda, cmd)
|
||
|
|
if out is None:
|
||
|
|
return "pip3", ""
|
||
|
|
|
||
|
|
filtered = "\n".join(
|
||
|
|
line
|
||
|
|
for line in out.splitlines()
|
||
|
|
if any(name.lower() in line.lower() for name in patterns)
|
||
|
|
)
|
||
|
|
pip_version = "pip3" if sys.version[0] == "3" else "pip"
|
||
|
|
return pip_version, filtered
|
||
|
|
|
||
|
|
|
||
|
|
def get_conda_packages(run_lambda, patterns=None):
|
||
|
|
"""Get conda package list"""
|
||
|
|
if patterns is None:
|
||
|
|
patterns = {
|
||
|
|
"torch",
|
||
|
|
"numpy",
|
||
|
|
"triton",
|
||
|
|
"transformers",
|
||
|
|
"kunlun",
|
||
|
|
"xpu",
|
||
|
|
"bkcl",
|
||
|
|
"xmlir",
|
||
|
|
}
|
||
|
|
|
||
|
|
conda = os.environ.get("CONDA_EXE", "conda")
|
||
|
|
out = run_and_read_all(run_lambda, [conda, "list"])
|
||
|
|
if out is None:
|
||
|
|
return None
|
||
|
|
|
||
|
|
return "\n".join(
|
||
|
|
line
|
||
|
|
for line in out.splitlines()
|
||
|
|
if not line.startswith("#")
|
||
|
|
and any(name.lower() in line.lower() for name in patterns)
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# Part 3: Kunlun-Specific Information Collection (Core Fix)
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
|
||
|
|
def parse_xpu_smi_output(run_lambda):
|
||
|
|
"""
|
||
|
|
Parse the complete output of xpu-smi command
|
||
|
|
[Principle Explanation]
|
||
|
|
The xpu-smi output format is similar to nvidia-smi, we need to parse it with regex.
|
||
|
|
Example output format:
|
||
|
|
+-----------------------------------------------------------------------------+
|
||
|
|
| XPU-SMI Driver Version: 515.58 XPU-RT Version: N/A |
|
||
|
|
|-------------------------------+----------------------+----------------------+
|
||
|
|
| 0 P800 OAM N/A | 00000000:52:00.0 N/A | 0 |
|
||
|
|
| N/A 43C N/A 85W / 400W | 4MiB / 98304MiB | 0% Default |
|
||
|
|
Returns:
|
||
|
|
dict: Dictionary containing parsing results
|
||
|
|
"""
|
||
|
|
rc, output, _ = run_lambda("xpu-smi")
|
||
|
|
if rc != 0 or not output:
|
||
|
|
return None
|
||
|
|
|
||
|
|
result = {
|
||
|
|
"raw_output": output,
|
||
|
|
"driver_version": None,
|
||
|
|
"xre_version": None,
|
||
|
|
"devices": [],
|
||
|
|
}
|
||
|
|
|
||
|
|
# Parse header: Driver Version and XPU-RT Version
|
||
|
|
# Format: | XPU-SMI Driver Version: 515.58 XPU-RT Version: N/A |
|
||
|
|
header_match = re.search(
|
||
|
|
r"Driver Version:\s*(\S+)\s+XPU-RT Version:\s*(\S+)", output
|
||
|
|
)
|
||
|
|
if header_match:
|
||
|
|
result["driver_version"] = header_match.group(1)
|
||
|
|
xre = header_match.group(2)
|
||
|
|
result["xre_version"] = xre if xre != "N/A" else None
|
||
|
|
|
||
|
|
# Parse device information
|
||
|
|
# Format: | 0 P800 OAM N/A | 00000000:52:00.0 N/A |
|
||
|
|
# Following: | N/A 43C N/A 85W / 400W | 4MiB / 98304MiB |
|
||
|
|
|
||
|
|
# Find all device lines (containing device ID and name)
|
||
|
|
device_pattern = re.compile(
|
||
|
|
r"\|\s*(\d+)\s+(\S+(?:\s+\S+)?)\s+(?:N/A|On|Off)\s*\|" # ID and Name
|
||
|
|
r"\s*([0-9a-fA-F:\.]+)\s*" # Bus-Id
|
||
|
|
)
|
||
|
|
|
||
|
|
# Find memory information
|
||
|
|
memory_pattern = re.compile(
|
||
|
|
r"\|\s*N/A\s+\d+C\s+N/A\s+\d+W\s*/\s*\d+W\s*\|"
|
||
|
|
r"\s*(\d+)MiB\s*/\s*(\d+)MiB\s*\|" # Memory-Usage / Total
|
||
|
|
)
|
||
|
|
|
||
|
|
lines = output.split("\n")
|
||
|
|
i = 0
|
||
|
|
while i < len(lines):
|
||
|
|
line = lines[i]
|
||
|
|
device_match = device_pattern.search(line)
|
||
|
|
if device_match:
|
||
|
|
device_id = int(device_match.group(1))
|
||
|
|
device_name = device_match.group(2).strip()
|
||
|
|
bus_id = device_match.group(3)
|
||
|
|
|
||
|
|
# Next line should have memory info
|
||
|
|
memory_used = 0
|
||
|
|
memory_total = 0
|
||
|
|
if i + 1 < len(lines):
|
||
|
|
mem_match = memory_pattern.search(lines[i + 1])
|
||
|
|
if mem_match:
|
||
|
|
memory_used = int(mem_match.group(1))
|
||
|
|
memory_total = int(mem_match.group(2))
|
||
|
|
|
||
|
|
result["devices"].append(
|
||
|
|
{
|
||
|
|
"id": device_id,
|
||
|
|
"name": device_name, # This will correctly get "P800 OAM"
|
||
|
|
"bus_id": bus_id,
|
||
|
|
"memory_used_mib": memory_used,
|
||
|
|
"memory_total_mib": memory_total,
|
||
|
|
}
|
||
|
|
)
|
||
|
|
i += 1
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def get_kunlun_gpu_info(run_lambda):
|
||
|
|
"""
|
||
|
|
Get Kunlun XPU device information
|
||
|
|
[Fix Explanation]
|
||
|
|
Previously used torch.cuda.get_device_properties() to get the name,
|
||
|
|
but it only returns "GPU" (because Kunlun masquerades as CUDA).
|
||
|
|
Now parse xpu-smi output to correctly get "P800 OAM".
|
||
|
|
Returns:
|
||
|
|
str: Device information string
|
||
|
|
"""
|
||
|
|
parsed = parse_xpu_smi_output(run_lambda)
|
||
|
|
|
||
|
|
if parsed and parsed["devices"]:
|
||
|
|
# Get real device name from xpu-smi parsing
|
||
|
|
lines = []
|
||
|
|
for dev in parsed["devices"]:
|
||
|
|
memory_gb = dev["memory_total_mib"] / 1024
|
||
|
|
# Correctly display: XPU 0: P800 OAM (96.0GB)
|
||
|
|
lines.append(f"XPU {dev['id']}: {dev['name']} ({memory_gb:.1f}GB)")
|
||
|
|
return "\n".join(lines)
|
||
|
|
|
||
|
|
# Fallback: Use PyTorch interface (but will display as GPU)
|
||
|
|
if TORCH_AVAILABLE:
|
||
|
|
try:
|
||
|
|
device_count = torch.cuda.device_count()
|
||
|
|
lines = []
|
||
|
|
for i in range(device_count):
|
||
|
|
props = torch.cuda.get_device_properties(i)
|
||
|
|
name = props.name if hasattr(props, "name") else "Kunlun XPU"
|
||
|
|
memory_gb = (
|
||
|
|
props.total_memory / (1024**3)
|
||
|
|
if hasattr(props, "total_memory")
|
||
|
|
else 0
|
||
|
|
)
|
||
|
|
lines.append(f"XPU {i}: {name} ({memory_gb:.1f}GB)")
|
||
|
|
return "\n".join(lines)
|
||
|
|
except Exception as e:
|
||
|
|
return f"Error: {e}"
|
||
|
|
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def get_kunlun_driver_version(run_lambda):
|
||
|
|
"""
|
||
|
|
Get Kunlun driver version
|
||
|
|
[Fix Explanation]
|
||
|
|
Parse directly from xpu-smi output header instead of calling incorrect commands.
|
||
|
|
Returns:
|
||
|
|
str: Driver version, e.g., "515.58"
|
||
|
|
"""
|
||
|
|
parsed = parse_xpu_smi_output(run_lambda)
|
||
|
|
if parsed and parsed["driver_version"]:
|
||
|
|
return parsed["driver_version"]
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def get_kunlun_xre_version(run_lambda):
|
||
|
|
"""
|
||
|
|
Get Kunlun XRE (Runtime) version
|
||
|
|
[Fix Explanation]
|
||
|
|
Previously used `xpu-smi --version` but that parameter doesn't exist.
|
||
|
|
Now parse the "XPU-RT Version" field from xpu-smi standard output header.
|
||
|
|
Returns:
|
||
|
|
str: XRE version, or None (if showing N/A)
|
||
|
|
"""
|
||
|
|
parsed = parse_xpu_smi_output(run_lambda)
|
||
|
|
if parsed and parsed["xre_version"]:
|
||
|
|
return parsed["xre_version"]
|
||
|
|
return "N/A (not installed or not detected)"
|
||
|
|
|
||
|
|
|
||
|
|
def get_kunlun_topo(run_lambda):
|
||
|
|
"""
|
||
|
|
Get Kunlun XPU topology information
|
||
|
|
Returns:
|
||
|
|
str: Topology information
|
||
|
|
"""
|
||
|
|
# xpu-smi topo -m command can get topology
|
||
|
|
output = run_and_read_all(run_lambda, "xpu-smi topo -m")
|
||
|
|
if output:
|
||
|
|
return output
|
||
|
|
|
||
|
|
# Fallback: Show device count
|
||
|
|
if TORCH_AVAILABLE:
|
||
|
|
try:
|
||
|
|
count = torch.cuda.device_count()
|
||
|
|
return f"Detected {count} Kunlun XPU device(s)"
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def get_bkcl_version(run_lambda):
|
||
|
|
"""
|
||
|
|
Get BKCL (communication library) version
|
||
|
|
[Principle Explanation]
|
||
|
|
BKCL = Baidu Kunlun Communication Library
|
||
|
|
Similar to NVIDIA's NCCL, used for multi-card communication.
|
||
|
|
Returns:
|
||
|
|
str: BKCL version information
|
||
|
|
"""
|
||
|
|
# Method 1: From your logs, BKCL prints version when loading
|
||
|
|
# [WARN][BKCL][globals.cpp:268] xccl version: 6ab4ffb [rdma] ...
|
||
|
|
# We can try importing related modules
|
||
|
|
try:
|
||
|
|
# Try getting from torch_xmlir
|
||
|
|
import torch_xmlir
|
||
|
|
|
||
|
|
# Find path to libbkcl.so
|
||
|
|
bkcl_path = None
|
||
|
|
if hasattr(torch_xmlir, "__file__"):
|
||
|
|
import os
|
||
|
|
|
||
|
|
base = os.path.dirname(torch_xmlir.__file__)
|
||
|
|
candidate = os.path.join(base, "libbkcl.so")
|
||
|
|
if os.path.exists(candidate):
|
||
|
|
bkcl_path = candidate
|
||
|
|
if bkcl_path:
|
||
|
|
return f"Found at: {bkcl_path}"
|
||
|
|
except ImportError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Method 2: Search from ldconfig
|
||
|
|
rc, out, _ = run_lambda("ldconfig -p 2>/dev/null | grep -i bkcl | head -1")
|
||
|
|
if rc == 0 and out:
|
||
|
|
return out
|
||
|
|
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def get_vllm_kunlun_version():
|
||
|
|
"""
|
||
|
|
Get vLLM-Kunlun version
|
||
|
|
[Fix Explanation]
|
||
|
|
Previously got hardcoded version "0.9.2" from vllm_kunlun.platforms.version,
|
||
|
|
but actual pip installed version is "0.1.0".
|
||
|
|
Now prioritize using importlib.metadata to get real installed version.
|
||
|
|
Returns:
|
||
|
|
str: Version number
|
||
|
|
"""
|
||
|
|
# Method 1 (recommended): Use importlib.metadata (Python 3.8+)
|
||
|
|
try:
|
||
|
|
from importlib.metadata import version
|
||
|
|
|
||
|
|
return version("vllm-kunlun")
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Method 2: Use pkg_resources
|
||
|
|
try:
|
||
|
|
import pkg_resources
|
||
|
|
|
||
|
|
return pkg_resources.get_distribution("vllm-kunlun").version
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Method 3 (fallback): Get from code (may be inaccurate)
|
||
|
|
try:
|
||
|
|
from vllm_kunlun.platforms.version import get_xvllm_version
|
||
|
|
|
||
|
|
return get_xvllm_version() + " (from code, may be inaccurate)"
|
||
|
|
except ImportError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
return "N/A"
|
||
|
|
|
||
|
|
|
||
|
|
def get_vllm_version():
|
||
|
|
"""Get vLLM main package version"""
|
||
|
|
try:
|
||
|
|
from importlib.metadata import version
|
||
|
|
|
||
|
|
return version("vllm")
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
try:
|
||
|
|
from vllm import __version__
|
||
|
|
|
||
|
|
return __version__
|
||
|
|
except ImportError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
return "N/A"
|
||
|
|
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# Part 4: Environment Variable Collection
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
|
||
|
|
def get_kunlun_env_vars():
|
||
|
|
"""Get Kunlun-related environment variables"""
|
||
|
|
env_vars = ""
|
||
|
|
kunlun_prefixes = (
|
||
|
|
"XPU",
|
||
|
|
"KUNLUN",
|
||
|
|
"BKCL",
|
||
|
|
"XCCL",
|
||
|
|
"XRE",
|
||
|
|
"TORCH",
|
||
|
|
"VLLM",
|
||
|
|
)
|
||
|
|
secret_terms = ("secret", "token", "api", "access", "password")
|
||
|
|
|
||
|
|
for k, v in sorted(os.environ.items()):
|
||
|
|
if any(term in k.lower() for term in secret_terms):
|
||
|
|
continue
|
||
|
|
if any(k.upper().startswith(prefix) for prefix in kunlun_prefixes):
|
||
|
|
env_vars += f"{k}={v}\n"
|
||
|
|
|
||
|
|
return env_vars
|
||
|
|
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# Part 5: Define Data Structure and Formatted Output
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
KunlunSystemEnv = namedtuple(
|
||
|
|
"KunlunSystemEnv",
|
||
|
|
[
|
||
|
|
# General system information
|
||
|
|
"os",
|
||
|
|
"gcc_version",
|
||
|
|
"clang_version",
|
||
|
|
"cmake_version",
|
||
|
|
"libc_version",
|
||
|
|
"python_version",
|
||
|
|
"python_platform",
|
||
|
|
"pip_version",
|
||
|
|
"pip_packages",
|
||
|
|
"conda_packages",
|
||
|
|
"cpu_info",
|
||
|
|
# PyTorch information
|
||
|
|
"torch_version",
|
||
|
|
"is_debug_build",
|
||
|
|
# Kunlun-specific information
|
||
|
|
"kunlun_xpu_info",
|
||
|
|
"kunlun_driver_version",
|
||
|
|
"kunlun_xre_version",
|
||
|
|
"bkcl_version",
|
||
|
|
"kunlun_topo",
|
||
|
|
# vLLM related
|
||
|
|
"vllm_version",
|
||
|
|
"vllm_kunlun_version",
|
||
|
|
"env_vars",
|
||
|
|
],
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def get_kunlun_env_info():
|
||
|
|
"""Collect all environment information"""
|
||
|
|
run_lambda = run
|
||
|
|
pip_version, pip_list_output = get_pip_packages(run_lambda)
|
||
|
|
|
||
|
|
# PyTorch information
|
||
|
|
if TORCH_AVAILABLE:
|
||
|
|
torch_version = torch.__version__
|
||
|
|
debug_mode_str = str(torch.version.debug)
|
||
|
|
else:
|
||
|
|
torch_version = "N/A"
|
||
|
|
debug_mode_str = "N/A"
|
||
|
|
|
||
|
|
sys_version = sys.version.replace("\n", " ")
|
||
|
|
|
||
|
|
return KunlunSystemEnv(
|
||
|
|
# General system information
|
||
|
|
os=get_os(run_lambda),
|
||
|
|
gcc_version=get_gcc_version(run_lambda),
|
||
|
|
clang_version=get_clang_version(run_lambda),
|
||
|
|
cmake_version=get_cmake_version(run_lambda),
|
||
|
|
libc_version=get_libc_version(),
|
||
|
|
python_version=f"{sys_version} ({sys.maxsize.bit_length() + 1}-bit runtime)",
|
||
|
|
python_platform=get_python_platform(),
|
||
|
|
pip_version=pip_version,
|
||
|
|
pip_packages=pip_list_output,
|
||
|
|
conda_packages=get_conda_packages(run_lambda),
|
||
|
|
cpu_info=get_cpu_info(run_lambda),
|
||
|
|
# PyTorch information
|
||
|
|
torch_version=torch_version,
|
||
|
|
is_debug_build=debug_mode_str,
|
||
|
|
# Kunlun-specific information
|
||
|
|
kunlun_xpu_info=get_kunlun_gpu_info(run_lambda),
|
||
|
|
kunlun_driver_version=get_kunlun_driver_version(run_lambda),
|
||
|
|
kunlun_xre_version=get_kunlun_xre_version(run_lambda),
|
||
|
|
bkcl_version=get_bkcl_version(run_lambda),
|
||
|
|
kunlun_topo=get_kunlun_topo(run_lambda),
|
||
|
|
# vLLM related
|
||
|
|
vllm_version=get_vllm_version(),
|
||
|
|
vllm_kunlun_version=get_vllm_kunlun_version(),
|
||
|
|
env_vars=get_kunlun_env_vars(),
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# Output format template
|
||
|
|
kunlun_env_info_fmt = """
|
||
|
|
==============================
|
||
|
|
System Info
|
||
|
|
==============================
|
||
|
|
OS : {os}
|
||
|
|
GCC version : {gcc_version}
|
||
|
|
Clang version : {clang_version}
|
||
|
|
CMake version : {cmake_version}
|
||
|
|
Libc version : {libc_version}
|
||
|
|
==============================
|
||
|
|
PyTorch Info
|
||
|
|
==============================
|
||
|
|
PyTorch version : {torch_version}
|
||
|
|
Is debug build : {is_debug_build}
|
||
|
|
==============================
|
||
|
|
Python Environment
|
||
|
|
==============================
|
||
|
|
Python version : {python_version}
|
||
|
|
Python platform : {python_platform}
|
||
|
|
==============================
|
||
|
|
Kunlun / XPU Info
|
||
|
|
==============================
|
||
|
|
XPU models and configuration :
|
||
|
|
{kunlun_xpu_info}
|
||
|
|
Kunlun driver version : {kunlun_driver_version}
|
||
|
|
XRE (Runtime) version : {kunlun_xre_version}
|
||
|
|
BKCL version : {bkcl_version}
|
||
|
|
XPU Topology:
|
||
|
|
{kunlun_topo}
|
||
|
|
==============================
|
||
|
|
CPU Info
|
||
|
|
==============================
|
||
|
|
{cpu_info}
|
||
|
|
==============================
|
||
|
|
Versions of relevant libraries
|
||
|
|
==============================
|
||
|
|
{pip_packages}
|
||
|
|
{conda_packages}
|
||
|
|
==============================
|
||
|
|
vLLM-Kunlun Info
|
||
|
|
==============================
|
||
|
|
vLLM Version : {vllm_version}
|
||
|
|
vLLM-Kunlun Version : {vllm_kunlun_version}
|
||
|
|
==============================
|
||
|
|
Environment Variables
|
||
|
|
==============================
|
||
|
|
{env_vars}
|
||
|
|
""".strip()
|
||
|
|
|
||
|
|
|
||
|
|
def pretty_str(envinfo):
|
||
|
|
"""Format environment information"""
|
||
|
|
mutable_dict = envinfo._asdict()
|
||
|
|
|
||
|
|
# Replace None with "Could not collect"
|
||
|
|
for key in mutable_dict:
|
||
|
|
if mutable_dict[key] is None:
|
||
|
|
mutable_dict[key] = "Could not collect"
|
||
|
|
|
||
|
|
# Handle pip package list
|
||
|
|
if mutable_dict["pip_packages"]:
|
||
|
|
mutable_dict["pip_packages"] = "\n".join(
|
||
|
|
f"[{envinfo.pip_version}] {line}"
|
||
|
|
for line in mutable_dict["pip_packages"].split("\n")
|
||
|
|
if line
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
mutable_dict["pip_packages"] = "No relevant packages"
|
||
|
|
|
||
|
|
# Handle conda package list
|
||
|
|
if mutable_dict["conda_packages"]:
|
||
|
|
mutable_dict["conda_packages"] = "\n".join(
|
||
|
|
f"[conda] {line}"
|
||
|
|
for line in mutable_dict["conda_packages"].split("\n")
|
||
|
|
if line
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
mutable_dict["conda_packages"] = ""
|
||
|
|
|
||
|
|
return kunlun_env_info_fmt.format(**mutable_dict)
|
||
|
|
|
||
|
|
|
||
|
|
def get_pretty_kunlun_env_info():
|
||
|
|
"""Get formatted environment information"""
|
||
|
|
return pretty_str(get_kunlun_env_info())
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""Main entry point"""
|
||
|
|
print("Collecting Kunlun XPU environment information...")
|
||
|
|
output = get_pretty_kunlun_env_info()
|
||
|
|
print(output)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|