diff --git a/collect_env.py b/collect_env.py new file mode 100644 index 0000000..1ccd881 --- /dev/null +++ b/collect_env.py @@ -0,0 +1,695 @@ +# SPDX-License-Identifier: Apache-2.0 +# vLLM-Kunlun Environment Information Collection Tool (Fixed Version) +""" +Environment information collection script for Kunlun XPU +Fixed the following issues: +1. Device name displayed as "GPU" → Now correctly shows "P800 OAM" +2. XRE version command error → Now parsed from xpu-smi output +3. vLLM-Kunlun version hardcoded → Now fetched from pip package metadata +""" + +import os +import re +import subprocess +import sys +from collections import namedtuple + +# ============================================================================= +# Part 1: Basic Utility Functions +# ============================================================================= + + +def run(command): + """ + Execute shell command and return result + [Principle Explanation - Web Development Analogy] + This is like the fetch() function in frontend development, sending a request and getting a response. + - command: The command to execute (similar to a URL) + - returns: (return_code, stdout, stderr) + Args: + command: Command as string or list + Returns: + tuple: (return_code, stdout, stderr) + """ + shell = True if isinstance(command, str) else False + try: + p = subprocess.Popen( + command, + stdout=subprocess.PIPE, # Capture standard output + stderr=subprocess.PIPE, # Capture error output + shell=shell, + ) + raw_output, raw_err = p.communicate() + rc = p.returncode + # Decode byte stream to string + output = raw_output.decode("utf-8").strip() + err = raw_err.decode("utf-8").strip() + return rc, output, err + except FileNotFoundError: + return 127, "", "Command not found" + + +def run_and_read_all(run_lambda, command): + """Execute command, return output if successful, None otherwise""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out + + +def run_and_parse_first_match(run_lambda, command, regex): + """Execute command and extract first regex match""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + match = re.search(regex, out) + if match is None: + return None + return match.group(1) + + +# Check if PyTorch is available +try: + import torch + + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False + + +# ============================================================================= +# Part 2: General System Information Collection (Reusing vLLM Original Logic) +# ============================================================================= + + +def get_platform(): + """Get operating system platform""" + if sys.platform.startswith("linux"): + return "linux" + elif sys.platform.startswith("win32"): + return "win32" + elif sys.platform.startswith("darwin"): + return "darwin" + return sys.platform + + +def get_os(run_lambda): + """Get detailed operating system information""" + from platform import machine + + if get_platform() == "linux": + # Try reading /etc/*-release + rc, out, _ = run_lambda( + "cat /etc/*-release 2>/dev/null | grep PRETTY_NAME | head -1" + ) + if rc == 0 and out: + match = re.search(r'PRETTY_NAME="(.*)"', out) + if match: + return f"{match.group(1)} ({machine()})" + # Fallback: use lsb_release + rc, out, _ = run_lambda("lsb_release -d 2>/dev/null") + if rc == 0 and out: + match = re.search(r"Description:\s*(.*)", out) + if match: + return f"{match.group(1)} ({machine()})" + return f"{get_platform()} ({machine()})" + + +def get_gcc_version(run_lambda): + """Get GCC version""" + return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)") + + +def get_clang_version(run_lambda): + """Get Clang version""" + return run_and_parse_first_match( + run_lambda, "clang --version", r"clang version (.*)" + ) + + +def get_cmake_version(run_lambda): + """Get CMake version""" + return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)") + + +def get_libc_version(): + """Get libc version""" + import platform + + if get_platform() != "linux": + return "N/A" + return "-".join(platform.libc_ver()) + + +def get_python_platform(): + """Get Python platform information""" + import platform + + return platform.platform() + + +def get_cpu_info(run_lambda): + """Get CPU information""" + if get_platform() == "linux": + rc, out, err = run_lambda("lscpu") + return out if rc == 0 else err + return "N/A" + + +def get_pip_packages(run_lambda, patterns=None): + """Get pip package list""" + if patterns is None: + patterns = { + "torch", + "numpy", + "triton", + "transformers", + "vllm", + "kunlun", + "xpu", + "bkcl", + "xmlir", + } + + cmd = [sys.executable, "-mpip", "list", "--format=freeze"] + out = run_and_read_all(run_lambda, cmd) + if out is None: + return "pip3", "" + + filtered = "\n".join( + line + for line in out.splitlines() + if any(name.lower() in line.lower() for name in patterns) + ) + pip_version = "pip3" if sys.version[0] == "3" else "pip" + return pip_version, filtered + + +def get_conda_packages(run_lambda, patterns=None): + """Get conda package list""" + if patterns is None: + patterns = { + "torch", + "numpy", + "triton", + "transformers", + "kunlun", + "xpu", + "bkcl", + "xmlir", + } + + conda = os.environ.get("CONDA_EXE", "conda") + out = run_and_read_all(run_lambda, [conda, "list"]) + if out is None: + return None + + return "\n".join( + line + for line in out.splitlines() + if not line.startswith("#") + and any(name.lower() in line.lower() for name in patterns) + ) + + +# ============================================================================= +# Part 3: Kunlun-Specific Information Collection (Core Fix) +# ============================================================================= + + +def parse_xpu_smi_output(run_lambda): + """ + Parse the complete output of xpu-smi command + [Principle Explanation] + The xpu-smi output format is similar to nvidia-smi, we need to parse it with regex. + Example output format: + +-----------------------------------------------------------------------------+ + | XPU-SMI Driver Version: 515.58 XPU-RT Version: N/A | + |-------------------------------+----------------------+----------------------+ + | 0 P800 OAM N/A | 00000000:52:00.0 N/A | 0 | + | N/A 43C N/A 85W / 400W | 4MiB / 98304MiB | 0% Default | + Returns: + dict: Dictionary containing parsing results + """ + rc, output, _ = run_lambda("xpu-smi") + if rc != 0 or not output: + return None + + result = { + "raw_output": output, + "driver_version": None, + "xre_version": None, + "devices": [], + } + + # Parse header: Driver Version and XPU-RT Version + # Format: | XPU-SMI Driver Version: 515.58 XPU-RT Version: N/A | + header_match = re.search( + r"Driver Version:\s*(\S+)\s+XPU-RT Version:\s*(\S+)", output + ) + if header_match: + result["driver_version"] = header_match.group(1) + xre = header_match.group(2) + result["xre_version"] = xre if xre != "N/A" else None + + # Parse device information + # Format: | 0 P800 OAM N/A | 00000000:52:00.0 N/A | + # Following: | N/A 43C N/A 85W / 400W | 4MiB / 98304MiB | + + # Find all device lines (containing device ID and name) + device_pattern = re.compile( + r"\|\s*(\d+)\s+(\S+(?:\s+\S+)?)\s+(?:N/A|On|Off)\s*\|" # ID and Name + r"\s*([0-9a-fA-F:\.]+)\s*" # Bus-Id + ) + + # Find memory information + memory_pattern = re.compile( + r"\|\s*N/A\s+\d+C\s+N/A\s+\d+W\s*/\s*\d+W\s*\|" + r"\s*(\d+)MiB\s*/\s*(\d+)MiB\s*\|" # Memory-Usage / Total + ) + + lines = output.split("\n") + i = 0 + while i < len(lines): + line = lines[i] + device_match = device_pattern.search(line) + if device_match: + device_id = int(device_match.group(1)) + device_name = device_match.group(2).strip() + bus_id = device_match.group(3) + + # Next line should have memory info + memory_used = 0 + memory_total = 0 + if i + 1 < len(lines): + mem_match = memory_pattern.search(lines[i + 1]) + if mem_match: + memory_used = int(mem_match.group(1)) + memory_total = int(mem_match.group(2)) + + result["devices"].append( + { + "id": device_id, + "name": device_name, # This will correctly get "P800 OAM" + "bus_id": bus_id, + "memory_used_mib": memory_used, + "memory_total_mib": memory_total, + } + ) + i += 1 + + return result + + +def get_kunlun_gpu_info(run_lambda): + """ + Get Kunlun XPU device information + [Fix Explanation] + Previously used torch.cuda.get_device_properties() to get the name, + but it only returns "GPU" (because Kunlun masquerades as CUDA). + Now parse xpu-smi output to correctly get "P800 OAM". + Returns: + str: Device information string + """ + parsed = parse_xpu_smi_output(run_lambda) + + if parsed and parsed["devices"]: + # Get real device name from xpu-smi parsing + lines = [] + for dev in parsed["devices"]: + memory_gb = dev["memory_total_mib"] / 1024 + # Correctly display: XPU 0: P800 OAM (96.0GB) + lines.append(f"XPU {dev['id']}: {dev['name']} ({memory_gb:.1f}GB)") + return "\n".join(lines) + + # Fallback: Use PyTorch interface (but will display as GPU) + if TORCH_AVAILABLE: + try: + device_count = torch.cuda.device_count() + lines = [] + for i in range(device_count): + props = torch.cuda.get_device_properties(i) + name = props.name if hasattr(props, "name") else "Kunlun XPU" + memory_gb = ( + props.total_memory / (1024**3) + if hasattr(props, "total_memory") + else 0 + ) + lines.append(f"XPU {i}: {name} ({memory_gb:.1f}GB)") + return "\n".join(lines) + except Exception as e: + return f"Error: {e}" + + return None + + +def get_kunlun_driver_version(run_lambda): + """ + Get Kunlun driver version + [Fix Explanation] + Parse directly from xpu-smi output header instead of calling incorrect commands. + Returns: + str: Driver version, e.g., "515.58" + """ + parsed = parse_xpu_smi_output(run_lambda) + if parsed and parsed["driver_version"]: + return parsed["driver_version"] + return None + + +def get_kunlun_xre_version(run_lambda): + """ + Get Kunlun XRE (Runtime) version + [Fix Explanation] + Previously used `xpu-smi --version` but that parameter doesn't exist. + Now parse the "XPU-RT Version" field from xpu-smi standard output header. + Returns: + str: XRE version, or None (if showing N/A) + """ + parsed = parse_xpu_smi_output(run_lambda) + if parsed and parsed["xre_version"]: + return parsed["xre_version"] + return "N/A (not installed or not detected)" + + +def get_kunlun_topo(run_lambda): + """ + Get Kunlun XPU topology information + Returns: + str: Topology information + """ + # xpu-smi topo -m command can get topology + output = run_and_read_all(run_lambda, "xpu-smi topo -m") + if output: + return output + + # Fallback: Show device count + if TORCH_AVAILABLE: + try: + count = torch.cuda.device_count() + return f"Detected {count} Kunlun XPU device(s)" + except Exception: + pass + + return None + + +def get_bkcl_version(run_lambda): + """ + Get BKCL (communication library) version + [Principle Explanation] + BKCL = Baidu Kunlun Communication Library + Similar to NVIDIA's NCCL, used for multi-card communication. + Returns: + str: BKCL version information + """ + # Method 1: From your logs, BKCL prints version when loading + # [WARN][BKCL][globals.cpp:268] xccl version: 6ab4ffb [rdma] ... + # We can try importing related modules + try: + # Try getting from torch_xmlir + import torch_xmlir + + # Find path to libbkcl.so + bkcl_path = None + if hasattr(torch_xmlir, "__file__"): + import os + + base = os.path.dirname(torch_xmlir.__file__) + candidate = os.path.join(base, "libbkcl.so") + if os.path.exists(candidate): + bkcl_path = candidate + if bkcl_path: + return f"Found at: {bkcl_path}" + except ImportError: + pass + + # Method 2: Search from ldconfig + rc, out, _ = run_lambda("ldconfig -p 2>/dev/null | grep -i bkcl | head -1") + if rc == 0 and out: + return out + + return None + + +def get_vllm_kunlun_version(): + """ + Get vLLM-Kunlun version + [Fix Explanation] + Previously got hardcoded version "0.9.2" from vllm_kunlun.platforms.version, + but actual pip installed version is "0.1.0". + Now prioritize using importlib.metadata to get real installed version. + Returns: + str: Version number + """ + # Method 1 (recommended): Use importlib.metadata (Python 3.8+) + try: + from importlib.metadata import version + + return version("vllm-kunlun") + except Exception: + pass + + # Method 2: Use pkg_resources + try: + import pkg_resources + + return pkg_resources.get_distribution("vllm-kunlun").version + except Exception: + pass + + # Method 3 (fallback): Get from code (may be inaccurate) + try: + from vllm_kunlun.platforms.version import get_xvllm_version + + return get_xvllm_version() + " (from code, may be inaccurate)" + except ImportError: + pass + + return "N/A" + + +def get_vllm_version(): + """Get vLLM main package version""" + try: + from importlib.metadata import version + + return version("vllm") + except Exception: + pass + + try: + from vllm import __version__ + + return __version__ + except ImportError: + pass + + return "N/A" + + +# ============================================================================= +# Part 4: Environment Variable Collection +# ============================================================================= + + +def get_kunlun_env_vars(): + """Get Kunlun-related environment variables""" + env_vars = "" + kunlun_prefixes = ( + "XPU", + "KUNLUN", + "BKCL", + "XCCL", + "XRE", + "TORCH", + "VLLM", + ) + secret_terms = ("secret", "token", "api", "access", "password") + + for k, v in sorted(os.environ.items()): + if any(term in k.lower() for term in secret_terms): + continue + if any(k.upper().startswith(prefix) for prefix in kunlun_prefixes): + env_vars += f"{k}={v}\n" + + return env_vars + + +# ============================================================================= +# Part 5: Define Data Structure and Formatted Output +# ============================================================================= + +KunlunSystemEnv = namedtuple( + "KunlunSystemEnv", + [ + # General system information + "os", + "gcc_version", + "clang_version", + "cmake_version", + "libc_version", + "python_version", + "python_platform", + "pip_version", + "pip_packages", + "conda_packages", + "cpu_info", + # PyTorch information + "torch_version", + "is_debug_build", + # Kunlun-specific information + "kunlun_xpu_info", + "kunlun_driver_version", + "kunlun_xre_version", + "bkcl_version", + "kunlun_topo", + # vLLM related + "vllm_version", + "vllm_kunlun_version", + "env_vars", + ], +) + + +def get_kunlun_env_info(): + """Collect all environment information""" + run_lambda = run + pip_version, pip_list_output = get_pip_packages(run_lambda) + + # PyTorch information + if TORCH_AVAILABLE: + torch_version = torch.__version__ + debug_mode_str = str(torch.version.debug) + else: + torch_version = "N/A" + debug_mode_str = "N/A" + + sys_version = sys.version.replace("\n", " ") + + return KunlunSystemEnv( + # General system information + os=get_os(run_lambda), + gcc_version=get_gcc_version(run_lambda), + clang_version=get_clang_version(run_lambda), + cmake_version=get_cmake_version(run_lambda), + libc_version=get_libc_version(), + python_version=f"{sys_version} ({sys.maxsize.bit_length() + 1}-bit runtime)", + python_platform=get_python_platform(), + pip_version=pip_version, + pip_packages=pip_list_output, + conda_packages=get_conda_packages(run_lambda), + cpu_info=get_cpu_info(run_lambda), + # PyTorch information + torch_version=torch_version, + is_debug_build=debug_mode_str, + # Kunlun-specific information + kunlun_xpu_info=get_kunlun_gpu_info(run_lambda), + kunlun_driver_version=get_kunlun_driver_version(run_lambda), + kunlun_xre_version=get_kunlun_xre_version(run_lambda), + bkcl_version=get_bkcl_version(run_lambda), + kunlun_topo=get_kunlun_topo(run_lambda), + # vLLM related + vllm_version=get_vllm_version(), + vllm_kunlun_version=get_vllm_kunlun_version(), + env_vars=get_kunlun_env_vars(), + ) + + +# Output format template +kunlun_env_info_fmt = """ +============================== + System Info +============================== +OS : {os} +GCC version : {gcc_version} +Clang version : {clang_version} +CMake version : {cmake_version} +Libc version : {libc_version} +============================== + PyTorch Info +============================== +PyTorch version : {torch_version} +Is debug build : {is_debug_build} +============================== + Python Environment +============================== +Python version : {python_version} +Python platform : {python_platform} +============================== + Kunlun / XPU Info +============================== +XPU models and configuration : +{kunlun_xpu_info} +Kunlun driver version : {kunlun_driver_version} +XRE (Runtime) version : {kunlun_xre_version} +BKCL version : {bkcl_version} +XPU Topology: +{kunlun_topo} +============================== + CPU Info +============================== +{cpu_info} +============================== +Versions of relevant libraries +============================== +{pip_packages} +{conda_packages} +============================== + vLLM-Kunlun Info +============================== +vLLM Version : {vllm_version} +vLLM-Kunlun Version : {vllm_kunlun_version} +============================== + Environment Variables +============================== +{env_vars} +""".strip() + + +def pretty_str(envinfo): + """Format environment information""" + mutable_dict = envinfo._asdict() + + # Replace None with "Could not collect" + for key in mutable_dict: + if mutable_dict[key] is None: + mutable_dict[key] = "Could not collect" + + # Handle pip package list + if mutable_dict["pip_packages"]: + mutable_dict["pip_packages"] = "\n".join( + f"[{envinfo.pip_version}] {line}" + for line in mutable_dict["pip_packages"].split("\n") + if line + ) + else: + mutable_dict["pip_packages"] = "No relevant packages" + + # Handle conda package list + if mutable_dict["conda_packages"]: + mutable_dict["conda_packages"] = "\n".join( + f"[conda] {line}" + for line in mutable_dict["conda_packages"].split("\n") + if line + ) + else: + mutable_dict["conda_packages"] = "" + + return kunlun_env_info_fmt.format(**mutable_dict) + + +def get_pretty_kunlun_env_info(): + """Get formatted environment information""" + return pretty_str(get_kunlun_env_info()) + + +def main(): + """Main entry point""" + print("Collecting Kunlun XPU environment information...") + output = get_pretty_kunlun_env_info() + print(output) + + +if __name__ == "__main__": + main()