[Misc] add collect_env feat (#218)

Signed-off-by: Lidang-Jiang <lidangjiang@gmail.com>
2026-02-27 12:19:58 +08:00
parent d425a0d0e9
commit 153093d3b3
1 changed files with 695 additions and 0 deletions
--- a/collect_env.py
+++ b/collect_env.py
@@ -0,0 +1,695 @@
+# SPDX-License-Identifier: Apache-2.0
+# vLLM-Kunlun Environment Information Collection Tool (Fixed Version)
+"""
+Environment information collection script for Kunlun XPU
+Fixed the following issues:
+1. Device name displayed as "GPU" → Now correctly shows "P800 OAM"
+2. XRE version command error → Now parsed from xpu-smi output
+3. vLLM-Kunlun version hardcoded → Now fetched from pip package metadata
+"""
+
+import os
+import re
+import subprocess
+import sys
+from collections import namedtuple
+
+# =============================================================================
+# Part 1: Basic Utility Functions
+# =============================================================================
+
+
+def run(command):
+    """
+    Execute shell command and return result
+    [Principle Explanation - Web Development Analogy]
+    This is like the fetch() function in frontend development, sending a request and getting a response.
+    - command: The command to execute (similar to a URL)
+    - returns: (return_code, stdout, stderr)
+    Args:
+        command: Command as string or list
+    Returns:
+        tuple: (return_code, stdout, stderr)
+    """
+    shell = True if isinstance(command, str) else False
+    try:
+        p = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,  # Capture standard output
+            stderr=subprocess.PIPE,  # Capture error output
+            shell=shell,
+        )
+        raw_output, raw_err = p.communicate()
+        rc = p.returncode
+        # Decode byte stream to string
+        output = raw_output.decode("utf-8").strip()
+        err = raw_err.decode("utf-8").strip()
+        return rc, output, err
+    except FileNotFoundError:
+        return 127, "", "Command not found"
+
+
+def run_and_read_all(run_lambda, command):
+    """Execute command, return output if successful, None otherwise"""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Execute command and extract first regex match"""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+
+# Check if PyTorch is available
+try:
+    import torch
+
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+
+# =============================================================================
+# Part 2: General System Information Collection (Reusing vLLM Original Logic)
+# =============================================================================
+
+
+def get_platform():
+    """Get operating system platform"""
+    if sys.platform.startswith("linux"):
+        return "linux"
+    elif sys.platform.startswith("win32"):
+        return "win32"
+    elif sys.platform.startswith("darwin"):
+        return "darwin"
+    return sys.platform
+
+
+def get_os(run_lambda):
+    """Get detailed operating system information"""
+    from platform import machine
+
+    if get_platform() == "linux":
+        # Try reading /etc/*-release
+        rc, out, _ = run_lambda(
+            "cat /etc/*-release 2>/dev/null | grep PRETTY_NAME | head -1"
+        )
+        if rc == 0 and out:
+            match = re.search(r'PRETTY_NAME="(.*)"', out)
+            if match:
+                return f"{match.group(1)} ({machine()})"
+        # Fallback: use lsb_release
+        rc, out, _ = run_lambda("lsb_release -d 2>/dev/null")
+        if rc == 0 and out:
+            match = re.search(r"Description:\s*(.*)", out)
+            if match:
+                return f"{match.group(1)} ({machine()})"
+    return f"{get_platform()} ({machine()})"
+
+
+def get_gcc_version(run_lambda):
+    """Get GCC version"""
+    return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
+
+
+def get_clang_version(run_lambda):
+    """Get Clang version"""
+    return run_and_parse_first_match(
+        run_lambda, "clang --version", r"clang version (.*)"
+    )
+
+
+def get_cmake_version(run_lambda):
+    """Get CMake version"""
+    return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
+
+
+def get_libc_version():
+    """Get libc version"""
+    import platform
+
+    if get_platform() != "linux":
+        return "N/A"
+    return "-".join(platform.libc_ver())
+
+
+def get_python_platform():
+    """Get Python platform information"""
+    import platform
+
+    return platform.platform()
+
+
+def get_cpu_info(run_lambda):
+    """Get CPU information"""
+    if get_platform() == "linux":
+        rc, out, err = run_lambda("lscpu")
+        return out if rc == 0 else err
+    return "N/A"
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Get pip package list"""
+    if patterns is None:
+        patterns = {
+            "torch",
+            "numpy",
+            "triton",
+            "transformers",
+            "vllm",
+            "kunlun",
+            "xpu",
+            "bkcl",
+            "xmlir",
+        }
+
+    cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
+    out = run_and_read_all(run_lambda, cmd)
+    if out is None:
+        return "pip3", ""
+
+    filtered = "\n".join(
+        line
+        for line in out.splitlines()
+        if any(name.lower() in line.lower() for name in patterns)
+    )
+    pip_version = "pip3" if sys.version[0] == "3" else "pip"
+    return pip_version, filtered
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    """Get conda package list"""
+    if patterns is None:
+        patterns = {
+            "torch",
+            "numpy",
+            "triton",
+            "transformers",
+            "kunlun",
+            "xpu",
+            "bkcl",
+            "xmlir",
+        }
+
+    conda = os.environ.get("CONDA_EXE", "conda")
+    out = run_and_read_all(run_lambda, [conda, "list"])
+    if out is None:
+        return None
+
+    return "\n".join(
+        line
+        for line in out.splitlines()
+        if not line.startswith("#")
+        and any(name.lower() in line.lower() for name in patterns)
+    )
+
+
+# =============================================================================
+# Part 3: Kunlun-Specific Information Collection (Core Fix)
+# =============================================================================
+
+
+def parse_xpu_smi_output(run_lambda):
+    """
+    Parse the complete output of xpu-smi command
+    [Principle Explanation]
+    The xpu-smi output format is similar to nvidia-smi, we need to parse it with regex.
+    Example output format:
+    +-----------------------------------------------------------------------------+
+    | XPU-SMI               Driver Version: 515.58       XPU-RT Version: N/A      |
+    |-------------------------------+----------------------+----------------------+
+    |   0  P800 OAM           N/A   | 00000000:52:00.0 N/A |                    0 |
+    | N/A   43C  N/A     85W / 400W |      4MiB / 98304MiB |      0%      Default |
+    Returns:
+        dict: Dictionary containing parsing results
+    """
+    rc, output, _ = run_lambda("xpu-smi")
+    if rc != 0 or not output:
+        return None
+
+    result = {
+        "raw_output": output,
+        "driver_version": None,
+        "xre_version": None,
+        "devices": [],
+    }
+
+    # Parse header: Driver Version and XPU-RT Version
+    # Format: | XPU-SMI               Driver Version: 515.58       XPU-RT Version: N/A      |
+    header_match = re.search(
+        r"Driver Version:\s*(\S+)\s+XPU-RT Version:\s*(\S+)", output
+    )
+    if header_match:
+        result["driver_version"] = header_match.group(1)
+        xre = header_match.group(2)
+        result["xre_version"] = xre if xre != "N/A" else None
+
+    # Parse device information
+    # Format: |   0  P800 OAM           N/A   | 00000000:52:00.0 N/A |
+    # Following: | N/A   43C  N/A     85W / 400W |      4MiB / 98304MiB |
+
+    # Find all device lines (containing device ID and name)
+    device_pattern = re.compile(
+        r"\|\s*(\d+)\s+(\S+(?:\s+\S+)?)\s+(?:N/A|On|Off)\s*\|"  # ID and Name
+        r"\s*([0-9a-fA-F:\.]+)\s*"  # Bus-Id
+    )
+
+    # Find memory information
+    memory_pattern = re.compile(
+        r"\|\s*N/A\s+\d+C\s+N/A\s+\d+W\s*/\s*\d+W\s*\|"
+        r"\s*(\d+)MiB\s*/\s*(\d+)MiB\s*\|"  # Memory-Usage / Total
+    )
+
+    lines = output.split("\n")
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        device_match = device_pattern.search(line)
+        if device_match:
+            device_id = int(device_match.group(1))
+            device_name = device_match.group(2).strip()
+            bus_id = device_match.group(3)
+
+            # Next line should have memory info
+            memory_used = 0
+            memory_total = 0
+            if i + 1 < len(lines):
+                mem_match = memory_pattern.search(lines[i + 1])
+                if mem_match:
+                    memory_used = int(mem_match.group(1))
+                    memory_total = int(mem_match.group(2))
+
+            result["devices"].append(
+                {
+                    "id": device_id,
+                    "name": device_name,  # This will correctly get "P800 OAM"
+                    "bus_id": bus_id,
+                    "memory_used_mib": memory_used,
+                    "memory_total_mib": memory_total,
+                }
+            )
+        i += 1
+
+    return result
+
+
+def get_kunlun_gpu_info(run_lambda):
+    """
+    Get Kunlun XPU device information
+    [Fix Explanation]
+    Previously used torch.cuda.get_device_properties() to get the name,
+    but it only returns "GPU" (because Kunlun masquerades as CUDA).
+    Now parse xpu-smi output to correctly get "P800 OAM".
+    Returns:
+        str: Device information string
+    """
+    parsed = parse_xpu_smi_output(run_lambda)
+
+    if parsed and parsed["devices"]:
+        # Get real device name from xpu-smi parsing
+        lines = []
+        for dev in parsed["devices"]:
+            memory_gb = dev["memory_total_mib"] / 1024
+            # Correctly display: XPU 0: P800 OAM (96.0GB)
+            lines.append(f"XPU {dev['id']}: {dev['name']} ({memory_gb:.1f}GB)")
+        return "\n".join(lines)
+
+    # Fallback: Use PyTorch interface (but will display as GPU)
+    if TORCH_AVAILABLE:
+        try:
+            device_count = torch.cuda.device_count()
+            lines = []
+            for i in range(device_count):
+                props = torch.cuda.get_device_properties(i)
+                name = props.name if hasattr(props, "name") else "Kunlun XPU"
+                memory_gb = (
+                    props.total_memory / (1024**3)
+                    if hasattr(props, "total_memory")
+                    else 0
+                )
+                lines.append(f"XPU {i}: {name} ({memory_gb:.1f}GB)")
+            return "\n".join(lines)
+        except Exception as e:
+            return f"Error: {e}"
+
+    return None
+
+
+def get_kunlun_driver_version(run_lambda):
+    """
+    Get Kunlun driver version
+    [Fix Explanation]
+    Parse directly from xpu-smi output header instead of calling incorrect commands.
+    Returns:
+        str: Driver version, e.g., "515.58"
+    """
+    parsed = parse_xpu_smi_output(run_lambda)
+    if parsed and parsed["driver_version"]:
+        return parsed["driver_version"]
+    return None
+
+
+def get_kunlun_xre_version(run_lambda):
+    """
+    Get Kunlun XRE (Runtime) version
+    [Fix Explanation]
+    Previously used `xpu-smi --version` but that parameter doesn't exist.
+    Now parse the "XPU-RT Version" field from xpu-smi standard output header.
+    Returns:
+        str: XRE version, or None (if showing N/A)
+    """
+    parsed = parse_xpu_smi_output(run_lambda)
+    if parsed and parsed["xre_version"]:
+        return parsed["xre_version"]
+    return "N/A (not installed or not detected)"
+
+
+def get_kunlun_topo(run_lambda):
+    """
+    Get Kunlun XPU topology information
+    Returns:
+        str: Topology information
+    """
+    # xpu-smi topo -m command can get topology
+    output = run_and_read_all(run_lambda, "xpu-smi topo -m")
+    if output:
+        return output
+
+    # Fallback: Show device count
+    if TORCH_AVAILABLE:
+        try:
+            count = torch.cuda.device_count()
+            return f"Detected {count} Kunlun XPU device(s)"
+        except Exception:
+            pass
+
+    return None
+
+
+def get_bkcl_version(run_lambda):
+    """
+    Get BKCL (communication library) version
+    [Principle Explanation]
+    BKCL = Baidu Kunlun Communication Library
+    Similar to NVIDIA's NCCL, used for multi-card communication.
+    Returns:
+        str: BKCL version information
+    """
+    # Method 1: From your logs, BKCL prints version when loading
+    # [WARN][BKCL][globals.cpp:268] xccl version: 6ab4ffb [rdma] ...
+    # We can try importing related modules
+    try:
+        # Try getting from torch_xmlir
+        import torch_xmlir
+
+        # Find path to libbkcl.so
+        bkcl_path = None
+        if hasattr(torch_xmlir, "__file__"):
+            import os
+
+            base = os.path.dirname(torch_xmlir.__file__)
+            candidate = os.path.join(base, "libbkcl.so")
+            if os.path.exists(candidate):
+                bkcl_path = candidate
+        if bkcl_path:
+            return f"Found at: {bkcl_path}"
+    except ImportError:
+        pass
+
+    # Method 2: Search from ldconfig
+    rc, out, _ = run_lambda("ldconfig -p 2>/dev/null | grep -i bkcl | head -1")
+    if rc == 0 and out:
+        return out
+
+    return None
+
+
+def get_vllm_kunlun_version():
+    """
+    Get vLLM-Kunlun version
+    [Fix Explanation]
+    Previously got hardcoded version "0.9.2" from vllm_kunlun.platforms.version,
+    but actual pip installed version is "0.1.0".
+    Now prioritize using importlib.metadata to get real installed version.
+    Returns:
+        str: Version number
+    """
+    # Method 1 (recommended): Use importlib.metadata (Python 3.8+)
+    try:
+        from importlib.metadata import version
+
+        return version("vllm-kunlun")
+    except Exception:
+        pass
+
+    # Method 2: Use pkg_resources
+    try:
+        import pkg_resources
+
+        return pkg_resources.get_distribution("vllm-kunlun").version
+    except Exception:
+        pass
+
+    # Method 3 (fallback): Get from code (may be inaccurate)
+    try:
+        from vllm_kunlun.platforms.version import get_xvllm_version
+
+        return get_xvllm_version() + " (from code, may be inaccurate)"
+    except ImportError:
+        pass
+
+    return "N/A"
+
+
+def get_vllm_version():
+    """Get vLLM main package version"""
+    try:
+        from importlib.metadata import version
+
+        return version("vllm")
+    except Exception:
+        pass
+
+    try:
+        from vllm import __version__
+
+        return __version__
+    except ImportError:
+        pass
+
+    return "N/A"
+
+
+# =============================================================================
+# Part 4: Environment Variable Collection
+# =============================================================================
+
+
+def get_kunlun_env_vars():
+    """Get Kunlun-related environment variables"""
+    env_vars = ""
+    kunlun_prefixes = (
+        "XPU",
+        "KUNLUN",
+        "BKCL",
+        "XCCL",
+        "XRE",
+        "TORCH",
+        "VLLM",
+    )
+    secret_terms = ("secret", "token", "api", "access", "password")
+
+    for k, v in sorted(os.environ.items()):
+        if any(term in k.lower() for term in secret_terms):
+            continue
+        if any(k.upper().startswith(prefix) for prefix in kunlun_prefixes):
+            env_vars += f"{k}={v}\n"
+
+    return env_vars
+
+
+# =============================================================================
+# Part 5: Define Data Structure and Formatted Output
+# =============================================================================
+
+KunlunSystemEnv = namedtuple(
+    "KunlunSystemEnv",
+    [
+        # General system information
+        "os",
+        "gcc_version",
+        "clang_version",
+        "cmake_version",
+        "libc_version",
+        "python_version",
+        "python_platform",
+        "pip_version",
+        "pip_packages",
+        "conda_packages",
+        "cpu_info",
+        # PyTorch information
+        "torch_version",
+        "is_debug_build",
+        # Kunlun-specific information
+        "kunlun_xpu_info",
+        "kunlun_driver_version",
+        "kunlun_xre_version",
+        "bkcl_version",
+        "kunlun_topo",
+        # vLLM related
+        "vllm_version",
+        "vllm_kunlun_version",
+        "env_vars",
+    ],
+)
+
+
+def get_kunlun_env_info():
+    """Collect all environment information"""
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    # PyTorch information
+    if TORCH_AVAILABLE:
+        torch_version = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+    else:
+        torch_version = "N/A"
+        debug_mode_str = "N/A"
+
+    sys_version = sys.version.replace("\n", " ")
+
+    return KunlunSystemEnv(
+        # General system information
+        os=get_os(run_lambda),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        libc_version=get_libc_version(),
+        python_version=f"{sys_version} ({sys.maxsize.bit_length() + 1}-bit runtime)",
+        python_platform=get_python_platform(),
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=get_conda_packages(run_lambda),
+        cpu_info=get_cpu_info(run_lambda),
+        # PyTorch information
+        torch_version=torch_version,
+        is_debug_build=debug_mode_str,
+        # Kunlun-specific information
+        kunlun_xpu_info=get_kunlun_gpu_info(run_lambda),
+        kunlun_driver_version=get_kunlun_driver_version(run_lambda),
+        kunlun_xre_version=get_kunlun_xre_version(run_lambda),
+        bkcl_version=get_bkcl_version(run_lambda),
+        kunlun_topo=get_kunlun_topo(run_lambda),
+        # vLLM related
+        vllm_version=get_vllm_version(),
+        vllm_kunlun_version=get_vllm_kunlun_version(),
+        env_vars=get_kunlun_env_vars(),
+    )
+
+
+# Output format template
+kunlun_env_info_fmt = """
+==============================
+        System Info
+==============================
+OS                           : {os}
+GCC version                  : {gcc_version}
+Clang version                : {clang_version}
+CMake version                : {cmake_version}
+Libc version                 : {libc_version}
+==============================
+       PyTorch Info
+==============================
+PyTorch version              : {torch_version}
+Is debug build               : {is_debug_build}
+==============================
+      Python Environment
+==============================
+Python version               : {python_version}
+Python platform              : {python_platform}
+==============================
+    Kunlun / XPU Info
+==============================
+XPU models and configuration :
+{kunlun_xpu_info}
+Kunlun driver version        : {kunlun_driver_version}
+XRE (Runtime) version        : {kunlun_xre_version}
+BKCL version                 : {bkcl_version}
+XPU Topology:
+{kunlun_topo}
+==============================
+          CPU Info
+==============================
+{cpu_info}
+==============================
+Versions of relevant libraries
+==============================
+{pip_packages}
+{conda_packages}
+==============================
+      vLLM-Kunlun Info
+==============================
+vLLM Version                 : {vllm_version}
+vLLM-Kunlun Version          : {vllm_kunlun_version}
+==============================
+     Environment Variables
+==============================
+{env_vars}
+""".strip()
+
+
+def pretty_str(envinfo):
+    """Format environment information"""
+    mutable_dict = envinfo._asdict()
+
+    # Replace None with "Could not collect"
+    for key in mutable_dict:
+        if mutable_dict[key] is None:
+            mutable_dict[key] = "Could not collect"
+
+    # Handle pip package list
+    if mutable_dict["pip_packages"]:
+        mutable_dict["pip_packages"] = "\n".join(
+            f"[{envinfo.pip_version}] {line}"
+            for line in mutable_dict["pip_packages"].split("\n")
+            if line
+        )
+    else:
+        mutable_dict["pip_packages"] = "No relevant packages"
+
+    # Handle conda package list
+    if mutable_dict["conda_packages"]:
+        mutable_dict["conda_packages"] = "\n".join(
+            f"[conda] {line}"
+            for line in mutable_dict["conda_packages"].split("\n")
+            if line
+        )
+    else:
+        mutable_dict["conda_packages"] = ""
+
+    return kunlun_env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_kunlun_env_info():
+    """Get formatted environment information"""
+    return pretty_str(get_kunlun_env_info())
+
+
+def main():
+    """Main entry point"""
+    print("Collecting Kunlun XPU environment information...")
+    output = get_pretty_kunlun_env_info()
+    print(output)
+
+
+if __name__ == "__main__":
+    main()