# SPDX-License-Identifier: Apache-2.0
# vLLM-Kunlun Environment Information Collection Tool (Fixed Version)
"""
Environment information collection script for Kunlun XPU
Fixed the following issues:
1. Device name displayed as "GPU" → Now correctly shows "P800 OAM"
2. XRE version command error → Now parsed from xpu-smi output
3. vLLM-Kunlun version hardcoded → Now fetched from pip package metadata
"""

import os
import re
import subprocess
import sys
from collections import namedtuple

# =============================================================================
# Part 1: Basic Utility Functions
# =============================================================================


def run(command):
    """
    Execute shell command and return result
    [Principle Explanation - Web Development Analogy]
    This is like the fetch() function in frontend development, sending a request and getting a response.
    - command: The command to execute (similar to a URL)
    - returns: (return_code, stdout, stderr)
    Args:
        command: Command as string or list
    Returns:
        tuple: (return_code, stdout, stderr)
    """
    shell = True if isinstance(command, str) else False
    try:
        p = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,  # Capture standard output
            stderr=subprocess.PIPE,  # Capture error output
            shell=shell,
        )
        raw_output, raw_err = p.communicate()
        rc = p.returncode
        # Decode byte stream to string
        output = raw_output.decode("utf-8").strip()
        err = raw_err.decode("utf-8").strip()
        return rc, output, err
    except FileNotFoundError:
        return 127, "", "Command not found"


def run_and_read_all(run_lambda, command):
    """Execute command, return output if successful, None otherwise"""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    return out


def run_and_parse_first_match(run_lambda, command, regex):
    """Execute command and extract first regex match"""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    match = re.search(regex, out)
    if match is None:
        return None
    return match.group(1)


# Check if PyTorch is available
try:
    import torch

    TORCH_AVAILABLE = True
except (ImportError, NameError, AttributeError, OSError):
    TORCH_AVAILABLE = False


# =============================================================================
# Part 2: General System Information Collection (Reusing vLLM Original Logic)
# =============================================================================


def get_platform():
    """Get operating system platform"""
    if sys.platform.startswith("linux"):
        return "linux"
    elif sys.platform.startswith("win32"):
        return "win32"
    elif sys.platform.startswith("darwin"):
        return "darwin"
    return sys.platform


def get_os(run_lambda):
    """Get detailed operating system information"""
    from platform import machine

    if get_platform() == "linux":
        # Try reading /etc/*-release
        rc, out, _ = run_lambda(
            "cat /etc/*-release 2>/dev/null | grep PRETTY_NAME | head -1"
        )
        if rc == 0 and out:
            match = re.search(r'PRETTY_NAME="(.*)"', out)
            if match:
                return f"{match.group(1)} ({machine()})"
        # Fallback: use lsb_release
        rc, out, _ = run_lambda("lsb_release -d 2>/dev/null")
        if rc == 0 and out:
            match = re.search(r"Description:\s*(.*)", out)
            if match:
                return f"{match.group(1)} ({machine()})"
    return f"{get_platform()} ({machine()})"


def get_gcc_version(run_lambda):
    """Get GCC version"""
    return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")


def get_clang_version(run_lambda):
    """Get Clang version"""
    return run_and_parse_first_match(
        run_lambda, "clang --version", r"clang version (.*)"
    )


def get_cmake_version(run_lambda):
    """Get CMake version"""
    return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")


def get_libc_version():
    """Get libc version"""
    import platform

    if get_platform() != "linux":
        return "N/A"
    return "-".join(platform.libc_ver())


def get_python_platform():
    """Get Python platform information"""
    import platform

    return platform.platform()


def get_cpu_info(run_lambda):
    """Get CPU information"""
    if get_platform() == "linux":
        rc, out, err = run_lambda("lscpu")
        return out if rc == 0 else err
    return "N/A"


def get_pip_packages(run_lambda, patterns=None):
    """Get pip package list"""
    if patterns is None:
        patterns = {
            "torch",
            "numpy",
            "triton",
            "transformers",
            "vllm",
            "kunlun",
            "xpu",
            "bkcl",
            "xmlir",
        }

    cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
    out = run_and_read_all(run_lambda, cmd)
    if out is None:
        return "pip3", ""

    filtered = "\n".join(
        line
        for line in out.splitlines()
        if any(name.lower() in line.lower() for name in patterns)
    )
    pip_version = "pip3" if sys.version[0] == "3" else "pip"
    return pip_version, filtered


def get_conda_packages(run_lambda, patterns=None):
    """Get conda package list"""
    if patterns is None:
        patterns = {
            "torch",
            "numpy",
            "triton",
            "transformers",
            "kunlun",
            "xpu",
            "bkcl",
            "xmlir",
        }

    conda = os.environ.get("CONDA_EXE", "conda")
    out = run_and_read_all(run_lambda, [conda, "list"])
    if out is None:
        return None

    return "\n".join(
        line
        for line in out.splitlines()
        if not line.startswith("#")
        and any(name.lower() in line.lower() for name in patterns)
    )


# =============================================================================
# Part 3: Kunlun-Specific Information Collection (Core Fix)
# =============================================================================


def parse_xpu_smi_output(run_lambda):
    """
    Parse the complete output of xpu-smi command
    [Principle Explanation]
    The xpu-smi output format is similar to nvidia-smi, we need to parse it with regex.
    Example output format:
    +-----------------------------------------------------------------------------+
    | XPU-SMI               Driver Version: 515.58       XPU-RT Version: N/A      |
    |-------------------------------+----------------------+----------------------+
    |   0  P800 OAM           N/A   | 00000000:52:00.0 N/A |                    0 |
    | N/A   43C  N/A     85W / 400W |      4MiB / 98304MiB |      0%      Default |
    Returns:
        dict: Dictionary containing parsing results
    """
    rc, output, _ = run_lambda("xpu-smi")
    if rc != 0 or not output:
        return None

    result = {
        "raw_output": output,
        "driver_version": None,
        "xre_version": None,
        "devices": [],
    }

    # Parse header: Driver Version and XPU-RT Version
    # Format: | XPU-SMI               Driver Version: 515.58       XPU-RT Version: N/A      |
    header_match = re.search(
        r"Driver Version:\s*(\S+)\s+XPU-RT Version:\s*(\S+)", output
    )
    if header_match:
        result["driver_version"] = header_match.group(1)
        xre = header_match.group(2)
        result["xre_version"] = xre if xre != "N/A" else None

    # Parse device information
    # Format: |   0  P800 OAM           N/A   | 00000000:52:00.0 N/A |
    # Following: | N/A   43C  N/A     85W / 400W |      4MiB / 98304MiB |

    # Find all device lines (containing device ID and name)
    device_pattern = re.compile(
        r"\|\s*(\d+)\s+(\S+(?:\s+\S+)?)\s+(?:N/A|On|Off)\s*\|"  # ID and Name
        r"\s*([0-9a-fA-F:\.]+)\s*"  # Bus-Id
    )

    # Find memory information
    memory_pattern = re.compile(
        r"\|\s*N/A\s+\d+C\s+N/A\s+\d+W\s*/\s*\d+W\s*\|"
        r"\s*(\d+)MiB\s*/\s*(\d+)MiB\s*\|"  # Memory-Usage / Total
    )

    lines = output.split("\n")
    i = 0
    while i < len(lines):
        line = lines[i]
        device_match = device_pattern.search(line)
        if device_match:
            device_id = int(device_match.group(1))
            device_name = device_match.group(2).strip()
            bus_id = device_match.group(3)

            # Next line should have memory info
            memory_used = 0
            memory_total = 0
            if i + 1 < len(lines):
                mem_match = memory_pattern.search(lines[i + 1])
                if mem_match:
                    memory_used = int(mem_match.group(1))
                    memory_total = int(mem_match.group(2))

            result["devices"].append(
                {
                    "id": device_id,
                    "name": device_name,  # This will correctly get "P800 OAM"
                    "bus_id": bus_id,
                    "memory_used_mib": memory_used,
                    "memory_total_mib": memory_total,
                }
            )
        i += 1

    return result


def get_kunlun_gpu_info(run_lambda):
    """
    Get Kunlun XPU device information
    [Fix Explanation]
    Previously used torch.cuda.get_device_properties() to get the name,
    but it only returns "GPU" (because Kunlun masquerades as CUDA).
    Now parse xpu-smi output to correctly get "P800 OAM".
    Returns:
        str: Device information string
    """
    parsed = parse_xpu_smi_output(run_lambda)

    if parsed and parsed["devices"]:
        # Get real device name from xpu-smi parsing
        lines = []
        for dev in parsed["devices"]:
            memory_gb = dev["memory_total_mib"] / 1024
            # Correctly display: XPU 0: P800 OAM (96.0GB)
            lines.append(f"XPU {dev['id']}: {dev['name']} ({memory_gb:.1f}GB)")
        return "\n".join(lines)

    # Fallback: Use PyTorch interface (but will display as GPU)
    if TORCH_AVAILABLE:
        try:
            device_count = torch.cuda.device_count()
            lines = []
            for i in range(device_count):
                props = torch.cuda.get_device_properties(i)
                name = props.name if hasattr(props, "name") else "Kunlun XPU"
                memory_gb = (
                    props.total_memory / (1024**3)
                    if hasattr(props, "total_memory")
                    else 0
                )
                lines.append(f"XPU {i}: {name} ({memory_gb:.1f}GB)")
            return "\n".join(lines)
        except Exception as e:
            return f"Error: {e}"

    return None


def get_kunlun_driver_version(run_lambda):
    """
    Get Kunlun driver version
    [Fix Explanation]
    Parse directly from xpu-smi output header instead of calling incorrect commands.
    Returns:
        str: Driver version, e.g., "515.58"
    """
    parsed = parse_xpu_smi_output(run_lambda)
    if parsed and parsed["driver_version"]:
        return parsed["driver_version"]
    return None


def get_kunlun_xre_version(run_lambda):
    """
    Get Kunlun XRE (Runtime) version
    [Fix Explanation]
    Previously used `xpu-smi --version` but that parameter doesn't exist.
    Now parse the "XPU-RT Version" field from xpu-smi standard output header.
    Returns:
        str: XRE version, or None (if showing N/A)
    """
    parsed = parse_xpu_smi_output(run_lambda)
    if parsed and parsed["xre_version"]:
        return parsed["xre_version"]
    return "N/A (not installed or not detected)"


def get_kunlun_topo(run_lambda):
    """
    Get Kunlun XPU topology information
    Returns:
        str: Topology information
    """
    # xpu-smi topo -m command can get topology
    output = run_and_read_all(run_lambda, "xpu-smi topo -m")
    if output:
        return output

    # Fallback: Show device count
    if TORCH_AVAILABLE:
        try:
            count = torch.cuda.device_count()
            return f"Detected {count} Kunlun XPU device(s)"
        except Exception:
            pass

    return None


def get_bkcl_version(run_lambda):
    """
    Get BKCL (communication library) version
    [Principle Explanation]
    BKCL = Baidu Kunlun Communication Library
    Similar to NVIDIA's NCCL, used for multi-card communication.
    Returns:
        str: BKCL version information
    """
    # Method 1: From your logs, BKCL prints version when loading
    # [WARN][BKCL][globals.cpp:268] xccl version: 6ab4ffb [rdma] ...
    # We can try importing related modules
    try:
        # Try getting from torch_xmlir
        import torch_xmlir

        # Find path to libbkcl.so
        bkcl_path = None
        if hasattr(torch_xmlir, "__file__"):
            import os

            base = os.path.dirname(torch_xmlir.__file__)
            candidate = os.path.join(base, "libbkcl.so")
            if os.path.exists(candidate):
                bkcl_path = candidate
        if bkcl_path:
            return f"Found at: {bkcl_path}"
    except ImportError:
        pass

    # Method 2: Search from ldconfig
    rc, out, _ = run_lambda("ldconfig -p 2>/dev/null | grep -i bkcl | head -1")
    if rc == 0 and out:
        return out

    return None


def get_vllm_kunlun_version():
    """
    Get vLLM-Kunlun version
    [Fix Explanation]
    Previously got hardcoded version "0.9.2" from vllm_kunlun.platforms.version,
    but actual pip installed version is "0.1.0".
    Now prioritize using importlib.metadata to get real installed version.
    Returns:
        str: Version number
    """
    # Method 1 (recommended): Use importlib.metadata (Python 3.8+)
    try:
        from importlib.metadata import version

        return version("vllm-kunlun")
    except Exception:
        pass

    # Method 2: Use pkg_resources
    try:
        import pkg_resources

        return pkg_resources.get_distribution("vllm-kunlun").version
    except Exception:
        pass

    # Method 3 (fallback): Get from code (may be inaccurate)
    try:
        from vllm_kunlun.platforms.version import get_xvllm_version

        return get_xvllm_version() + " (from code, may be inaccurate)"
    except ImportError:
        pass

    return "N/A"


def get_vllm_version():
    """Get vLLM main package version"""
    try:
        from importlib.metadata import version

        return version("vllm")
    except Exception:
        pass

    try:
        from vllm import __version__

        return __version__
    except ImportError:
        pass

    return "N/A"


# =============================================================================
# Part 4: Environment Variable Collection
# =============================================================================


def get_kunlun_env_vars():
    """Get Kunlun-related environment variables"""
    env_vars = ""
    kunlun_prefixes = (
        "XPU",
        "KUNLUN",
        "BKCL",
        "XCCL",
        "XRE",
        "TORCH",
        "VLLM",
    )
    secret_terms = ("secret", "token", "api", "access", "password")

    for k, v in sorted(os.environ.items()):
        if any(term in k.lower() for term in secret_terms):
            continue
        if any(k.upper().startswith(prefix) for prefix in kunlun_prefixes):
            env_vars += f"{k}={v}\n"

    return env_vars


# =============================================================================
# Part 5: Define Data Structure and Formatted Output
# =============================================================================

KunlunSystemEnv = namedtuple(
    "KunlunSystemEnv",
    [
        # General system information
        "os",
        "gcc_version",
        "clang_version",
        "cmake_version",
        "libc_version",
        "python_version",
        "python_platform",
        "pip_version",
        "pip_packages",
        "conda_packages",
        "cpu_info",
        # PyTorch information
        "torch_version",
        "is_debug_build",
        # Kunlun-specific information
        "kunlun_xpu_info",
        "kunlun_driver_version",
        "kunlun_xre_version",
        "bkcl_version",
        "kunlun_topo",
        # vLLM related
        "vllm_version",
        "vllm_kunlun_version",
        "env_vars",
    ],
)


def get_kunlun_env_info():
    """Collect all environment information"""
    run_lambda = run
    pip_version, pip_list_output = get_pip_packages(run_lambda)

    # PyTorch information
    if TORCH_AVAILABLE:
        torch_version = torch.__version__
        debug_mode_str = str(torch.version.debug)
    else:
        torch_version = "N/A"
        debug_mode_str = "N/A"

    sys_version = sys.version.replace("\n", " ")

    return KunlunSystemEnv(
        # General system information
        os=get_os(run_lambda),
        gcc_version=get_gcc_version(run_lambda),
        clang_version=get_clang_version(run_lambda),
        cmake_version=get_cmake_version(run_lambda),
        libc_version=get_libc_version(),
        python_version=f"{sys_version} ({sys.maxsize.bit_length() + 1}-bit runtime)",
        python_platform=get_python_platform(),
        pip_version=pip_version,
        pip_packages=pip_list_output,
        conda_packages=get_conda_packages(run_lambda),
        cpu_info=get_cpu_info(run_lambda),
        # PyTorch information
        torch_version=torch_version,
        is_debug_build=debug_mode_str,
        # Kunlun-specific information
        kunlun_xpu_info=get_kunlun_gpu_info(run_lambda),
        kunlun_driver_version=get_kunlun_driver_version(run_lambda),
        kunlun_xre_version=get_kunlun_xre_version(run_lambda),
        bkcl_version=get_bkcl_version(run_lambda),
        kunlun_topo=get_kunlun_topo(run_lambda),
        # vLLM related
        vllm_version=get_vllm_version(),
        vllm_kunlun_version=get_vllm_kunlun_version(),
        env_vars=get_kunlun_env_vars(),
    )


# Output format template
kunlun_env_info_fmt = """
==============================
        System Info
==============================
OS                           : {os}
GCC version                  : {gcc_version}
Clang version                : {clang_version}
CMake version                : {cmake_version}
Libc version                 : {libc_version}
==============================
       PyTorch Info
==============================
PyTorch version              : {torch_version}
Is debug build               : {is_debug_build}
==============================
      Python Environment
==============================
Python version               : {python_version}
Python platform              : {python_platform}
==============================
    Kunlun / XPU Info
==============================
XPU models and configuration :
{kunlun_xpu_info}
Kunlun driver version        : {kunlun_driver_version}
XRE (Runtime) version        : {kunlun_xre_version}
BKCL version                 : {bkcl_version}
XPU Topology:
{kunlun_topo}
==============================
          CPU Info
==============================
{cpu_info}
==============================
Versions of relevant libraries
==============================
{pip_packages}
{conda_packages}
==============================
      vLLM-Kunlun Info
==============================
vLLM Version                 : {vllm_version}
vLLM-Kunlun Version          : {vllm_kunlun_version}
==============================
     Environment Variables
==============================
{env_vars}
""".strip()


def pretty_str(envinfo):
    """Format environment information"""
    mutable_dict = envinfo._asdict()

    # Replace None with "Could not collect"
    for key in mutable_dict:
        if mutable_dict[key] is None:
            mutable_dict[key] = "Could not collect"

    # Handle pip package list
    if mutable_dict["pip_packages"]:
        mutable_dict["pip_packages"] = "\n".join(
            f"[{envinfo.pip_version}] {line}"
            for line in mutable_dict["pip_packages"].split("\n")
            if line
        )
    else:
        mutable_dict["pip_packages"] = "No relevant packages"

    # Handle conda package list
    if mutable_dict["conda_packages"]:
        mutable_dict["conda_packages"] = "\n".join(
            f"[conda] {line}"
            for line in mutable_dict["conda_packages"].split("\n")
            if line
        )
    else:
        mutable_dict["conda_packages"] = ""

    return kunlun_env_info_fmt.format(**mutable_dict)


def get_pretty_kunlun_env_info():
    """Get formatted environment information"""
    return pretty_str(get_kunlun_env_info())


def main():
    """Main entry point"""
    print("Collecting Kunlun XPU environment information...")
    output = get_pretty_kunlun_env_info()
    print(output)


if __name__ == "__main__":
    main()