[Misc] Add collect_env.py scripts for bug reporting (#175)

### What this PR does / why we need it? Add `collect_env.py` scripts from vLLM and remove `nvidia`, `gpu`, `cuda` related codes, thus users of vllm-ascend can collect their env info when reporting bugs. ### Does this PR introduce _any_ user-facing change? no. ### How was this patch tested? Run `python collect_env.py` works Signed-off-by: Shanshan Shen <467638484@qq.com>
2025-03-04 14:14:37 +08:00
parent 839dac8d60
commit b9f0e25c16
2 changed files with 504 additions and 3 deletions
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -14,9 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      npu-smi info
-      cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/collect_env.py
+++ b/collect_env.py
@@ -0,0 +1,503 @@
+#
+# Adapted from https://github.com/vllm-project/vllm/blob/main/collect_env.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import datetime
+import locale
+import os
+import re
+import subprocess
+import sys
+from collections import namedtuple
+
+from vllm.envs import environment_variables
+
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+# System Environment Information
+SystemEnv = namedtuple(
+    'SystemEnv',
+    [
+        'torch_version',
+        'is_debug_build',
+        'gcc_version',
+        'clang_version',
+        'cmake_version',
+        'os',
+        'libc_version',
+        'python_version',
+        'python_platform',
+        'pip_version',  # 'pip' or 'pip3'
+        'pip_packages',
+        'conda_packages',
+        'is_xnnpack_available',
+        'cpu_info',
+        'vllm_version',  # vllm specific field
+        'vllm_build_flags',  # vllm specific field
+        'env_vars',
+        'npu_info',  # ascend specific field
+        'cann_info',  # ascend specific field
+    ])
+
+DEFAULT_CONDA_PATTERNS = {
+    "torch",
+    "numpy",
+    "soumith",
+    "mkl",
+    "magma",
+    "triton",
+    "optree",
+    "transformers",
+    "zmq",
+    "pynvml",
+}
+
+DEFAULT_PIP_PATTERNS = {
+    "torch",
+    "numpy",
+    "mypy",
+    "flake8",
+    "triton",
+    "optree",
+    "onnx",
+    "transformers",
+    "zmq",
+    "pynvml",
+}
+
+
+def run(command):
+    """Return (return-code, stdout, stderr)."""
+    shell = True if type(command) is str else False
+    p = subprocess.Popen(command,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE,
+                         shell=shell)
+    raw_output, raw_err = p.communicate()
+    rc = p.returncode
+    if get_platform() == 'win32':
+        enc = 'oem'
+    else:
+        enc = locale.getpreferredencoding()
+    output = raw_output.decode(enc)
+    err = raw_err.decode(enc)
+    return rc, output.strip(), err.strip()
+
+
+def run_and_read_all(run_lambda, command):
+    """Run command using run_lambda; reads and returns entire output if rc is 0."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Run command using run_lambda, returns the first regex match if it exists."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+
+def run_and_return_first_line(run_lambda, command):
+    """Run command using run_lambda and returns first line if output is not empty."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out.split('\n')[0]
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    if patterns is None:
+        patterns = DEFAULT_CONDA_PATTERNS
+    conda = os.environ.get('CONDA_EXE', 'conda')
+    out = run_and_read_all(run_lambda, "{} list".format(conda))
+    if out is None:
+        return out
+
+    return "\n".join(line for line in out.splitlines()
+                     if not line.startswith("#") and any(name in line
+                                                         for name in patterns))
+
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
+
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'clang --version',
+                                     r'clang version (.*)')
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cmake --version',
+                                     r'cmake (.*)')
+
+
+def get_vllm_version():
+    from vllm import __version__, __version_tuple__
+
+    if __version__ == "dev":
+        return "N/A (dev)"
+
+    if len(__version_tuple__) == 4:  # dev build
+        git_sha = __version_tuple__[-1][1:]  # type: ignore
+        return f"{__version__} (git sha: {git_sha}"
+
+    return __version__
+
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, '', ''
+    if get_platform() == 'linux':
+        rc, out, err = run_lambda('lscpu')
+    elif get_platform() == 'win32':
+        rc, out, err = run_lambda(
+            'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE'
+        )
+    elif get_platform() == 'darwin':
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = 'None'
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
+def get_platform():
+    if sys.platform.startswith('linux'):
+        return 'linux'
+    elif sys.platform.startswith('win32'):
+        return 'win32'
+    elif sys.platform.startswith('cygwin'):
+        return 'cygwin'
+    elif sys.platform.startswith('darwin'):
+        return 'darwin'
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion',
+                                     r'(.*)')
+
+
+def get_windows_version(run_lambda):
+    system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+    wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
+    findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
+    return run_and_read_all(
+        run_lambda,
+        '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'lsb_release -a',
+                                     r'Description:\t(.*)')
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
+                                     r'PRETTY_NAME="(.*)"')
+
+
+def get_os(run_lambda):
+    from platform import machine
+    platform = get_platform()
+
+    if platform == 'win32' or platform == 'cygwin':
+        return get_windows_version(run_lambda)
+
+    if platform == 'darwin':
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return 'macOS {} ({})'.format(version, machine())
+
+    if platform == 'linux':
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        return '{} ({})'.format(platform, machine())
+
+    # Unknown platform
+    return platform
+
+
+def get_python_platform():
+    import platform
+    return platform.platform()
+
+
+def get_libc_version():
+    import platform
+    if get_platform() != 'linux':
+        return 'N/A'
+    return '-'.join(platform.libc_ver())
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
+    if patterns is None:
+        patterns = DEFAULT_PIP_PATTERNS
+
+    # People generally have `pip` as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    def run_with_pip(pip):
+        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+        return "\n".join(line for line in out.splitlines()
+                         if any(name in line for name in patterns))
+
+    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
+    out = run_with_pip([sys.executable, '-mpip'])
+
+    return pip_version, out
+
+
+def summarize_vllm_build_flags():
+    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
+    return 'ROCm: {}; Neuron: {}'.format(
+        'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
+        'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
+    )
+
+
+def is_xnnpack_available():
+    if TORCH_AVAILABLE:
+        import torch.backends.xnnpack
+        return str(
+            torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+    else:
+        return "N/A"
+
+
+def get_npu_info(run_lambda):
+    return run_and_read_all(run_lambda, 'npu-smi info')
+
+
+def get_cann_info(run_lambda):
+    out = run_and_read_all(run_lambda, 'lscpu | grep Architecture:')
+    cpu_arch = str(out).split()[-1]
+    return run_and_read_all(
+        run_lambda,
+        'cat /usr/local/Ascend/ascend-toolkit/latest/{}-linux/ascend_toolkit_install.info'
+        .format(cpu_arch))
+
+
+def get_env_vars():
+    env_vars = ''
+    secret_terms = ('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "PYTORCH", "ASCEND_")
+    for k, v in os.environ.items():
+        if any(term in k.lower() for term in secret_terms):
+            continue
+        if k in environment_variables:
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+        if k.startswith(report_prefix):
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+    return env_vars
+
+
+def get_env_info():
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    if TORCH_AVAILABLE:
+        version_str = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+    else:
+        version_str = debug_mode_str = 'N/A'
+
+    sys_version = sys.version.replace("\n", " ")
+
+    conda_packages = get_conda_packages(run_lambda)
+
+    vllm_version = get_vllm_version()
+    vllm_build_flags = summarize_vllm_build_flags()
+
+    return SystemEnv(
+        torch_version=version_str,
+        is_debug_build=debug_mode_str,
+        python_version='{} ({}-bit runtime)'.format(
+            sys_version,
+            sys.maxsize.bit_length() + 1),
+        python_platform=get_python_platform(),
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=conda_packages,
+        os=get_os(run_lambda),
+        libc_version=get_libc_version(),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        is_xnnpack_available=is_xnnpack_available(),
+        cpu_info=get_cpu_info(run_lambda),
+        vllm_version=vllm_version,
+        vllm_build_flags=vllm_build_flags,
+        env_vars=get_env_vars(),
+        npu_info=get_npu_info(run_lambda),
+        cann_info=get_cann_info(run_lambda),
+    )
+
+
+env_info_fmt = """
+PyTorch version: {torch_version}
+Is debug build: {is_debug_build}
+
+OS: {os}
+GCC version: {gcc_version}
+Clang version: {clang_version}
+CMake version: {cmake_version}
+Libc version: {libc_version}
+
+Python version: {python_version}
+Python platform: {python_platform}
+Is XNNPACK available: {is_xnnpack_available}
+
+CPU:
+{cpu_info}
+
+Versions of relevant libraries:
+{pip_packages}
+{conda_packages}
+""".strip()
+
+# both the above code and the following code use `strip()` to
+# remove leading/trailing whitespaces, so we need to add a newline
+# in between to separate the two sections
+env_info_fmt += "\n"
+
+env_info_fmt += """
+vLLM Version: {vllm_version}
+vLLM Build Flags:
+{vllm_build_flags}
+
+{env_vars}
+NPU:
+{npu_info}
+
+CANN:
+{cann_info}
+""".strip()
+
+
+def pretty_str(envinfo):
+
+    def replace_nones(dct, replacement='Could not collect'):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true='Yes', false='No'):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag='[prepend]'):
+        lines = text.split('\n')
+        updated_lines = [tag + line for line in lines]
+        return '\n'.join(updated_lines)
+
+    def replace_if_empty(text, replacement='No relevant packages'):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split('\n')) > 1:
+            return '\n{}\n'.format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict['pip_packages'] = replace_if_empty(
+        mutable_dict['pip_packages'])
+    mutable_dict['conda_packages'] = replace_if_empty(
+        mutable_dict['conda_packages'])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict['pip_packages']:
+        mutable_dict['pip_packages'] = prepend(
+            mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version))
+    if mutable_dict['conda_packages']:
+        mutable_dict['conda_packages'] = prepend(
+            mutable_dict['conda_packages'], '[conda] ')
+    mutable_dict['cpu_info'] = envinfo.cpu_info
+    mutable_dict['npu_info'] = envinfo.npu_info
+    mutable_dict['cann_info'] = envinfo.cann_info
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(
+            torch.utils, '_crash_handler'):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [
+                os.path.join(minidump_dir, dump)
+                for dump in os.listdir(minidump_dir)
+            ]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
+                '%Y-%m-%d %H:%M:%S')
+            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
+                  "if this is related to your bug please include it when you file a report ***"
+            print(msg, file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()