diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml index 6132b2b67..2dacbb18e 100644 --- a/.github/ISSUE_TEMPLATE/1-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml @@ -20,7 +20,7 @@ body: attributes: label: Reproduction description: | - 1. What command or script did you run? + What command or script did you run? placeholder: | A placeholder for the command. validations: @@ -29,7 +29,7 @@ body: attributes: label: Environment description: | - Please provide necessary environment information here. + Please provide necessary environment information here with `python3 -m sglang.check_env`. placeholder: Environment here. render: Shell validations: diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py new file mode 100644 index 000000000..825a51c75 --- /dev/null +++ b/python/sglang/check_env.py @@ -0,0 +1,163 @@ +import importlib +import os +import subprocess +import sys +from collections import OrderedDict, defaultdict + +import torch + +# List of packages to check versions for +PACKAGE_LIST = [ + "sglang", + "flashinfer", + "aiohttp", + "fastapi", + "hf_transfer", + "huggingface_hub", + "interegular", + "packaging", + "pillow", + "psutil", + "pydantic", + "rpyc", + "uvicorn", + "uvloop", + "zmq", + "vllm", + "outlines", + "openai", + "tiktoken", + "anthropic", + "litellm", +] + + +def get_package_versions(packages): + """ + Get versions of specified packages. + """ + versions = {} + for package in packages: + package_name = package.split("==")[0].split(">=")[0].split("<=")[0] + try: + module = importlib.import_module(package_name) + if hasattr(module, "__version__"): + versions[package_name] = module.__version__ + except ModuleNotFoundError: + versions[package_name] = "Module Not Found" + return versions + + +def get_cuda_info(): + """ + Get CUDA-related information if available. + """ + cuda_info = {"CUDA available": torch.cuda.is_available()} + + if cuda_info["CUDA available"]: + cuda_info.update(_get_gpu_info()) + cuda_info.update(_get_cuda_version_info()) + + return cuda_info + + +def _get_gpu_info(): + """ + Get information about available GPUs. + """ + devices = defaultdict(list) + for k in range(torch.cuda.device_count()): + devices[torch.cuda.get_device_name(k)].append(str(k)) + + return {f"GPU {','.join(device_ids)}": name for name, device_ids in devices.items()} + + +def _get_cuda_version_info(): + """ + Get CUDA version information. + """ + from torch.utils.cpp_extension import CUDA_HOME + + cuda_info = {"CUDA_HOME": CUDA_HOME} + + if CUDA_HOME and os.path.isdir(CUDA_HOME): + cuda_info.update(_get_nvcc_info()) + cuda_info.update(_get_cuda_driver_version()) + + return cuda_info + + +def _get_nvcc_info(): + """ + Get NVCC version information. + """ + from torch.utils.cpp_extension import CUDA_HOME + + try: + nvcc = os.path.join(CUDA_HOME, "bin/nvcc") + nvcc_output = ( + subprocess.check_output(f'"{nvcc}" -V', shell=True).decode("utf-8").strip() + ) + return { + "NVCC": nvcc_output[ + nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind("Build") + ].strip() + } + except subprocess.SubprocessError: + return {"NVCC": "Not Available"} + + +def _get_cuda_driver_version(): + """ + Get CUDA driver version. + """ + try: + output = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=driver_version", + "--format=csv,noheader,nounits", + ] + ) + return {"CUDA Driver Version": output.decode().strip()} + except subprocess.SubprocessError: + return {"CUDA Driver Version": "Not Available"} + + +def get_gpu_topology(): + """ + Get GPU topology information. + """ + try: + result = subprocess.run( + ["nvidia-smi", "topo", "-m"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + return "\n" + result.stdout if result.returncode == 0 else None + except subprocess.SubprocessError: + return None + + +def check_env(): + """ + Check and print environment information. + """ + env_info = OrderedDict() + env_info["Python"] = sys.version.replace("\n", "") + env_info.update(get_cuda_info()) + env_info["PyTorch"] = torch.__version__ + env_info.update(get_package_versions(PACKAGE_LIST)) + + gpu_topo = get_gpu_topology() + if gpu_topo: + env_info["NVIDIA Topology"] = gpu_topo + + for k, v in env_info.items(): + print(f"{k}: {v}") + + +if __name__ == "__main__": + check_env() diff --git a/python/sglang/srt/managers/controller/infer_batch.py b/python/sglang/srt/managers/controller/infer_batch.py index 5fd125756..6a03ba97c 100644 --- a/python/sglang/srt/managers/controller/infer_batch.py +++ b/python/sglang/srt/managers/controller/infer_batch.py @@ -327,8 +327,10 @@ class Batch: req_pool_indices = self.req_to_token_pool.alloc(bs) if req_pool_indices is None: - raise RuntimeError("Out of memory. " - "Please set a smaller number for `--max-running-requests`.") + raise RuntimeError( + "Out of memory. " + "Please set a smaller number for `--max-running-requests`." + ) req_pool_indices_cpu = req_pool_indices.cpu().numpy() for i in range(bs): diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index ff76189a9..dd576f7a6 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -168,7 +168,10 @@ class ModelRunner: ) self.req_to_token_pool = ReqToTokenPool( - max(int(self.max_total_num_tokens / self.model_config.context_len * 512), 2048), + max( + int(self.max_total_num_tokens / self.model_config.context_len * 512), + 2048, + ), self.model_config.context_len + 8, ) self.token_to_kv_pool = TokenToKVPool( diff --git a/python/sglang/srt/memory_pool.py b/python/sglang/srt/memory_pool.py index 7d1813c6a..573771334 100644 --- a/python/sglang/srt/memory_pool.py +++ b/python/sglang/srt/memory_pool.py @@ -44,7 +44,14 @@ class ReqToTokenPool: class TokenToKVPool: """A memory pool that maps a token to its kv cache locations""" - def __init__(self, size: int, dtype: torch.dtype, head_num: int, head_dim: int, layer_num: int): + def __init__( + self, + size: int, + dtype: torch.dtype, + head_num: int, + head_dim: int, + layer_num: int, + ): self.size = size # We also add one slot. This slot is used for writing dummy output from padded tokens.