import importlib.util import io import logging import os import re import subprocess import sys from shutil import which from typing import Dict, List import torch import torch_musa from packaging.version import Version, parse from setuptools import Extension, find_packages, setup from setuptools.command.build_ext import build_ext from torch.utils.cpp_extension import CUDA_HOME from torch_musa.utils.simple_porting import SimplePorting from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension def load_module_from_path(module_name, path): spec = importlib.util.spec_from_file_location(module_name, path) module = importlib.util.module_from_spec(spec) sys.modules[module_name] = module spec.loader.exec_module(module) return module ROOT_DIR = os.path.dirname(__file__) logger = logging.getLogger(__name__) # cannot import envs directly because it depends on vllm, # which is not installed yet envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py')) VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE # vLLM only supports Linux platform assert sys.platform.startswith( "linux"), "vLLM only supports Linux platform (including WSL)." MAIN_CUDA_VERSION = "12.1" def is_sccache_available() -> bool: return which("sccache") is not None def is_ccache_available() -> bool: return which("ccache") is not None def is_ninja_available() -> bool: return which("ninja") is not None def remove_prefix(text, prefix): if text.startswith(prefix): return text[len(prefix):] return text class CMakeExtension(Extension): def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None: super().__init__(name, sources=[], **kwa) self.cmake_lists_dir = os.path.abspath(cmake_lists_dir) ext_modules = [] ext_modules.append( MUSAExtension( name="vllm_C", sources=[ "csrc_musa/cache_kernels.mu", "csrc_musa/attention/attention_kernels.mu", "csrc_musa/pos_encoding_kernels.mu", "csrc_musa/activation_kernels.mu", "csrc_musa/layernorm_kernels.mu", "csrc_musa/musa_utils_kernels.mu", "csrc_musa/moe_align_block_size_kernels.mu", "csrc_musa/pybind.cpp", "csrc_musa/custom_all_reduce.mu", ], extra_compile_args= {"cxx": ['-O3', '-std=c++17'],} ) ) class cmake_build_ext(build_ext): # A dict of extension directories that have been configured. did_config: Dict[str, bool] = {} # # Determine number of compilation jobs and optionally nvcc compile threads. # def compute_num_jobs(self): # `num_jobs` is either the value of the MAX_JOBS environment variable # (if defined) or the number of CPUs available. num_jobs = envs.MAX_JOBS if num_jobs is not None: num_jobs = int(num_jobs) logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs) else: try: # os.sched_getaffinity() isn't universally available, so fall # back to os.cpu_count() if we get an error here. num_jobs = len(os.sched_getaffinity(0)) except AttributeError: num_jobs = os.cpu_count() nvcc_threads = None return num_jobs, nvcc_threads # # Perform cmake configuration for a single extension. # def configure(self, ext: CMakeExtension) -> None: # If we've already configured using the CMakeLists.txt for # this extension, exit early. if ext.cmake_lists_dir in cmake_build_ext.did_config: return cmake_build_ext.did_config[ext.cmake_lists_dir] = True # Select the build type. # Note: optimization level + debug info are set by the build type default_cfg = "Debug" if self.debug else "RelWithDebInfo" cfg = envs.CMAKE_BUILD_TYPE or default_cfg # where .so files will be written, should be the same for all extensions # that use the same CMakeLists.txt. outdir = os.path.abspath( os.path.dirname(self.get_ext_fullpath(ext.name))) cmake_args = [ '-DCMAKE_BUILD_TYPE={}'.format(cfg), '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir), '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp), '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE), ] verbose = envs.VERBOSE # verbose = False if verbose: cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON'] if is_sccache_available(): cmake_args += [ '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache', '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache', ] elif is_ccache_available(): cmake_args += [ '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache', '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache', ] # Pass the python executable to cmake so it can find an exact # match. cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)] if _install_punica(): cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON'] # # Setup parallelism and build tool # num_jobs, nvcc_threads = self.compute_num_jobs() if nvcc_threads: cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)] if is_ninja_available(): build_tool = ['-G', 'Ninja'] cmake_args += [ '-DCMAKE_JOB_POOL_COMPILE:STRING=compile', '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs), ] else: # Default build tool to whatever cmake picks. build_tool = [] subprocess.check_call( ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], cwd=self.build_temp) def build_extensions(self) -> None: # Ensure that CMake is present and working try: subprocess.check_output(['cmake', '--version']) except OSError as e: raise RuntimeError('Cannot find CMake executable') from e # Create build directory if it does not exist. if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) # Build all the extensions for ext in self.extensions: self.configure(ext) ext_target_name = remove_prefix(ext.name, "vllm.") num_jobs, _ = self.compute_num_jobs() build_args = [ '--build', '.', '--target', ext_target_name, '-j', str(num_jobs) ] subprocess.check_call(['cmake', *build_args], cwd=self.build_temp) def _is_cuda() -> bool: return VLLM_TARGET_DEVICE == "cuda" \ and torch.version.cuda is not None \ and not _is_neuron() def _is_musa() -> bool: return VLLM_TARGET_DEVICE == "musa" \ and torch.version.musa is not None def _is_hip() -> bool: return (VLLM_TARGET_DEVICE == "cuda" or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None def _is_neuron() -> bool: torch_neuronx_installed = True try: subprocess.run(["neuron-ls"], capture_output=True, check=True) except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): torch_neuronx_installed = False return torch_neuronx_installed or envs.VLLM_BUILD_WITH_NEURON def _is_cpu() -> bool: return VLLM_TARGET_DEVICE == "cpu" def _install_punica() -> bool: return envs.VLLM_INSTALL_PUNICA_KERNELS def get_hipcc_rocm_version(): # Run the hipcc --version command result = subprocess.run(['hipcc', '--version'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # Check if the command was executed successfully if result.returncode != 0: print("Error running 'hipcc --version'") return None # Extract the version using a regular expression match = re.search(r'HIP version: (\S+)', result.stdout) if match: # Return the version string return match.group(1) else: print("Could not find HIP version in the output") return None def get_neuronxcc_version(): import sysconfig site_dir = sysconfig.get_paths()["purelib"] version_file = os.path.join(site_dir, "neuronxcc", "version", "__init__.py") # Check if the command was executed successfully with open(version_file, "rt") as fp: content = fp.read() # Extract the version using a regular expression match = re.search(r"__version__ = '(\S+)'", content) if match: # Return the version string return match.group(1) else: raise RuntimeError("Could not find HIP version in the output") def get_mcc_musa_version() -> Version: """Get the CUDA version from nvcc. Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py """ assert CUDA_HOME is not None, "CUDA_HOME is not set" nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True) output = nvcc_output.split() release_idx = output.index("release") + 1 nvcc_cuda_version = parse(output[release_idx].split(",")[0]) return nvcc_cuda_version def get_path(*filepath) -> str: return os.path.join(ROOT_DIR, *filepath) def find_version(filepath: str) -> str: """Extract version information from the given filepath. Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py """ with open(filepath) as fp: version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M) if version_match: return version_match.group(1) raise RuntimeError("Unable to find version string.") def get_vllm_version() -> str: version = find_version(get_path("vllm", "__init__.py")) if _is_cuda(): cuda_version = str(get_mcc_musa_version()) if cuda_version != MAIN_CUDA_VERSION: cuda_version_str = cuda_version.replace(".", "")[:3] version += f"+cu{cuda_version_str}" elif _is_musa(): version += "+musa" elif _is_hip(): # Get the HIP version hipcc_version = get_hipcc_rocm_version() if hipcc_version != MAIN_CUDA_VERSION: rocm_version_str = hipcc_version.replace(".", "")[:3] version += f"+rocm{rocm_version_str}" elif _is_neuron(): # Get the Neuron version neuron_version = str(get_neuronxcc_version()) if neuron_version != MAIN_CUDA_VERSION: neuron_version_str = neuron_version.replace(".", "")[:3] version += f"+neuron{neuron_version_str}" elif _is_cpu(): version += "+cpu" else: raise RuntimeError("Unknown runtime environment") return version def read_readme() -> str: """Read the README file if present.""" p = get_path("README.md") if os.path.isfile(p): return io.open(get_path("README.md"), "r", encoding="utf-8").read() else: return "" def get_requirements() -> List[str]: """Get Python package dependencies from requirements.txt.""" def _read_requirements(filename: str) -> List[str]: with open(get_path(filename)) as f: requirements = f.read().strip().split("\n") resolved_requirements = [] for line in requirements: if line.startswith("-r "): resolved_requirements += _read_requirements(line.split()[1]) else: resolved_requirements.append(line) return resolved_requirements if _is_cuda(): requirements = _read_requirements("requirements-cuda.txt") cuda_major = torch.version.cuda.split(".")[0] modified_requirements = [] for req in requirements: if "vllm-nccl-cu12" in req: modified_requirements.append( req.replace("vllm-nccl-cu12", f"vllm-nccl-cu{cuda_major}")) else: modified_requirements.append(req) requirements = modified_requirements elif _is_musa(): requirements = _read_requirements("requirements-musa.txt") elif _is_hip(): requirements = _read_requirements("requirements-rocm.txt") elif _is_neuron(): requirements = _read_requirements("requirements-neuron.txt") elif _is_cpu(): requirements = _read_requirements("requirements-cpu.txt") else: raise ValueError( "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.") return requirements # ext_modules = [] # if _is_cuda() or _is_musa(): # ext_modules.append(CMakeExtension(name="vllm._moe_C")) # if _install_punica(): # ext_modules.append(CMakeExtension(name="vllm._punica_C")) # if not _is_neuron(): # ext_modules.append(CMakeExtension(name="vllm._C")) package_data = { "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] } # if envs.VLLM_USE_PRECOMPILED: # ext_modules = [] # package_data["vllm"].append("*.so") setup( name="vllm", version=get_vllm_version(), author="vLLM Team", license="Apache 2.0", description=("A high-throughput and memory-efficient inference and " "serving engine for LLMs"), long_description=read_readme(), long_description_content_type="text/markdown", url="https://github.com/vllm-project/vllm", project_urls={ "Homepage": "https://github.com/vllm-project/vllm", "Documentation": "https://vllm.readthedocs.io/en/latest/", }, classifiers=[ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "License :: OSI Approved :: Apache Software License", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples", "tests*")), python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, extras_require={ "tensorizer": ["tensorizer==2.9.0"], }, cmdclass={"build_ext": BuildExtension} if ext_modules else {}, package_data=package_data, )