Files
xc-llm-ascend/setup.py
Rui Kang 427b17e2da [Misc] Add a model loader that utilizes HCCL for weight loading (#2888)
### What this PR does / why we need it?

This PR introduces a new model loader called Netloader, which leverages
high-bandwidth P2P direct transfer between NPU cards to achieve weight
loading. Netloader is implemented as a plugin through the newly added
'register_model_loader' function in vLLM 0.10. It facilitates the
process of weight loading by sending weights from a pre-loaded model
(server) to an empty model of a newly started instance (client). The
server operates concurrently with normal inference tasks through
sub-threads and the 'stateless_init_torch_distributed_process_group' in
vLLM. The client initiates a transfer request after verifying that the
model and partitioning method are the same as the server's, and uses
HCCL's collective communication (send/recv) to load the weights in the
order they are stored in the model.

Application Scenarios:
1. Significantly Reduces Inference Instance Startup Time By reusing the
weights of already loaded instances and performing high-speed transfers
directly between computing cards, this method reduces model loading
latency compared to traditional remote/local pull methods.
2. Reduces Network and Storage Pressure Avoids the need to repeatedly
download weight files from remote repositories, reducing the impact on
centralized storage and network traffic, thereby enhancing overall
system stability and service quality.
3. Improves Resource Utilization and Reduces Costs Accelerating the
loading process reduces reliance on redundant computing pools, allowing
computing resources to be elastically scaled and reclaimed as needed.
4. Enhances Business Continuity and High Availability In fault recovery
scenarios, new instances can quickly take over existing services,
avoiding prolonged business interruptions and improving the system's
high availability and user experience.

### Does this PR introduce _any_ user-facing change?

Netloader utilizes the existing --load-format=netloader and
--model-loader-extra-config to be activated. The
model-loader-extra-config needs to be input as a JSON string (as it is
now)

Afterwards, you can check whether the outputs for the same sentence are
consistent when the temperature is set to 0.

Signed-off-by: destinysky <kangrui10@126.com>

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: destinysky <kangrui10@126.com>
2025-10-23 15:56:07 +08:00

401 lines
14 KiB
Python

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from https://github.com/vllm-project/vllm/blob/main/setup.py
#
import importlib.util
import logging
import os
import subprocess
import sys
from sysconfig import get_paths
from typing import Dict, List
from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext
from setuptools.command.build_py import build_py
from setuptools.command.develop import develop
from setuptools.command.install import install
from setuptools_scm import get_version
def load_module_from_path(module_name, path):
spec = importlib.util.spec_from_file_location(module_name, path)
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
return module
ROOT_DIR = os.path.dirname(__file__)
logger = logging.getLogger(__name__)
def check_or_set_default_env(cmake_args,
env_name,
env_variable,
default_path=""):
if env_variable is None:
logging.warning(
f"No {env_name} found in your environment, pleause try to set {env_name} "
"if you customize the installation path of this library, otherwise default "
"path will be adapted during build this project")
logging.warning(f"Set default {env_name}: {default_path}")
env_variable = default_path
else:
logging.info(f"Found existing {env_name}: {env_variable}")
# cann package seems will check this environments in cmake, need write this env variable back.
if env_name == "ASCEND_HOME_PATH":
os.environ["ASCEND_HOME_PATH"] = env_variable
cmake_args += [f"-D{env_name}={env_variable}"]
return cmake_args
envs = load_module_from_path("envs",
os.path.join(ROOT_DIR, "vllm_ascend", "envs.py"))
class CMakeExtension(Extension):
def __init__(self,
name: str,
cmake_lists_dir: str = ".",
**kwargs) -> None:
super().__init__(name, sources=[], py_limited_api=False, **kwargs)
self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
class custom_build_info(build_py):
def run(self):
soc_version = envs.SOC_VERSION
if not soc_version:
raise ValueError(
"SOC version is not set. Please set SOC_VERSION environment variable."
)
if "310" in soc_version and not envs.COMPILE_CUSTOM_KERNELS:
raise ValueError(
"SOC version 310 only supports custom kernels. Please set COMPILE_CUSTOM_KERNELS=1 to enable custom kernels."
)
package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py")
with open(package_dir, "w+") as f:
f.write('# Auto-generated file\n')
f.write(f"__soc_version__ = '{soc_version}'\n")
f.write(
f"__sleep_mode_enabled__ = {envs.COMPILE_CUSTOM_KERNELS}\n")
logging.info(
f"Generated _build_info.py with SOC version: {soc_version}")
super().run()
class cmake_build_ext(build_ext):
# A dict of extension directories that have been configured.
did_config: Dict[str, bool] = {}
#
# Determine number of compilation jobs
#
def compute_num_jobs(self):
# `num_jobs` is either the value of the MAX_JOBS environment variable
# (if defined) or the number of CPUs available.
num_jobs = envs.MAX_JOBS
if num_jobs is not None:
num_jobs = int(num_jobs)
logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
else:
try:
# os.sched_getaffinity() isn't universally available, so fall
# back to os.cpu_count() if we get an error here.
num_jobs = len(os.sched_getaffinity(0))
except AttributeError:
num_jobs = os.cpu_count()
num_jobs = max(1, num_jobs)
return num_jobs
#
# Perform cmake configuration for a single extension.
#
def configure(self, ext: CMakeExtension) -> None:
build_temp = self.build_temp
os.makedirs(build_temp, exist_ok=True)
source_dir = os.path.abspath(ROOT_DIR)
python_executable = sys.executable
cmake_args = ["cmake"]
# Default use release mode to compile the csrc code
# Turbo now support compiled with Release, Debug and RelWithDebugInfo
if envs.CMAKE_BUILD_TYPE is None or envs.CMAKE_BUILD_TYPE not in [
"Debug",
"Release",
"RelWithDebugInfo",
]:
envs.CMAKE_BUILD_TYPE = "Release"
cmake_args += [f"-DCMAKE_BUILD_TYPE={envs.CMAKE_BUILD_TYPE}"]
# Default dump the compile commands for lsp
cmake_args += ["-DCMAKE_EXPORT_COMPILE_COMMANDS=1"]
if envs.CXX_COMPILER is not None:
cmake_args += [f"-DCMAKE_CXX_COMPILER={envs.CXX_COMPILER}"]
if envs.C_COMPILER is not None:
cmake_args += [f"-DCMAKE_C_COMPILER={envs.C_COMPILER}"]
if envs.VERBOSE:
cmake_args += ["-DCMAKE_VERBOSE_MAKEFILE=ON"]
# find ASCEND_HOME_PATH
check_or_set_default_env(
cmake_args,
"ASCEND_HOME_PATH",
envs.ASCEND_HOME_PATH,
"/usr/local/Ascend/ascend-toolkit/latest",
)
# find PYTHON_EXECUTABLE
check_or_set_default_env(cmake_args, "PYTHON_EXECUTABLE",
sys.executable)
# find PYTHON_INCLUDE_PATH
check_or_set_default_env(cmake_args, "PYTHON_INCLUDE_PATH",
get_paths()["include"])
# ccache and ninja can not be applied at ascendc kernels now
try:
# if pybind11 is installed via pip
pybind11_cmake_path = (subprocess.check_output(
[python_executable, "-m", "pybind11",
"--cmakedir"]).decode().strip())
except subprocess.CalledProcessError as e:
# else specify pybind11 path installed from source code on CI container
raise RuntimeError(f"CMake configuration failed: {e}")
install_path = os.path.join(ROOT_DIR, self.build_lib)
if isinstance(self.distribution.get_command_obj("develop"), develop):
install_path = os.path.join(ROOT_DIR, "vllm_ascend")
# add CMAKE_INSTALL_PATH
cmake_args += [f"-DCMAKE_INSTALL_PREFIX={install_path}"]
cmake_args += [f"-DCMAKE_PREFIX_PATH={pybind11_cmake_path}"]
cmake_args += [f"-DSOC_VERSION={envs.SOC_VERSION}"]
# Override the base directory for FetchContent downloads to $ROOT/.deps
# This allows sharing dependencies between profiles,
# and plays more nicely with sccache.
# To override this, set the FETCHCONTENT_BASE_DIR environment variable.
fc_base_dir = os.path.join(ROOT_DIR, ".deps")
fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)]
torch_npu_command = "python3 -m pip show torch-npu | grep '^Location:' | awk '{print $2}'"
try:
torch_npu_path = subprocess.check_output(
torch_npu_command, shell=True).decode().strip()
torch_npu_path += "/torch_npu"
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Retrieve torch version version failed: {e}")
# add TORCH_NPU_PATH
cmake_args += [f"-DTORCH_NPU_PATH={torch_npu_path}"]
build_tool = []
# TODO(ganyi): ninja and ccache support for ascend c auto codegen. now we can only use make build
# if which('ninja') is not None:
# build_tool += ['-G', 'Ninja']
# Default build tool to whatever cmake picks.
cmake_args += [source_dir]
logging.info(f"cmake config command: {cmake_args}")
try:
subprocess.check_call(cmake_args, cwd=self.build_temp)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"CMake configuration failed: {e}")
subprocess.check_call(
["cmake", ext.cmake_lists_dir, *build_tool, *cmake_args],
cwd=self.build_temp,
)
def build_extensions(self) -> None:
if not envs.COMPILE_CUSTOM_KERNELS:
return
# Ensure that CMake is present and working
try:
subprocess.check_output(["cmake", "--version"])
except OSError as e:
raise RuntimeError(f"Cannot find CMake executable: {e}")
# Create build directory if it does not exist.
if not os.path.exists(self.build_temp):
os.makedirs(self.build_temp)
targets = []
os.makedirs(os.path.join(self.build_lib, "vllm_ascend"), exist_ok=True)
def target_name(s: str) -> str:
return s.removeprefix("vllm_ascend.")
# Build all the extensions
for ext in self.extensions:
self.configure(ext)
targets.append(target_name(ext.name))
num_jobs = self.compute_num_jobs()
build_args = [
"--build",
".",
f"-j={num_jobs}",
*[f"--target={name}" for name in targets],
]
try:
subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
except OSError as e:
raise RuntimeError(f"Build library failed: {e}")
# Install the libraries
install_args = [
"cmake",
"--install",
".",
]
try:
subprocess.check_call(install_args, cwd=self.build_temp)
except OSError as e:
raise RuntimeError(f"Install library failed: {e}")
# copy back to build folder for editable build
if isinstance(self.distribution.get_command_obj("develop"), develop):
import shutil
for root, _, files in os.walk(self.build_temp):
for file in files:
if file.endswith(".so"):
src_path = os.path.join(root, file)
dst_path = os.path.join(self.build_lib, "vllm_ascend",
file)
shutil.copy(src_path, dst_path)
print(f"Copy: {src_path} -> {dst_path}")
def run(self):
# First, run the standard build_ext command to compile the extensions
super().run()
class custom_install(install):
def run(self):
self.run_command("build_ext")
install.run(self)
ROOT_DIR = os.path.dirname(__file__)
try:
VERSION = get_version(write_to="vllm_ascend/_version.py")
except LookupError:
# The checkout action in github action CI does not checkout the tag. It
# only checks out the commit. In this case, we set a dummy version.
VERSION = "0.0.0"
ext_modules = []
if envs.COMPILE_CUSTOM_KERNELS:
ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")]
def get_path(*filepath) -> str:
return os.path.join(ROOT_DIR, *filepath)
def read_readme() -> str:
"""Read the README file if present."""
p = get_path("README.md")
if os.path.isfile(p):
with open(get_path("README.md"), encoding="utf-8") as f:
return f.read()
else:
return ""
def get_requirements() -> List[str]:
"""Get Python package dependencies from requirements.txt."""
def _read_requirements(filename: str) -> List[str]:
with open(get_path(filename)) as f:
requirements = f.read().strip().split("\n")
resolved_requirements = []
for line in requirements:
if line.startswith("-r "):
resolved_requirements += _read_requirements(line.split()[1])
elif line.startswith("--"):
continue
else:
resolved_requirements.append(line)
return resolved_requirements
try:
requirements = _read_requirements("requirements.txt")
except ValueError:
print("Failed to read requirements.txt in vllm_ascend.")
return requirements
cmdclass = {
"build_py": custom_build_info,
"build_ext": cmake_build_ext,
"install": custom_install
}
setup(
name="vllm_ascend",
# Follow:
# https://packaging.python.org/en/latest/specifications/version-specifiers
version=VERSION,
author="vLLM-Ascend team",
license="Apache 2.0",
description="vLLM Ascend backend plugin",
long_description=read_readme(),
long_description_content_type="text/markdown",
url="https://github.com/vllm-project/vllm-ascend",
project_urls={
"Homepage": "https://github.com/vllm-project/vllm-ascend",
},
# TODO: Add 3.12 back when torch-npu support 3.12
classifiers=[
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: Apache Software License",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
],
packages=find_packages(exclude=("docs", "examples", "tests*", "csrc")),
python_requires=">=3.9",
install_requires=get_requirements(),
ext_modules=ext_modules,
cmdclass=cmdclass,
extras_require={},
entry_points={
"vllm.platform_plugins": ["ascend = vllm_ascend:register"],
"vllm.general_plugins": [
"ascend_enhanced_model = vllm_ascend:register_model",
"ascend_kv_connector = vllm_ascend:register_connector",
"ascend_model_loader = vllm_ascend:register_model_loader"
],
},
)