diff --git a/.github/Dockerfile.buildwheel b/.github/Dockerfile.buildwheel index 3374e8b9..cf5eaf2a 100644 --- a/.github/Dockerfile.buildwheel +++ b/.github/Dockerfile.buildwheel @@ -17,12 +17,10 @@ ARG PY_VERSION=3.11 FROM quay.io/ascend/manylinux:8.3.rc2-910b-manylinux_2_28-py${PY_VERSION} -ARG COMPILE_CUSTOM_KERNELS=1 ARG SOC_VERSION="ascend910b1" # Define environments ENV DEBIAN_FRONTEND=noninteractive -ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} ENV SOC_VERSION=$SOC_VERSION RUN yum update -y && \ yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ diff --git a/.github/Dockerfile.nightly.a2 b/.github/Dockerfile.nightly.a2 index e7e395f6..a7945bdd 100644 --- a/.github/Dockerfile.nightly.a2 +++ b/.github/Dockerfile.nightly.a2 @@ -23,7 +23,6 @@ ARG AIS_BENCH_URL="https://gitee.com/aisbench/benchmark.git" # Define environments ENV DEBIAN_FRONTEND=noninteractive -ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} WORKDIR /workspace diff --git a/.github/Dockerfile.nightly.a3 b/.github/Dockerfile.nightly.a3 index 0012c544..70705b32 100644 --- a/.github/Dockerfile.nightly.a3 +++ b/.github/Dockerfile.nightly.a3 @@ -23,7 +23,6 @@ ARG AIS_BENCH_URL="https://gitee.com/aisbench/benchmark.git" # Define environments ENV DEBIAN_FRONTEND=noninteractive -ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} WORKDIR /workspace diff --git a/Dockerfile b/Dockerfile index 9511698a..bff22191 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,13 +18,11 @@ FROM quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" -ARG COMPILE_CUSTOM_KERNELS=1 ARG MOONCAKE_TAG="v0.3.7.post2" ARG SOC_VERSION="ascend910b1" # Define environments ENV DEBIAN_FRONTEND=noninteractive -ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} ENV SOC_VERSION=$SOC_VERSION WORKDIR /workspace diff --git a/Dockerfile.310p b/Dockerfile.310p index 0245bb87..44476d87 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -18,12 +18,10 @@ FROM quay.io/ascend/cann:8.3.rc2-310p-ubuntu22.04-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" -ARG COMPILE_CUSTOM_KERNELS=1 ARG SOC_VERSION="ascend310p1" # Define environments ENV DEBIAN_FRONTEND=noninteractive -ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} ENV SOC_VERSION=$SOC_VERSION RUN apt-get update -y && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index 13e38df5..6d03629a 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -18,10 +18,8 @@ FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" -ARG COMPILE_CUSTOM_KERNELS=1 ARG SOC_VERSION="ascend310p1" -ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} ENV SOC_VERSION=$SOC_VERSION RUN yum update -y && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index d10d3374..59975d99 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -18,14 +18,12 @@ FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" -ARG COMPILE_CUSTOM_KERNELS=1 ARG MOONCAKE_TAG=v0.3.7.post2 ARG SOC_VERSION="ascend910_9391" COPY . /vllm-workspace/vllm-ascend/ # Define environments ENV DEBIAN_FRONTEND=noninteractive -ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} ENV SOC_VERSION=$SOC_VERSION RUN pip config set global.index-url ${PIP_INDEX_URL} diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index a270fbf7..80a965bd 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -18,11 +18,9 @@ FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" -ARG COMPILE_CUSTOM_KERNELS=1 ARG MOONCAKE_TAG="v0.3.7.post2" ARG SOC_VERSION="ascend910_9391" -ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} ENV SOC_VERSION=$SOC_VERSION RUN pip config set global.index-url ${PIP_INDEX_URL} diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 746667ff..7d509c0a 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -18,11 +18,9 @@ FROM quay.io/ascend/cann:8.3.rc2-910b-openeuler24.03-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" -ARG COMPILE_CUSTOM_KERNELS=1 ARG MOONCAKE_TAG="v0.3.7.post2" ARG SOC_VERSION="ascend910b1" -ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} ENV SOC_VERSION=$SOC_VERSION RUN pip config set global.index-url ${PIP_INDEX_URL} diff --git a/docs/source/installation.md b/docs/source/installation.md index 375e8fac..aac8ee0e 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -162,12 +162,10 @@ pip install -v -e . cd .. ``` -vllm-ascend will build custom operators by default. If you don't want to build it, set `COMPILE_CUSTOM_KERNELS=0` environment to disable it. If you are building custom operators for Atlas A3, you should run `git submodule update --init --recursive` manually, or ensure your environment has Internet access. ::: ```{note} -If you are building from v0.7.3-dev and intend to use sleep mode feature, you should set `COMPILE_CUSTOM_KERNELS=1` manually. To build custom operators, gcc/g++ higher than 8 and c++ 17 or higher is required. If you're using `pip install -e .` and encounter a torch-npu version conflict, please install with `pip install --no-build-isolation -e .` to build on system env. If you encounter other problems during compiling, it is probably because unexpected compiler is being used, you may export `CXX_COMPILER` and `C_COMPILER` in environment to specify your g++ and gcc locations before compiling. ``` diff --git a/docs/source/user_guide/feature_guide/lora.md b/docs/source/user_guide/feature_guide/lora.md index 4678c024..a2322180 100644 --- a/docs/source/user_guide/feature_guide/lora.md +++ b/docs/source/user_guide/feature_guide/lora.md @@ -19,5 +19,3 @@ vllm serve meta-llama/Llama-2-7b \ ## Custom LoRA Operators We have implemented LoRA-related AscendC operators, such as bgmv_shrink, bgmv_expand, sgmv_shrink and sgmv_expand. You can find them under the "csrc/kernels" directory of [vllm-ascend repo](https://github.com/vllm-project/vllm-ascend.git). - -When you install vllm and vllm-ascend, those operators mentioned above will be compiled and installed automatically. If you do not want to use AscendC operators when you run vllm-ascend, you should set `COMPILE_CUSTOM_KERNELS=0` and reinstall vllm-ascend. For more instructions about installation and compilation, you can refer to [installation guide](../../installation.md). diff --git a/docs/source/user_guide/feature_guide/sleep_mode.md b/docs/source/user_guide/feature_guide/sleep_mode.md index 6fc36521..845e5a9b 100644 --- a/docs/source/user_guide/feature_guide/sleep_mode.md +++ b/docs/source/user_guide/feature_guide/sleep_mode.md @@ -23,7 +23,7 @@ The engine (v0/v1) supports two sleep levels to manage memory during idle period - Memory: The content of both the model weights and KV cache is forgotten. - Use Case: Ideal when switching to a different model or updating the current one. -Since this feature uses the low-level API [AscendCL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/appdevgapi/appdevgapi_07_0000.html), in order to use sleep mode, you should follow the [installation guide](https://vllm-ascend.readthedocs.io/en/latest/installation.html) and build from source. If you are using v0.7.3, remember to set `export COMPILE_CUSTOM_KERNELS=1`. For the latest version (v0.9.x+), the environment variable `COMPILE_CUSTOM_KERNELS` will be set to 1 by default while building from source. +Since this feature uses the low-level API [AscendCL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/appdevgapi/appdevgapi_07_0000.html), in order to use sleep mode, you should follow the [installation guide](https://vllm-ascend.readthedocs.io/en/latest/installation.html) and build from source. If you are using < v0.12.0rc1, remember to set `export COMPILE_CUSTOM_KERNELS=1`. ## Usage diff --git a/setup.py b/setup.py index 890b5228..2f07b7c1 100644 --- a/setup.py +++ b/setup.py @@ -165,16 +165,10 @@ def gen_build_info(): assert soc_version in soc_to_device, f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend." device_type = soc_to_device[soc_version] - if device_type == "_310P" and not envs.COMPILE_CUSTOM_KERNELS: - raise ValueError( - "device type 310P only supports custom kernels. Please set COMPILE_CUSTOM_KERNELS=1 to enable custom kernels." - ) - package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py") with open(package_dir, "w+") as f: f.write('# Auto-generated file\n') f.write(f"__device_type__ = '{device_type}'\n") - f.write(f"__sleep_mode_enabled__ = {envs.COMPILE_CUSTOM_KERNELS}\n") logging.info(f"Generated _build_info.py with SOC version: {soc_version}") @@ -357,8 +351,6 @@ class cmake_build_ext(build_ext): ) def build_extensions(self) -> None: - if not envs.COMPILE_CUSTOM_KERNELS: - return # Ensure that CMake is present and working try: subprocess.check_output(["cmake", "--version"]) @@ -450,9 +442,7 @@ except LookupError: # only checks out the commit. In this case, we set a dummy version. VERSION = "0.0.0" -ext_modules = [] -if envs.COMPILE_CUSTOM_KERNELS: - ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")] +ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")] def get_path(*filepath) -> str: diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py index 8ff1419e..31c01c23 100644 --- a/tests/ut/test_utils.py +++ b/tests/ut/test_utils.py @@ -43,16 +43,6 @@ class TestUtils(TestBase): 0): self.assertFalse(utils.is_enable_nz()) - def test_sleep_mode_enabled(self): - utils._SLEEP_MODE_ENABLED = None - with mock.patch("vllm_ascend._build_info.__sleep_mode_enabled__", - True): - self.assertTrue(utils.sleep_mode_enabled()) - utils._SLEEP_MODE_ENABLED = None - with mock.patch("vllm_ascend._build_info.__sleep_mode_enabled__", - False): - self.assertFalse(utils.sleep_mode_enabled()) - def test_nd_to_nz_2d(self): # can be divided by 16 input_tensor = torch.randn(32, 64) diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index 1b98b30d..fc4cc595 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -1,4 +1,3 @@ -import os import unittest from unittest.mock import MagicMock, patch @@ -216,68 +215,13 @@ class TestNPUWorker(TestBase): self.assertEqual(worker.cache_config.num_gpu_blocks, 100) self.assertEqual(worker.cache_config.num_cpu_blocks, 50) - @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled") - @patch("vllm_ascend.worker.worker_v1.NPUPlatform") - @patch("vllm_ascend.worker.worker_v1.CaMemAllocator") - @patch("vllm_ascend.worker.worker_v1.logger") - def test_sleep_mode_enabled(self, mock_logger, mock_allocator_class, - mock_platform, mock_sleep_mode_enabled): - """Test sleep method when sleep mode is enabled""" - from vllm_ascend.worker.worker_v1 import NPUWorker - - # Setup mock - mock_sleep_mode_enabled.return_value = True - mock_platform.mem_get_info.side_effect = [ - (1000, 2000), - (1200, 2000), - ] # before, after - mock_allocator = MagicMock() - mock_allocator_class.get_instance.return_value = mock_allocator - - # Create worker mock - with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): - worker = NPUWorker() - - # Test sleep method - worker.sleep(level=1) - - # Verify calls - mock_sleep_mode_enabled.assert_called_once() - mock_allocator.sleep.assert_called_once_with( - offload_tags=("weights", )) - self.assertEqual(mock_platform.mem_get_info.call_count, - 2) # Called 2 times in sleep method - # Verify log output - mock_logger.info.assert_called_once() - - @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled") - def test_sleep_mode_disabled_raises_error(self, mock_sleep_mode_enabled): - """Test sleep method raises exception when sleep mode is disabled""" - from vllm_ascend.worker.worker_v1 import NPUWorker - - # Set sleep mode disabled - mock_sleep_mode_enabled.return_value = False - - # Create worker mock - with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): - worker = NPUWorker() - - # Test sleep method should raise exception - with self.assertRaises(ValueError) as cm: - worker.sleep() - - self.assertIn("Sleep mode is not enabled", str(cm.exception)) - - @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled") @patch("vllm_ascend.worker.worker_v1.CaMemAllocator") @patch.dict("os.environ", {"VLLM_ASCEND_ENABLE_NZ": "0"}) - def test_wake_up_mode_enabled(self, mock_allocator_class, - mock_sleep_mode_enabled): + def test_wake_up_mode_enabled(self, mock_allocator_class): """Test wake_up method when sleep mode is enabled""" from vllm_ascend.worker.worker_v1 import NPUWorker # Setup mock - mock_sleep_mode_enabled.return_value = True mock_allocator = MagicMock() mock_allocator_class.get_instance.return_value = mock_allocator @@ -301,29 +245,8 @@ class TestNPUWorker(TestBase): # Test wake_up method worker.wake_up(tags=["test_tag"]) - # Verify calls - mock_sleep_mode_enabled.assert_called_once() mock_allocator.wake_up.assert_called_once_with(tags=["test_tag"]) - @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled") - @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) - def test_wake_up_mode_disabled_raises_error(self, mock_sleep_mode_enabled): - """Test wake_up method raises exception when sleep mode is disabled""" - from vllm_ascend.worker.worker_v1 import NPUWorker - - # Set sleep mode disabled - mock_sleep_mode_enabled.return_value = False - - # Create worker mock - with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): - worker = NPUWorker() - - # Test wake_up method should raise exception - with self.assertRaises(ValueError) as cm: - worker.wake_up() - - self.assertIn("Sleep mode is not enabled", str(cm.exception)) - @patch( "vllm_ascend.worker.worker_v1.NPUWorker._init_worker_distributed_environment" ) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 60ccdf0d..d9376acc 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -36,12 +36,6 @@ env_variables: Dict[str, Callable[[], Any]] = { # Release, Debug, RelWithDebugInfo. If not set, the default value is Release. "CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"), - # Whether to compile custom kernels. If not set, the default value is True. - # If set to False, the custom kernels will not be compiled. Please note that - # the sleep mode feature will be disabled as well if custom kernels are not - # compiled. - "COMPILE_CUSTOM_KERNELS": - lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))), # The CXX compiler used for compiling the package. If not set, the default # value is None, which means the system default CXX compiler will be used. "CXX_COMPILER": diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 7e207c7d..d55a2af7 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -49,7 +49,6 @@ ACL_FORMAT_FRACTAL_ND = 2 ACL_FORMAT_FRACTAL_NZ = 29 _CUSTOM_OP_ENABLED = None -_SLEEP_MODE_ENABLED = None _CURRENT_STREAM = None _PREFETCH_STREAM = None _SHARED_EXPERTS_CALCULATION_STREAM = None @@ -125,14 +124,6 @@ def is_enable_nz(): return envs_ascend.VLLM_ASCEND_ENABLE_NZ -def sleep_mode_enabled(): - global _SLEEP_MODE_ENABLED - if _SLEEP_MODE_ENABLED is None: - from vllm_ascend import _build_info # type: ignore - _SLEEP_MODE_ENABLED = _build_info.__sleep_mode_enabled__ - return _SLEEP_MODE_ENABLED - - def _round_up(x: int, align: int): # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc. # input: 15, 16 -> output: 16 diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 265e5211..6e28a503 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -54,7 +54,7 @@ from vllm_ascend.ops.triton.triton_utils import init_device_properties_triton from vllm_ascend.platform import NPUPlatform from vllm_ascend.utils import (check_ascend_device_type, enable_sp, is_enable_nz, register_ascend_customop, - sleep_mode_enabled, try_register_lib) + try_register_lib) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402 @@ -129,7 +129,7 @@ class NPUWorker(WorkerBase): init_cached_hf_modules() self.profiler = self._init_profiler() - if sleep_mode_enabled(): + if vllm_config.model_config and vllm_config.model_config.enable_sleep_mode: # Buffers saved before sleep self._sleep_saved_buffers: dict[str, torch.Tensor] = {} @@ -140,10 +140,6 @@ class NPUWorker(WorkerBase): WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod") def sleep(self, level: int = 1) -> None: - if not sleep_mode_enabled(): - raise ValueError( - "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1." - ) free_bytes_before_sleep = NPUPlatform.mem_get_info()[0] # Save the buffers before level 2 sleep if level == 2: @@ -164,11 +160,6 @@ class NPUWorker(WorkerBase): used_bytes / GiB_bytes) def wake_up(self, tags: Optional[list[str]] = None) -> None: - if not sleep_mode_enabled(): - raise ValueError( - "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1." - ) - if is_enable_nz(): raise ValueError( "FRACTAL_NZ mode is enabled. This may cause model parameter precision issues "