Remove COMPILE_CUSTOM_KERNELS env (#4864)

With more and more custom ops merged, disable `COMPILE_CUSTOM_KERNELS ` for vllm ascend seems useless now. Let's enable csrc compile by default. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-10 23:48:03 +08:00
parent 3362be7f86
commit 37db0844f5
18 changed files with 5 additions and 146 deletions
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -36,12 +36,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
    # Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
    "CMAKE_BUILD_TYPE":
    lambda: os.getenv("CMAKE_BUILD_TYPE"),
-    # Whether to compile custom kernels. If not set, the default value is True.
-    # If set to False, the custom kernels will not be compiled. Please note that
-    # the sleep mode feature will be disabled as well if custom kernels are not
-    # compiled.
-    "COMPILE_CUSTOM_KERNELS":
-    lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
    # The CXX compiler used for compiling the package. If not set, the default
    # value is None, which means the system default CXX compiler will be used.
    "CXX_COMPILER":
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -49,7 +49,6 @@ ACL_FORMAT_FRACTAL_ND = 2
 ACL_FORMAT_FRACTAL_NZ = 29

 _CUSTOM_OP_ENABLED = None
-_SLEEP_MODE_ENABLED = None
 _CURRENT_STREAM = None
 _PREFETCH_STREAM = None
 _SHARED_EXPERTS_CALCULATION_STREAM = None
@@ -125,14 +124,6 @@ def is_enable_nz():
    return envs_ascend.VLLM_ASCEND_ENABLE_NZ


-def sleep_mode_enabled():
-    global _SLEEP_MODE_ENABLED
-    if _SLEEP_MODE_ENABLED is None:
-        from vllm_ascend import _build_info  # type: ignore
-        _SLEEP_MODE_ENABLED = _build_info.__sleep_mode_enabled__
-    return _SLEEP_MODE_ENABLED
-
-
 def _round_up(x: int, align: int):
    # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc.
    # input: 15, 16 -> output: 16
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -54,7 +54,7 @@ from vllm_ascend.ops.triton.triton_utils import init_device_properties_triton
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import (check_ascend_device_type, enable_sp,
                               is_enable_nz, register_ascend_customop,
-                               sleep_mode_enabled, try_register_lib)
+                               try_register_lib)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner

 torch._dynamo.trace_rules.clear_lru_cache()  # noqa: E402
@@ -129,7 +129,7 @@ class NPUWorker(WorkerBase):
            init_cached_hf_modules()

        self.profiler = self._init_profiler()
-        if sleep_mode_enabled():
+        if vllm_config.model_config and vllm_config.model_config.enable_sleep_mode:
            # Buffers saved before sleep
            self._sleep_saved_buffers: dict[str, torch.Tensor] = {}

@@ -140,10 +140,6 @@ class NPUWorker(WorkerBase):
            WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")

    def sleep(self, level: int = 1) -> None:
-        if not sleep_mode_enabled():
-            raise ValueError(
-                "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
-            )
        free_bytes_before_sleep = NPUPlatform.mem_get_info()[0]
        # Save the buffers before level 2 sleep
        if level == 2:
@@ -164,11 +160,6 @@ class NPUWorker(WorkerBase):
            used_bytes / GiB_bytes)

    def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        if not sleep_mode_enabled():
-            raise ValueError(
-                "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
-            )
-
        if is_enable_nz():
            raise ValueError(
                "FRACTAL_NZ mode is enabled. This may cause model parameter precision issues "