diff --git a/.github/Dockerfile.buildwheel b/.github/Dockerfile.buildwheel
index 3374e8b9..cf5eaf2a 100644
--- a/.github/Dockerfile.buildwheel
+++ b/.github/Dockerfile.buildwheel
@@ -17,12 +17,10 @@
 ARG PY_VERSION=3.11
 FROM quay.io/ascend/manylinux:8.3.rc2-910b-manylinux_2_28-py${PY_VERSION}
 
-ARG COMPILE_CUSTOM_KERNELS=1
 ARG SOC_VERSION="ascend910b1"
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION
 RUN yum update -y && \
     yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
diff --git a/.github/Dockerfile.nightly.a2 b/.github/Dockerfile.nightly.a2
index e7e395f6..a7945bdd 100644
--- a/.github/Dockerfile.nightly.a2
+++ b/.github/Dockerfile.nightly.a2
@@ -23,7 +23,6 @@ ARG AIS_BENCH_URL="https://gitee.com/aisbench/benchmark.git"
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 
 WORKDIR /workspace
 
diff --git a/.github/Dockerfile.nightly.a3 b/.github/Dockerfile.nightly.a3
index 0012c544..70705b32 100644
--- a/.github/Dockerfile.nightly.a3
+++ b/.github/Dockerfile.nightly.a3
@@ -23,7 +23,6 @@ ARG AIS_BENCH_URL="https://gitee.com/aisbench/benchmark.git"
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 
 WORKDIR /workspace
 
diff --git a/Dockerfile b/Dockerfile
index 9511698a..bff22191 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,13 +18,11 @@
 FROM quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG="v0.3.7.post2"
 ARG SOC_VERSION="ascend910b1"
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION
 
 WORKDIR /workspace
diff --git a/Dockerfile.310p b/Dockerfile.310p
index 0245bb87..44476d87 100644
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -18,12 +18,10 @@
 FROM quay.io/ascend/cann:8.3.rc2-310p-ubuntu22.04-py3.11
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-ARG COMPILE_CUSTOM_KERNELS=1
 ARG SOC_VERSION="ascend310p1"
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION
 
 RUN apt-get update -y && \
diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler
index 13e38df5..6d03629a 100644
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -18,10 +18,8 @@
 FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-ARG COMPILE_CUSTOM_KERNELS=1
 ARG SOC_VERSION="ascend310p1"
 
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION
 
 RUN yum update -y && \
diff --git a/Dockerfile.a3 b/Dockerfile.a3
index d10d3374..59975d99 100644
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -18,14 +18,12 @@
 FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG=v0.3.7.post2
 ARG SOC_VERSION="ascend910_9391"
 
 COPY . /vllm-workspace/vllm-ascend/
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION
 
 RUN pip config set global.index-url ${PIP_INDEX_URL}
diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler
index a270fbf7..80a965bd 100644
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -18,11 +18,9 @@
 FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG="v0.3.7.post2"
 ARG SOC_VERSION="ascend910_9391"
 
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION
 
 RUN pip config set global.index-url ${PIP_INDEX_URL}
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
index 746667ff..7d509c0a 100644
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -18,11 +18,9 @@
 FROM quay.io/ascend/cann:8.3.rc2-910b-openeuler24.03-py3.11
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG="v0.3.7.post2"
 ARG SOC_VERSION="ascend910b1"
 
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION
 
 RUN pip config set global.index-url ${PIP_INDEX_URL}
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 375e8fac..aac8ee0e 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -162,12 +162,10 @@ pip install -v -e .
 cd ..
 ```
 
-vllm-ascend will build custom operators by default. If you don't want to build it, set `COMPILE_CUSTOM_KERNELS=0` environment to disable it.
 If you are building custom operators for Atlas A3, you should run `git submodule update --init --recursive` manually, or ensure your environment has Internet access.
 :::
 
 ```{note}
-If you are building from v0.7.3-dev and intend to use sleep mode feature, you should set `COMPILE_CUSTOM_KERNELS=1` manually.
 To build custom operators, gcc/g++ higher than 8 and c++ 17 or higher is required. If you're using `pip install -e .` and encounter a torch-npu version conflict, please install with `pip install --no-build-isolation -e .` to build on system env.
 If you encounter other problems during compiling, it is probably because unexpected compiler is being used, you may export `CXX_COMPILER` and `C_COMPILER` in environment to specify your g++ and gcc locations before compiling.
 ```
diff --git a/docs/source/user_guide/feature_guide/lora.md b/docs/source/user_guide/feature_guide/lora.md
index 4678c024..a2322180 100644
--- a/docs/source/user_guide/feature_guide/lora.md
+++ b/docs/source/user_guide/feature_guide/lora.md
@@ -19,5 +19,3 @@ vllm serve meta-llama/Llama-2-7b \
 ## Custom LoRA Operators
 
 We have implemented LoRA-related AscendC operators, such as bgmv_shrink, bgmv_expand, sgmv_shrink and sgmv_expand. You can find them under the "csrc/kernels" directory of [vllm-ascend repo](https://github.com/vllm-project/vllm-ascend.git).
-
-When you install vllm and vllm-ascend, those operators mentioned above will be compiled and installed automatically. If you do not want to use AscendC operators when you run vllm-ascend, you should set `COMPILE_CUSTOM_KERNELS=0` and reinstall vllm-ascend. For more instructions about installation and compilation, you can refer to [installation guide](../../installation.md).
diff --git a/docs/source/user_guide/feature_guide/sleep_mode.md b/docs/source/user_guide/feature_guide/sleep_mode.md
index 6fc36521..845e5a9b 100644
--- a/docs/source/user_guide/feature_guide/sleep_mode.md
+++ b/docs/source/user_guide/feature_guide/sleep_mode.md
@@ -23,7 +23,7 @@ The engine (v0/v1) supports two sleep levels to manage memory during idle period
     - Memory: The content of both the model weights and KV cache is forgotten.
     - Use Case: Ideal when switching to a different model or updating the current one.
 
-Since this feature uses the low-level API [AscendCL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/appdevgapi/appdevgapi_07_0000.html), in order to use sleep mode, you should follow the [installation guide](https://vllm-ascend.readthedocs.io/en/latest/installation.html) and build from source. If you are using v0.7.3, remember to set `export COMPILE_CUSTOM_KERNELS=1`. For the latest version (v0.9.x+), the environment variable `COMPILE_CUSTOM_KERNELS` will be set to 1 by default while building from source.
+Since this feature uses the low-level API [AscendCL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/appdevgapi/appdevgapi_07_0000.html), in order to use sleep mode, you should follow the [installation guide](https://vllm-ascend.readthedocs.io/en/latest/installation.html) and build from source. If you are using < v0.12.0rc1, remember to set `export COMPILE_CUSTOM_KERNELS=1`.
 
 ## Usage
 
diff --git a/setup.py b/setup.py
index 890b5228..2f07b7c1 100644
--- a/setup.py
+++ b/setup.py
@@ -165,16 +165,10 @@ def gen_build_info():
     assert soc_version in soc_to_device, f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend."
     device_type = soc_to_device[soc_version]
 
-    if device_type == "_310P" and not envs.COMPILE_CUSTOM_KERNELS:
-        raise ValueError(
-            "device type 310P only supports custom kernels. Please set COMPILE_CUSTOM_KERNELS=1 to enable custom kernels."
-        )
-
     package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py")
     with open(package_dir, "w+") as f:
         f.write('# Auto-generated file\n')
         f.write(f"__device_type__ = '{device_type}'\n")
-        f.write(f"__sleep_mode_enabled__ = {envs.COMPILE_CUSTOM_KERNELS}\n")
     logging.info(f"Generated _build_info.py with SOC version: {soc_version}")
 
 
@@ -357,8 +351,6 @@ class cmake_build_ext(build_ext):
         )
 
     def build_extensions(self) -> None:
-        if not envs.COMPILE_CUSTOM_KERNELS:
-            return
         # Ensure that CMake is present and working
         try:
             subprocess.check_output(["cmake", "--version"])
@@ -450,9 +442,7 @@ except LookupError:
     # only checks out the commit. In this case, we set a dummy version.
     VERSION = "0.0.0"
 
-ext_modules = []
-if envs.COMPILE_CUSTOM_KERNELS:
-    ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")]
+ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")]
 
 
 def get_path(*filepath) -> str:
diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py
index 8ff1419e..31c01c23 100644
--- a/tests/ut/test_utils.py
+++ b/tests/ut/test_utils.py
@@ -43,16 +43,6 @@ class TestUtils(TestBase):
                         0):
             self.assertFalse(utils.is_enable_nz())
 
-    def test_sleep_mode_enabled(self):
-        utils._SLEEP_MODE_ENABLED = None
-        with mock.patch("vllm_ascend._build_info.__sleep_mode_enabled__",
-                        True):
-            self.assertTrue(utils.sleep_mode_enabled())
-        utils._SLEEP_MODE_ENABLED = None
-        with mock.patch("vllm_ascend._build_info.__sleep_mode_enabled__",
-                        False):
-            self.assertFalse(utils.sleep_mode_enabled())
-
     def test_nd_to_nz_2d(self):
         # can be divided by 16
         input_tensor = torch.randn(32, 64)
diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py
index 1b98b30d..fc4cc595 100644
--- a/tests/ut/worker/test_worker_v1.py
+++ b/tests/ut/worker/test_worker_v1.py
@@ -1,4 +1,3 @@
-import os
 import unittest
 from unittest.mock import MagicMock, patch
 
@@ -216,68 +215,13 @@ class TestNPUWorker(TestBase):
             self.assertEqual(worker.cache_config.num_gpu_blocks, 100)
             self.assertEqual(worker.cache_config.num_cpu_blocks, 50)
 
-    @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled")
-    @patch("vllm_ascend.worker.worker_v1.NPUPlatform")
-    @patch("vllm_ascend.worker.worker_v1.CaMemAllocator")
-    @patch("vllm_ascend.worker.worker_v1.logger")
-    def test_sleep_mode_enabled(self, mock_logger, mock_allocator_class,
-                                mock_platform, mock_sleep_mode_enabled):
-        """Test sleep method when sleep mode is enabled"""
-        from vllm_ascend.worker.worker_v1 import NPUWorker
-
-        # Setup mock
-        mock_sleep_mode_enabled.return_value = True
-        mock_platform.mem_get_info.side_effect = [
-            (1000, 2000),
-            (1200, 2000),
-        ]  # before, after
-        mock_allocator = MagicMock()
-        mock_allocator_class.get_instance.return_value = mock_allocator
-
-        # Create worker mock
-        with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None):
-            worker = NPUWorker()
-
-            # Test sleep method
-            worker.sleep(level=1)
-
-            # Verify calls
-            mock_sleep_mode_enabled.assert_called_once()
-            mock_allocator.sleep.assert_called_once_with(
-                offload_tags=("weights", ))
-            self.assertEqual(mock_platform.mem_get_info.call_count,
-                             2)  # Called 2 times in sleep method
-            # Verify log output
-            mock_logger.info.assert_called_once()
-
-    @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled")
-    def test_sleep_mode_disabled_raises_error(self, mock_sleep_mode_enabled):
-        """Test sleep method raises exception when sleep mode is disabled"""
-        from vllm_ascend.worker.worker_v1 import NPUWorker
-
-        # Set sleep mode disabled
-        mock_sleep_mode_enabled.return_value = False
-
-        # Create worker mock
-        with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None):
-            worker = NPUWorker()
-
-            # Test sleep method should raise exception
-            with self.assertRaises(ValueError) as cm:
-                worker.sleep()
-
-            self.assertIn("Sleep mode is not enabled", str(cm.exception))
-
-    @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled")
     @patch("vllm_ascend.worker.worker_v1.CaMemAllocator")
     @patch.dict("os.environ", {"VLLM_ASCEND_ENABLE_NZ": "0"})
-    def test_wake_up_mode_enabled(self, mock_allocator_class,
-                                  mock_sleep_mode_enabled):
+    def test_wake_up_mode_enabled(self, mock_allocator_class):
         """Test wake_up method when sleep mode is enabled"""
         from vllm_ascend.worker.worker_v1 import NPUWorker
 
         # Setup mock
-        mock_sleep_mode_enabled.return_value = True
         mock_allocator = MagicMock()
         mock_allocator_class.get_instance.return_value = mock_allocator
 
@@ -301,29 +245,8 @@ class TestNPUWorker(TestBase):
             # Test wake_up method
             worker.wake_up(tags=["test_tag"])
 
-            # Verify calls
-            mock_sleep_mode_enabled.assert_called_once()
             mock_allocator.wake_up.assert_called_once_with(tags=["test_tag"])
 
-    @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled")
-    @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
-    def test_wake_up_mode_disabled_raises_error(self, mock_sleep_mode_enabled):
-        """Test wake_up method raises exception when sleep mode is disabled"""
-        from vllm_ascend.worker.worker_v1 import NPUWorker
-
-        # Set sleep mode disabled
-        mock_sleep_mode_enabled.return_value = False
-
-        # Create worker mock
-        with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None):
-            worker = NPUWorker()
-
-            # Test wake_up method should raise exception
-            with self.assertRaises(ValueError) as cm:
-                worker.wake_up()
-
-            self.assertIn("Sleep mode is not enabled", str(cm.exception))
-
     @patch(
         "vllm_ascend.worker.worker_v1.NPUWorker._init_worker_distributed_environment"
     )
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index 60ccdf0d..d9376acc 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -36,12 +36,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
     # Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
     "CMAKE_BUILD_TYPE":
     lambda: os.getenv("CMAKE_BUILD_TYPE"),
-    # Whether to compile custom kernels. If not set, the default value is True.
-    # If set to False, the custom kernels will not be compiled. Please note that
-    # the sleep mode feature will be disabled as well if custom kernels are not
-    # compiled.
-    "COMPILE_CUSTOM_KERNELS":
-    lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
     # The CXX compiler used for compiling the package. If not set, the default
     # value is None, which means the system default CXX compiler will be used.
     "CXX_COMPILER":
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 7e207c7d..d55a2af7 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -49,7 +49,6 @@ ACL_FORMAT_FRACTAL_ND = 2
 ACL_FORMAT_FRACTAL_NZ = 29
 
 _CUSTOM_OP_ENABLED = None
-_SLEEP_MODE_ENABLED = None
 _CURRENT_STREAM = None
 _PREFETCH_STREAM = None
 _SHARED_EXPERTS_CALCULATION_STREAM = None
@@ -125,14 +124,6 @@ def is_enable_nz():
     return envs_ascend.VLLM_ASCEND_ENABLE_NZ
 
 
-def sleep_mode_enabled():
-    global _SLEEP_MODE_ENABLED
-    if _SLEEP_MODE_ENABLED is None:
-        from vllm_ascend import _build_info  # type: ignore
-        _SLEEP_MODE_ENABLED = _build_info.__sleep_mode_enabled__
-    return _SLEEP_MODE_ENABLED
-
-
 def _round_up(x: int, align: int):
     # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc.
     # input: 15, 16 -> output: 16
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index 265e5211..6e28a503 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -54,7 +54,7 @@ from vllm_ascend.ops.triton.triton_utils import init_device_properties_triton
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import (check_ascend_device_type, enable_sp,
                                is_enable_nz, register_ascend_customop,
-                               sleep_mode_enabled, try_register_lib)
+                               try_register_lib)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
 torch._dynamo.trace_rules.clear_lru_cache()  # noqa: E402
@@ -129,7 +129,7 @@ class NPUWorker(WorkerBase):
             init_cached_hf_modules()
 
         self.profiler = self._init_profiler()
-        if sleep_mode_enabled():
+        if vllm_config.model_config and vllm_config.model_config.enable_sleep_mode:
             # Buffers saved before sleep
             self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
 
@@ -140,10 +140,6 @@ class NPUWorker(WorkerBase):
             WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
 
     def sleep(self, level: int = 1) -> None:
-        if not sleep_mode_enabled():
-            raise ValueError(
-                "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
-            )
         free_bytes_before_sleep = NPUPlatform.mem_get_info()[0]
         # Save the buffers before level 2 sleep
         if level == 2:
@@ -164,11 +160,6 @@ class NPUWorker(WorkerBase):
             used_bytes / GiB_bytes)
 
     def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        if not sleep_mode_enabled():
-            raise ValueError(
-                "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
-            )
-
         if is_enable_nz():
             raise ValueError(
                 "FRACTAL_NZ mode is enabled. This may cause model parameter precision issues "