diff --git a/.gitignore b/.gitignore
index e32c065..ef8fc87 100644
--- a/.gitignore
+++ b/.gitignore
@@ -196,3 +196,5 @@ kernel_meta/
 
 # version file generated by setuptools-scm
 /vllm_ascend/_version.py
+# build info file generated by setup.py
+/vllm_ascend/_build_info.py
diff --git a/setup.py b/setup.py
index 327c2f8..0ceb2e4 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,7 @@ from typing import Dict, List
 
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
+from setuptools.command.build_py import build_py
 from setuptools.command.develop import develop
 from setuptools.command.install import install
 from setuptools_scm import get_version
@@ -78,6 +79,30 @@ class CMakeExtension(Extension):
         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
 
 
+class custom_build_info(build_py):
+
+    def run(self):
+        soc_version = envs.SOC_VERSION
+        if not soc_version:
+            raise ValueError(
+                "SOC version is not set. Please set SOC_VERSION environment variable."
+            )
+        if "310" in soc_version and not envs.COMPILE_CUSTOM_KERNELS:
+            raise ValueError(
+                "SOC version 310 only supports custom kernels. Please set COMPILE_CUSTOM_KERNELS=1 to enable custom kernels."
+            )
+
+        package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py")
+        with open(package_dir, "w+") as f:
+            f.write('# Auto-generated file\n')
+            f.write(f"__soc_version__ = '{soc_version}'\n")
+            f.write(
+                f"__sleep_mode_enabled__ = {envs.COMPILE_CUSTOM_KERNELS}\n")
+        logging.info(
+            f"Generated _build_info.py with SOC version: {soc_version}")
+        super().run()
+
+
 class cmake_build_ext(build_ext):
     # A dict of extension directories that have been configured.
     did_config: Dict[str, bool] = {}
@@ -326,7 +351,11 @@ def get_requirements() -> List[str]:
     return requirements
 
 
-cmdclass = {"build_ext": cmake_build_ext, "install": custom_install}
+cmdclass = {
+    "build_py": custom_build_info,
+    "build_ext": cmake_build_ext,
+    "install": custom_install
+}
 
 setup(
     name="vllm_ascend",
diff --git a/tests/ut/base.py b/tests/ut/base.py
new file mode 100644
index 0000000..99c6d5b
--- /dev/null
+++ b/tests/ut/base.py
@@ -0,0 +1,12 @@
+import unittest
+
+from vllm_ascend.utils import adapt_patch
+
+
+class TestBase(unittest.TestCase):
+
+    def setUp(self):
+        # adapt patch by default.
+        adapt_patch(True)
+        adapt_patch()
+        super().setUp()
diff --git a/tests/ut/patch/worker/patch_common/test_patch_distributed.py b/tests/ut/patch/worker/patch_common/test_patch_distributed.py
new file mode 100644
index 0000000..1ddc614
--- /dev/null
+++ b/tests/ut/patch/worker/patch_common/test_patch_distributed.py
@@ -0,0 +1,12 @@
+from tests.ut.base import TestBase
+
+
+class TestPatchDistributed(TestBase):
+
+    def test_GroupCoordinator_patched(self):
+        from vllm.distributed.parallel_state import GroupCoordinator
+
+        from vllm_ascend.patch.worker.patch_common.patch_distributed import \
+            GroupCoordinatorPatch
+
+        self.assertIs(GroupCoordinator, GroupCoordinatorPatch)
diff --git a/tests/ut/worker/patch_common/test_patch_sampler.py b/tests/ut/patch/worker/patch_common/test_patch_sampler.py
similarity index 100%
rename from tests/ut/worker/patch_common/test_patch_sampler.py
rename to tests/ut/patch/worker/patch_common/test_patch_sampler.py
diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py
new file mode 100644
index 0000000..83e65b0
--- /dev/null
+++ b/tests/ut/test_utils.py
@@ -0,0 +1,251 @@
+import math
+import os
+import unittest
+from threading import Lock
+from unittest import mock
+
+import torch
+from vllm.config import (CompilationConfig, ModelConfig, ParallelConfig,
+                         VllmConfig)
+
+from vllm_ascend import utils
+
+
+class TestUtils(unittest.TestCase):
+
+    def test_is_310p(self):
+        utils._IS_310P = None
+        with mock.patch("vllm_ascend._build_info.__soc_version__",
+                        "Ascend310P3"):
+            self.assertTrue(utils.is_310p())
+        utils._IS_310P = None
+        with mock.patch("vllm_ascend._build_info.__soc_version__",
+                        "Ascend910P1"):
+            self.assertFalse(utils.is_310p())
+
+    def test_sleep_mode_enabled(self):
+        utils._SLEEP_MODE_ENABLED = None
+        with mock.patch("vllm_ascend._build_info.__sleep_mode_enabled__",
+                        True):
+            self.assertTrue(utils.sleep_mode_enabled())
+        utils._SLEEP_MODE_ENABLED = None
+        with mock.patch("vllm_ascend._build_info.__sleep_mode_enabled__",
+                        False):
+            self.assertFalse(utils.sleep_mode_enabled())
+
+    def test_nd_to_nz_2d(self):
+        # can be divided by 16
+        input_tensor = torch.randn(32, 64)
+        output = utils.nd_to_nz_2d(input_tensor)
+        self.assertEqual(output.shape[0], 1)
+        self.assertEqual(output.shape[1], 64 // 16)
+        self.assertEqual(output.shape[2], 32)
+        self.assertEqual(output.shape[3], 16)
+
+        # cannot be divided by 16
+        input_tensor = torch.randn(30, 62)
+        output = utils.nd_to_nz_2d(input_tensor)
+        self.assertEqual(output.shape[0], 1)
+        self.assertEqual(output.shape[1], math.ceil(62 / 16))
+        self.assertEqual(output.shape[2], 32)
+        self.assertEqual(output.shape[3], 16)
+
+        # pad to 16
+        input_tensor = torch.randn(8, 12)
+        output = utils.nd_to_nz_2d(input_tensor)
+        self.assertEqual(output.shape[0], 1)
+        self.assertEqual(output.shape[1], 1)  # 12->16, 16//16=1
+        self.assertEqual(output.shape[2], 16)  # 8->16
+        self.assertEqual(output.shape[3], 16)
+
+        # check if the output is contiguous
+        input_tensor = torch.randn(32, 64)
+        output = utils.nd_to_nz_2d(input_tensor)
+        self.assertTrue(output.is_contiguous())
+
+        # check if the output values are preserved
+        input_tensor = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
+        output = utils.nd_to_nz_2d(input_tensor)
+        expected = torch.tensor(
+            [[[[1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]])
+        self.assertTrue(torch.allclose(output, expected))
+
+    def test_aligned_16(self):
+        # align to 16
+        input_tensor = torch.randn(15, 64)
+        output_tensor = utils.aligned_16(input_tensor)
+        self.assertEqual(output_tensor.shape[0], 16)
+
+        # align to 16
+        input_tensor = torch.randn(16, 64)
+        output_tensor = utils.aligned_16(input_tensor)
+        self.assertEqual(output_tensor.shape[0], 16)
+        self.assertTrue(torch.equal(input_tensor, output_tensor))
+
+        # align to 32
+        input_tensor = torch.randn(17, 64)
+        output_tensor = utils.aligned_16(input_tensor)
+        self.assertEqual(output_tensor.shape[0], 32)
+
+    @mock.patch('importlib.util.find_spec')
+    @mock.patch('importlib.import_module')
+    def test_try_register_lib(self, mock_import_module, mock_find_spec):
+        # import OK
+        mock_find_spec.return_value = mock.MagicMock()
+        mock_import_module.return_value = mock.MagicMock()
+        lib_name = "existing_lib"
+        lib_info = "Library found and imported successfully"
+        utils.try_register_lib(lib_name, lib_info)
+        mock_find_spec.assert_called_once_with(lib_name)
+        mock_import_module.assert_called_once_with(lib_name)
+
+        # Can't find lib
+        mock_find_spec.return_value = None
+        lib_name = "non_existing_lib"
+        utils.try_register_lib(lib_name)
+        self.assertEqual(2, mock_find_spec.call_count)
+        self.assertEqual(1, mock_import_module.call_count)
+
+        # import error
+        mock_find_spec.return_value = mock.MagicMock()
+        mock_import_module.side_effect = ImportError("import error")
+        lib_name = "error_lib"
+        utils.try_register_lib(lib_name)
+        self.assertEqual(3, mock_find_spec.call_count)
+        self.assertEqual(2, mock_import_module.call_count)
+
+    def test_enable_custom_op(self):
+        result = utils.enable_custom_op()
+        self.assertTrue(result)
+
+        utils._CUSTOM_OP_ENABLED = None
+
+        with mock.patch('builtins.__import__') as mock_import_module:
+            mock_import_module.side_effect = ImportError("import error")
+            self.assertFalse(utils.enable_custom_op())
+
+    def test_find_hccl_library(self):
+        with mock.patch.dict(os.environ,
+                             {"HCCL_SO_PATH": "/path/to/hccl/libhccl.so"}):
+            self.assertEqual(utils.find_hccl_library(),
+                             "/path/to/hccl/libhccl.so")
+        with mock.patch("torch.version.cann", None):
+            self.assertRaises(ValueError, utils.find_hccl_library)
+        with mock.patch("torch.version.cann", "Ascend910"):
+            self.assertEqual(utils.find_hccl_library(), "libhccl.so")
+
+    def test_current_stream(self):
+        with mock.patch("torch.npu.current_stream") as mock_current_stream:
+            self.assertEqual(utils.current_stream(), mock_current_stream())
+
+    def test_vllm_version_is(self):
+        with mock.patch.dict(os.environ, {"VLLM_VERSION": "1.0.0"}):
+            with mock.patch("vllm.__version__", "1.0.0"):
+                self.assertTrue(utils.vllm_version_is("1.0.0"))
+                self.assertFalse(utils.vllm_version_is("2.0.0"))
+            with mock.patch("vllm.__version__", "2.0.0"):
+                self.assertTrue(utils.vllm_version_is("1.0.0"))
+                self.assertFalse(utils.vllm_version_is("2.0.0"))
+        with mock.patch("vllm.__version__", "1.0.0"):
+            self.assertTrue(utils.vllm_version_is("1.0.0"))
+            self.assertFalse(utils.vllm_version_is("2.0.0"))
+        with mock.patch("vllm.__version__", "2.0.0"):
+            self.assertTrue(utils.vllm_version_is("2.0.0"))
+            self.assertFalse(utils.vllm_version_is("1.0.0"))
+
+    def test_update_aclgraph_sizes(self):
+        # max_num_batch_sizes < len(original_sizes)
+        test_compilation_config = CompilationConfig(
+            cudagraph_capture_sizes=[i for i in range(150)])
+        model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
+        test_model_config = ModelConfig(model=model_path, enforce_eager=True)
+        test_parallel_config = ParallelConfig()
+        test_vllm_config = VllmConfig(
+            model_config=test_model_config,
+            compilation_config=test_compilation_config,
+            parallel_config=test_parallel_config,
+        )
+        utils.update_aclgraph_sizes(test_vllm_config)
+        self.assertEqual(
+            147,
+            len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
+        # max_num_batch_sizes >= len(original_sizes)
+        test_compilation_config = CompilationConfig(
+            cudagraph_capture_sizes=[1, 2, 3])
+        test_vllm_config = VllmConfig(
+            model_config=test_model_config,
+            compilation_config=test_compilation_config,
+            parallel_config=test_parallel_config,
+        )
+        utils.update_aclgraph_sizes(test_vllm_config)
+        self.assertEqual(
+            3,
+            len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
+
+
+class TestProfileExecuteDuration(unittest.TestCase):
+
+    def setUp(self):
+        utils.ProfileExecuteDuration._instance = None
+        utils.ProfileExecuteDuration._observations = []
+        utils.ProfileExecuteDuration._lock = Lock()
+
+    def test_singleton_creation(self):
+        instance1 = utils.ProfileExecuteDuration()
+        self.assertIsNotNone(instance1)
+        self.assertIs(instance1, utils.ProfileExecuteDuration._instance)
+
+        instance2 = utils.ProfileExecuteDuration()
+        self.assertIs(instance1, instance2)
+
+    def test_thread_safety(self):
+        from threading import Thread
+
+        instances = []
+
+        def create_instance():
+            instances.append(utils.ProfileExecuteDuration())
+
+        threads = [Thread(target=create_instance) for _ in range(10)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        first_instance = instances[0]
+        for instance in instances[1:]:
+            self.assertIs(first_instance, instance)
+
+    def test_atexit_registration(self):
+        with mock.patch('atexit.register') as mock_register:
+            instance = utils.ProfileExecuteDuration()
+            mock_register.assert_called_once_with(instance.destroy)
+
+    def test_lock_usage(self):
+        original_lock = utils.ProfileExecuteDuration._lock
+
+        with mock.patch.object(utils.ProfileExecuteDuration,
+                               '_lock',
+                               wraps=original_lock) as mock_lock:
+            utils.ProfileExecuteDuration()
+            mock_lock.__enter__.assert_called()
+            mock_lock.__exit__.assert_called()
+
+    def test_observations_initialization(self):
+        instance = utils.ProfileExecuteDuration()
+        self.assertEqual(instance._observations, [])
diff --git a/vllm_ascend/device_allocator/camem.py b/vllm_ascend/device_allocator/camem.py
index 7156602..23dd77c 100644
--- a/vllm_ascend/device_allocator/camem.py
+++ b/vllm_ascend/device_allocator/camem.py
@@ -138,7 +138,6 @@ class CaMemAllocator:
         We cannot call the constructor directly.
         Call this method to get the instance.
         """
-        assert camem_available, "camem allocator is not available"
         if CaMemAllocator.instance is None:
             CaMemAllocator.instance = CaMemAllocator()
         return CaMemAllocator.instance
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index f8a4f5e..cad90a4 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -155,7 +155,6 @@ def fused_experts_with_mc2(
     kwargs_mc2.update(stage1_kwargs)
 
     output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
-    # comm_stream.wait_stream(torch.npu.current_stream())
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
 
diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
index 1e9a7b0..b3db843 100644
--- a/vllm_ascend/patch/platform/patch_common/patch_distributed.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
@@ -23,7 +23,7 @@ import vllm.distributed
 import vllm.envs as envs
 from vllm.config import ParallelConfig
 
-from vllm_ascend.utils import NullHandle, is_310p
+from vllm_ascend.utils import is_310p
 
 
 def ascend_destroy_model_parallel():
@@ -66,6 +66,15 @@ vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_pa
 ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
 
 
+class NullHandle:
+
+    def __init__(self):
+        pass
+
+    def wait(self):
+        pass
+
+
 def communication_adaptation_310p():
 
     def broadcast310p(tensor, src, group=None, async_op=False):
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 4764526..a667b38 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -21,13 +21,11 @@ import atexit
 import math
 from contextlib import contextmanager, nullcontext
 from enum import Enum
-from functools import lru_cache
 from threading import Lock
 from typing import TYPE_CHECKING, List, Tuple
 
 import torch
 import torch_npu  # noqa: F401  # noqa: F401
-import torchair  # type: ignore[import]  # noqa: F401
 from packaging.version import InvalidVersion, Version
 from torch_npu.npu.streams import Event
 from vllm.logger import logger
@@ -55,75 +53,83 @@ else:
 MAX_CAPTURE_SIZE = 1920
 
 ASCEND_QUATIZATION_METHOD = "ascend"
-
-CUSTOM_OP_ENABLED = None
-
 SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"]
 
 ACL_FORMAT_FRACTAL_ND = 2
 ACL_FORMAT_FRACTAL_NZ = 29
 
-
-@lru_cache(maxsize=None)
-def _get_soc_version():
-    """Gets the SOC version and caches it."""
-    if not torch.npu.is_available():
-        return ""
-    device_count = torch.npu.device_count()
-    if device_count <= 0:
-        return ""
-    try:
-        return torch.npu.get_device_name(0)
-    except Exception:
-        return ""
-
-
-_SOC_VERSION = _get_soc_version()
+_CUSTOM_OP_ENABLED = None
+_IS_310P = None
+_SLEEP_MODE_ENABLED = None
+_CURRENT_STREAM = None
 
 
 def is_310p():
-    return _SOC_VERSION in SOC_VERSION_INFERENCE_SERIES
+    global _IS_310P
+    if _IS_310P is None:
+        from vllm_ascend import _build_info  # type: ignore
+        _IS_310P = _build_info.__soc_version__.lower().startswith("ascend310p")
+    return _IS_310P
 
 
-class NullHandle:
-
-    def __init__(self):
-        pass
-
-    def wait(self):
-        pass
+def sleep_mode_enabled():
+    global _SLEEP_MODE_ENABLED
+    if _SLEEP_MODE_ENABLED is None:
+        from vllm_ascend import _build_info  # type: ignore
+        _SLEEP_MODE_ENABLED = _build_info.__sleep_mode_enabled__
+    return _SLEEP_MODE_ENABLED
 
 
 def _round_up(x: int, align: int):
-    if align == 0:
-        return -1
+    # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc.
+    # input: 15, 16 -> output: 16
+    # input: 17, 16 -> output: 32
+    # input: 30, 16 -> output: 32
+    # input: 33, 16 -> output: 48
+    # ...
     return (x + align - 1) // align * align
 
 
 def _custom_pad(x, pad_dims):
+    # pad the input tensor to the shape of pad_dims
+    # input: (13, 30), pad_dims: [0, 2, 0, 3]
+    # output: (16, 32)
     return torch.nn.functional.pad(x, pad_dims)
 
 
 def _custom_reshape(x, target_shape):
+    # reshape the input tensor to the shape of target_shape
+    # input: (16, 32), target_shape: [1, 16, 2, 16]
+    # output: (1, 16, 2, 16)
     return x.reshape(target_shape)
 
 
 def _custom_transpose(x, dim1, dim2):
+    # transpose the input tensor
+    # input: (1, 16, 2, 16), dim1: 1, dim2: 2
+    # output: (1, 2, 16, 16)
     return x.transpose(dim1, dim2)
 
 
 def nd_to_nz_2d(in_tensor: torch.Tensor) -> torch.Tensor:
-    aux_dims = [0, 0, 0, 0]
-    aux_dims[0] = 1
+    # in_tensor: (13, 30)
+    aux_dims = [1, 0, 0, 16]
+    # aux_dims[1]: 16
     aux_dims[1] = _round_up(in_tensor.size(0), 16)
+    # aux_dims[2]: 2
+    aux_dims[2] = _round_up(in_tensor.size(1), 16) // 16
+
+    # after: aux_dims: [1, 16, 2, 16]
 
     pad_dims = [0, 0, 0, 0]
+    # pad_dims[1]: 2
+    pad_dims[1] = _round_up(in_tensor.size(1), 16) - in_tensor.size(1)
+    # pad_dims[3]: 3
     pad_dims[3] = _round_up(in_tensor.size(0), 16) - in_tensor.size(0)
 
-    aux_dims[2] = _round_up(in_tensor.size(1), 16) // 16
-    aux_dims[3] = 16
-    pad_dims[1] = _round_up(in_tensor.size(1), 16) - in_tensor.size(1)
+    # after: pad_dims: [0, 2, 0, 3]
 
+    # return: (1, 2, 16, 16)
     return _custom_transpose(
         _custom_reshape(_custom_pad(in_tensor, pad_dims), aux_dims), 1,
         2).contiguous()
@@ -187,24 +193,19 @@ def enable_custom_op():
     Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. 
     Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
     """
-    global CUSTOM_OP_ENABLED
-
-    if CUSTOM_OP_ENABLED is not None:
-        return CUSTOM_OP_ENABLED
-
-    else:
-        try:
-            # register custom ops into torch_library here
-            import vllm_ascend.vllm_ascend_C  # type: ignore  # noqa: F401
-            CUSTOM_OP_ENABLED = True
-
-        except ImportError:
-            CUSTOM_OP_ENABLED = False
-            logger.warning(
-                "Warning: Failed to register custom ops, all custom ops will be disabled"
-            )
-
-        return CUSTOM_OP_ENABLED
+    global _CUSTOM_OP_ENABLED
+    if _CUSTOM_OP_ENABLED is not None:
+        return _CUSTOM_OP_ENABLED
+    try:
+        # register custom ops into torch_library here
+        import vllm_ascend.vllm_ascend_C  # type: ignore  # noqa: F401
+        _CUSTOM_OP_ENABLED = True
+    except ImportError:
+        _CUSTOM_OP_ENABLED = False
+        logger.warning(
+            "Warning: Failed to register custom ops, all custom ops will be disabled"
+        )
+    return _CUSTOM_OP_ENABLED
 
 
 def find_hccl_library() -> str:
@@ -229,9 +230,6 @@ def find_hccl_library() -> str:
     return so_file
 
 
-_current_stream = None
-
-
 def current_stream() -> torch.npu.Stream:
     """
     replace `torch.npu.current_stream()` with `vllm.utils.current_stream()`.
@@ -241,12 +239,12 @@ def current_stream() -> torch.npu.Stream:
     directly, so that we can avoid calling `torch.npu.current_stream()`.
 
     """
-    global _current_stream
-    if _current_stream is None:
+    global _CURRENT_STREAM
+    if _CURRENT_STREAM is None:
         # when this function is called before any stream is set,
         # we return the default stream.
-        _current_stream = torch.npu.current_stream()
-    return _current_stream
+        _CURRENT_STREAM = torch.npu.current_stream()
+    return _CURRENT_STREAM
 
 
 def adapt_patch(is_global_patch: bool = False):
@@ -326,6 +324,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
             len(original_sizes))
 
 
+# TODO(wxy): Move to ops module
 def dispose_tensor(x: torch.Tensor):
     x.set_(torch.empty((0, ), device=x.device, dtype=x.dtype))
 
@@ -378,10 +377,12 @@ class ProfileExecuteDuration:
         return durations
 
 
+# TODO(wxy): Move to ops module
 def npu_stream_switch(tag: str, priority: int, *, enabled: bool = True):
     return _npu_stream_switch(tag, priority) if enabled else nullcontext()
 
 
+# TODO(wxy): Move to ops module
 def npu_wait_tensor(self: torch.Tensor,
                     dependency: torch.Tensor,
                     *,
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index 88d7c21..78e00ec 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -40,7 +40,7 @@ from vllm_ascend.ascend_config import init_ascend_config
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.utils import try_register_lib
+from vllm_ascend.utils import sleep_mode_enabled, try_register_lib
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
 
@@ -91,6 +91,10 @@ class NPUWorker(WorkerBase):
         self.profiler = self._init_profiler()
 
     def sleep(self, level: int = 1) -> None:
+        if not sleep_mode_enabled():
+            raise ValueError(
+                "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
+            )
         free_bytes_before_sleep = NPUPlatform.mem_get_info()[0]
         allocator = CaMemAllocator.get_instance()
         allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
@@ -104,6 +108,10 @@ class NPUWorker(WorkerBase):
             used_bytes / GiB_bytes)
 
     def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        if not sleep_mode_enabled():
+            raise ValueError(
+                "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
+            )
         allocator = CaMemAllocator.get_instance()
         allocator.wake_up(tags=tags)