Add sleep mode feature for Ascend NPU (#513)

### What this PR does / why we need it? This PR adds sleep mode feature for vllm-ascend, when sleeps, we do mainly two things: - offload model weights - discard kv cache RLHF tools(such as https://github.com/volcengine/verl and https://github.com/OpenRLHF/OpenRLHF) have a strong need of sleep mode to accelerate the training process. This PR may solve #375 and #320 . ### Does this PR introduce _any_ user-facing change? No existing user interfaces changed. Users will have two new methods(`sleep()` and `wake_up()`) to use. ### How was this patch tested? This PR is tested with Qwen/Qwen2.5-0.5B-Instruct. At first, we have free NPU memory M1. After `llm = LLM("Qwen/Qwen2.5-0.5B-Instruct", enable_sleep_mode=True)` executed, we have free NPU memory M2. M2 < M1. Then we call `llm.sleep(level=1)`, we have free NPU memory M3. We have M3 > M2, M3 is very close to M1. Plus, we have the same output tokens before sleep and after wake up, with the config of `SamplingParams(temperature=0, max_tokens=10)` and with the same input tokens of course. This PR is utilizing the CMake procedure of #371 , thanks a lot. Signed-off-by: Shuqiao Li <celestialli@outlook.com>
2025-04-18 13:11:39 +08:00
parent 42c7fbb10e
commit 84563fc65d
13 changed files with 1020 additions and 9 deletions
--- a/vllm_ascend/init.py
+++ b/vllm_ascend/init.py
@@ -18,9 +18,6 @@

 def register():
    """Register the NPU platform."""
-    # Adapt the global patch here.
-    from vllm_ascend.utils import adapt_patch
-    adapt_patch(is_global_patch=True)

    return "vllm_ascend.platform.NPUPlatform"

--- a/vllm_ascend/device_allocator/init.py
+++ b/vllm_ascend/device_allocator/init.py
--- a/vllm_ascend/device_allocator/camem.py
+++ b/vllm_ascend/device_allocator/camem.py
@@ -0,0 +1,283 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# CANN-mem-based pytorch pluggable allocator to implement sleep mode.
+#
+import dataclasses
+import os
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
+import torch
+from acl.rt import memcpy  # type: ignore # noqa: F401
+from vllm.logger import logger
+
+try:
+    import torch_npu  # noqa: F401
+except ImportError:
+    print("Failed to import torch_npu.")
+
+from vllm.utils import is_pin_memory_available
+
+
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """ # noqa
+    found_line = None
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found_line = line
+                break
+    if found_line is None:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = found_line.index("/")
+    path = found_line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), \
+        f"Unexpected filename: {filename} for library {lib_name}"
+    return path
+
+
+camem_available = False
+try:
+    from vllm_ascend.vllm_ascend_C import (  # type: ignore # noqa: F401
+        init_module, python_create_and_map, python_unmap_and_release)
+    lib_name = find_loaded_library("vllm_ascend_C")
+    camem_available = True
+except ModuleNotFoundError as e:
+    logger.error("Failed to import vllm_ascend_C:%s", e)
+    init_module = None
+    python_create_and_map = None
+    python_unmap_and_release = None
+    lib_name = None
+    libcudart = None
+
+# py_device, py_alignedSize, py_d_mem, py_p_memHandle
+HandleType = Tuple[int, int, int, int]
+
+
+@dataclasses.dataclass
+class AllocationData:
+    handle: HandleType
+    tag: str
+    cpu_backup_tensor: Optional[torch.Tensor] = None
+
+
+def create_and_map(allocation_handle: HandleType) -> None:
+    python_create_and_map(*allocation_handle)
+
+
+def unmap_and_release(allocation_handle: HandleType) -> None:
+    python_unmap_and_release(*allocation_handle)
+
+
+def get_pluggable_allocator(
+    python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
+    python_free_func: Callable[[int], tuple[int, int, int, int]]
+) -> torch_npu.npu.memory.NPUPluggableAllocator:
+    init_module(python_malloc_fn, python_free_func)
+    new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(
+        lib_name, 'my_malloc', 'my_free')
+    return new_alloc
+
+
+@contextmanager
+def use_memory_pool_with_allocator(
+        python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
+        python_free_func: Callable[[int], tuple[int, int, int, int]]):
+    new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
+    mem_pool = torch_npu.npu.memory.MemPool(new_alloc._allocator)
+    with torch_npu.npu.memory.use_mem_pool(mem_pool):
+        yield mem_pool, new_alloc
+
+
+class CaMemAllocator:
+    """
+    A singleton class that manages a memory pool for CANN tensors.
+    The memory in this pool can be offloaded or discarded when the
+    allocator sleeps.
+    Inside the `use_memory_pool(tag)` context, all tensors created will
+    be allocated in the memory pool, and has the same tag as the
+    tag passed to the context.
+    When we call `sleep`, all tensors with the specified tag will be
+    offloaded to CPU memory, and the rest of the tensors will be discarded.
+    When we call `wake_up`, all tensors that are previously offloaded
+    will be loaded back to GPU memory, and the rest of the tensors will
+    have empty memory.
+    Why it needs to be a singleton?
+    When allocated tensors are garbage collected, PyTorch will call
+    the free callback, which will call the `python_free_callback` method.
+    The C-extension uses a global variable to store the function of an
+    instance of this class. If we create multiple instances of this class,
+    the global variable will be overwritten and the free callback will
+    not work as expected.
+    """
+    instance = None
+    default_tag: str = "default"
+
+    @staticmethod
+    def get_instance() -> "CaMemAllocator":
+        """
+        CaMemAllocator is a singleton class.
+        We cannot call the constructor directly.
+        Call this method to get the instance.
+        """
+        assert camem_available, "camem allocator is not available"
+        if CaMemAllocator.instance is None:
+            CaMemAllocator.instance = CaMemAllocator()
+        return CaMemAllocator.instance
+
+    def __init__(self):
+        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+        assert "expandable_segments:True" not in conf, \
+            ("Expandable segments are not compatible with memory pool. "
+            "Please track https://github.com/pytorch/pytorch/issues/147851 "
+            "for the latest updates.")
+
+        self.pointer_to_data: Dict[int, AllocationData] = {}
+        self.current_tag: str = CaMemAllocator.default_tag
+        self.allocator_and_pools: Dict[str, Any] = {}
+
+    def python_malloc_callback(self, allocation_handle: HandleType) -> None:
+        """
+        Internal method to store the allocation data
+        when memory is allocated in the memory pool."""
+        py_d_mem = allocation_handle[2]
+        self.pointer_to_data[py_d_mem] = AllocationData(
+            allocation_handle, self.current_tag)
+        return
+
+    def python_free_callback(self, ptr: int) -> HandleType:
+        """
+        Internal method to look up the allocation data
+        when memory is freed in the memory pool."""
+        data = self.pointer_to_data.pop(ptr)
+        if data.cpu_backup_tensor is not None:
+            data.cpu_backup_tensor = None
+        return data.handle
+
+    def sleep(
+            self,
+            offload_tags: Optional[Union[Tuple[str, ...],
+                                         str]] = None) -> None:
+        """
+        Put the allocator in sleep mode.
+        All data in the memory allocation with the specified tag will be 
+        offloaded to CPU memory, and others will be discarded.
+        :param offload_tags: The tags of the memory allocation that will be
+            offloaded. The rest of the memory allocation will be discarded.
+        """
+        if offload_tags is None:
+            # by default, allocated tensors are offloaded
+            # when the allocator sleeps
+            offload_tags = (CaMemAllocator.default_tag, )
+        elif isinstance(offload_tags, str):
+            offload_tags = (offload_tags, )
+
+        assert isinstance(offload_tags, tuple)
+
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            if data.tag in offload_tags:
+                size_in_bytes = handle[1]
+                cpu_backup_tensor = torch.empty(
+                    size_in_bytes,
+                    dtype=torch.uint8,
+                    device='cpu',
+                    pin_memory=is_pin_memory_available())
+                cpu_ptr = cpu_backup_tensor.data_ptr()
+                ACL_MEMCPY_DEVICE_TO_HOST = 2
+                dest_max = cpu_ptr + size_in_bytes * 2
+                memcpy(cpu_ptr, dest_max, ptr, size_in_bytes,
+                       ACL_MEMCPY_DEVICE_TO_HOST)
+                data.cpu_backup_tensor = cpu_backup_tensor
+            unmap_and_release(handle)
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        """
+        Wake up the allocator from sleep mode.
+        All data that is previously offloaded will be loaded back to GPU 
+        memory, and the rest of the data will have empty memory."""
+        for ptr, data in self.pointer_to_data.items():
+            if tags is None or data.tag in tags:
+                handle = data.handle
+                create_and_map(handle)
+                if data.cpu_backup_tensor is not None:
+                    cpu_backup_tensor = data.cpu_backup_tensor
+                    if cpu_backup_tensor is not None:
+                        size_in_bytes = cpu_backup_tensor.numel(
+                        ) * cpu_backup_tensor.element_size()
+                        cpu_ptr = cpu_backup_tensor.data_ptr()
+                        ACL_MEMCPY_HOST_TO_DEVICE = 1
+                        dest_max = ptr + size_in_bytes * 2
+                        memcpy(ptr, dest_max, cpu_ptr, size_in_bytes,
+                               ACL_MEMCPY_HOST_TO_DEVICE)
+                        data.cpu_backup_tensor = None
+
+    @contextmanager
+    def use_memory_pool(self, tag: Optional[str] = None):
+        """
+        A context manager to use the memory pool.
+        All memory allocation created inside the context will be allocated 
+        in the memory pool, and has the specified tag.
+        :param tag: The tag of the memory allocation. If None, the default tag
+            will be used.
+        """
+        if tag is None:
+            tag = CaMemAllocator.default_tag
+
+        assert isinstance(tag, str)
+
+        old_tag = self.current_tag
+        self.current_tag = tag
+        with use_memory_pool_with_allocator(self.python_malloc_callback,
+                                            self.python_free_callback) as data:
+            # start to hit another PyTorch bug in PyTorch 2.6,
+            # possibly because of gc-related issue w.r.t. the allocator and
+            # the memory pool.
+            # to avoid the issue, we keep a reference of the data.
+            # see https://github.com/pytorch/pytorch/issues/146431 .
+            self.allocator_and_pools[tag] = data
+            yield
+            # PyTorch's bug, calling torch.cuda.empty_cache() will error
+            # when using pluggable allocator, see
+            # https://github.com/pytorch/pytorch/issues/145168 .
+            # if we have some memory allocated and then freed,
+            # the memory will not be released.
+            # right now it is fine, because we only use this allocator
+            # during weight loading and kv cache creation, where we only
+            # allocate memory.
+            # TODO: we need to find a way to release the memory,
+            # i.e. calling torch.cuda.empty_cache()
+            self.current_tag = old_tag
+
+    def get_current_usage(self) -> int:
+        """
+        Get the total number of bytes allocated in the memory pool.
+        """
+        sum_bytes: int = 0
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            sum_bytes += handle[1]
+        return sum_bytes
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -56,6 +56,10 @@ env_variables: Dict[str, Callable[[], Any]] = {
    lambda: os.getenv("LLMDATADIST_COMM_PORT", "26000"),
    "LLMDATADIST_SYNC_CACHE_WAIT_TIME":
    lambda: os.getenv("LLMDATADIST_SYNC_CACHE_WAIT_TIME", "5000"),
+    "CXX_COMPILER":
+    lambda: os.getenv("CXX_COMPILER", None),
+    "C_COMPILER":
+    lambda: os.getenv("C_COMPILER", None)
 }

 # end-env-vars-definition
--- a/vllm_ascend/patch/platform/patch_0_8_4/init.py
+++ b/vllm_ascend/patch/platform/patch_0_8_4/init.py
@@ -13,4 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
+#
+
+import vllm_ascend.patch.platform.patch_0_8_4.patch_config  # noqa
--- a/vllm_ascend/patch/platform/patch_0_8_4/patch_config.py
+++ b/vllm_ascend/patch/platform/patch_0_8_4/patch_config.py
@@ -0,0 +1,243 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import json
+import warnings
+from importlib.util import find_spec
+from typing import Any, Final, Literal, Mapping, Optional, Union
+
+import torch
+import vllm.envs as envs
+from vllm.config import (HfOverrides, ModelConfig, ModelImpl, PoolerConfig,
+                         TaskOption, _get_and_verify_dtype,
+                         _get_and_verify_max_len, get_min_sliding_window,
+                         get_served_model_name, logger)
+from vllm.transformers_utils.config import (ConfigFormat, get_config,
+                                            get_hf_image_processor_config,
+                                            get_hf_text_config)
+from vllm.transformers_utils.utils import maybe_model_redirect
+
+
+def new_init(
+    self,
+    model: str,
+    task: Union[TaskOption, Literal["draft"]],
+    tokenizer: str,
+    tokenizer_mode: str,
+    trust_remote_code: bool,
+    dtype: Union[str, torch.dtype],
+    seed: int,
+    hf_config_path: Optional[str] = None,
+    allowed_local_media_path: str = "",
+    revision: Optional[str] = None,
+    code_revision: Optional[str] = None,
+    rope_scaling: Optional[dict[str, Any]] = None,
+    rope_theta: Optional[float] = None,
+    tokenizer_revision: Optional[str] = None,
+    max_model_len: Optional[int] = None,
+    spec_target_max_model_len: Optional[int] = None,
+    quantization: Optional[str] = None,
+    enforce_eager: Optional[bool] = None,
+    max_seq_len_to_capture: Optional[int] = None,
+    max_logprobs: int = 20,
+    disable_sliding_window: bool = False,
+    disable_cascade_attn: bool = False,
+    skip_tokenizer_init: bool = False,
+    served_model_name: Optional[Union[str, list[str]]] = None,
+    limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+    use_async_output_proc: bool = True,
+    config_format: ConfigFormat = ConfigFormat.AUTO,
+    hf_token: Optional[Union[bool, str]] = None,
+    hf_overrides: Optional[HfOverrides] = None,
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    disable_mm_preprocessor_cache: bool = False,
+    override_neuron_config: Optional[dict[str, Any]] = None,
+    override_pooler_config: Optional["PoolerConfig"] = None,
+    logits_processor_pattern: Optional[str] = None,
+    generation_config: str = "auto",
+    enable_sleep_mode: bool = False,
+    override_generation_config: Optional[dict[str, Any]] = None,
+    model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
+) -> None:
+    self.model = maybe_model_redirect(model)
+    self.tokenizer = maybe_model_redirect(tokenizer)
+
+    self.hf_config_path = hf_config_path
+    if isinstance(hf_config_path, str):
+        self.hf_config_path = maybe_model_redirect(hf_config_path)
+
+    self.tokenizer_mode = tokenizer_mode
+    self.trust_remote_code = trust_remote_code
+    self.allowed_local_media_path = allowed_local_media_path
+    self.seed = seed
+    self.revision = revision
+    self.code_revision = code_revision
+    self.rope_scaling = rope_scaling
+    self.rope_theta = rope_theta
+    self.model_impl = model_impl
+
+    if hf_overrides is None:
+        hf_overrides = {}
+
+    if callable(hf_overrides):
+        hf_overrides_kw: dict[str, Any] = {}
+        hf_overrides_fn = hf_overrides
+    else:
+        hf_overrides_kw = hf_overrides
+        hf_overrides_fn = None
+
+    if rope_scaling is not None:
+        hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
+        hf_overrides_kw.update(hf_override)
+        hf_overrides_str = json.dumps(hf_overrides)
+        msg = ("`--rope-scaling` will be removed in a future release. "
+               f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+    if rope_theta is not None:
+        hf_override = {"rope_theta": rope_theta}
+        hf_overrides_kw.update(hf_override)
+        hf_overrides_str = json.dumps(hf_overrides)
+        msg = ("`--rope-theta` will be removed in a future release. "
+               f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+    self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
+
+    if (backend := envs.VLLM_ATTENTION_BACKEND
+        ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
+        raise ValueError(
+            "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
+            "module was not found. See "
+            "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
+            "for instructions on how to install it.")
+
+    # The tokenizer version is consistent with the model version by default.
+    if tokenizer_revision is None:
+        self.tokenizer_revision = revision
+    else:
+        self.tokenizer_revision = tokenizer_revision
+    self.quantization = quantization
+    self.enforce_eager = enforce_eager
+    self.max_seq_len_to_capture = max_seq_len_to_capture
+    self.max_logprobs = max_logprobs
+    self.disable_sliding_window = disable_sliding_window
+    self.disable_cascade_attn = disable_cascade_attn
+    self.skip_tokenizer_init = skip_tokenizer_init
+    self.enable_sleep_mode = enable_sleep_mode
+
+    from vllm.platforms import current_platform
+
+    hf_config = get_config(self.hf_config_path or self.model,
+                           trust_remote_code, revision, code_revision,
+                           config_format)
+
+    if hf_overrides_kw:
+        logger.info("Overriding HF config with %s", hf_overrides_kw)
+        hf_config.update(hf_overrides_kw)
+    if hf_overrides_fn:
+        logger.info("Overriding HF config with %s", hf_overrides_fn)
+        hf_config = hf_overrides_fn(hf_config)
+
+    self.hf_config = hf_config
+
+    self.hf_text_config = get_hf_text_config(self.hf_config)
+    self.attention_chunk_size = getattr(self.hf_text_config,
+                                        "attention_chunk_size", None)
+    self.encoder_config = self._get_encoder_config()
+    self.hf_image_processor_config = get_hf_image_processor_config(
+        self.model, hf_token=hf_token, revision=revision)
+    self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
+    self.use_async_output_proc = use_async_output_proc
+    self.mm_processor_kwargs = mm_processor_kwargs
+    self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
+
+    # Set enforce_eager to False if the value is unset.
+    if self.enforce_eager is None:
+        self.enforce_eager = False
+
+    interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"]
+    sliding_window = getattr(self.hf_text_config, "sliding_window", None)
+    has_interleaved_attention = (sliding_window is not None) and (
+        isinstance(sliding_window, list) or
+        (self.hf_text_config.model_type in interleaved_attn_models))
+
+    if (not self.disable_sliding_window and has_interleaved_attention):
+        if (backend :=
+                envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
+            sliding_window_len_min = get_min_sliding_window(
+                self.hf_text_config.sliding_window)
+
+            logger.warning_once(
+                f"{self.hf_text_config.model_type} has interleaved "
+                "attention, which is currently not supported by the "
+                f"{backend} backend. Disabling sliding window and capping "
+                "the max length to the sliding window size "
+                f"({sliding_window_len_min}).")
+            self.disable_sliding_window = True
+        else:
+            # for a model with interleaved attention,
+            # the scheduler and the model treat it as full attention
+            # (i.e., not dropping any tokens outside the window).
+            # only the attention layer itself is aware of the sliding
+            # window, and use the window size to compute the attention.
+            self.hf_text_config.interleaved_sliding_window = sliding_window
+            delattr(self.hf_text_config, "sliding_window")
+            sliding_window = None
+
+    self.max_model_len = _get_and_verify_max_len(
+        hf_config=self.hf_text_config,
+        max_model_len=max_model_len,
+        disable_sliding_window=self.disable_sliding_window,
+        sliding_window_len=self.get_hf_config_sliding_window(),
+        spec_target_max_model_len=spec_target_max_model_len,
+        encoder_config=self.encoder_config)
+    self.served_model_name = get_served_model_name(model, served_model_name)
+    self.multimodal_config = self._init_multimodal_config(limit_mm_per_prompt)
+    if not self.skip_tokenizer_init:
+        self._verify_tokenizer_mode()
+
+    self.is_attention_free = self._init_attention_free()
+    self.is_hybrid = self._init_is_hybrid()
+    self.has_noops = self._init_has_noops()
+    self.has_inner_state = self._init_has_inner_state()
+
+    if current_platform.is_neuron():
+        self.override_neuron_config = override_neuron_config
+    else:
+        self.override_neuron_config = None
+
+    supported_tasks, task = self._resolve_task(task)
+    self.supported_tasks = supported_tasks
+    self.task: Final = task  # type: ignore
+    if self.task in ("draft", "generate"):
+        self.truncation_side = "left"
+    else:
+        self.truncation_side = "right"
+
+    self.pooler_config = self._init_pooler_config(override_pooler_config)
+    self.logits_processor_pattern = logits_processor_pattern
+
+    self.generation_config = generation_config
+    self.override_generation_config = override_generation_config or {}
+
+    self._verify_quantization()
+    self._verify_cuda_graph()
+    self._verify_bnb_config()
+
+
+# The platform assertion is deleted to support the npu platform.
+ModelConfig.__init__ = new_init
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -63,10 +63,17 @@ class NPUPlatform(Platform):

    supported_quantization: list[str] = ["ascend"]

+    def is_sleep_mode_available(self) -> bool:
+        return True
+
    @classmethod
    def pre_register_and_update(cls,
                                parser: Optional[FlexibleArgumentParser] = None
                                ) -> None:
+        # Adapt the global patch here.
+        from vllm_ascend.utils import adapt_patch
+        adapt_patch(is_global_patch=True)
+
        from vllm_ascend.quantization.quant_config import \
            AscendQuantConfig  # noqa: F401

--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -36,13 +36,14 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                           SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.utils import bind_kv_cache
+from vllm.utils import GiB_bytes, bind_kv_cache
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner_base import ModelRunnerBase
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
                                     WorkerInput)

+from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import try_register_lib, vllm_version_is
 from vllm_ascend.worker.model_runner import NPUModelRunner
@@ -159,6 +160,24 @@ class NPUWorker(LocalOrDistributedWorkerBase):
        else:
            self.profiler = None

+    def sleep(self, level: int = 1) -> None:
+        NPUPlatform.set_device(self.device)
+        free_bytes_before_sleep = NPUPlatform.mem_get_info()[0]
+        allocator = CaMemAllocator.get_instance()
+        allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
+        free_bytes_after_sleep, total = NPUPlatform.mem_get_info()
+        freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
+        used_bytes = total - free_bytes_after_sleep
+        assert freed_bytes >= 0, "Memory usage increased after sleeping."
+        logger.info(
+            "Sleep mode freed %.2f GiB memory, "
+            "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
+            used_bytes / GiB_bytes)
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        allocator = CaMemAllocator.get_instance()
+        allocator.wake_up(tags=tags)
+
    def init_device(self) -> None:
        if self.device_config.device.type == "npu":
            self.device = torch.device(f"npu:{self.local_rank}")
@@ -176,7 +195,17 @@ class NPUWorker(LocalOrDistributedWorkerBase):
        set_random_seed(self.model_config.seed)

    def load_model(self):
-        self.model_runner.load_model()
+        if self.vllm_config.model_config.enable_sleep_mode:
+            allocator = CaMemAllocator.get_instance()
+            assert allocator.get_current_usage() == 0, (
+                "Sleep mode can only be "
+                "used for one instance per process.")
+            context = allocator.use_memory_pool(tag="weights")
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()  # type: ignore
+        with context:
+            self.model_runner.load_model()

    def start_profile(self):
        if self.profiler is None:
@@ -263,8 +292,14 @@ class NPUWorker(LocalOrDistributedWorkerBase):

        self.cache_config.num_gpu_blocks = num_gpu_blocks
        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        self._init_cache_engine()
+        if self.vllm_config.model_config.enable_sleep_mode:
+            allocator = CaMemAllocator.get_instance()
+            context = allocator.use_memory_pool(tag="kv_cache")
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()  # type: ignore
+        with context:
+            self._init_cache_engine()
        self._warm_up_model()

    def _init_cache_engine(self):
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -89,6 +89,12 @@ class NPUWorker(WorkerBase):

        self.profiler = self._init_profiler()

+    def sleep(self, level: int = 1) -> None:
+        logger.error("Sleep mode is only supported on v0")
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        logger.error("Sleep mode is only supported on v0")
+
    def init_device(self):
        if self.device_config.device.type == "npu":
            self.device = torch.device(f"npu:{self.local_rank}")