[Doc] Add sleep mode doc (#1295)

### What this PR does / why we need it? Add sleep related doc and example --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-06-25 14:07:14 +08:00
parent e4e0b7af05
commit 15df8be937
4 changed files with 173 additions and 0 deletions
--- a/examples/offline_inference_sleep_mode_npu.py
+++ b/examples/offline_inference_sleep_mode_npu.py
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+import os
+
+import torch
+from vllm import LLM, SamplingParams
+from vllm.utils import GiB_bytes
+
+os.environ["VLLM_USE_V1"] = "1"
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+if __name__ == "__main__":
+    prompt = "How are you?"
+
+    free, total = torch.npu.mem_get_info()
+    print(f"Free memory before sleep: {free / 1024 ** 3:.2f} GiB")
+    # record npu memory use baseline in case other process is running
+    used_bytes_baseline = total - free
+    llm = LLM("Qwen/Qwen2.5-0.5B-Instruct", enable_sleep_mode=True)
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    llm.sleep(level=1)
+
+    free_npu_bytes_after_sleep, total = torch.npu.mem_get_info()
+    print(
+        f"Free memory after sleep: {free_npu_bytes_after_sleep / 1024 ** 3:.2f} GiB"
+    )
+    used_bytes = total - free_npu_bytes_after_sleep - used_bytes_baseline
+    # now the memory usage should be less than the model weights
+    # (0.5B model, 1GiB weights)
+    assert used_bytes < 1 * GiB_bytes
+
+    llm.wake_up()
+    output2 = llm.generate(prompt, sampling_params)
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text