From 7c8bdc3a18ca70e1239c1b9c620e241e2739b90c Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Mon, 17 Feb 2025 22:11:04 +0800 Subject: [PATCH] [Doc] Update tutorials (#79) ### What this PR does / why we need it? Update tutorials. ### Does this PR introduce _any_ user-facing change? no. ### How was this patch tested? no. --------- Signed-off-by: Shanshan Shen <87969357+shen-shanshan@users.noreply.github.com> --- docs/source/tutorials.md | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/docs/source/tutorials.md b/docs/source/tutorials.md index 704d07b..01dedc8 100644 --- a/docs/source/tutorials.md +++ b/docs/source/tutorials.md @@ -28,7 +28,6 @@ Setup environment variables: ```bash # Use Modelscope mirror to speed up model download export VLLM_USE_MODELSCOPE=True -export MODELSCOPE_CACHE=/root/.cache/ # To avoid NPU out of memory, set `max_split_size_mb` to any value lower than you need to allocate for Qwen2.5-7B-Instruct export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 @@ -82,7 +81,6 @@ docker run \ -v /root/.cache:/root/.cache \ -p 8000:8000 \ -e VLLM_USE_MODELSCOPE=True \ --e MODELSCOPE_CACHE=/root/.cache/ \ -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ -it quay.io/ascend/vllm-ascend:latest \ vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 @@ -91,6 +89,14 @@ vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 > [!NOTE] > Add `--max_model_len` option to avoid ValueError that the Qwen2.5-7B model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (26240). +If your service start successfully, you can see the info shown below: + +```bash +INFO: Started server process [6873] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + Once your server is started, you can query the model with input prompts: ```bash @@ -146,7 +152,6 @@ Setup environment variables: ```bash # Use Modelscope mirror to speed up model download export VLLM_USE_MODELSCOPE=True -export MODELSCOPE_CACHE=/root/.cache/ # To avoid NPU out of memory, set `max_split_size_mb` to any value lower than you need to allocate for Qwen2.5-7B-Instruct export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 @@ -155,7 +160,19 @@ export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 Run the following script to execute offline inference on multi-NPU: ```python +import gc + +import torch + from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() prompts = [ "Hello, my name is", @@ -172,6 +189,9 @@ for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +del llm +clean_up() ``` If you run this script successfully, you can see the info shown below: