From f0c1f0c828bba2f65d4f386b27a3d7cbb940a2d3 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Sat, 2 Aug 2025 08:58:56 +0800 Subject: [PATCH] [Doc] Add qwen vl example in tutorials for 310I series (#2160) ### What this PR does / why we need it? Add qwen vl example in tutorials for 310I series. Model: Qwen2.5-VL-3B-Instruct Accuracy test result, dataset MMM-val: | | 910B3 | 310P3 | | --- | --- | --- | |Summary|0.455 | 0.46 | |--art_and_design| 0.558 | 0.566 | |--business| 0.373 | 0.366 | |--health_and_medicine|0.513 | 0.52 | |--science|0.333 | 0.333 | |--tech_and_engineering|0.362 | 0.380 | |--humanities_and_social_science|0.691 | 0.691 | Function test result: 1. On line: ![image](https://github.com/user-attachments/assets/d81bba61-df28-4676-a246-c5d094815ac7) ![image](https://github.com/user-attachments/assets/0be81628-9999-4ef2-93c1-898b3043e09e) 2. Offline: ![image](https://github.com/user-attachments/assets/603275c1-6ed6-4cfc-a6e2-7726156de087) - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad57f23f6a528ab01066998b41796a44340fd43d --------- Signed-off-by: leo-pony --- docs/source/tutorials/single_node_300i.md | 78 ++++++++++++++++++++++- 1 file changed, 76 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorials/single_node_300i.md b/docs/source/tutorials/single_node_300i.md index 7d45bfd..270d002 100644 --- a/docs/source/tutorials/single_node_300i.md +++ b/docs/source/tutorials/single_node_300i.md @@ -1,7 +1,8 @@ # Single Node (Atlas 300I series) ```{note} -This Atlas 300I series is currently experimental. In future versions, there may be behavioral changes around model coverage, performance improvement. +1. This Atlas 300I series is currently experimental. In future versions, there may be behavioral changes around model coverage, performance improvement. +2. Currently, the 310I series only supports eager mode and the data type is float16. ``` ## Run vLLM on Altlas 300I series @@ -83,7 +84,7 @@ curl http://localhost:8000/v1/completions \ :::: -::::{tab-item} Qwen/Qwen2.5-7B-Instruct +::::{tab-item} Qwen2.5-7B-Instruct :sync: qwen7b Run the following command to start the vLLM server: @@ -113,6 +114,36 @@ curl http://localhost:8000/v1/completions \ :::: +::::{tab-item} Qwen2.5-VL-3B-Instruct +:sync: qwen-vl-2.5-3b + +Run the following command to start the vLLM server: + +```{code-block} bash + :substitutions: +vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ + --tensor-parallel-size 1 \ + --enforce-eager \ + --dtype float16 \ + --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' +``` + +Once your server is started, you can query the model with input prompts + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "The future of AI is", + "max_tokens": 64, + "top_p": 0.95, + "top_k": 50, + "temperature": 0.6 + }' +``` + +:::: + ::::{tab-item} Pangu-Pro-MoE-72B :sync: pangu @@ -251,6 +282,49 @@ clean_up() :::: +::::{tab-item} Qwen2.5-VL-3B-Instruct +:sync: qwen-vl-2.5-3b + +```{code-block} python + :substitutions: +from vllm import LLM, SamplingParams +import gc +import torch +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() +prompts = [ + "Hello, my name is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(max_tokens=100, top_p=0.95, top_k=50, temperature=0.6) +# Create an LLM. +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + tensor_parallel_size=1, + enforce_eager=True, # For 300I series, only eager mode is supported. + dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series + compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series +) +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +del llm +clean_up() +``` + +:::: + ::::{tab-item} Pangu-Pro-MoE-72B :sync: pangu