### What this PR does / why we need it?
vLLM Ascend plugin (vllm-ascend) is a backend plugin for running vLLM on
the Ascend NPU.
This plugin is the recommended approach for supporting the Ascend
backend within the vLLM community. It adheres to the principles outlined
in the [RFC]: Hardware pluggable, providing a hardware-pluggable
interface that decouples the integration of the Ascend NPU with vLLM.
This patch also include changes to make CI work and use cache speed up
e2e test, including:
1. Change push (post merge ci) and pull_request (pr ci) trigger branch
to main
2. Make mypy work by ignore base_communicator and clear unused deps
3. Several improvements for vllm_ascend_test:
- use cache (pip, ms, hf) speed up e2e test (25mins --> 5mins)
- switch `git clone` command to `action/checkout` to speedup checkout
and
- Enable sv for pytest for better info dump
- Remove network host to resole `docker: conflicting ontions: cannot
attach both user-defined and non-user-definednetwork-modes`, which is a
problem on docker 1.45 but not on 1.39.
4. Adapt MLA decode optimizations:
cabaf4eff3
### Does this PR introduce _any_ user-facing change?
Yes, init the PR.
### How was this patch tested?
- This is the first PR to make ascend NPU work on vLLM. All code is
tested on ascend with vLLM V0 Engine.
- CI passed
---------
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
Co-authored-by: wangshuai09 <391746016@qq.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
154 lines
5.0 KiB
Python
154 lines
5.0 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# This file is a part of the vllm-ascend project.
|
|
# Adapted from vllm-project/vllm/examples/offline_inference/audio_language.py
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
"""
|
|
This example shows how to use vLLM for running offline inference
|
|
with the correct prompt format on audio language models.
|
|
|
|
For most models, the prompt format should follow corresponding examples
|
|
on HuggingFace model repository.
|
|
"""
|
|
|
|
from transformers import AutoTokenizer
|
|
from vllm import LLM, SamplingParams
|
|
from vllm.assets.audio import AudioAsset
|
|
from vllm.utils import FlexibleArgumentParser
|
|
|
|
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
|
question_per_audio_count = {
|
|
0: "What is 1+1?",
|
|
1: "What is recited in the audio?",
|
|
2: "What sport and what nursery rhyme are referenced?"
|
|
}
|
|
|
|
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
|
# lower-end GPUs.
|
|
# Unless specified, these settings have been tested to work on a single L4.
|
|
|
|
|
|
# Ultravox 0.3
|
|
def run_ultravox(question: str, audio_count: int):
|
|
model_name = "fixie-ai/ultravox-v0_3"
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
messages = [{
|
|
'role': 'user',
|
|
'content': "<|audio|>\n" * audio_count + question
|
|
}]
|
|
prompt = tokenizer.apply_chat_template(messages,
|
|
tokenize=False,
|
|
add_generation_prompt=True)
|
|
|
|
llm = LLM(model=model_name,
|
|
max_model_len=4096,
|
|
max_num_seqs=5,
|
|
trust_remote_code=True,
|
|
limit_mm_per_prompt={"audio": audio_count})
|
|
stop_token_ids = None
|
|
return llm, prompt, stop_token_ids
|
|
|
|
|
|
# Qwen2-Audio
|
|
def run_qwen2_audio(question: str, audio_count: int):
|
|
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
|
|
|
|
llm = LLM(model=model_name,
|
|
max_model_len=4096,
|
|
max_num_seqs=5,
|
|
limit_mm_per_prompt={"audio": audio_count})
|
|
|
|
audio_in_prompt = "".join([
|
|
f"Audio {idx+1}: "
|
|
f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
|
|
])
|
|
|
|
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
|
"<|im_start|>user\n"
|
|
f"{audio_in_prompt}{question}<|im_end|>\n"
|
|
"<|im_start|>assistant\n")
|
|
stop_token_ids = None
|
|
return llm, prompt, stop_token_ids
|
|
|
|
|
|
# TODO (cmq): test ultravox
|
|
model_example_map = {
|
|
# "ultravox": run_ultravox,
|
|
"qwen2_audio": run_qwen2_audio
|
|
}
|
|
|
|
|
|
def main(args):
|
|
model = args.model_type
|
|
if model not in model_example_map:
|
|
raise ValueError(f"Model type {model} is not supported.")
|
|
|
|
audio_count = args.num_audios
|
|
llm, prompt, stop_token_ids = model_example_map[model](
|
|
question_per_audio_count[audio_count], audio_count)
|
|
|
|
# We set temperature to 0.2 so that outputs can be different
|
|
# even when all prompts are identical when running batch inference.
|
|
sampling_params = SamplingParams(temperature=0.2,
|
|
max_tokens=64,
|
|
stop_token_ids=stop_token_ids)
|
|
|
|
mm_data = {}
|
|
if audio_count > 0:
|
|
mm_data = {
|
|
"audio": [
|
|
asset.audio_and_sample_rate
|
|
for asset in audio_assets[:audio_count]
|
|
]
|
|
}
|
|
|
|
assert args.num_prompts > 0
|
|
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
|
|
if args.num_prompts > 1:
|
|
# Batch inference
|
|
inputs = [inputs] * args.num_prompts
|
|
|
|
outputs = llm.generate(inputs, sampling_params=sampling_params)
|
|
|
|
for o in outputs:
|
|
generated_text = o.outputs[0].text
|
|
print(generated_text)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = FlexibleArgumentParser(
|
|
description='Demo on using vLLM for offline inference with '
|
|
'audio language models')
|
|
parser.add_argument('--model-type',
|
|
'-m',
|
|
type=str,
|
|
default="qwen2_audio",
|
|
choices=model_example_map.keys(),
|
|
help='Huggingface "model_type".')
|
|
parser.add_argument('--num-prompts',
|
|
type=int,
|
|
default=1,
|
|
help='Number of prompts to run.')
|
|
parser.add_argument("--num-audios",
|
|
type=int,
|
|
default=1,
|
|
choices=[0, 1, 2],
|
|
help="Number of audio items per prompt.")
|
|
|
|
args = parser.parse_args()
|
|
main(args)
|