[Feature] Support for cross-attention and whisper model (#5592)
### What this PR does / why we need it?
To solve the problem of the
issue:https://github.com/vllm-project/vllm-ascend/issues/2262
- support for cross-attention when the model is encoder-decoder
- support for whisper model
- vLLM version: v0.13.0
- vLLM main:
7157596103
Signed-off-by: gh924 <guihao2@huawei.com>
Co-authored-by: Aoxuan Chen <43376869+chenaoxuan@users.noreply.github.com>
This commit is contained in:
@@ -21,6 +21,8 @@ import os
|
||||
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm import SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
@@ -32,6 +34,10 @@ MINICPM_MODELS = [
|
||||
"OpenBMB/MiniCPM4-0.5B",
|
||||
]
|
||||
|
||||
WHISPER_MODELS = [
|
||||
"openai-mirror/whisper-large-v3-turbo",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MINICPM_MODELS)
|
||||
def test_minicpm(model) -> None:
|
||||
@@ -44,3 +50,26 @@ def test_minicpm(model) -> None:
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.7) as runner:
|
||||
runner.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", WHISPER_MODELS)
|
||||
def test_whisper(model) -> None:
|
||||
prompts = ["<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"]
|
||||
audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner(snapshot_download(model),
|
||||
max_model_len=448,
|
||||
max_num_seqs=5,
|
||||
dtype="bfloat16",
|
||||
block_size=128,
|
||||
gpu_memory_utilization=0.9) as runner:
|
||||
outputs = runner.generate(prompts=prompts,
|
||||
audios=audios,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
assert outputs is not None, "Generated outputs should not be None."
|
||||
assert len(outputs) > 0, "Generated outputs should not be empty."
|
||||
|
||||
Reference in New Issue
Block a user