[Feature] Support for cross-attention and whisper model (#5592)

### What this PR does / why we need it? To solve the problem of the issue：https://github.com/vllm-project/vllm-ascend/issues/2262 - support for cross-attention when the model is encoder-decoder - support for whisper model - vLLM version: v0.13.0 - vLLM main: 7157596103 Signed-off-by: gh924 <guihao2@huawei.com> Co-authored-by: Aoxuan Chen <43376869+chenaoxuan@users.noreply.github.com>
2026-01-11 11:38:45 +08:00
parent db12c1e2c8
commit 6880c1b383
5 changed files with 103 additions and 68 deletions
--- a/tests/e2e/singlecard/test_models.py
+++ b/tests/e2e/singlecard/test_models.py
@@ -21,6 +21,8 @@ import os

 import pytest
 from modelscope import snapshot_download  # type: ignore
+from vllm import SamplingParams
+from vllm.assets.audio import AudioAsset

 from tests.e2e.conftest import VllmRunner

@@ -32,6 +34,10 @@ MINICPM_MODELS = [
    "OpenBMB/MiniCPM4-0.5B",
 ]

+WHISPER_MODELS = [
+    "openai-mirror/whisper-large-v3-turbo",
+]
+

@pytest.mark.parametrize("model", MINICPM_MODELS)
 def test_minicpm(model) -> None:
@@ -44,3 +50,26 @@ def test_minicpm(model) -> None:
                    max_model_len=512,
                    gpu_memory_utilization=0.7) as runner:
        runner.generate_greedy(example_prompts, max_tokens)
+
+
+@pytest.mark.parametrize("model", WHISPER_MODELS)
+def test_whisper(model) -> None:
+    prompts = ["<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"]
+    audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
+
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=10,
+                                     stop_token_ids=None)
+
+    with VllmRunner(snapshot_download(model),
+                    max_model_len=448,
+                    max_num_seqs=5,
+                    dtype="bfloat16",
+                    block_size=128,
+                    gpu_memory_utilization=0.9) as runner:
+        outputs = runner.generate(prompts=prompts,
+                                  audios=audios,
+                                  sampling_params=sampling_params)
+
+    assert outputs is not None, "Generated outputs should not be None."
+    assert len(outputs) > 0, "Generated outputs should not be empty."