Upgrade vLLM to v0.10.0 (#1927)

### What this PR does / why we need it? - Upgrade to v0.10.0 - Drop v0.9.2 version compatibility - Add patch for `vllm_ascend/patch/worker/patch_common/patch_sampler_gather_logprobs.py` as workaround of f3a683b7c9 for v0.10.0 and also add e2e test `test_models_prompt_logprobs` - Pin transformers<4.54.0 as workaround of https://github.com/vllm-project/vllm-ascend/issues/2034 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Test locally: `VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models_prompt_logprobs` - CI passed - vLLM version: v0.9.2 - vLLM main: 7728dd77bb --------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
2025-07-26 15:43:29 +08:00
parent 2f50304c19
commit 17a430f7b8
29 changed files with 198 additions and 251 deletions
--- a/tests/e2e/singlecard/test_offline_inference.py
+++ b/tests/e2e/singlecard/test_offline_inference.py
@@ -127,3 +127,19 @@ def test_models_topk() -> None:
                    enforce_eager=True,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
+
+
+def test_models_prompt_logprobs() -> None:
+
+    example_prompts = [
+        "Hello, my name is",
+    ]
+
+    with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct",
+                    max_model_len=8192,
+                    dtype="float16",
+                    enforce_eager=True,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_model.generate_greedy_logprobs(example_prompts,
+                                            max_tokens=5,
+                                            num_logprobs=1)
--- a/tests/ut/attention/test_attention_v1.py
+++ b/tests/ut/attention/test_attention_v1.py
@@ -3,15 +3,12 @@ from unittest.mock import MagicMock, patch
 import torch

 from tests.ut.base import TestBase
-from vllm_ascend.attention.attention_v1 import \
-    AscendAttentionBackendImpl092  # isort: skip
 from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
                                                AscendAttentionBackendImpl,
                                                AscendAttentionMetadataBuilder,
                                                AscendAttentionState,
                                                AscendMetadata,
                                                CommonAttentionState)
-from vllm_ascend.utils import vllm_version_is


 class TestAscendAttentionBackend(TestBase):
@@ -20,12 +17,8 @@ class TestAscendAttentionBackend(TestBase):
        self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND")

    def test_get_impl_cls(self):
-        if vllm_version_is("0.9.2"):
-            self.assertEqual(AscendAttentionBackend.get_impl_cls(),
-                             AscendAttentionBackendImpl092)
-        else:
-            self.assertEqual(AscendAttentionBackend.get_impl_cls(),
-                             AscendAttentionBackendImpl)
+        self.assertEqual(AscendAttentionBackend.get_impl_cls(),
+                         AscendAttentionBackendImpl)

    def test_get_metadata_cls(self):
        self.assertEqual(AscendAttentionBackend.get_metadata_cls(),