Upgrade vLLM to v0.10.0 (#1927)

### What this PR does / why we need it? - Upgrade to v0.10.0 - Drop v0.9.2 version compatibility - Add patch for `vllm_ascend/patch/worker/patch_common/patch_sampler_gather_logprobs.py` as workaround of f3a683b7c9 for v0.10.0 and also add e2e test `test_models_prompt_logprobs` - Pin transformers<4.54.0 as workaround of https://github.com/vllm-project/vllm-ascend/issues/2034 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Test locally: `VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models_prompt_logprobs` - CI passed - vLLM version: v0.9.2 - vLLM main: 7728dd77bb --------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
2025-07-26 15:43:29 +08:00
parent 2f50304c19
commit 17a430f7b8
29 changed files with 198 additions and 251 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -17,7 +17,7 @@

 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type

 import torch
 import torch_npu
@@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch

 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
+                               nd_to_nz_2d, nd_to_nz_spec)


 class AscendAttentionBackend(AttentionBackend):
@@ -43,8 +43,6 @@ class AscendAttentionBackend(AttentionBackend):

    @staticmethod
    def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
-        if vllm_version_is("0.9.2"):
-            return AscendAttentionBackendImpl092
        return AscendAttentionBackendImpl

    @staticmethod
@@ -440,38 +438,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
        return output.view(num_tokens, self.hidden_size)


-class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        super().__init__(
-            num_heads=num_heads,
-            head_size=head_size,
-            scale=scale,
-            num_kv_heads=num_kv_heads,
-            alibi_slopes=alibi_slopes,
-            sliding_window=sliding_window,
-            kv_cache_dtype=kv_cache_dtype,
-            logits_soft_cap=logits_soft_cap,
-            attn_type=attn_type,
-            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            use_irope=use_irope,
-        )
-
-
 def unified_ascend_attention_with_output(
    query: torch.Tensor,
    key: torch.Tensor,