Upgrade vLLM to v0.10.0 (#1927)
### What this PR does / why we need it? - Upgrade to v0.10.0 - Drop v0.9.2 version compatibility - Add patch for `vllm_ascend/patch/worker/patch_common/patch_sampler_gather_logprobs.py` as workaround off3a683b7c9for v0.10.0 and also add e2e test `test_models_prompt_logprobs` - Pin transformers<4.54.0 as workaround of https://github.com/vllm-project/vllm-ascend/issues/2034 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Test locally: `VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models_prompt_logprobs` - CI passed - vLLM version: v0.9.2 - vLLM main:7728dd77bb--------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
import torch_npu
|
||||
@@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
|
||||
nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
|
||||
nd_to_nz_2d, nd_to_nz_spec)
|
||||
|
||||
|
||||
class AscendAttentionBackend(AttentionBackend):
|
||||
@@ -43,8 +43,6 @@ class AscendAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendAttentionBackendImpl092
|
||||
return AscendAttentionBackendImpl
|
||||
|
||||
@staticmethod
|
||||
@@ -440,38 +438,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
return output.view(num_tokens, self.hidden_size)
|
||||
|
||||
|
||||
class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
)
|
||||
|
||||
|
||||
def unified_ascend_attention_with_output(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -29,7 +29,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
|
||||
nd_to_nz_2d, vllm_version_is)
|
||||
nd_to_nz_2d)
|
||||
|
||||
|
||||
class AscendAttentionTorchairBackend(AttentionBackend):
|
||||
@@ -41,8 +41,6 @@ class AscendAttentionTorchairBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendAttentionTorchairBackendImpl092
|
||||
return AscendAttentionTorchairBackendImpl
|
||||
|
||||
@staticmethod
|
||||
@@ -489,36 +487,3 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
|
||||
"to use ascend scheduler.")
|
||||
|
||||
return output.view(num_tokens, self.hidden_size)
|
||||
|
||||
|
||||
class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl
|
||||
):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
)
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
|
||||
TypeVar)
|
||||
from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch_npu
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
|
||||
AttentionMetadata, AttentionType,
|
||||
AttentionMetadata,
|
||||
MLAAttentionImpl)
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID
|
||||
from vllm.config import get_current_vllm_config
|
||||
@@ -22,7 +21,7 @@ from vllm_ascend.multistream.context import get_multistream_comm_context
|
||||
from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
|
||||
from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
|
||||
from vllm_ascend.utils import npu_prefetch, vllm_version_is
|
||||
from vllm_ascend.utils import npu_prefetch
|
||||
from vllm_ascend.worker.npu_input_batch import InputBatch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -54,8 +53,6 @@ class AscendMLABackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["MLAAttentionImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendMLAImpl092
|
||||
return AscendMLAImpl
|
||||
|
||||
|
||||
@@ -1212,34 +1209,3 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
output[:num_decode_tokens] = output_decode
|
||||
|
||||
return output_padded
|
||||
|
||||
|
||||
class AscendMLAImpl092(AscendMLAImpl):
|
||||
|
||||
def __init__(self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
**kwargs) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
**kwargs)
|
||||
|
||||
Reference in New Issue
Block a user