Drop 0.10.2 (#3284)

Drop v0.10.2 support, we support vLLM 0.11.0rc3 now. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-09 10:28:38 +08:00
parent 2dde1268c7
commit f12f76d7ba
17 changed files with 202 additions and 653 deletions
--- a/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py
@@ -16,8 +16,6 @@ from vllm.model_executor.layers.quantization.base_config import \
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform

-from vllm_ascend.utils import vllm_version_is
-

 class AscendAttention(Attention, nn.Module, AttentionLayerBase):
    """Attention layer.
@@ -69,12 +67,10 @@ class AscendAttention(Attention, nn.Module, AttentionLayerBase):
        if cache_config is not None:
            kv_cache_dtype = cache_config.cache_dtype
            block_size = cache_config.block_size
-            is_attention_free = cache_config.is_attention_free
            calculate_kv_scales = cache_config.calculate_kv_scales
        else:
            kv_cache_dtype = "auto"
            block_size = 16
-            is_attention_free = False
            calculate_kv_scales = False
        if num_kv_heads is None:
            num_kv_heads = num_heads
@@ -135,23 +131,13 @@ class AscendAttention(Attention, nn.Module, AttentionLayerBase):
        # weight and activation dtype.
        dtype = torch.get_default_dtype()
        if attn_backend is None:
-            if vllm_version_is("0.10.2"):
-                self.attn_backend = get_attn_backend(head_size,
-                                                     dtype,
-                                                     kv_cache_dtype,
-                                                     block_size,
-                                                     is_attention_free,
-                                                     use_mla=use_mla,
-                                                     use_sfa=use_sfa,
-                                                     has_sink=self.has_sink)
-            else:
-                self.attn_backend = get_attn_backend(head_size,
-                                                     dtype,
-                                                     kv_cache_dtype,
-                                                     block_size,
-                                                     use_mla=use_mla,
-                                                     use_sfa=use_sfa,
-                                                     has_sink=self.has_sink)
+            self.attn_backend = get_attn_backend(head_size,
+                                                 dtype,
+                                                 kv_cache_dtype,
+                                                 block_size,
+                                                 use_mla=use_mla,
+                                                 use_sfa=use_sfa,
+                                                 has_sink=self.has_sink)
        else:
            self.attn_backend = attn_backend

--- a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
@@ -27,154 +27,72 @@ from vllm.attention.selector import (backend_name_to_enum,
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import resolve_obj_by_qualname

-from vllm_ascend.utils import vllm_version_is

-if vllm_version_is("0.10.2"):
+def get_attn_backend(  # type: ignore[misc]
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: Optional[str],
+    block_size: int,
+    use_mla: bool = False,
+    use_sfa: bool = False,
+    has_sink: bool = False,
+) -> type[AttentionBackend]:
+    """Selects which attention backend to use and lazily imports it."""
+    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
+    # value to be returned from the cache if the value changes between calls.
+    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
+    # private function.
+    return _cached_get_attn_backend(
+        head_size=head_size,
+        dtype=dtype,
+        kv_cache_dtype=kv_cache_dtype,
+        block_size=block_size,
+        use_v1=envs.VLLM_USE_V1,
+        use_mla=use_mla,
+        use_sfa=use_sfa,
+        has_sink=has_sink,
+    )

-    def get_attn_backend(
-        head_size: int,
-        dtype: torch.dtype,
-        kv_cache_dtype: Optional[str],
-        block_size: int,
-        is_attention_free: bool = False,
-        use_mla: bool = False,
-        use_sfa: bool = False,
-        has_sink: bool = False,
-    ) -> type[AttentionBackend]:
-        """Selects which attention backend to use and lazily imports it."""
-        # Accessing envs.* behind an @lru_cache decorator can cause the wrong
-        # value to be returned from the cache if the value changes between calls.
-        # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
-        # private function.
-        return _cached_get_attn_backend(
-            head_size=head_size,
-            dtype=dtype,
-            kv_cache_dtype=kv_cache_dtype,
-            block_size=block_size,
-            is_attention_free=is_attention_free,
-            use_v1=envs.VLLM_USE_V1,
-            use_mla=use_mla,
-            use_sfa=use_sfa,
-            has_sink=has_sink,
-        )

-    @cache
-    def _cached_get_attn_backend(
-        head_size: int,
-        dtype: torch.dtype,
-        kv_cache_dtype: Optional[str],
-        block_size: int,
-        is_attention_free: bool,
-        use_v1: bool = False,
-        use_mla: bool = False,
-        use_sfa: bool = False,
-        has_sink: bool = False,
-    ) -> type[AttentionBackend]:
-        # If there are no attention layers (e.g. we are running Mamba),
-        # use the placeholder NO_ATTENTION
-        if is_attention_free:
-            from vllm.attention.backends.placeholder_attn import \
-                PlaceholderAttentionBackend
-            return PlaceholderAttentionBackend
+@cache
+def _cached_get_attn_backend(
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: Optional[str],
+    block_size: int,
+    use_v1: bool = False,
+    use_mla: bool = False,
+    use_sfa: bool = False,
+    has_sink: bool = False,
+) -> type[AttentionBackend]:
+    # Check whether a particular choice of backend was
+    # previously forced.
+    #
+    # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
+    # ENVIRONMENT VARIABLE.
+    selected_backend = None
+    backend_by_global_setting: Optional[_Backend] = (
+        get_global_forced_attn_backend())
+    if backend_by_global_setting is not None:
+        selected_backend = backend_by_global_setting
+    else:
+        # Check the environment variable and override if specified
+        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        if backend_by_env_var is not None:
+            selected_backend = backend_name_to_enum(backend_by_env_var)
+            if selected_backend is None:
+                raise ValueError(
+                    f"Invalid attention backend: '{backend_by_env_var}'. "
+                    f"Valid backends are: {list(_Backend.__members__.keys())}")

-        # Check whether a particular choice of backend was
-        # previously forced.
-        #
-        # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
-        # ENVIRONMENT VARIABLE.
-        selected_backend = None
-        backend_by_global_setting: Optional[_Backend] = (
-            get_global_forced_attn_backend())
-        if backend_by_global_setting is not None:
-            selected_backend = backend_by_global_setting
-        else:
-            # Check the environment variable and override if specified
-            backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-            if backend_by_env_var is not None:
-                selected_backend = backend_name_to_enum(backend_by_env_var)
-                if selected_backend is None:
-                    raise ValueError(
-                        f"Invalid attention backend: '{backend_by_env_var}'. "
-                        f"Valid backends are: {list(_Backend.__members__.keys())}"
-                    )
-
-        # get device-specific attn_backend
-        attention_cls = current_platform.get_attn_backend_cls(
-            selected_backend, head_size, dtype, kv_cache_dtype, block_size,
-            use_v1, use_mla, use_sfa, has_sink)
-        if not attention_cls:
-            raise ValueError(
-                f"Invalid attention backend for {current_platform.device_name}"
-            )
-        return resolve_obj_by_qualname(attention_cls)
-else:
-
-    def get_attn_backend(  # type: ignore[misc]
-        head_size: int,
-        dtype: torch.dtype,
-        kv_cache_dtype: Optional[str],
-        block_size: int,
-        use_mla: bool = False,
-        use_sfa: bool = False,
-        has_sink: bool = False,
-    ) -> type[AttentionBackend]:
-        """Selects which attention backend to use and lazily imports it."""
-        # Accessing envs.* behind an @lru_cache decorator can cause the wrong
-        # value to be returned from the cache if the value changes between calls.
-        # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
-        # private function.
-        return _cached_get_attn_backend(
-            head_size=head_size,
-            dtype=dtype,
-            kv_cache_dtype=kv_cache_dtype,
-            block_size=block_size,
-            use_v1=envs.VLLM_USE_V1,
-            use_mla=use_mla,
-            use_sfa=use_sfa,
-            has_sink=has_sink,
-        )
-
-    @cache
-    def _cached_get_attn_backend(
-        head_size: int,
-        dtype: torch.dtype,
-        kv_cache_dtype: Optional[str],
-        block_size: int,
-        use_v1: bool = False,
-        use_mla: bool = False,
-        use_sfa: bool = False,
-        has_sink: bool = False,
-    ) -> type[AttentionBackend]:
-        # Check whether a particular choice of backend was
-        # previously forced.
-        #
-        # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
-        # ENVIRONMENT VARIABLE.
-        selected_backend = None
-        backend_by_global_setting: Optional[_Backend] = (
-            get_global_forced_attn_backend())
-        if backend_by_global_setting is not None:
-            selected_backend = backend_by_global_setting
-        else:
-            # Check the environment variable and override if specified
-            backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-            if backend_by_env_var is not None:
-                selected_backend = backend_name_to_enum(backend_by_env_var)
-                if selected_backend is None:
-                    raise ValueError(
-                        f"Invalid attention backend: '{backend_by_env_var}'. "
-                        f"Valid backends are: {list(_Backend.__members__.keys())}"
-                    )
-
-        # get device-specific attn_backend
-        attention_cls = current_platform.get_attn_backend_cls(
-            selected_backend, head_size, dtype, kv_cache_dtype, block_size,
-            use_v1, use_mla, use_sfa, has_sink)
-        if not attention_cls:
-            raise ValueError(
-                f"Invalid attention backend for {current_platform.device_name}"
-            )
-        return resolve_obj_by_qualname(attention_cls)
+    # get device-specific attn_backend
+    attention_cls = current_platform.get_attn_backend_cls(
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
+        use_mla, use_sfa, has_sink)
+    if not attention_cls:
+        raise ValueError(
+            f"Invalid attention backend for {current_platform.device_name}")
+    return resolve_obj_by_qualname(attention_cls)


 vllm.attention.get_attn_backend = get_attn_backend
--- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
@@ -1,11 +1,10 @@
 import torch
 from torch.nn.parameter import Parameter
 from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import GiB_bytes

-from vllm_ascend.utils import vllm_version_is
-
 logger = init_logger(__name__)


@@ -39,6 +38,4 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
    set_weight_attrs(weight, extra_weight_attrs)


-if not vllm_version_is("0.10.2"):
-    from vllm.model_executor.layers.linear import UnquantizedLinearMethod
-    UnquantizedLinearMethod.create_weights = create_weights
+UnquantizedLinearMethod.create_weights = create_weights