[Refactor] Adapt deepseek-v3.2 to vllm 0.11.0 (#3432)

### What this PR does / why we need it? Adapt deepseek-v3.2 to vllm 0.11.0, removing the useless patch. The final goal is to remove all the patches and align the code arch to vllm, thus we need to do the following work in next prs. TODO: - [x] remove patch on attention spec - [ ] refactor the kvcache creation logic ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? 1. CI passed with existing test. 2. Test pass with deepseek-v3.2-exp - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-15 17:48:58 +08:00
parent 099255e933
commit 8abe517870
20 changed files with 143 additions and 262 deletions
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -21,8 +21,6 @@ if HAS_TRITON:
    import vllm_ascend.patch.worker.patch_common.patch_triton

 # isort: off
-import vllm_ascend.patch.worker.patch_common.patch_attention_selector  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_attention_layer  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_roberta  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py
@@ -1,188 +0,0 @@
-from typing import List, Optional
-
-import torch
-import vllm
-import vllm.envs as envs
-from torch import nn
-from vllm.attention import Attention, AttentionType, get_attn_backend
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.selector import backend_name_to_enum
-from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
-from vllm.config import CacheConfig, get_current_vllm_config
-from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
-from vllm.model_executor.layers.quantization.base_config import \
-    QuantizationConfig
-from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.platforms import current_platform
-
-
-class AscendAttention(Attention, nn.Module, AttentionLayerBase):
-    """Attention layer.
-
-    This class takes query, key, and value tensors as input. The input tensors
-    can either contain prompt tokens or generation tokens.
-    The class does the following:
-
-    1. Store the input key and value tensors in the KV cache.
-    2. Perform (multi-head/multi-query/grouped-query) attention.
-    3. Return the output tensor.
-    """
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: Optional[int] = None,
-        alibi_slopes: Optional[List[float]] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        logits_soft_cap: Optional[float] = None,
-        per_layer_sliding_window: Optional[int] = None,
-        use_mla: bool = False,
-        use_sfa: bool = False,
-        prefix: str = "",
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        attn_backend: Optional[type[AttentionBackend]] = None,
-        **extra_impl_args,
-    ) -> None:
-        """
-        The KV cache is stored inside this class and is accessed via
-        `self.kv_cache`.
-        """
-        nn.Module.__init__(self)
-        AttentionLayerBase.__init__(self)
-
-        if per_layer_sliding_window is not None:
-            # per-layer sliding window
-            sliding_window = per_layer_sliding_window
-        elif cache_config is not None:
-            # model-level sliding window
-            sliding_window = cache_config.sliding_window
-        else:
-            sliding_window = None
-
-        if cache_config is not None:
-            kv_cache_dtype = cache_config.cache_dtype
-            block_size = cache_config.block_size
-            calculate_kv_scales = cache_config.calculate_kv_scales
-        else:
-            kv_cache_dtype = "auto"
-            block_size = 16
-            calculate_kv_scales = False
-        if num_kv_heads is None:
-            num_kv_heads = num_heads
-        assert num_heads % num_kv_heads == 0, \
-            f"num_heads ({num_heads}) is not " \
-            f"divisible by num_kv_heads ({num_kv_heads})"
-
-        # The default k/v_scale is set to 1.0. This is ignored
-        # when kv-cache is not fp8, and should be used with
-        # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
-        # expect the pre-quantized k/v_scale to be loaded along
-        # with the model weights.
-        self.kv_cache_dtype = kv_cache_dtype
-        self.calculate_kv_scales = calculate_kv_scales
-        self._k_scale = torch.tensor(1.0, dtype=torch.float32)
-        self._v_scale = torch.tensor(1.0, dtype=torch.float32)
-        # FlashAttn doesn't support quantizing the kv-cache only
-        # but requires q to be quantized as well.
-        self._q_scale = torch.tensor(1.0, dtype=torch.float32)
-        self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
-
-        # We also keep q/k/v_scale on host (cpu) memory for attention
-        # backends that require the scales to be on host instead of on device.
-        # e.g. Flashinfer
-        self._q_scale_float = 1.0
-        self._k_scale_float = 1.0
-        self._v_scale_float = 1.0
-
-        # The output scale on host memory. This should be the input scale of
-        # the quant op after this attention layer.
-        self._o_scale_float: Optional[float] = None
-
-        self.use_mla = use_mla
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.num_kv_heads = num_kv_heads
-        self.sliding_window = sliding_window
-        self.has_sink = extra_impl_args.get("sinks") is not None
-
-        quant_method = quant_config.get_quant_method(
-            self, prefix=prefix) if quant_config else None
-        if quant_method is not None and not isinstance(
-                quant_method, UnquantizedLinearMethod):
-            assert isinstance(quant_method, BaseKVCacheMethod)
-            # TODO (mgoin): kv cache dtype should be specified in the FP8
-            # checkpoint config and become the "auto" behavior
-            if self.kv_cache_dtype == "fp8_e5m2":
-                raise ValueError("fp8_e5m2 kv-cache is not supported with "
-                                 "fp8 checkpoints.")
-            # If quantization is enabled, we make "k_scale" and "v_scale"
-            # parameters so that it can be loaded from the model checkpoint.
-            # The k/v_scale will then be converted back to native float32
-            # values after weight loading.
-            self.quant_method = quant_method
-            self.quant_method.create_weights(self)
-
-        # During model initialization, the default dtype is set as the model
-        # weight and activation dtype.
-        dtype = torch.get_default_dtype()
-        if attn_backend is None:
-            self.attn_backend = get_attn_backend(head_size,
-                                                 dtype,
-                                                 kv_cache_dtype,
-                                                 block_size,
-                                                 use_mla=use_mla,
-                                                 use_sfa=use_sfa,
-                                                 has_sink=self.has_sink)
-        else:
-            self.attn_backend = attn_backend
-
-        impl_cls = self.attn_backend.get_impl_cls()
-        self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
-                             alibi_slopes, sliding_window, kv_cache_dtype,
-                             logits_soft_cap, attn_type,
-                             kv_sharing_target_layer_name, **extra_impl_args)
-        self.backend = backend_name_to_enum(self.attn_backend.get_name())
-        self.dtype = dtype
-
-        # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
-        # torch.compile works by registering the attention as one giant
-        # opaque custom op. For other platforms, we directly call them
-        # and let torch.compile handle them.
-        self.use_direct_call = not current_platform.opaque_attention_op()
-
-        self.use_output = self.attn_backend.accept_output_buffer
-        compilation_config = get_current_vllm_config().compilation_config
-        if prefix in compilation_config.static_forward_context:
-            raise ValueError(f"Duplicate layer name: {prefix}")
-        compilation_config.static_forward_context[prefix] = self
-        self.layer_name = prefix
-        self.attn_type = attn_type
-
-        if kv_sharing_target_layer_name is not None:
-            validate_kv_sharing_target(
-                prefix,
-                kv_sharing_target_layer_name,
-                compilation_config.static_forward_context,
-            )
-        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-
-        # use a placeholder kv cache tensor during init, which will be replaced
-        # by bind_kv_cache
-        # this variable will not be accessed if use_direct_call is True
-        self.kv_cache = [
-            torch.tensor([]) for _ in range(get_current_vllm_config(
-            ).parallel_config.pipeline_parallel_size)
-        ]
-
-        self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
-        self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
-        self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
-        self.query_quant = None
-
-
-vllm.attention.Attention = AscendAttention