[Refactor] Adapt deepseek-v3.2 to vllm 0.11.0 (#3432)

### What this PR does / why we need it? Adapt deepseek-v3.2 to vllm 0.11.0, removing the useless patch. The final goal is to remove all the patches and align the code arch to vllm, thus we need to do the following work in next prs. TODO: - [x] remove patch on attention spec - [ ] refactor the kvcache creation logic ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? 1. CI passed with existing test. 2. Test pass with deepseek-v3.2-exp - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-15 17:48:58 +08:00
parent 099255e933
commit 8abe517870
20 changed files with 143 additions and 262 deletions
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -510,7 +510,6 @@ class AscendSFAImpl(MLAAttentionImpl):

        ascend_config = get_ascend_config()
        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
-        self.enable_prefetch = ascend_config.enable_prefetch
        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz

        vllm_config = get_current_vllm_config()
@@ -690,6 +689,8 @@ class AscendSFAImpl(MLAAttentionImpl):
            topk_indices = self.indexer_select(hidden_states_decode,
                                               decode_q_c,
                                               attn_metadata=attn_metadata,
+                                               cos=cos,
+                                               sin=sin,
                                               kv_cache=kv_cache)

            query_states = (decode_q_nope, decode_q_pe)
@@ -778,6 +779,8 @@ class AscendSFAImpl(MLAAttentionImpl):
            topk_indices = self.indexer_select(x=hidden_states_prefill,
                                               qr=prefill_qr,
                                               kv_cache=kv_cache,
+                                               cos=cos,
+                                               sin=sin,
                                               attn_metadata=attn_metadata)
            query_states = (prefill_q_nope, prefill_q_pe)
            key_states = (prefill_k_nope, prefill_k_pe)
@@ -920,17 +923,15 @@ class AscendSFAImpl(MLAAttentionImpl):
        x: torch.Tensor,
        qr: torch.Tensor,
        kv_cache: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+        cos,
+        sin,
        attn_metadata: M,
    ):
        if attn_metadata.prefill is not None:
-            cos = attn_metadata.prefill.cos
-            sin = attn_metadata.prefill.sin
            actual_seq_lengths_query = attn_metadata.prefill.query_lens
            actual_seq_lengths_key = attn_metadata.prefill.seq_lens
            block_table = attn_metadata.prefill.block_table
        elif attn_metadata.decode is not None:
-            cos = attn_metadata.decode.cos
-            sin = attn_metadata.decode.sin
            actual_seq_lengths_query = attn_metadata.decode.actual_seq_lengths_q
            actual_seq_lengths_key = attn_metadata.decode.seq_lens
            block_table = attn_metadata.decode.block_table