[Refactor] Adapt deepseek-v3.2 to vllm 0.11.0 (#3432)

### What this PR does / why we need it? Adapt deepseek-v3.2 to vllm 0.11.0, removing the useless patch. The final goal is to remove all the patches and align the code arch to vllm, thus we need to do the following work in next prs. TODO: - [x] remove patch on attention spec - [ ] refactor the kvcache creation logic ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? 1. CI passed with existing test. 2. Test pass with deepseek-v3.2-exp - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-15 17:48:58 +08:00
parent 099255e933
commit 8abe517870
20 changed files with 143 additions and 262 deletions
--- a/vllm_ascend/torchair/models/torchair_deepseek_v2.py
+++ b/vllm_ascend/torchair/models/torchair_deepseek_v2.py
@@ -791,7 +791,7 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
            use_mla=True,
-            use_sfa=True,
+            use_sparse=True,
            # SFA Args
            q_lora_rank=self.q_lora_rank,
            kv_lora_rank=self.kv_lora_rank,
@@ -879,12 +879,12 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
        self.tp_rank = get_tp_group().rank_in_group
        ascend_config = get_ascend_config()
        self.use_mla = False
-        self.use_sfa = False
+        self.use_sparse = False
        # TODO: enable mla in vllm-ascend
        if model_config.use_mla:
-            if ascend_config.use_sfa:
+            if hasattr(model_config.hf_config, "index_topk"):
                attn_cls = TorchairDeepseekV2SFAAttention
-                self.use_sfa = True
+                self.use_sparse = True
            else:
                attn_cls = TorchairDeepseekV2MLAAttention  # type: ignore[assignment]
            self.use_mla = True
@@ -950,7 +950,7 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
        forward_context = get_forward_context()
        if attn_metadata is not None:
            decoding_condition_met = (
-                not attn_metadata.is_prefill if self.use_sfa else
+                not attn_metadata.is_prefill if self.use_sparse else
                not forward_context.with_prefill if self.use_mla else False)
            mla_moe_communication = decoding_condition_met and self.mla_moe_communication and replace_allreduce
        else:
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -376,7 +376,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
            npu_backend = torchair.get_npu_backend(compiler_config=config)
            self.torchair_compiled_model = torch.compile(
                self.model,
-                dynamic=not self.ascend_config.use_sfa,
+                dynamic=not self.use_sparse,
                fullgraph=True,
                backend=npu_backend)
            return self.torchair_compiled_model
@@ -399,7 +399,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
            self.torchair_compiled_models[
                batch_size] = torchair.inference.cache_compile(
                    self.model.__dict__[forward_proxy_name],
-                    dynamic=not self.ascend_config.use_sfa,
+                    dynamic=not self.use_sparse,
                    fullgraph=True,
                    cache_dir=TORCHAIR_CACHE_DIR,
                    config=config,
--- a/vllm_ascend/torchair/torchair_sfa.py
+++ b/vllm_ascend/torchair/torchair_sfa.py
@@ -738,7 +738,7 @@ class AscendSFATorchairImpl(MLAAttentionImpl):

        ascend_config = get_ascend_config()
        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
-        self.enable_prefetch = ascend_config.enable_prefetch
+        self.enable_prefetch = ascend_config.weight_prefetch_config.enabled
        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
        if ascend_config.torchair_graph_config.enabled:
            self.graph_batch_size = ascend_config.torchair_graph_config.graph_batch_sizes[