[Model][3/N] Refactor sfa into mla and remove deepseek_v3_2.py (#3769)

This is the follow-up PR to PR #3189, which continues to refactor sfa into mla and finally remove deepseek_v3_2.py. This is the last PR of deepseek modeling refactoring. After this, all deepseek-related model codes are removed from vllm_ascend. FurtherMore, after this PR deepseek v3.2 can run chunk-prefill with correct accuracy. - vLLM version: v0.11.0rc3 - vLLM main: 83f478bb19 --------- Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-10-30 17:06:38 +08:00
parent eff3e5fc6f
commit f6149f3894
10 changed files with 751 additions and 1935 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -577,7 +577,6 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
    from vllm.model_executor.custom_op import CustomOp

    from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
-    from vllm_ascend.models.layers.sfa import AscendSparseFlashAttention
    from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
    from vllm_ascend.ops.fused_moe.fused_moe import (AscendFusedMoE,
                                                     AscendSharedFusedMoE)
@@ -625,10 +624,7 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
    mla_to_register = "MultiHeadLatentAttention" if vllm_version_is(
        "0.11.0") else "MultiHeadLatentAttentionWrapper"
    if vllm_config and vllm_config.model_config and vllm_config.model_config.use_mla:
-        AscendMLAAttentionWarrper = AscendSparseFlashAttention if hasattr(
-            vllm_config.model_config.hf_config,
-            "index_topk") else AscendMultiHeadLatentAttention
-        REGISTERED_ASCEND_OPS[mla_to_register] = AscendMLAAttentionWarrper
+        REGISTERED_ASCEND_OPS[mla_to_register] = AscendMultiHeadLatentAttention

    for name, op_cls in REGISTERED_ASCEND_OPS.items():
        CustomOp.register_oot(_decorated_op_cls=op_cls, name=name)