[1/N][Refactor] Refactor code to adapt with vllm main (#3612)

### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with 17c540a993 1. refactor deepseek to the latest code arch as of 17c540a993 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
parent ec9ec78b53
commit cea0755b07
47 changed files with 1189 additions and 493 deletions
--- a/vllm_ascend/models/deepseek_v3_2.py
+++ b/vllm_ascend/models/deepseek_v3_2.py
@@ -42,6 +42,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED,
                                               ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
                                               ReplicatedLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -64,10 +65,15 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform

 from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.models.layers.sfa import (AscendSFAModules,
-                                           AscendSparseFlashAttention, Indexer)
+from vllm_ascend.models.layers.sfa import AscendSFAModules, Indexer
 from vllm_ascend.ops.common_fused_moe import AscendFusedMoE
 from vllm_ascend.ops.linear import AscendLinearBase
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.11.0"):
+    from vllm.model_executor.layers.mla import MultiHeadLatentAttention
+else:
+    from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper


@support_torch_compile
@@ -260,14 +266,6 @@ class CustomDeepseekV2SFAAttention(DeepseekV2MLAAttention):
        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

        if self.q_lora_rank is not None:
-            self.q_a_proj = ReplicatedLinear(
-                self.hidden_size,
-                self.q_lora_rank,
-                bias=False,
-                quant_config=quant_config,
-                prefix=f"{prefix}.q_a_proj",
-                return_bias=False,
-            )
            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
                                         eps=config.rms_norm_eps)
            self.q_b_proj = ColumnParallelLinear(
@@ -288,14 +286,6 @@ class CustomDeepseekV2SFAAttention(DeepseekV2MLAAttention):
                return_bias=False,
            )

-        self.kv_a_proj_with_mqa = ReplicatedLinear(
-            self.hidden_size,
-            self.kv_lora_rank + self.qk_rope_head_dim,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.kv_a_proj_with_mqa",
-            return_bias=False,
-        )
        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
                                      eps=config.rms_norm_eps)
        self.kv_b_proj = ColumnParallelLinear(
@@ -315,14 +305,33 @@ class CustomDeepseekV2SFAAttention(DeepseekV2MLAAttention):
            return_bias=False,
        )

+        if self.q_lora_rank is not None:
+            self.fused_qkv_a_proj = MergedColumnParallelLinear(
+                self.hidden_size,
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.fused_qkv_a_proj",
+                disable_tp=True)
+            self.kv_a_proj_with_mqa = None
+        else:
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.kv_a_proj_with_mqa")
+
        if rope_scaling:
            rope_scaling["rope_type"] = 'deepseek_yarn'
+
        self.rotary_emb = get_rope(qk_rope_head_dim,
                                   rotary_dim=qk_rope_head_dim,
                                   max_position=max_position_embeddings,
                                   base=rope_theta,
                                   rope_scaling=rope_scaling,
                                   is_neox_style=False)
+
        if rope_scaling:
            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
            scaling_factor = rope_scaling["factor"]
@@ -345,37 +354,51 @@ class CustomDeepseekV2SFAAttention(DeepseekV2MLAAttention):
        )

        sfa_modules = AscendSFAModules(
-            q_a_proj=self.q_a_proj if self.q_lora_rank is not None else None,
            q_a_layernorm=self.q_a_layernorm
            if self.q_lora_rank is not None else None,
            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
+            q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
+            fused_qkv_a_proj=self.fused_qkv_a_proj
+            if self.q_lora_rank is not None else None,
            kv_a_layernorm=self.kv_a_layernorm,
            kv_b_proj=self.kv_b_proj,
            o_proj=self.o_proj,
            rotary_emb=self.rotary_emb,
-            indexer=self.indexer)
+            indexer=self.indexer,
+            is_sparse=hasattr(config, "index_topk"),
+            topk_indices_buffer=None)

-        self.sfa_attn = AscendSparseFlashAttention(
-            self.hidden_size,
-            self.enable_shared_expert_dp,
-            self.debug_layer_idx,
-            self.first_k_dense_replace,
-            self.tp_size,
-            sfa_modules,
-            self.num_local_heads,
-            self.scaling,
-            self.layers,
-            self.kv_lora_rank,
-            self.qk_rope_head_dim,
-            self.q_lora_rank,
-            self.qk_nope_head_dim,
-            self.qk_head_dim,
-            self.v_head_dim,
-            cache_config,
-            quant_config,
-            prefix,
-        )
+        if vllm_version_is("0.11.0"):
+            self.sfa_attn = MultiHeadLatentAttention(
+                hidden_size=self.hidden_size,
+                num_heads=self.num_local_heads,
+                scale=self.scaling,
+                qk_nope_head_dim=self.qk_nope_head_dim,
+                qk_rope_head_dim=self.qk_rope_head_dim,
+                v_head_dim=self.v_head_dim,
+                q_lora_rank=self.q_lora_rank,
+                kv_lora_rank=self.kv_lora_rank,
+                mla_modules=sfa_modules,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+        else:
+            self.sfa_attn = MultiHeadLatentAttentionWrapper(
+                hidden_size=self.hidden_size,
+                num_heads=self.num_local_heads,
+                scale=self.scaling,
+                qk_nope_head_dim=self.qk_nope_head_dim,
+                qk_rope_head_dim=self.qk_rope_head_dim,
+                v_head_dim=self.v_head_dim,
+                q_lora_rank=self.q_lora_rank,
+                kv_lora_rank=self.kv_lora_rank,
+                mla_modules=sfa_modules,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
        self.prefix = prefix

    def forward(
@@ -540,6 +563,8 @@ class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
            # (param_name, shard_name, shard_id)
            ("gate_up_proj", "gate_proj", 0),
            ("gate_up_proj", "up_proj", 1),
+            ("fused_qkv_a_proj", "q_a_proj", 0),
+            ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
        ]

        # Params for weights, fp8 weight scales, fp8 activation scales
--- a/vllm_ascend/models/layers/mla.py
+++ b/vllm_ascend/models/layers/mla.py
@@ -42,6 +42,14 @@ else:
    from vllm.attention.layer import MLAAttention
    from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper

+if vllm_version_is("0.11.0"):
+    from vllm.attention import Attention
+    from vllm.model_executor.layers.mla import \
+        MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
+else:
+    from vllm.attention.layer import MLAAttention
+    from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
+

 # TODO(whx): adapt v0.11.0 and DSA
 class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
@@ -107,22 +115,20 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
            )
        else:
            self.mla_attn = MLAAttention(
-                num_heads=self.num_heads,
+                num_heads=num_heads,
                scale=scale,
-                head_size=self.kv_lora_rank + self.qk_rope_head_dim,
                qk_nope_head_dim=self.qk_nope_head_dim,
                qk_rope_head_dim=self.qk_rope_head_dim,
                v_head_dim=self.v_head_dim,
                q_lora_rank=self.q_lora_rank,
                kv_lora_rank=self.kv_lora_rank,
+                kv_b_proj=mla_modules.kv_b_proj,
                cache_config=cache_config,
                quant_config=quant_config,
                prefix=f"{prefix}.attn",
-                kv_b_proj=mla_modules.kv_b_proj,
                use_sparse=mla_modules.is_sparse,
                indexer=mla_modules.indexer,
                # extra args
-                qk_head_dim=self.qk_head_dim,
                rotary_emb=mla_modules.rotary_emb,
                fused_qkv_a_proj=mla_modules.fused_qkv_a_proj,
                q_b_proj=mla_modules.q_b_proj,
--- a/vllm_ascend/models/layers/sfa.py
+++ b/vllm_ascend/models/layers/sfa.py
@@ -24,18 +24,29 @@ from typing import Optional

 import torch
 from torch import nn
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.linear import ReplicatedLinear
-from vllm.model_executor.layers.mla import MultiHeadLatentAttention
+from vllm.model_executor.layers.mla import MLAModules
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.utils import direct_register_custom_op

+from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.11.0"):
+    from vllm.attention import Attention
+    from vllm.model_executor.layers.mla import \
+        MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
+else:
+    from vllm.attention.layer import MLAAttention
+    from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
+

@dataclass
 class AscendSFAModules:
-    q_a_proj: Optional[torch.nn.Module]
    q_a_layernorm: Optional[torch.nn.Module]
    q_proj: Optional[torch.nn.Module]
    kv_a_proj_with_mqa: torch.nn.Module
@@ -44,73 +55,103 @@ class AscendSFAModules:
    o_proj: torch.nn.Module
    rotary_emb: torch.nn.Module
    indexer: torch.nn.Module
+    is_sparse: bool
+    fused_qkv_a_proj: Optional[torch.nn.Module]
+    q_b_proj: Optional[torch.nn.Module]
+    topk_indices_buffer: Optional[torch.Tensor]


-class AscendSparseFlashAttention(MultiHeadLatentAttention):
+class AscendSparseFlashAttention(MultiHeadLatentAttentionWrapper):

    def __init__(
        self,
        hidden_size: int,
-        enable_shared_expert_dp: bool,
-        debug_layer_idx: int,
-        first_k_dense_replace: int,
-        tp_size: int,
-        sfa_modules: AscendSFAModules,
-        num_local_heads: int,
-        scaling: float,
-        layers: int,
-        kv_lora_rank: int,
-        qk_rope_head_dim: int,
-        q_lora_rank: Optional[int],
+        num_heads: int,
+        scale: float,
        qk_nope_head_dim: int,
-        qk_head_dim: int,
+        qk_rope_head_dim: int,
        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        mla_modules: MLAModules,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ) -> None:
        nn.Module.__init__(self)
        self.hidden_size = hidden_size
-        self.enable_shared_expert_dp = enable_shared_expert_dp
-        self.debug_layer_idx = debug_layer_idx
-        self.first_k_dense_replace = first_k_dense_replace
-        self.tp_size = tp_size
-        self.num_local_heads = num_local_heads
-        self.layers = layers
        self.kv_lora_rank = kv_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.q_lora_rank = q_lora_rank
        self.qk_nope_head_dim = qk_nope_head_dim
-        self.qk_head_dim = qk_head_dim
+        self.qk_head_dim = qk_rope_head_dim + qk_nope_head_dim
        self.v_head_dim = v_head_dim
        self.prefix = prefix
+        self.scaling = scale
+        self.indexer = mla_modules.indexer
+        self.is_sparse = mla_modules.is_sparse
+        hf_config = get_current_vllm_config().model_config.hf_config
+        self.enable_shared_expert_dp = get_ascend_config(
+        ).enable_shared_expert_dp
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+        self.first_k_dense_replace = hf_config.first_k_dense_replace
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layers = hf_config.num_hidden_layers

-        self.sfa_attn = Attention(
-            num_heads=self.num_local_heads,
-            head_size=self.kv_lora_rank + self.qk_rope_head_dim,
-            scale=scaling,
-            num_kv_heads=1,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.attn",
-            use_mla=True,
-            use_sparse=True,
-            # SFA Args
-            q_lora_rank=self.q_lora_rank,
-            kv_lora_rank=self.kv_lora_rank,
-            qk_nope_head_dim=self.qk_nope_head_dim,
-            qk_rope_head_dim=self.qk_rope_head_dim,
-            qk_head_dim=self.qk_head_dim,
-            v_head_dim=self.v_head_dim,
-            rotary_emb=sfa_modules.rotary_emb,
-            q_a_proj=sfa_modules.q_a_proj,
-            q_a_layernorm=sfa_modules.q_a_layernorm,
-            q_proj=sfa_modules.q_proj,
-            kv_a_proj_with_mqa=sfa_modules.kv_a_proj_with_mqa,
-            kv_a_layernorm=sfa_modules.kv_a_layernorm,
-            kv_b_proj=sfa_modules.kv_b_proj,
-            o_proj=sfa_modules.o_proj,
-            indexer=sfa_modules.indexer)
+        if vllm_version_is("0.11.0"):
+            self.sfa_attn = Attention(
+                num_heads=num_heads,
+                head_size=self.kv_lora_rank + self.qk_rope_head_dim,
+                scale=scale,
+                num_kv_heads=1,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.attn",
+                use_mla=True,
+                use_sparse=True,
+                indexer=self.indexer,
+                # SFA Args
+                q_lora_rank=self.q_lora_rank,
+                kv_lora_rank=self.kv_lora_rank,
+                qk_nope_head_dim=self.qk_nope_head_dim,
+                qk_rope_head_dim=self.qk_rope_head_dim,
+                v_head_dim=self.v_head_dim,
+                qk_head_dim=self.qk_head_dim,
+                rotary_emb=mla_modules.rotary_emb,
+                fused_qkv_a_proj=mla_modules.fused_qkv_a_proj,
+                q_b_proj=mla_modules.q_b_proj,
+                q_a_layernorm=mla_modules.q_a_layernorm,
+                q_proj=mla_modules.q_proj,
+                kv_a_proj_with_mqa=mla_modules.kv_a_proj_with_mqa,
+                kv_a_layernorm=mla_modules.kv_a_layernorm,
+                kv_b_proj=mla_modules.kv_b_proj,
+                o_proj=mla_modules.o_proj,
+            )
+        else:
+            self.sfa_attn = MLAAttention(
+                num_heads=num_heads,
+                scale=scale,
+                qk_nope_head_dim=self.qk_nope_head_dim,
+                qk_rope_head_dim=self.qk_rope_head_dim,
+                v_head_dim=self.v_head_dim,
+                q_lora_rank=self.q_lora_rank,
+                kv_lora_rank=self.kv_lora_rank,
+                kv_b_proj=mla_modules.kv_b_proj,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.attn",
+                use_sparse=mla_modules.is_sparse,
+                indexer=mla_modules.indexer,
+                # extra args
+                rotary_emb=mla_modules.rotary_emb,
+                fused_qkv_a_proj=mla_modules.fused_qkv_a_proj,
+                q_b_proj=mla_modules.q_b_proj,
+                q_a_layernorm=mla_modules.q_a_layernorm,
+                q_proj=mla_modules.q_proj,
+                kv_a_proj_with_mqa=mla_modules.kv_a_proj_with_mqa,
+                kv_a_layernorm=mla_modules.kv_a_layernorm,
+                o_proj=mla_modules.o_proj,
+            )

        compilation_config = get_current_vllm_config().compilation_config
        if prefix in compilation_config.static_forward_context: