Add DeepSeek V3.2 support (#3270)

### What this PR does / why we need it? This PR added the initial DeepSeek V3.2 support with [vLLM v0.11.0](https://github.com/vllm-project/vllm/tree/releases/v0.11.0) (not released yet). We will complete vLLM adaptation as soon as possible. This feature will be ready in recent 1-2 days. Related doc: https://github.com/vllm-project/vllm-ascend/pull/3223 . ### Does this PR introduce _any_ user-facing change? Yes! ### How was this patch tested? CI passed and Run deepseek doc soon. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: zzzzwwjj <1183291235@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: wxsIcey <1790571317@qq.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: wxsIcey <1790571317@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com>
2025-09-30 03:25:58 +08:00
parent 5503a3142f
commit 81bd6e4c99
27 changed files with 4354 additions and 70 deletions
--- a/vllm_ascend/patch/platform/patch_common/init.py
+++ b/vllm_ascend/patch/platform/patch_common/init.py
@@ -15,6 +15,10 @@
 # limitations under the License.
 #

+import vllm_ascend.patch.platform.patch_common.patch_config  # noqa
 import vllm_ascend.patch.platform.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.platform.patch_common.patch_mamba_config  # noqa
 import vllm_ascend.patch.platform.patch_common.patch_multimodal_merge  # noqa
+import vllm_ascend.patch.platform.patch_common.patch_transformers_utils  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_attention_selector  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_attentionspec  # noqa
--- a/vllm_ascend/patch/platform/patch_common/patch_config.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_config.py
@@ -0,0 +1,313 @@
+import ast
+
+import vllm.envs as envs
+from transformers import PretrainedConfig
+from vllm.config import ModelConfig
+from vllm.config.speculative import SpeculativeConfig
+from vllm.logger import logger
+
+
+# mypy: ignore-errors
+@property
+def is_deepseek_mla(self: ModelConfig):
+    if not hasattr(self.hf_text_config, "model_type"):
+        return False
+    elif self.hf_text_config.model_type in \
+        ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp',
+            'kimi_k2', 'longcat_flash', 'deepseek_v32'):
+        return self.hf_text_config.kv_lora_rank is not None
+    elif self.hf_text_config.model_type == 'eagle':
+        # if the model is an EAGLE module, check for the
+        # underlying architecture
+        return self.hf_text_config.model.model_type in \
+                ('deepseek_v2', 'deepseek_v3', 'deepseek_v32') \
+            and self.hf_text_config.kv_lora_rank is not None
+    return False
+
+
+@staticmethod
+def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
+    if hf_config.model_type in ("deepseek_v3", "deepseek_v32"):
+        hf_config.model_type = "deepseek_mtp"
+    if hf_config.model_type == "deepseek_mtp":
+        n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+        hf_config.update({
+            "n_predict": n_predict,
+            "architectures": ["DeepSeekMTPModel"]
+        })
+
+    if hf_config.architectures[0] == "MiMoForCausalLM":
+        hf_config.model_type = "mimo_mtp"
+        n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+        hf_config.update({
+            "num_hidden_layers": 0,
+            "n_predict": n_predict,
+            "architectures": ["MiMoMTPModel"]
+        })
+
+    if hf_config.architectures[0] == "Glm4MoeForCausalLM":
+        hf_config.model_type = "glm4_moe_mtp"
+        n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+        hf_config.update({
+            "num_hidden_layers": 0,
+            "n_predict": n_predict,
+            "architectures": ["Glm4MoeMTPModel"]
+        })
+
+    if hf_config.model_type == "ernie4_5_moe":
+        hf_config.model_type = "ernie_mtp"
+    if hf_config.model_type == "ernie_mtp":
+        n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+        hf_config.update({
+            "n_predict": n_predict,
+            "architectures": ["ErnieMTPModel"]
+        })
+
+    if hf_config.model_type == "qwen3_next":
+        hf_config.model_type = "qwen3_next_mtp"
+    if hf_config.model_type == "qwen3_next_mtp":
+        n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+        hf_config.update({
+            "n_predict": n_predict,
+            "architectures": ["Qwen3NextMTP"]
+        })
+    if hf_config.model_type == "longcat_flash":
+        hf_config.model_type = "longcat_flash_mtp"
+        n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
+        hf_config.update({
+            "n_predict": n_predict,
+            "architectures": ["LongCatFlashMTPModel"]
+        })
+
+    return hf_config
+
+
+def __post_init__(self):
+
+    # Note: "method" is a new parameter that helps to extend the
+    # configuration of non-model-based proposers, and the "model" parameter
+    # will be used to set the draft model, eagle head, or additional weight
+    # when needed. If users do not specify "method", the speculative method
+    # will be detected automatically if possible. If the speculative method
+    # can not be detected, it will be considered as the "draft_model" by
+    # default.
+
+    if self.model is None and self.num_speculative_tokens is not None:
+        # TODO(Shangming): Refactor mtp configuration logic when supporting
+        if (self.target_model_config
+                and self.target_model_config.hf_text_config.model_type
+                in ("deepseek_v3", "deepseek_v32", "mimo", "ernie4_5_moe",
+                    "qwen3_next")):
+            # use the draft model from the same model:
+            self.model = self.target_model_config.model
+            # Align the quantization of draft model for cases such as
+            # --quantization fp8 with a bf16 checkpoint.
+            if not self.quantization:
+                self.quantization = self.target_model_config.quantization
+        elif self.method in ("ngram", "[ngram]"):
+            self.model = "ngram"
+        else:
+            raise ValueError("num_speculative_tokens was provided but without "
+                             "speculative model.")
+
+    # Automatically configure the method for ngram when "model" is used
+    # instead of "method"
+    if self.method is None and (self.model is not None
+                                and self.model in ("ngram", "[ngram]")):
+        self.method = "ngram"
+
+    if self.method in ("ngram", "[ngram]"):
+        # Unified to "ngram" internally
+        self.method = "ngram"
+        # Set default values if not provided
+        if (self.prompt_lookup_min is None and self.prompt_lookup_max is None):
+            # TODO(woosuk): Tune these values. They are arbitrarily chosen.
+            self.prompt_lookup_min = 5
+            self.prompt_lookup_max = 5
+        elif self.prompt_lookup_min is None:
+            assert self.prompt_lookup_max is not None
+            self.prompt_lookup_min = self.prompt_lookup_max
+        elif self.prompt_lookup_max is None:
+            assert self.prompt_lookup_min is not None
+            self.prompt_lookup_max = self.prompt_lookup_min
+
+        # Validate values
+        if self.prompt_lookup_min < 1:
+            raise ValueError(
+                f"prompt_lookup_min={self.prompt_lookup_min} must be > 0")
+        if self.prompt_lookup_max < 1:
+            raise ValueError(
+                f"prompt_lookup_max={self.prompt_lookup_max} must be > 0")
+        if self.prompt_lookup_min > self.prompt_lookup_max:
+            raise ValueError(
+                f"prompt_lookup_min={self.prompt_lookup_min} must "
+                f"be <= prompt_lookup_max={self.prompt_lookup_max}")
+
+        # TODO: current we still need extract vocab_size from target model
+        # config, in future, we may try refactor it out, and set
+        # draft related config as None here.
+        self.draft_model_config = self.target_model_config
+        self.draft_parallel_config = self.target_parallel_config
+    else:
+        self.prompt_lookup_max = 0
+        self.prompt_lookup_min = 0
+
+        if self.model is not None:
+            # TODO: Move this import to the top once `ModelConfig`
+            # lives in `vllm.config.model`.
+            from vllm.config import ModelConfig
+            self.draft_model_config = ModelConfig(
+                model=self.model,
+                runner="draft",
+                tokenizer=self.target_model_config.tokenizer,
+                tokenizer_mode=self.target_model_config.tokenizer_mode,
+                trust_remote_code=self.target_model_config.trust_remote_code,
+                allowed_local_media_path=self.target_model_config.
+                allowed_local_media_path,
+                allowed_media_domains=self.target_model_config.
+                allowed_media_domains,
+                dtype=self.target_model_config.dtype,
+                seed=self.target_model_config.seed,
+                revision=self.revision,
+                code_revision=self.code_revision,
+                tokenizer_revision=self.target_model_config.tokenizer_revision,
+                spec_target_max_model_len=self.target_model_config.
+                max_model_len,
+                quantization=self.quantization,
+                enforce_eager=self.target_model_config.enforce_eager,
+                max_logprobs=self.target_model_config.max_logprobs,
+                hf_overrides=SpeculativeConfig.hf_config_override,
+            )
+
+            # Automatically detect the method
+            if self.method in ('eagle', 'eagle3'):
+                pass
+            # examples:
+            # yuhuili/EAGLE-LLaMA3-Instruct-8B
+            # yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
+            # AngelSlim/Qwen3-8B_eagle3
+            elif "eagle-" in self.draft_model_config.model.lower():
+                self.method = "eagle"
+            elif "eagle3" in self.draft_model_config.model.lower():
+                self.method = "eagle3"
+            elif self.draft_model_config.hf_config.model_type == "medusa":
+                self.method = "medusa"
+            elif (self.draft_model_config.hf_config.model_type ==
+                  "mlp_speculator"):
+                self.method = "mlp_speculator"
+            elif (self.draft_model_config.hf_config.model_type
+                  in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")):
+                self.method = "deepseek_mtp"
+                if self.num_speculative_tokens > 1:
+                    logger.warning(
+                            "All Deepseek MTP models only have " \
+                            "one layer. Might need some code changes " \
+                            "to support multiple layers."
+                        )
+            elif (self.draft_model_config.hf_config.model_type == "ernie_mtp"):
+                self.method = "ernie_mtp"
+                if self.num_speculative_tokens > 1:
+                    logger.warning(
+                            "All Ernie MTP models only have " \
+                            "one layer. Might need some code changes " \
+                            "to support multiple layers."
+                        )
+            elif (self.draft_model_config.hf_config.model_type ==
+                  "qwen3_next_mtp"):
+                self.method = "qwen3_next_mtp"
+                if self.num_speculative_tokens > 1:
+                    logger.warning(
+                            "All Qwen3Next MTP models only have " \
+                            "one layer. Might need some code changes " \
+                            "to support multiple layers."
+                        )
+            elif (self.draft_model_config.hf_config.model_type
+                  in ("longcat_flash_mtp")):
+                self.method = "longcat_flash_mtp"
+                if self.num_speculative_tokens > 1:
+                    logger.warning(
+                            "LongCat MTP models only have " \
+                            "one layer. Might need some code changes " \
+                            "to support multiple layers."
+                        )
+            else:
+                self.method = "draft_model"
+                raise NotImplementedError(
+                    "Speculative decoding with draft model is not "
+                    "supported yet. Please consider using other "
+                    "speculative decoding methods such as ngram, medusa, "
+                    "eagle, or deepseek_mtp.")
+
+            # Replace hf_config for EAGLE draft_model
+            if self.method in ("eagle", "eagle3"):
+                if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
+                    raise ValueError(
+                        "Chunked prefill and EAGLE are not compatible "
+                        "when using V0.")
+
+                from vllm.transformers_utils.configs import SpeculatorsConfig
+                from vllm.transformers_utils.configs.eagle import EAGLEConfig
+
+                if isinstance(self.draft_model_config.hf_config,
+                              (EAGLEConfig, SpeculatorsConfig)):
+                    pass
+                else:
+                    eagle_config = EAGLEConfig(
+                        self.draft_model_config.hf_config,
+                        method=self.method,
+                        model_type="eagle")
+                    self.draft_model_config.hf_config = eagle_config
+
+            if (self.num_speculative_tokens is not None
+                    and hasattr(self.draft_model_config.hf_config,
+                                "num_lookahead_tokens")):
+                self.draft_model_config.hf_config.num_lookahead_tokens = \
+                self.num_speculative_tokens
+
+            n_predict = getattr(self.draft_model_config.hf_config, "n_predict",
+                                None)
+            if n_predict is not None:
+                if self.num_speculative_tokens is None:
+                    # Default to max value defined in draft model config.
+                    self.num_speculative_tokens = n_predict
+                elif self.num_speculative_tokens > n_predict and \
+                        self.num_speculative_tokens % n_predict != 0:
+                    # Ensure divisibility for MTP module reuse.
+                    raise ValueError(
+                        f"num_speculative_tokens:{self.num_speculative_tokens}"
+                        f" must be divisible by {n_predict=}")
+
+            if self.speculative_token_tree is None:
+                # Generate chain of tokens.
+                self.speculative_token_tree = str([
+                    (i + 1) * (0, ) for i in range(self.num_speculative_tokens)
+                ])
+            else:
+                # Sort the token tree breadth-first.
+                tree_choices = ast.literal_eval(self.speculative_token_tree)
+                self.speculative_token_tree = str(
+                    sorted(tree_choices, key=lambda t: (len(t), t)))
+
+            self.draft_tensor_parallel_size = \
+                SpeculativeConfig._verify_and_get_draft_tp(
+                    self.target_parallel_config,
+                    self.draft_tensor_parallel_size,
+                    self.draft_model_config.hf_config
+            )
+
+            self.draft_model_config.max_model_len = (
+                SpeculativeConfig._maybe_override_draft_max_model_len(
+                    self.max_model_len,
+                    self.draft_model_config.max_model_len,
+                    self.target_model_config.max_model_len,
+                ))
+
+            self.draft_parallel_config = (
+                SpeculativeConfig.create_draft_parallel_config(
+                    self.target_parallel_config,
+                    self.draft_tensor_parallel_size))
+
+
+ModelConfig.is_deepseek_mla = is_deepseek_mla
+SpeculativeConfig.__post_init__ = __post_init__
+SpeculativeConfig.hf_config_override = hf_config_override
--- a/vllm_ascend/patch/platform/patch_common/patch_mamba_config.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_mamba_config.py
@@ -6,6 +6,8 @@ from vllm.model_executor.models.config import MambaModelConfig
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec

+from vllm_ascend.ascend_config import get_ascend_config
+

@classmethod
 def verify_and_update_config(cls, vllm_config) -> None:
@@ -22,6 +24,7 @@ def verify_and_update_config(cls, vllm_config) -> None:
    logger = init_logger(__name__)
    # Enable FULL_AND_PIECEWISE by default
    MambaModelConfig.verify_and_update_config(vllm_config)
+    ascend_config = get_ascend_config()

    cache_config = vllm_config.cache_config
    model_config = vllm_config.model_config
@@ -38,7 +41,7 @@ def verify_and_update_config(cls, vllm_config) -> None:
        num_kv_heads=model_config.get_num_kv_heads(parallel_config),
        head_size=model_config.get_head_size(),
        dtype=kv_cache_dtype,
-        use_mla=model_config.use_mla).page_size_bytes
+        use_mla=model_config.use_mla or ascend_config.use_sfa).page_size_bytes

    model_cls, _ = ModelRegistry.resolve_model_cls(
        model_config.architecture,
--- a/vllm_ascend/patch/platform/patch_common/patch_transformers_utils.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_transformers_utils.py
@@ -0,0 +1,200 @@
+import vllm.transformers_utils.configs
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from vllm.transformers_utils import config
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekV3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DeepSeek-V3.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 129280):
+            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DeepseekV3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 1407):
+            Dimension of the MoE representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_nextn_predict_layers (`int`, *optional*, defaults to 1):
+            Number of nextn predict layers in the DeepSeekV3 Model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        n_shared_experts (`int`, *optional*, defaults to None):
+            Number of shared experts, None means dense model.
+        n_routed_experts (`int`, *optional*, defaults to None):
+            Number of routed experts, None means dense model.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor or routed experts.
+        topk_method (`str`, *optional*, defaults to `gready`):
+            Topk method used in routed gate.
+        n_group (`int`, *optional*, defaults to None):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to None):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+        num_experts_per_tok (`int`, *optional*, defaults to None):
+            Number of selected experts, None means dense model.
+        moe_layer_freq (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
+        first_k_dense_replace (`int`, *optional*, defaults to 0):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                            \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to False):
+            Whether to normalize the weights of the routed experts.
+        scoring_func (`str`, *optional*, defaults to 'softmax'):
+            Method of computing expert weights.
+        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
+            Auxiliary loss weight coefficient.
+        seq_aux = (`bool`, *optional*, defaults to True):
+            Whether to compute the auxiliary loss for each individual sample.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import DeepseekV3Model, DeepseekV3Config
+    >>> # Initializing a Deepseek-V3 style configuration
+    >>> configuration = DeepseekV3Config()
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "deepseek_v3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=129280,
+        hidden_size=7168,
+        intermediate_size=18432,
+        moe_intermediate_size=2048,
+        num_hidden_layers=61,
+        num_nextn_predict_layers=1,
+        num_attention_heads=128,
+        num_key_value_heads=128,
+        n_shared_experts=1,
+        n_routed_experts=256,
+        ep_size=1,
+        routed_scaling_factor=2.5,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method='noaux_tc',
+        n_group=8,
+        topk_group=4,
+        num_experts_per_tok=8,
+        moe_layer_freq=1,
+        first_k_dense_replace=3,
+        norm_topk_prob=True,
+        scoring_func='sigmoid',
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=0,
+        eos_token_id=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+vllm.transformers_utils.configs.__all__.append("DeepseekV3Config")
+vllm.transformers_utils.configs.DeepseekV3Config = DeepseekV3Config
+config._CONFIG_REGISTRY["deepseek_v32"] = "DeepseekV3Config"
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -20,6 +20,10 @@ from vllm.triton_utils import HAS_TRITON
 if HAS_TRITON:
    import vllm_ascend.patch.worker.patch_common.patch_triton

+# isort: off
+import vllm_ascend.patch.worker.patch_common.patch_attention_selector  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_attentionspec  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_attention_layer  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_weight_loader  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py
@@ -0,0 +1,202 @@
+from typing import List, Optional
+
+import torch
+import vllm
+import vllm.envs as envs
+from torch import nn
+from vllm.attention import Attention, AttentionType, get_attn_backend
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.selector import backend_name_to_enum
+from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
+from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.base_config import \
+    QuantizationConfig
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.platforms import current_platform
+
+from vllm_ascend.utils import vllm_version_is
+
+
+class AscendAttention(Attention, nn.Module, AttentionLayerBase):
+    """Attention layer.
+
+    This class takes query, key, and value tensors as input. The input tensors
+    can either contain prompt tokens or generation tokens.
+    The class does the following:
+
+    1. Store the input key and value tensors in the KV cache.
+    2. Perform (multi-head/multi-query/grouped-query) attention.
+    3. Return the output tensor.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        logits_soft_cap: Optional[float] = None,
+        per_layer_sliding_window: Optional[int] = None,
+        use_mla: bool = False,
+        use_sfa: bool = False,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        attn_backend: Optional[type[AttentionBackend]] = None,
+        **extra_impl_args,
+    ) -> None:
+        """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+        """
+        nn.Module.__init__(self)
+        AttentionLayerBase.__init__(self)
+
+        if per_layer_sliding_window is not None:
+            # per-layer sliding window
+            sliding_window = per_layer_sliding_window
+        elif cache_config is not None:
+            # model-level sliding window
+            sliding_window = cache_config.sliding_window
+        else:
+            sliding_window = None
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+            is_attention_free = cache_config.is_attention_free
+            calculate_kv_scales = cache_config.calculate_kv_scales
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+            is_attention_free = False
+            calculate_kv_scales = False
+        if num_kv_heads is None:
+            num_kv_heads = num_heads
+        assert num_heads % num_kv_heads == 0, \
+            f"num_heads ({num_heads}) is not " \
+            f"divisible by num_kv_heads ({num_kv_heads})"
+
+        # The default k/v_scale is set to 1.0. This is ignored
+        # when kv-cache is not fp8, and should be used with
+        # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
+        # expect the pre-quantized k/v_scale to be loaded along
+        # with the model weights.
+        self.kv_cache_dtype = kv_cache_dtype
+        self.calculate_kv_scales = calculate_kv_scales
+        self._k_scale = torch.tensor(1.0, dtype=torch.float32)
+        self._v_scale = torch.tensor(1.0, dtype=torch.float32)
+        # FlashAttn doesn't support quantizing the kv-cache only
+        # but requires q to be quantized as well.
+        self._q_scale = torch.tensor(1.0, dtype=torch.float32)
+        self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
+
+        # We also keep q/k/v_scale on host (cpu) memory for attention
+        # backends that require the scales to be on host instead of on device.
+        # e.g. Flashinfer
+        self._q_scale_float = 1.0
+        self._k_scale_float = 1.0
+        self._v_scale_float = 1.0
+
+        # The output scale on host memory. This should be the input scale of
+        # the quant op after this attention layer.
+        self._o_scale_float: Optional[float] = None
+
+        self.use_mla = use_mla
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
+        self.has_sink = extra_impl_args.get("sinks") is not None
+
+        quant_method = quant_config.get_quant_method(
+            self, prefix=prefix) if quant_config else None
+        if quant_method is not None and not isinstance(
+                quant_method, UnquantizedLinearMethod):
+            assert isinstance(quant_method, BaseKVCacheMethod)
+            # TODO (mgoin): kv cache dtype should be specified in the FP8
+            # checkpoint config and become the "auto" behavior
+            if self.kv_cache_dtype == "fp8_e5m2":
+                raise ValueError("fp8_e5m2 kv-cache is not supported with "
+                                 "fp8 checkpoints.")
+            # If quantization is enabled, we make "k_scale" and "v_scale"
+            # parameters so that it can be loaded from the model checkpoint.
+            # The k/v_scale will then be converted back to native float32
+            # values after weight loading.
+            self.quant_method = quant_method
+            self.quant_method.create_weights(self)
+
+        # During model initialization, the default dtype is set as the model
+        # weight and activation dtype.
+        dtype = torch.get_default_dtype()
+        if attn_backend is None:
+            if vllm_version_is("0.10.2"):
+                self.attn_backend = get_attn_backend(head_size,
+                                                     dtype,
+                                                     kv_cache_dtype,
+                                                     block_size,
+                                                     is_attention_free,
+                                                     use_mla=use_mla,
+                                                     use_sfa=use_sfa,
+                                                     has_sink=self.has_sink)
+            else:
+                self.attn_backend = get_attn_backend(head_size,
+                                                     dtype,
+                                                     kv_cache_dtype,
+                                                     block_size,
+                                                     use_mla=use_mla,
+                                                     use_sfa=use_sfa,
+                                                     has_sink=self.has_sink)
+        else:
+            self.attn_backend = attn_backend
+
+        impl_cls = self.attn_backend.get_impl_cls()
+        self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
+                             alibi_slopes, sliding_window, kv_cache_dtype,
+                             logits_soft_cap, attn_type,
+                             kv_sharing_target_layer_name, **extra_impl_args)
+        self.backend = backend_name_to_enum(self.attn_backend.get_name())
+        self.dtype = dtype
+
+        # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
+        # torch.compile works by registering the attention as one giant
+        # opaque custom op. For other platforms, we directly call them
+        # and let torch.compile handle them.
+        self.use_direct_call = not current_platform.opaque_attention_op()
+
+        self.use_output = self.attn_backend.accept_output_buffer
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+        self.layer_name = prefix
+        self.attn_type = attn_type
+
+        if kv_sharing_target_layer_name is not None:
+            validate_kv_sharing_target(
+                prefix,
+                kv_sharing_target_layer_name,
+                compilation_config.static_forward_context,
+            )
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
+        # use a placeholder kv cache tensor during init, which will be replaced
+        # by bind_kv_cache
+        # this variable will not be accessed if use_direct_call is True
+        self.kv_cache = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
+
+        self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
+        self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
+        self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
+        self.query_quant = None
+
+
+vllm.attention.Attention = AscendAttention
--- a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
@@ -0,0 +1,181 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# mypy: ignore-errors
+from functools import cache
+from typing import Optional
+
+import torch
+import vllm
+import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.selector import (backend_name_to_enum,
+                                     get_global_forced_attn_backend)
+from vllm.platforms import _Backend, current_platform
+from vllm.utils import resolve_obj_by_qualname
+
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.10.2"):
+
+    def get_attn_backend(
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: Optional[str],
+        block_size: int,
+        is_attention_free: bool = False,
+        use_mla: bool = False,
+        use_sfa: bool = False,
+        has_sink: bool = False,
+    ) -> type[AttentionBackend]:
+        """Selects which attention backend to use and lazily imports it."""
+        # Accessing envs.* behind an @lru_cache decorator can cause the wrong
+        # value to be returned from the cache if the value changes between calls.
+        # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
+        # private function.
+        return _cached_get_attn_backend(
+            head_size=head_size,
+            dtype=dtype,
+            kv_cache_dtype=kv_cache_dtype,
+            block_size=block_size,
+            is_attention_free=is_attention_free,
+            use_v1=envs.VLLM_USE_V1,
+            use_mla=use_mla,
+            use_sfa=use_sfa,
+            has_sink=has_sink,
+        )
+
+    @cache
+    def _cached_get_attn_backend(
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: Optional[str],
+        block_size: int,
+        is_attention_free: bool,
+        use_v1: bool = False,
+        use_mla: bool = False,
+        use_sfa: bool = False,
+        has_sink: bool = False,
+    ) -> type[AttentionBackend]:
+        # If there are no attention layers (e.g. we are running Mamba),
+        # use the placeholder NO_ATTENTION
+        if is_attention_free:
+            from vllm.attention.backends.placeholder_attn import \
+                PlaceholderAttentionBackend
+            return PlaceholderAttentionBackend
+
+        # Check whether a particular choice of backend was
+        # previously forced.
+        #
+        # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
+        # ENVIRONMENT VARIABLE.
+        selected_backend = None
+        backend_by_global_setting: Optional[_Backend] = (
+            get_global_forced_attn_backend())
+        if backend_by_global_setting is not None:
+            selected_backend = backend_by_global_setting
+        else:
+            # Check the environment variable and override if specified
+            backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+            if backend_by_env_var is not None:
+                selected_backend = backend_name_to_enum(backend_by_env_var)
+                if selected_backend is None:
+                    raise ValueError(
+                        f"Invalid attention backend: '{backend_by_env_var}'. "
+                        f"Valid backends are: {list(_Backend.__members__.keys())}"
+                    )
+
+        # get device-specific attn_backend
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend, head_size, dtype, kv_cache_dtype, block_size,
+            use_v1, use_mla, use_sfa, has_sink)
+        if not attention_cls:
+            raise ValueError(
+                f"Invalid attention backend for {current_platform.device_name}"
+            )
+        return resolve_obj_by_qualname(attention_cls)
+else:
+
+    def get_attn_backend(  # type: ignore[misc]
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: Optional[str],
+        block_size: int,
+        use_mla: bool = False,
+        use_sfa: bool = False,
+        has_sink: bool = False,
+    ) -> type[AttentionBackend]:
+        """Selects which attention backend to use and lazily imports it."""
+        # Accessing envs.* behind an @lru_cache decorator can cause the wrong
+        # value to be returned from the cache if the value changes between calls.
+        # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
+        # private function.
+        return _cached_get_attn_backend(
+            head_size=head_size,
+            dtype=dtype,
+            kv_cache_dtype=kv_cache_dtype,
+            block_size=block_size,
+            use_v1=envs.VLLM_USE_V1,
+            use_mla=use_mla,
+            use_sfa=use_sfa,
+            has_sink=has_sink,
+        )
+
+    @cache
+    def _cached_get_attn_backend(
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: Optional[str],
+        block_size: int,
+        use_v1: bool = False,
+        use_mla: bool = False,
+        use_sfa: bool = False,
+        has_sink: bool = False,
+    ) -> type[AttentionBackend]:
+        # Check whether a particular choice of backend was
+        # previously forced.
+        #
+        # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
+        # ENVIRONMENT VARIABLE.
+        selected_backend = None
+        backend_by_global_setting: Optional[_Backend] = (
+            get_global_forced_attn_backend())
+        if backend_by_global_setting is not None:
+            selected_backend = backend_by_global_setting
+        else:
+            # Check the environment variable and override if specified
+            backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+            if backend_by_env_var is not None:
+                selected_backend = backend_name_to_enum(backend_by_env_var)
+                if selected_backend is None:
+                    raise ValueError(
+                        f"Invalid attention backend: '{backend_by_env_var}'. "
+                        f"Valid backends are: {list(_Backend.__members__.keys())}"
+                    )
+
+        # get device-specific attn_backend
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend, head_size, dtype, kv_cache_dtype, block_size,
+            use_v1, use_mla, use_sfa, has_sink)
+        if not attention_cls:
+            raise ValueError(
+                f"Invalid attention backend for {current_platform.device_name}"
+            )
+        return resolve_obj_by_qualname(attention_cls)
+
+
+vllm.attention.get_attn_backend = get_attn_backend
+vllm.attention.selector._cached_get_attn_backend = _cached_get_attn_backend
--- a/vllm_ascend/patch/worker/patch_common/patch_attentionspec.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_attentionspec.py
@@ -0,0 +1,110 @@
+from dataclasses import dataclass, fields
+from typing import Optional
+
+import torch
+import vllm
+from typing_extensions import Self
+from vllm.config import VllmConfig
+from vllm.utils import cdiv, get_dtype_size
+from vllm.v1.core.single_type_kv_cache_manager import (FullAttentionManager,
+                                                       spec_manager_map)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheSpec
+
+
+@dataclass(frozen=True)
+class AttentionSpec(KVCacheSpec):
+    num_kv_heads: int
+    head_size: int
+    dtype: torch.dtype
+    use_mla: bool
+    use_sfa: bool
+
+    @property
+    def page_size_bytes(self) -> int:
+        # For MLA we only store a single latent vector
+        coef = 1 if self.use_mla else 2
+        sfa_bytes = 128 * self.block_size * get_dtype_size(
+            self.dtype) if self.use_sfa else 0
+
+        return coef * self.block_size * self.num_kv_heads * self.head_size \
+                * get_dtype_size(self.dtype) + sfa_bytes
+
+
+vllm.v1.kv_cache_interface.AttentionSpec = AttentionSpec
+
+
+@dataclass(frozen=True)
+class AscendFullAttentionSpec(FullAttentionSpec, AttentionSpec):
+    sliding_window: Optional[int] = None
+    attention_chunk_size: Optional[int] = None
+    """
+    When hybrid allocator is disabled and the model contains both full 
+    attention layers and sliding window attention layers, sliding 
+    window attention are regarded as full attention in KV cache manager 
+    (blocks are allocated for all tokens), while computed as sliding window 
+    attention in model runner.
+    In this case, we use FullAttentionSpec and record the sliding window size.
+    Default to None for not using sliding window attention.
+    """
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        dcp_world_size = \
+            vllm_config.parallel_config.decode_context_parallel_size
+        # Note(hc): each dcp rank only need save
+        # (max_model_len//dcp_world_size) tokens locally.
+        if dcp_world_size > 1:
+            max_model_len = cdiv(max_model_len, dcp_world_size)
+        return cdiv(max_model_len, self.block_size) * self.page_size_bytes
+
+    @classmethod
+    def merge_window_sizes(cls, window_sizes: set[int]) -> Optional[int]:
+        if len(window_sizes) == 0:
+            return None
+        elif len(window_sizes) == 1:
+            return window_sizes.pop()
+        else:
+            raise ValueError(
+                "All attention layers in the same KV cache group must have the "
+                "same window size.")
+
+    @classmethod
+    def merge(cls, specs: list[Self]) -> Self:
+        """
+        Merge a list of FullAttentionSpec objects into a single 
+        FullAttentionSpec object.
+        """
+        assert all(isinstance(spec, FullAttentionSpec) for spec in specs), (
+            "All attention layers in the same KV cache group must be "
+            "FullAttentionSpec.")
+
+        sliding_window = set(spec.sliding_window for spec in specs
+                             if spec.sliding_window is not None)
+        attention_chunk_size = set(spec.attention_chunk_size for spec in specs
+                                   if spec.attention_chunk_size is not None)
+        merged_spec = cls(
+            block_size=specs[0].block_size,
+            num_kv_heads=specs[0].num_kv_heads,
+            head_size=specs[0].head_size,
+            dtype=specs[0].dtype,
+            use_mla=specs[0].use_mla,
+            use_sfa=specs[0].use_sfa,
+            sliding_window=cls.merge_window_sizes(sliding_window),
+            attention_chunk_size=cls.merge_window_sizes(attention_chunk_size),
+        )
+        for spec in specs:
+            for f in fields(AttentionSpec):
+                assert getattr(spec, f.name) == getattr(merged_spec, f.name), (
+                    "All attention layers in the same KV cache group must have "
+                    "the same attention spec.")
+        assert (
+            (merged_spec.sliding_window is not None) +
+            (merged_spec.attention_chunk_size is not None) <= 1
+        ), ("Model with both sliding window layers and chunked local attention "
+            "layers is not supported.")
+        return merged_spec
+
+
+spec_manager_map.update({AscendFullAttentionSpec: FullAttentionManager})
+
+vllm.v1.kv_cache_interface.FullAttentionSpec = AscendFullAttentionSpec