init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Type, TypeVar
+from typing import (TYPE_CHECKING, ClassVar, NamedTuple, Optional, Tuple, Type,
+                    TypeVar)

 import torch
 import torch_npu
@@ -12,15 +13,17 @@ from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group
 from vllm.model_executor.layers.linear import (LinearBase,
                                               UnquantizedLinearMethod)
 from vllm.utils import cdiv, round_down
+from vllm.v1.attention.backends.utils import AttentionCGSupport

 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
-                                         split_decodes_and_prefills)
+                                         maybe_save_kv_layer_to_connector,
+                                         split_decodes_and_prefills,
+                                         wait_for_kv_layer_from_connector)
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
-from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
 from vllm_ascend.utils import npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch

@@ -164,6 +167,9 @@ M = TypeVar("M", bound=AscendMLAMetadata)


 class AscendMLAMetadataBuilder:
+    # Does this backend/builder support ACL Graphs for attention (default: no).
+    aclgraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.NEVER
    """
    NOTE: Please read the comment at the top of the file before trying to
    understand this class
@@ -171,6 +177,8 @@ class AscendMLAMetadataBuilder:

    # _attn_mask_builder = None
    def __init__(self,
+                 kv_cache_spec,
+                 layer_names,
                 vllm_config: VllmConfig,
                 device: torch.device,
                 metadata_cls: Optional[AscendMLAMetadata] = None):
@@ -185,7 +193,16 @@ class AscendMLAMetadataBuilder:
                           self.block_size - 1) // self.block_size
        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled

+        self.speculative_config = vllm_config.speculative_config
        self.decode_threshold = 1
+        if self.speculative_config:
+            spec_token_num = self.speculative_config.num_speculative_tokens
+            self.decode_threshold += spec_token_num
+            assert self.decode_threshold <= 16, f"decode_threshold exceeded \
+                npu_fused_infer_attention_score TND layout's limit of 16, \
+                got {self.decode_threshold}"
+
+        self.reorder_batch_threshold = self.decode_threshold

        if self.chunked_prefill_enabled:
            self.chunked_prefill_workspace_size = min(
@@ -265,6 +282,7 @@ class AscendMLAMetadataBuilder:

    def build(
        self,
+        common_prefix_len: int,
        common_attn_metadata: AscendCommonAttentionMetadata,
        model: nn.Module,
    ) -> AscendMLAMetadata:
@@ -272,7 +290,6 @@ class AscendMLAMetadataBuilder:
        num_actual_tokens = common_attn_metadata.num_actual_tokens
        query_start_loc = common_attn_metadata.query_start_loc
        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-        # TODO(xyx): remove the if condition after mla supports torch mode speculative decoding
        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
            split_decodes_and_prefills(common_attn_metadata, decode_threshold=self.decode_threshold)
        assert num_decodes + num_prefills == num_reqs
@@ -284,11 +301,7 @@ class AscendMLAMetadataBuilder:
        device = self.device

        block_table = (common_attn_metadata.block_table_tensor[:num_reqs])
-        slot_mapping = common_attn_metadata.slot_mapping_cpu[:
-                                                             num_actual_tokens].to(
-                                                                 device,
-                                                                 non_blocking=
-                                                                 True)
+        slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens]
        input_positions = common_attn_metadata.positions[:
                                                         num_actual_tokens].long(
                                                         )
@@ -376,11 +389,12 @@ class AscendMLAMetadataBuilder:

        decode_metadata = None
        if num_decodes > 0:
+            # Notice that num_decodes != num_decode_tokens in SpecDecoding Scenario
            actual_seq_lengths_q = query_start_loc[1:num_decodes + 1].tolist()
            max_seq_lens = seq_lens[:num_decodes].max().item()
-            seq_lens = seq_lens[:num_decode_tokens]
+            seq_lens = seq_lens[:num_decodes]
            input_positions = input_positions[:num_decode_tokens]
-            block_table = block_table[:num_decode_tokens, ...]
+            block_table = block_table[:num_decodes, ...]
            seq_lens_list = seq_lens.tolist()

            cos = self.cos_cache[input_positions].unsqueeze(  # type: ignore
@@ -481,17 +495,12 @@ class AscendMLAImpl(MLAAttentionImpl):
        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
        self.enable_prefetch = ascend_config.enable_prefetch
        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
-        self.chunked_prefill_for_mla = ascend_config.chunked_prefill_for_mla

        vllm_config = get_current_vllm_config()
        self.ring_mla_mask_size = 512
        self.prefill_mask = None

-        # Adapt torch air graph mode with spec decoding.
-        speculative_config = vllm_config.speculative_config
-        if speculative_config is not None:
-            self.spec_token_num = speculative_config.num_speculative_tokens
-            assert self.spec_token_num > 0
+        self.speculative_config = vllm_config.speculative_config

    def _v_up_proj(self, x):
        # Convert from (B, N, L) to (N, B, L)
@@ -663,84 +672,47 @@ class AscendMLAImpl(MLAAttentionImpl):
                                  self.v_head_dim,
                                  dtype=q_nope.dtype,
                                  device=q_nope.device)
-        if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
-            query = torch.cat((q_nope, q_pe), dim=-1)
-            key = torch.cat((k_nope, k_pe), dim=-1)
-            torch_npu._npu_flash_attention(
-                query=query,
-                key=key,
-                value=value,
-                mask=attn_metadata.attn_mask,
-                seq_len=attn_metadata.prefill.context_lens,
-                scale_value=self.scale,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_heads,
-                out=attn_output)
-        elif self.chunked_prefill_for_mla:
-            attn_lse = torch.empty(self.num_heads,
-                                   num_tokens,
-                                   dtype=torch.float32,
-                                   device=q_nope.device)
-            if self.prefill_mask is None:
-                self.prefill_mask = torch.triu(
-                    torch.ones(self.ring_mla_mask_size,
-                               self.ring_mla_mask_size,
-                               device=q_nope.device,
-                               dtype=q_nope.dtype), 1)
-            torch_npu.atb.npu_ring_mla(
-                q_nope=q_nope,
-                q_rope=q_pe,
-                k_nope=k_nope,
-                k_rope=k_pe,
-                value=value,
-                mask=self.prefill_mask,
-                seqlen=torch.tensor(attn_metadata.prefill.query_lens,
-                                    dtype=torch.int32),
-                head_num=self.num_heads,
-                kv_head_num=self.num_heads,
-                pre_out=None,
-                prev_lse=None,
-                qk_scale=self.scale,
-                kernel_type="kernel_type_high_precision",
-                mask_type="mask_type_triu",
-                input_layout="type_bsnd",
-                calc_type="calc_type_first_ring",
-                output=attn_output,
-                softmax_lse=attn_lse)
-            attn_output, attn_lse = self._compute_prefill_context( \
-                q_nope, q_pe, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse)
-        else:
-            query = torch.cat((q_nope, q_pe), dim=-1)
-            attn_output_torch = torch.empty(num_tokens,
-                                            self.num_heads * self.v_head_dim,
-                                            dtype=query.dtype,
-                                            device=query.device)
-            # current requests is chunked in prefill, disable flash attention with chunked prefill
-            vanilla_chunked_prefill_mla(
-                output=attn_output_torch,
-                query=query,
-                kv_cache=kv_c_and_k_pe_cache,
-                block_tables=attn_metadata.prefill.block_table,
-                query_lens=attn_metadata.prefill.query_lens,
-                context_lens=attn_metadata.prefill.context_lens,
-                kv_b_proj=self.kv_b_proj,
-                max_query_len=attn_metadata.prefill.max_query_len,
-                max_context_len=attn_metadata.prefill.max_seq_lens,
-                nope_dim=self.qk_nope_head_dim,
-                rope_dim=self.qk_rope_head_dim,
-                v_head_dim=self.v_head_dim,
-                scale=self.scale,
-                alibi_slopes=None,
-                causal=True)
+        attn_lse = torch.empty(self.num_heads,
+                               num_tokens,
+                               dtype=torch.float32,
+                               device=q_nope.device)
+        if self.prefill_mask is None:
+            if q_nope.dtype == torch.float16:
+                mask_value = torch.finfo(torch.float32).min
+            else:
+                mask_value = 1
+            prefill_mask = torch.triu(
+                torch.ones(self.ring_mla_mask_size,
+                           self.ring_mla_mask_size,
+                           device=q_nope.device,
+                           dtype=q_nope.dtype), 1)
+            self.prefill_mask = torch.where(prefill_mask == 1, mask_value,
+                                            0).to(q_nope.dtype)
+        torch_npu.atb.npu_ring_mla(q_nope=q_nope,
+                                   q_rope=q_pe,
+                                   k_nope=k_nope,
+                                   k_rope=k_pe,
+                                   value=value,
+                                   mask=self.prefill_mask,
+                                   seqlen=torch.tensor(
+                                       attn_metadata.prefill.query_lens,
+                                       dtype=torch.int32),
+                                   head_num=self.num_heads,
+                                   kv_head_num=self.num_heads,
+                                   pre_out=None,
+                                   prev_lse=None,
+                                   qk_scale=self.scale,
+                                   kernel_type="kernel_type_high_precision",
+                                   mask_type="mask_type_triu",
+                                   input_layout="type_bsnd",
+                                   calc_type="calc_type_first_ring",
+                                   output=attn_output,
+                                   softmax_lse=attn_lse)
+        attn_output, attn_lse = self._compute_prefill_context( \
+            q_nope, q_pe, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse)

        attn_output = attn_output.reshape(
            [num_tokens, self.num_heads * self.v_head_dim])
-        if attn_metadata.attn_state in [
-                AscendAttentionState.ChunkedPrefill,
-                AscendAttentionState.SpecDecoding,
-                AscendAttentionState.PrefillCacheHit
-        ] and not self.chunked_prefill_for_mla:
-            attn_output = attn_output_torch
        return attn_output

    def exec_kv_decode(
@@ -785,7 +757,7 @@ class AscendMLAImpl(MLAAttentionImpl):
        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
        kv_no_split = kv_no_split.view(
            B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
-        cache_mode = "PA_BLK_NZ" if self.enable_kv_nz else "PA"
+        cache_mode = "PA_NZ" if self.enable_kv_nz else "PA"
        _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache(
            kv_no_split,
            self.kv_a_layernorm.weight,
@@ -840,8 +812,11 @@ class AscendMLAImpl(MLAAttentionImpl):
                             self.qk_rope_head_dim)
            input_layout = "BNSD"

-        if attn_metadata.attn_state == AscendAttentionState.SpecDecoding:
-            assert num_tokens % self.spec_token_num == 0
+        if attn_metadata.attn_state in [
+                AscendAttentionState.SpecDecoding,
+                AscendAttentionState.ChunkedPrefill
+        ] and self.speculative_config is not None:
+            # Use TND layout for pure SpecDecoding and SpecDecoding in ChunkedPrefill
            input_layout = "TND"
            # [bs * q_seq_len, num_heads_per_rank, dim]
            q_nope = q_nope.view(num_tokens, self.num_heads, -1)
@@ -887,8 +862,8 @@ class AscendMLAImpl(MLAAttentionImpl):
                current_ms_metadata.before_comm_event.wait()
                return self._v_up_proj(attn_output)

-    def _mla_preprocess(self, hidden_states, kv_cache, attn_metadata,
-                        need_gather_q_kv):
+    def _mla_preprocess(self, layer_name, hidden_states, kv_cache,
+                        attn_metadata, need_gather_q_kv):
        # MLA Preprocess:
        # 1. Perform q_a_proj and q_a_layernorm to obtain q_c
        # 2. Perform kv_a_proj_with_mqa to obtain kv_no_split
@@ -917,6 +892,8 @@ class AscendMLAImpl(MLAAttentionImpl):
            kv_no_split = get_tp_group().all_gather(kv_no_split, 0)
        decode_preprocess_res = None
        prefill_preprocess_res = None
+        if has_prefill:
+            wait_for_kv_layer_from_connector(layer_name)
        # Preprocess for decode tokens
        if has_decode:
            decode_q_c = q_c[:num_decode_tokens]
@@ -963,6 +940,7 @@ class AscendMLAImpl(MLAAttentionImpl):

    def forward(
        self,
+        layer_name,
        hidden_states: torch.Tensor,  # query in unified attn
        kv_cache: Tuple[torch.Tensor],
        attn_metadata: M,
@@ -989,7 +967,8 @@ class AscendMLAImpl(MLAAttentionImpl):

        # MLA Preprocess
        decode_preprocess_res, prefill_preprocess_res = self._mla_preprocess(
-            hidden_states, kv_cache, attn_metadata, need_gather_q_kv)
+            layer_name, hidden_states, kv_cache, attn_metadata,
+            need_gather_q_kv)

        if decode_preprocess_res is not None:
            # MLA Preprocess for decoding
@@ -1047,4 +1026,8 @@ class AscendMLAImpl(MLAAttentionImpl):
                    is_force_scatter=self.enable_shared_expert_dp)[0]
                current_ms_metadata.after_comm_event.record()
        del o_proj_input
+
+        has_prefill = attn_metadata.num_prefills > 0
+        if has_prefill:
+            maybe_save_kv_layer_to_connector(layer_name, list(kv_cache))
        return output_padded