port deepseekv2 and mtp to main branch (#429)

### What this PR does / why we need it? This PR ports all the deepseek graph mode code and mtp code from v0.7.3 to the main branch --------- Signed-off-by: SidaoY <1024863041@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: mengwei805 <mengwei25@huawei.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: q00832892 <qiaoyang19@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: SidaoY <1024863041@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: Yizhou Liu <liuyizhou5@h-partners.com> Co-authored-by: mengwei805 <mengwei25@huawei.com> Co-authored-by: libaokui <libaokui@huawei.com>
2025-04-19 17:38:18 +08:00
parent 086423dc35
commit 1a1f9a6d89
33 changed files with 3361 additions and 315 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -24,6 +24,10 @@ import torch_npu
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                              AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+from vllm_ascend.ops.attention import vanilla_chunked_prefill


 class AscendAttentionBackend(AttentionBackend):
@@ -44,6 +48,10 @@ class AscendAttentionBackend(AttentionBackend):
    def get_state_cls() -> Type["CommonAttentionState"]:
        return CommonAttentionState

+    @staticmethod
+    def get_builder_cls() -> type["AscendAttentionMetadataBuilder"]:
+        return AscendAttentionMetadataBuilder
+
    @staticmethod
    def get_kv_cache_shape(
        num_blocks: int,
@@ -94,11 +102,11 @@ class AscendAttentionState(Enum):
 class AscendMetadata:
    # (batch_size, max_blocks_per_seq).
    # Block addresses per sequence. (Seq id -> list of physical block)
-    block_tables: Optional[torch.Tensor]
+    block_tables: torch.Tensor
    # (batch_size,). The sequence length per sequence. Sequence length means
    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]] = None
-    context_lens: Optional[List[int]] = None
+    query_lens: torch.Tensor
+    seq_lens: torch.Tensor
    # Maximum query length in the batch. None for decoding.
    max_query_len: Optional[int] = None
    # (num_tokens,). The indices of the token slots that input tokens will be
@@ -117,6 +125,36 @@ class AscendMetadata:
    attn_mask: Optional[torch.Tensor] = None


+class AscendAttentionMetadataBuilder:
+
+    def __init__(self, runner):
+        self.runner = runner
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return False
+
+    def build(self, num_reqs, num_actual_tokens, max_query_len,
+              common_prefix_len):
+        block_table = (
+            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        query_lens = self.runner.query_lens
+        seq_lens = self.runner.seq_lens_cpu[:num_reqs]
+        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
+            self.runner.device, non_blocking=True)
+        attn_mask = self.runner.attn_mask
+        attn_state = self.runner.attn_state
+
+        attn_metadata = AscendMetadata(block_tables=block_table,
+                                       query_lens=query_lens,
+                                       seq_lens=seq_lens,
+                                       max_query_len=max_query_len,
+                                       slot_mapping=slot_mapping,
+                                       attn_mask=attn_mask,
+                                       attn_state=attn_state)
+        return attn_metadata
+
+
 class AscendAttentionBackendImpl(AttentionImpl):

    def __init__(
@@ -229,29 +267,46 @@ class AscendAttentionBackendImpl(AttentionImpl):
                                           out=output)
        elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
            block_tables = attn_metadata.block_tables
-            torch_npu._npu_paged_attention(
-                query=query,
-                key_cache=self.key_cache,
-                value_cache=self.value_cache,
-                num_kv_heads=self.num_kv_heads,
-                num_heads=self.num_heads,
-                scale_value=self.scale,
-                block_table=block_tables,
-                context_lens=attn_metadata.context_lens,
-                out=output)
+            torch_npu._npu_paged_attention(query=query,
+                                           key_cache=self.key_cache,
+                                           value_cache=self.value_cache,
+                                           num_kv_heads=self.num_kv_heads,
+                                           num_heads=self.num_heads,
+                                           scale_value=self.scale,
+                                           block_table=block_tables,
+                                           context_lens=attn_metadata.seq_lens,
+                                           out=output)
        # Normal V1 situation.
        else:
-            # use paged attention
-            torch_npu._npu_paged_attention_splitfuse(
-                query=query,
-                key_cache=self.key_cache,
-                value_cache=self.value_cache,
-                mask=attn_metadata.attn_mask,
-                block_table=attn_metadata.block_tables,
-                seq_len=attn_metadata.seq_lens,
-                context_lens=attn_metadata.context_lens,
-                num_kv_heads=self.num_kv_heads,
-                num_heads=self.num_heads,
-                scale_value=self.scale,
-                out=output)
+            # use chunked prefill for head size 192 scenario, like deepseek
+            # paged_attention_splitfuse maybe crash at such scenario
+            # TODO: vanilla path will be removed after the kernel support
+            # head_size 192 scenario
+            if self.head_size == 192:
+                cu_seqlen_q = [0] + attn_metadata.query_lens.tolist()
+                cu_seqlen_k = [0] + attn_metadata.seq_lens.tolist()
+                cu_seqlen_q = torch.tensor(cu_seqlen_q, device="npu")
+                cu_seqlen_k = torch.tensor(cu_seqlen_k, device="npu")
+                cu_seqlen_q = torch.cumsum(cu_seqlen_q, dim=0)
+                cu_seqlen_k = torch.cumsum(cu_seqlen_k, dim=0)
+                max_seqlen_q = torch.max(attn_metadata.query_lens)
+                max_seqlen_k = torch.max(attn_metadata.seq_lens)
+                vanilla_chunked_prefill(output, query, self.key_cache,
+                                        self.value_cache,
+                                        attn_metadata.block_tables,
+                                        cu_seqlen_q, cu_seqlen_k, max_seqlen_q,
+                                        max_seqlen_k, self.scale, None, True)
+            else:
+                torch_npu._npu_paged_attention_splitfuse(
+                    query=query,
+                    key_cache=self.key_cache,
+                    value_cache=self.value_cache,
+                    mask=attn_metadata.attn_mask,
+                    block_table=attn_metadata.block_tables,
+                    seq_len=attn_metadata.query_lens,
+                    context_lens=attn_metadata.seq_lens,
+                    num_kv_heads=self.num_kv_heads,
+                    num_heads=self.num_heads,
+                    scale_value=self.scale,
+                    out=output)
        return output.view(num_tokens, self.hidden_size)