Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -7,7 +7,6 @@ from typing import Any
 import torch

 from vllm.config import VllmConfig
-from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backend import (
    AttentionBackend,
    CommonAttentionMetadata,
@@ -105,14 +104,6 @@ class Mamba2AttentionMetadata(BaseMambaAttentionMetadata):

    # Chunk-related metadata (only for prefill)
    seq_idx_p: torch.Tensor | None = None
-    # cu_chunk_seqlen_p is a tensor of shape (nchunks+1,) that contains, for
-    # each chunk, its offsets into the varlen sequence dimension. It is defined
-    # such that the i-th chunk contains tokens from cu_chunk_seqlen_p[i] to
-    # cu_chunk_seqlen_p[i+1].
-    cu_chunk_seqlen_p: torch.Tensor | None = None
-    # last_chunk_indices_p is a tensor of shape (batch,) that contains the
-    # index of the last chunk for every sequence in the (prefill) batch.
-    last_chunk_indices_p: torch.Tensor | None = None


 class Mamba2AttentionMetadataBuilder(
@@ -134,68 +125,6 @@ class Mamba2AttentionMetadataBuilder(
        )
        self.chunk_size: int = chunk_size

-    def _compute_chunk_metadata(
-        self,
-        num_prefills: int,
-        num_computed_tokens_p_cpu: torch.Tensor,
-        query_start_loc_p_cpu: torch.Tensor,
-    ) -> tuple[list[int], list[int], list[int]]:
-        """
-        Compute chunk-specific metadata for Mamba2.
-
-        The code below carefully constructs the chunks such that:
-        1. Chunks contain tokens from a *single* sequence only.
-        2. For every sequence, we are guaranteed that we can
-           retrieve the mamba state *every* chunk_size tokens.
-        Constraint (1) dramatically simplifies the mamba2 kernels.
-        Constraint (2) dramatically simplifies the implementation
-        of prefix caching for mamba2 (wip). We need to take care
-        of the interaction with chunked prefill in order to
-        satisfy constraint (2).
-        """
-        # TODO (tdoublep): This code could probably be optimized.
-        cu_chunk_seqlen = []
-        seq_idx = []
-        last_chunk_indices = []
-        seqlen_pos = 0
-
-        for req_idx in range(num_prefills):
-            this_num_computed = num_computed_tokens_p_cpu[req_idx].item()
-            this_new_tokens = (
-                query_start_loc_p_cpu[req_idx + 1].item()
-                - query_start_loc_p_cpu[req_idx].item()
-            )
-
-            # if computed tokens are not chunk-aligned, use the first
-            # chunk to finish it off
-            if this_num_computed % self.chunk_size != 0:
-                seq_idx.append(req_idx)
-                cu_chunk_seqlen.append(seqlen_pos)
-                # how many tokens to finish the chunk?
-                chunk_len = (
-                    cdiv(this_num_computed, self.chunk_size) * self.chunk_size
-                    - this_num_computed
-                )
-                # we can only use at most this_new_tokens
-                chunk_len = min(chunk_len, this_new_tokens)
-                seqlen_pos += chunk_len
-                this_new_tokens -= chunk_len
-
-            n_chunks = cdiv(this_new_tokens, self.chunk_size)
-            for chunk in range(n_chunks):
-                seq_idx.append(req_idx)
-                cu_chunk_seqlen.append(seqlen_pos)
-                chunk_len = min(self.chunk_size, this_new_tokens)
-                seqlen_pos += chunk_len
-                this_new_tokens -= chunk_len
-
-            assert this_new_tokens == 0
-            last_chunk_indices.append(len(cu_chunk_seqlen) - 1)
-
-        cu_chunk_seqlen.append(seqlen_pos)
-
-        return cu_chunk_seqlen, seq_idx, last_chunk_indices
-
    def build(
        self,
        common_prefix_len: int,
@@ -220,41 +149,12 @@ class Mamba2AttentionMetadataBuilder(
                else False
            )

-            num_reqs = common.num_reqs
-            num_prefills = common.num_prefills
-            num_decode_tokens = common.num_decode_tokens
-
-            num_computed_tokens_cpu = (
-                common_attn_metadata.compute_num_computed_tokens().cpu()
-            )
-            num_computed_tokens_p_cpu = num_computed_tokens_cpu[
-                num_reqs - num_prefills : num_reqs
-            ]
-            query_start_loc_p_cpu = (
-                common_attn_metadata.query_start_loc_cpu[-num_prefills - 1 :]
-                - num_decode_tokens
-            )
-
-            cu_chunk_seqlen, seq_idx, last_chunk_indices = self._compute_chunk_metadata(
-                num_prefills,
-                num_computed_tokens_p_cpu,
-                query_start_loc_p_cpu,
-            )
-
-            seq_idx_p = torch.as_tensor(
-                seq_idx,
-                device=common_attn_metadata.query_start_loc.device,
-                dtype=torch.int32,
-            )
-            cu_chunk_seqlen_p = torch.as_tensor(
-                cu_chunk_seqlen,
-                device=common_attn_metadata.query_start_loc.device,
-                dtype=torch.int32,
-            )
-            last_chunk_indices_p = torch.as_tensor(
-                last_chunk_indices,
-                device=common_attn_metadata.query_start_loc.device,
-                dtype=torch.int32,
+            cu_chunk_seqlen_p, seq_idx_p, last_chunk_indices_p = (
+                self._build_chunk_metadata_tensors(
+                    self.chunk_size,
+                    common,
+                    common_attn_metadata,
+                )
            )

        return replace(