first commit

2026-03-10 13:31:25 +08:00
parent ba974cecfa
commit b62b889355
2604 changed files with 438977 additions and 0 deletions
--- a/vllm_br/v0/init.py
+++ b/vllm_br/v0/init.py
--- a/vllm_br/v0/pycache/init.cpython-310.pyc
+++ b/vllm_br/v0/pycache/init.cpython-310.pyc
--- a/vllm_br/v0/attention/init.py
+++ b/vllm_br/v0/attention/init.py
@@ -0,0 +1,15 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
--- a/vllm_br/v0/attention/pycache/init.cpython-310.pyc
+++ b/vllm_br/v0/attention/pycache/init.cpython-310.pyc
--- a/vllm_br/v0/attention/backends/init.py
+++ b/vllm_br/v0/attention/backends/init.py
@@ -0,0 +1,15 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
--- a/vllm_br/v0/attention/backends/pycache/init.cpython-310.pyc
+++ b/vllm_br/v0/attention/backends/pycache/init.cpython-310.pyc
--- a/vllm_br/v0/attention/backends/pycache/attention_v0.cpython-310.pyc
+++ b/vllm_br/v0/attention/backends/pycache/attention_v0.cpython-310.pyc
--- a/vllm_br/v0/attention/backends/attention_v0.py
+++ b/vllm_br/v0/attention/backends/attention_v0.py
@@ -0,0 +1,570 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+"""Attention layer with FlashAttention."""
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+
+import torch
+import torch_br
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
+                                           get_flash_attn_version)
+from vllm.logger import logger
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder)
+
+from collections import defaultdict
+from itertools import accumulate
+
+from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
+                                           compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.multimodal import MultiModalPlaceholderMap
+from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+
+
+class SUPAFlashAttentionBackend(AttentionBackend):
+
+    # NOTE: When piecewise cudagraph is enabled, this
+    # makes sure the output tensor is allocated inside the cudagraph.
+    # NOTE: currently, we do not support accept_output_buffer=True
+    accept_output_buffer: bool = False
+
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "SUPAFLASH_ATTN_VLLM_V0"
+
+    @staticmethod
+    def get_impl_cls() -> type["SUPAFlashAttentionImpl"]:
+        return SUPAFlashAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["SUPAFlashAttentionMetadata"]:
+        return SUPAFlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["SUPAFlashAttentionMetadataBuilder"]:
+        return SUPAFlashAttentionMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_kv_cache_usharp_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        th_gran = SUPAFlashAttentionBackend.get_kv_cache_usharp_alignment(
+            block_size)
+        n_block = max(1, (num_blocks + th_gran - 1) // th_gran)
+        logger.debug(
+            f'Origin kv cache shape is [2, {num_blocks}, {block_size}, {num_kv_heads}, {head_size}, For SUPA Speed up, use [2, {n_block}, {th_gran * block_size}, {num_kv_heads * head_size}]'  # noqa: G004
+        )
+        return (2, n_block, th_gran * block_size, num_kv_heads * head_size)
+
+    @staticmethod
+    def get_kv_cache_usharp_alignment(block_size: int) -> int:
+        max_h_limit = 2048
+        return max_h_limit // block_size
+
+
+@dataclass
+class SUPAFlashAttentionMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    seq_lens_tensor: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    # BIREN Attention Params
+    seq_start_loc: torch.Tensor
+    context_lens: torch.Tensor
+    max_decode_seq_len: int
+    num_prefills: int
+    num_decodes: int
+    num_prefills_tokens: int
+    do_cache: bool  # when use attentionsplit, do cache = False
+
+    # For cascade attention.
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
+
+    # Optional aot scheduling
+    scheduler_metadata: Optional[torch.Tensor] = None
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+    _cached_prefill_metadata: Optional["SUPAFlashAttentionMetadata"] = None
+    _cached_decode_metadata: Optional["SUPAFlashAttentionMetadata"] = None
+
+    # for local attention
+    @dataclass
+    class LocalAttentionMetadata:
+        local_query_start_loc: torch.Tensor
+        local_seqused_k: torch.Tensor
+        local_block_table: torch.Tensor
+        local_max_query_len: int
+        local_max_seq_len: int
+        local_scheduler_metadata: Optional[torch.Tensor]
+
+    local_attn_metadata: Optional[LocalAttentionMetadata] = None
+
+    @property
+    def do_prefill(self) -> bool:
+        return self.num_prefills > 0
+
+    @property
+    def do_decode(self) -> bool:
+        return self.num_decodes > 0
+
+    @property
+    def prefill_metadata(self) -> Optional["SUPAFlashAttentionMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+        else:
+            return None
+
+
+class SUPAFlashAttentionMetadataBuilder:
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens,
+                 inter_data.seq_lens,
+                 inter_data.query_lens,
+                 inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks,
+                 strict=False):
+            self.context_lens.append(context_len)
+
+            if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int,
+            block_tables: List[List[int]]) -> torch.Tensor:
+        # The shape of graph_block_tables is
+        # [max batch size, max context len // block size].
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+
+        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
+        for i, block_table in enumerate(block_tables):
+            if block_table:
+                num_blocks = len(block_table)
+                if num_blocks <= max_blocks:
+                    graph_block_tables[i, :num_blocks] = block_table
+                else:
+                    # It may be possible to have more blocks allocated due
+                    # to lookahead slots of multi-step, however, they are
+                    # not used anyway, so can be safely ignored.
+                    graph_block_tables[
+                        i, :max_blocks] = block_table[:max_blocks]
+
+        return torch.from_numpy(graph_block_tables).to(
+            device=self.runner.device, non_blocking=True)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        # decode_query_lens = query_lens[self.num_prefills:]
+        # if len(decode_query_lens) > 0:
+        #     max_decode_query_len = max(decode_query_lens)
+        # else:
+        #     max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        num_seqs = len(seq_lens)
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+            block_tables = self._get_graph_runner_block_tables(
+                num_seqs, self.block_tables)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        return SUPAFlashAttentionMetadata(
+            num_actual_tokens=batch_size,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc_tensor,
+            max_seq_len=max_prefill_seq_len,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            block_table=block_tables,
+            slot_mapping=slot_mapping_tensor,
+            use_cascade=False,
+            common_prefix_len=0,
+            scheduler_metadata=0,
+            cu_prefix_query_lens=None,
+            prefix_kv_lens=None,
+            suffix_kv_lens=None,
+            local_attn_metadata=None,
+            prefix_scheduler_metadata=None,
+            # Biren Attention Params
+            seq_start_loc=seq_start_loc,
+            context_lens=context_lens_tensor,
+            max_decode_seq_len=max_decode_seq_len,
+            num_prefills=self.num_prefills,
+            num_decodes=num_decode_tokens,
+            num_prefills_tokens=self.num_prefill_tokens,
+            do_cache=False)
+
+
+class SUPAFlashAttentionImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        use_irope: bool = False,
+    ) -> None:
+        if blocksparse_params is not None:
+            raise ValueError(
+                "FlashAttention does not support block-sparse attention.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.attn_type = attn_type
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        self.kv_cache_dtype = kv_cache_dtype
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        support_head_sizes = SUPAFlashAttentionBackend.get_supported_head_sizes(
+        )
+        if head_size not in support_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by FlashAttention. "
+                f"Supported head sizes are: {support_head_sizes}. "
+                "Set VLLM_USE_V1=1 to use another attention backend.")
+
+        self.use_irope = use_irope
+        self.vllm_flash_attn_version = get_flash_attn_version()
+        if is_quantized_kv_cache(self.kv_cache_dtype) \
+            and not flash_attn_supports_fp8():
+            raise NotImplementedError(
+                "FlashAttention does not support fp8 kv-cache on this device.")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: SUPAFlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        NOTE: FP8 quantization, flash-attn expect the size of
+              {q,k,v}_descale to be (num_sequences, num_kv_heads).
+              We use torch's .expand() to avoid duplicating values
+        """
+        assert output is None, "Output tensor should not provided."
+        if attn_metadata is None:
+            # FIXME: this may lead to wrong block estimatation
+            # Profiling run.
+            return query
+
+        # NOTE: supa attn use [batch_size, num_tokens, num_heads * head_size] as shape
+        if kv_cache is not None and attn_metadata.do_cache:
+            torch_br.supa_kvcache_store_infer_v2(
+                kv_cache,
+                key,
+                value,  # type: ignore
+                attn_metadata.slot_mapping,
+                self.head_size)
+
+        output_prefill = output_decode = None
+        output = torch.empty_like(query)
+
+        if attn_metadata.do_prefill and attn_metadata.do_decode:
+            # chunked
+            decode_query = query[:, attn_metadata.num_prefills_tokens:]
+            query = query[:, :attn_metadata.num_prefills_tokens]
+
+            key = key[:, :attn_metadata.num_prefills_tokens]
+            value = value[:, :attn_metadata.num_prefills_tokens]
+        elif attn_metadata.do_decode:
+            decode_query = query
+
+        if attn_metadata.do_prefill:
+            if (kv_cache is None or attn_metadata.block_table.numel() == 0):
+                # has do_decode should go into prefix-enabled branch
+                assert not attn_metadata.do_decode
+
+                # in this branch, query_start_loc = seq_start_loc
+                if os.getenv('USE_BR_SUEAGER_SDPA',
+                             'False').lower() not in {'false', '0', ''}:
+                    output_prefill, inter_mediate = torch_br.sueager_scaled_dot_product_attention_fwd(
+                        query=query,
+                        key=key,
+                        value=value,
+                        mask=None,
+                        dropout_prob=0.0,
+                        is_causal=_get_causal_option(self.attn_type),
+                        scale=self.scale,
+                        algorithm="FMHA",
+                    )
+                    output_prefill = torch_br.supa_shape_transform_qkv(
+                        output_prefill, 1, query.shape[1], self.num_kv_heads,
+                        self.head_size)
+                else:
+                    output_prefill = torch_br.supa_flash_attention_infer(  # type: ignore
+                        query,
+                        key,
+                        value,
+                        attn_metadata.query_start_loc,
+                        self.head_size,
+                        len(attn_metadata.query_start_loc),  # type: ignore
+                        self.alibi_slopes,
+                        softmax_scale=self.scale,
+                        is_causal=_get_causal_option(self.attn_type))
+            else:
+                # prefix-enabled attention
+                output_prefill = torch_br.supa_flash_attn_cache_infer(  # type: ignore
+                    query,
+                    kv_cache,
+                    attn_metadata.query_start_loc,
+                    attn_metadata.seq_start_loc,
+                    attn_metadata.block_table,
+                    attn_metadata.context_lens,
+                    attn_metadata.slot_mapping,
+                    attn_metadata.max_seq_len,
+                    self.head_size,
+                    self.alibi_slopes,
+                    softmax_scale=self.scale)
+
+        if attn_metadata.do_decode:
+            output_decode = torch_br.supa_attention_decoder_infer_v2(  # type: ignore
+                decode_query,  # type: ignore
+                kv_cache,
+                attn_metadata.block_table,
+                attn_metadata.seq_lens,
+                attn_metadata.max_decode_seq_len,
+                self.head_size,
+                attn_metadata.num_prefills,
+                self.alibi_slopes,
+                softmax_scale=self.scale)
+
+        if attn_metadata.do_prefill and attn_metadata.do_decode:
+            output[:, :attn_metadata.num_prefills_tokens] = output_prefill
+            output[:, attn_metadata.num_prefills_tokens:] = output_decode
+        elif attn_metadata.do_prefill:
+            output = output_prefill
+        else:
+            output = output_decode
+
+        return output
+
+
+def _get_causal_option(attn_type: str) -> bool:
+    """
+    Determine whether the given attention type is suitable for causal
+    attention mechanisms.
+
+    Args:
+        attn_type (AttentionType): The type of attention being evaluated
+
+    Returns:
+        bool: Returns `True` if the attention type is suitable for causal
+        attention (i.e., not encoder, encoder-only, or encoder-decoder),
+        otherwise returns `False`.
+    """
+    return not (attn_type == AttentionType.ENCODER
+                or attn_type == AttentionType.ENCODER_ONLY
+                or attn_type == AttentionType.ENCODER_DECODER)
--- a/vllm_br/v0/worker/init.py
+++ b/vllm_br/v0/worker/init.py
@@ -0,0 +1,15 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
--- a/vllm_br/v0/worker/pycache/init.cpython-310.pyc
+++ b/vllm_br/v0/worker/pycache/init.cpython-310.pyc
--- a/vllm_br/v0/worker/pycache/pooling_model_runner.cpython-310.pyc
+++ b/vllm_br/v0/worker/pycache/pooling_model_runner.cpython-310.pyc
--- a/vllm_br/v0/worker/pycache/worker.cpython-310.pyc
+++ b/vllm_br/v0/worker/pycache/worker.cpython-310.pyc
--- a/vllm_br/v0/worker/pooling_model_runner.py
+++ b/vllm_br/v0/worker/pooling_model_runner.py
@@ -0,0 +1,255 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.multimodal import MultiModalKwargs
+from vllm.pooling_params import PoolingParams
+from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
+                           SequenceGroupMetadata)
+from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU,
+                                      ModelInputForGPUBuilder)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
+    """
+    Used by the PoolingModelRunner.
+    """
+    pooling_metadata: Optional["PoolingMetadata"] = None
+
+
+class PoolingModelRunner(
+        GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
+    _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
+        ModelInputForGPUWithPoolingMetadata)
+    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+    ):
+        super().__init__(vllm_config=vllm_config,
+                         kv_cache_dtype=kv_cache_dtype,
+                         is_driver_worker=is_driver_worker)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForGPUWithPoolingMetadata,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
+        if num_steps > 1:
+            raise ValueError(
+                "PoolingModelRunner does not support multi-step execution.")
+
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
+        if self.prompt_adapter_config:
+            assert model_input.prompt_adapter_requests is not None
+            assert model_input.prompt_adapter_mapping is not None
+            self.set_active_prompt_adapters(
+                model_input.prompt_adapter_requests,
+                model_input.prompt_adapter_mapping)
+
+        # Currently cuda graph is only supported by the decode phase.
+        assert model_input.attn_metadata is not None
+        prefill_meta = model_input.attn_metadata.prefill_metadata if hasattr(
+            model_input.attn_metadata, 'prefill_metadata') else None
+        decode_meta = model_input.attn_metadata.decode_metadata if hasattr(
+            model_input.attn_metadata, 'decode_metadata') else None
+        virtual_engine = model_input.virtual_engine
+        # Pooling models are (ab-)used also to integrate non text models that
+        # are not autoregressive (PrithviGeosaptialMAE).
+        # These model might not use attention and do not really have a prefill
+        # and decode phase. The model input is processed in one shot and both
+        # decode_metadata and prefill_metadata would be None for such models.
+        # See the PlaceholderAttentionMetadata class.
+        # TODO: Figure out if cuda_graph is of any use for these models and
+        #  explore how to leverage it.
+        if (prefill_meta is None and decode_meta is not None
+                and decode_meta.use_cuda_graph):
+            if model_input.inputs_embeds is None:
+                assert model_input.input_tokens is not None
+                graph_batch_size = model_input.input_tokens.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, False)])
+            else:
+                graph_batch_size = model_input.inputs_embeds.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, True)])
+        else:
+            model_executable = self.model
+
+        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        seqlen_agnostic_kwargs = {
+            "finished_requests_ids": model_input.finished_requests_ids,
+            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
+        } if self.has_inner_state else {}
+        if (self.observability_config is not None
+                and self.observability_config.collect_model_forward_time):
+            model_forward_start = torch.cuda.Event(enable_timing=True)
+            model_forward_end = torch.cuda.Event(enable_timing=True)
+            model_forward_start.record()
+
+        cross_enc_kwargs = {}
+        if model_input.token_types is not None:
+            cross_enc_kwargs["token_type_ids"] = model_input.token_types
+
+        import os
+        use_graph = bool(
+            os.getenv('ENABLE_VLLM_BR_GRAPH_MODE',
+                      'False').lower() not in {'false', '0', ''}
+            and model_input.input_tokens.shape[0] % 256 == 0)
+        if use_graph:
+            batch_size = int(model_input.input_tokens.shape[0] / 256)
+            self.model_input_in = self.graph_inputs.get(batch_size)
+            graph = self.graphs.get(batch_size)
+            if graph is None or self.model_input_in is None:
+                use_graph = False
+                # logger.info(f"!!! No graph captured for batch_size={batch_size}, fallback to normal execution")
+        if use_graph:
+            # logger.info(f"use graph captured for batch_size={batch_size}")
+            # Copy the input tensors to the input buffers.
+            self.model_input_in.input_tokens.copy_(model_input.input_tokens,
+                                                   non_blocking=True)
+            self.model_input_in.input_positions.copy_(
+                model_input.input_positions, non_blocking=True)
+            # self.intermediate_tensors.copy_(intermediate_tensors) if intermediate_tensors is not None else None
+            self.default_stream.record_event(self.copy_done_event)
+
+            with torch.supa.stream(self.graph_stream):
+                self.graph_stream.wait_event(self.copy_done_event)
+                graph.replay()
+                self.graph_stream.record_event(self.graph_done_event)
+            self.default_stream.wait_event(self.graph_done_event)
+            hidden_or_intermediate_states = self.graph_outputs.get(batch_size)
+        else:
+            with set_forward_context(model_input.attn_metadata,
+                                     self.vllm_config, virtual_engine):
+                hidden_or_intermediate_states = model_executable(
+                    input_ids=model_input.input_tokens,
+                    positions=model_input.input_positions,
+                    intermediate_tensors=intermediate_tensors,
+                    **MultiModalKwargs.as_kwargs(
+                        multi_modal_kwargs,
+                        dtype=self.model_config.dtype,
+                        device=self.device,
+                    ),
+                    **cross_enc_kwargs,
+                    **seqlen_agnostic_kwargs,
+                )
+
+        if (self.observability_config is not None
+                and self.observability_config.collect_model_forward_time):
+            model_forward_end.record()
+
+        # Only perform pooling in the last pipeline stage.
+        if not get_pp_group().is_last_rank:
+            if (self.is_driver_worker
+                    and hidden_or_intermediate_states is not None
+                    and isinstance(hidden_or_intermediate_states,
+                                   IntermediateTensors)
+                    and self.observability_config is not None
+                    and self.observability_config.collect_model_forward_time):
+                model_forward_end.synchronize()
+                model_forward_time = model_forward_start.elapsed_time(
+                    model_forward_end)
+                orig_model_forward_time = 0.0
+                if intermediate_tensors is not None:
+                    orig_model_forward_time = intermediate_tensors.tensors.get(
+                        "model_forward_time", torch.tensor(0.0)).item()
+                hidden_or_intermediate_states.tensors["model_forward_time"] = (
+                    torch.tensor(model_forward_time + orig_model_forward_time))
+            return hidden_or_intermediate_states
+
+        # Only perform pooling in the driver worker.
+        if not self.is_driver_worker:
+            return []
+
+        return [
+            self.model.pooler(hidden_states=hidden_or_intermediate_states,
+                              pooling_metadata=model_input.pooling_metadata)
+        ]
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self,
+            tensor_dict: Dict[str,
+                              Any]) -> ModelInputForGPUWithPoolingMetadata:
+        return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForGPUWithPoolingMetadata:
+        assert seq_group_metadata_list is not None
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list, finished_requests_ids)
+        # Prepare PoolingMetadata.
+        assert model_input.seq_lens is not None
+        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
+                                                 model_input.seq_lens)
+
+        return dataclasses.replace(model_input,
+                                   pooling_metadata=pooling_metadata)
+
+    def _prepare_pooling(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        prompt_lens: List[int],
+    ) -> PoolingMetadata:
+        """Prepare PoolingMetadata for the sequence group metadata list."""
+        seq_groups: List[Tuple[List[int], PoolingParams]] = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            pooling_params = seq_group_metadata.pooling_params
+            seq_groups.append((seq_ids, pooling_params))
+
+        seq_data: Dict[int, SequenceData] = {}
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data.update(seq_group_metadata.seq_data)
+
+        pooling_metadata = PoolingMetadata(
+            seq_groups=seq_groups,
+            seq_data=seq_data,
+            prompt_lens=prompt_lens,
+        )
+
+        return pooling_metadata
--- a/vllm_br/v0/worker/worker.py
+++ b/vllm_br/v0/worker/worker.py
@@ -0,0 +1,720 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+"""A GPU worker class."""
+import gc
+import os
+from typing import Optional  # SPDX-License-Identifier: Apache-2.0
+from typing import Dict, List, Set, Tuple, Type, Union
+
+import torch
+import torch_br
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment,
+                              set_custom_all_reduce)
+from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
+from vllm.distributed.parallel_state import get_world_group
+from vllm.forward_context import set_forward_context
+from vllm.logger import logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.multimodal import MultiModalKwargs
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
+                           SequenceGroupMetadata, SequenceGroupMetadataDelta)
+from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
+                        memory_profiling)
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
+                                     WorkerInput)
+from vllm_br.platform import SUPAPlatform
+from vllm_br.v0.attention.backends.attention_v0 import (
+    SUPAFlashAttentionMetadata)
+from vllm_br.v0.worker.pooling_model_runner import (
+    ModelInputForGPUWithPoolingMetadata, PoolingModelRunner)
+
+_NUM_WARMUP_ITERS = 2
+
+
+def build_batch_input(batch_size, seq_len=256, device="supa"):
+    input_tokens = torch.cat([
+        torch.randint(0, 200, (seq_len, ), device=device)
+        for _ in range(batch_size)
+    ])
+    input_positions = torch.arange(seq_len, device=device).repeat(batch_size)
+    seq_lens = [seq_len] * batch_size
+    query_lens = [seq_len] * batch_size
+    query_start_loc = torch.tensor(
+        [i * seq_len for i in range(batch_size + 1)],
+        dtype=torch.int32,
+        device=device)
+    seq_start_loc = [i * seq_len for i in range(batch_size + 1)]
+    context_lens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+    slot_mapping = torch.full((batch_size * seq_len, ),
+                              -1,
+                              dtype=torch.int32,
+                              device=device)
+
+    attn_metadata = SUPAFlashAttentionMetadata(
+        num_actual_tokens=batch_size * seq_len,
+        max_query_len=seq_len,
+        query_start_loc=query_start_loc,
+        max_seq_len=seq_len,
+        seq_lens=seq_lens,
+        seq_lens_tensor=torch.tensor(seq_lens,
+                                     dtype=torch.int32,
+                                     device=device),
+        block_table=torch.empty((batch_size, 0), dtype=torch.int32),
+        slot_mapping=slot_mapping,
+        seq_start_loc=seq_start_loc,
+        context_lens=context_lens,
+        max_decode_seq_len=0,
+        num_prefills=batch_size,
+        num_decodes=0,
+        num_prefills_tokens=batch_size * seq_len,
+        do_cache=False,
+        use_cascade=False,
+        common_prefix_len=0,
+        cu_prefix_query_lens=None,
+        prefix_kv_lens=None,
+        suffix_kv_lens=None,
+        scheduler_metadata=0,
+        prefix_scheduler_metadata=None,
+        _cached_prefill_metadata=None,
+        _cached_decode_metadata=None,
+        local_attn_metadata=None)
+
+    # build ModelInputForGPUWithPoolingMetadata
+    model_input = ModelInputForGPUWithPoolingMetadata(
+        input_tokens=input_tokens,
+        inputs_embeds=None,
+        input_positions=input_positions,
+        token_types=None,
+        seq_lens=seq_lens,
+        query_lens=query_lens,
+        lora_mapping=None,
+        lora_requests=set(),
+        attn_metadata=attn_metadata,
+        prompt_adapter_mapping=None,
+        prompt_adapter_requests=set(),
+        multi_modal_kwargs={},
+        request_ids_to_seq_ids={f'embd-{i}': [i]
+                                for i in range(batch_size)},
+        finished_requests_ids=[],
+        virtual_engine=0,
+        async_callback=None,
+        scheduler_outputs=None,
+        previous_hidden_states=None,
+        pooling_metadata=None)
+    return model_input
+
+
+class SUPAWorker(LocalOrDistributedWorkerBase):
+    """A worker class that executes (a partition of) the model on a GPU.
+
+    Each worker is associated with a single GPU. The worker is responsible for
+    maintaining the KV cache and executing the model on the GPU. In case of
+    distributed inference, each worker is assigned a partition of the model.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
+    ) -> None:
+        WorkerBase.__init__(self, vllm_config)
+        self.parallel_config.rank = rank
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+
+        # Return hidden states from target model if the draft model is an
+        # mlp_speculator
+        speculative_config = self.speculative_config
+        model_config = self.model_config
+        speculative_args = {} if speculative_config is None \
+            or (speculative_config.draft_model_config.hf_config.model_type ==
+                model_config.hf_config.model_type) \
+            or (speculative_config.draft_model_config.hf_config.model_type
+                not in ("medusa",
+                        "mlp_speculator",
+                        "eagle",
+                        "deepseek_mtp",
+                         "mimo_mtp")) \
+                    else {"return_hidden_states": True}
+
+        ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
+        if model_config.runner_type == "pooling":
+            ModelRunnerClass = PoolingModelRunner
+        elif self.model_config.is_encoder_decoder:
+            ModelRunnerClass = EncoderDecoderModelRunner
+        self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
+            vllm_config=self.vllm_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=is_driver_worker,
+            **speculative_args,
+        )
+        if model_runner_cls is not None:
+            self.model_runner = model_runner_cls(self.model_runner)
+
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: List[CacheEngine]
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
+        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
+        self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
+
+        # Buffers saved before sleep
+        self._sleep_saved_buffers: Dict[str, torch.Tensor] = {}
+
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info(
+                "Profiling enabled. Traces will be saved to: %s",
+                torch_profiler_trace_dir,
+            )
+            self.profiler = torch.profiler.profile(
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, use_gzip=True),
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.SUPA,  # type: ignore
+                ],
+                schedule=torch.profiler.schedule(wait=0,
+                                                 warmup=0,
+                                                 active=1,
+                                                 repeat=1),
+                profile_memory=False,
+                record_shapes=True,
+                with_stack=False,
+                use_supa_simple=True,  # type: ignore
+            )
+        else:
+            self.profiler = None
+
+    def start_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.start()
+
+    def stop_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.stop()
+
+    def sleep(self, level: int = 1) -> None:
+        raise NotImplementedError
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        raise NotImplementedError
+
+    def init_device(self):
+        if self.device_config.device.type == "supa":
+            self.device = torch.device(f"supa:{self.local_rank}")
+            SUPAPlatform.set_device(self.device)
+
+            _check_if_gpu_supports_dtype(self.model_config.dtype)
+            gc.collect()
+            SUPAPlatform.empty_cache()
+            self.init_gpu_memory = SUPAPlatform.mem_get_info()[0]
+            self.baseline_snapshot = MemorySnapshot()
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.vllm_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        if self.vllm_config.model_config.enable_sleep_mode:
+            raise NotImplementedError('SUPA do not support sleep mode')
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()
+        with context:
+            self.model_runner.load_model()
+
+        ### capture graphs ###
+        if os.getenv('ENABLE_VLLM_BR_GRAPH_MODE',
+                     'False').lower() not in {'false', '0', ''}:
+            logger.info("Start capturing graphs...")
+            if not hasattr(self.model_runner, "graph_captured"):
+                self.model_runner.graph_captured = False
+            if not self.model_runner.graph_captured:
+                # support capturing graphs under multiple batch sizes."
+                batch_sizes = [1, 2, 3, 4, 5, 6, 7, 8]
+                self.model_runner.graphs = {}
+                self.model_runner.graph_inputs = {}
+                self.model_runner.graph_outputs = {}
+                for bs in batch_sizes:
+                    if self.model_runner.parallel_config.world_size != 1:
+                        # prevent SCCL capturing by using the same stream with SCCL
+                        self.model_runner.graph_stream = torch.distributed.get_group_stream(
+                            get_world_group().device_group)
+                    else:
+                        self.model_runner.graph_stream = torch_br.supa.Stream()
+
+                    self.model_runner.default_stream = torch.supa.default_stream(
+                    )
+                    self.model_runner.copy_done_event = torch_br.supa.Event()
+                    self.model_runner.graph_done_event = torch_br.supa.Event()
+                    graph = torch.supa.SUPAGraph()
+                    self.model_runner.model_input_in = build_batch_input(
+                        bs, seq_len=256, device=self.device)
+                    self.model_runner.intermediate_tensors = None
+
+                    model_executable = self.model_runner.model
+                    multi_modal_kwargs = self.model_runner.model_input_in.multi_modal_kwargs or {}
+                    seqlen_agnostic_kwargs = {
+                        "finished_requests_ids":
+                        self.model_runner.model_input_in.finished_requests_ids,
+                        "request_ids_to_seq_ids":
+                        self.model_runner.model_input_in.
+                        request_ids_to_seq_ids,
+                    } if self.model_runner.has_inner_state else {}
+
+                    cross_enc_kwargs = {}
+                    if self.model_runner.model_input_in.token_types is not None:
+                        cross_enc_kwargs[
+                            "token_type_ids"] = self.model_runner.model_input_in.token_types
+
+                    # Run the model a few times without capturing the graph.
+                    # This is to make sure that the captured graph does not include the
+                    # kernel launches for initial benchmarking (e.g., Triton autotune).
+                    # Note one iteration is not enough for torch.compile
+                    for _ in range(_NUM_WARMUP_ITERS):
+                        with set_forward_context(
+                                self.model_runner.model_input_in.attn_metadata,
+                                self.model_runner.vllm_config, self.
+                                model_runner.model_input_in.virtual_engine):
+                            model_executable(
+                                input_ids=self.model_runner.model_input_in.
+                                input_tokens,
+                                positions=self.model_runner.model_input_in.
+                                input_positions,
+                                intermediate_tensors=None,
+                                **MultiModalKwargs.as_kwargs(
+                                    multi_modal_kwargs,
+                                    dtype=self.model_runner.model_config.dtype,
+                                    device=self.model_runner.device,
+                                ),
+                                **cross_enc_kwargs,
+                                **seqlen_agnostic_kwargs,
+                            )
+                    # Wait for the warm up operations to finish before proceeding with
+                    # Graph Capture.
+                    torch.supa.synchronize()
+
+                    with torch.supa.graph(
+                            graph, stream=self.model_runner.graph_stream), \
+                         set_forward_context(
+                            self.model_runner.model_input_in.attn_metadata,
+                            self.model_runner.vllm_config, self.
+                            model_runner.model_input_in.virtual_engine):
+                        hidden_or_intermediate_states = model_executable(
+                            input_ids=self.model_runner.model_input_in.
+                            input_tokens,
+                            positions=self.model_runner.model_input_in.
+                            input_positions,
+                            intermediate_tensors=self.model_runner.
+                            intermediate_tensors,
+                            **MultiModalKwargs.as_kwargs(
+                                multi_modal_kwargs,
+                                dtype=self.model_runner.model_config.dtype,
+                                device=self.model_runner.device,
+                            ),
+                            **cross_enc_kwargs,
+                            **seqlen_agnostic_kwargs,
+                        )
+                    torch.supa.synchronize()
+                    self.model_runner.graphs[bs] = graph
+                    self.model_runner.graph_inputs[
+                        bs] = self.model_runner.model_input_in
+                    self.model_runner.graph_outputs[
+                        bs] = hidden_or_intermediate_states
+                self.model_runner.graph_captured = True
+                logger.info("capturing graphs Done.")
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.model_runner.save_sharded_state(
+            path,
+            pattern=pattern,
+            max_size=max_size,
+        )
+
+    def save_tensorized_model(
+        self,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        self.model_runner.save_tensorized_model(
+            tensorizer_config=tensorizer_config, )
+
+    @torch.inference_mode()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
+
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+        SUPAPlatform.empty_cache()
+
+        _, total_gpu_memory = SUPAPlatform.mem_get_info()
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        with memory_profiling(
+                self.baseline_snapshot,
+                weights_memory=self.model_runner.model_memory_usage) as result:
+            self.model_runner.profile_run()
+
+        self._assert_memory_footprint_increased_during_profiling()
+
+        memory_for_current_instance = total_gpu_memory * \
+            self.cache_config.gpu_memory_utilization
+        available_kv_cache_memory = (memory_for_current_instance -
+                                     result.non_kv_cache_memory)
+
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
+        cache_block_size = self.get_cache_block_size_bytes()
+        if cache_block_size == 0:
+            num_gpu_blocks = 0
+            num_cpu_blocks = 0
+        else:
+            num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
+            num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                                 cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+
+        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
+               "the current vLLM instance can use "
+               "total_gpu_memory "
+               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
+               " x gpu_memory_utilization "
+               f"({self.cache_config.gpu_memory_utilization:.2f})"
+               f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
+               "model weights take "
+               f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
+               " non_torch_memory takes "
+               f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
+               " PyTorch activation peak memory takes "
+               f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
+               " the rest of the memory reserved for KV Cache is "
+               f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
+
+        logger.info(msg)
+        # Final cleanup
+        gc.collect()
+
+        return num_gpu_blocks, num_cpu_blocks
+
+    def _assert_memory_footprint_increased_during_profiling(self):
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        free_gpu_memory, total = SUPAPlatform.mem_get_info()
+        supa_memory = total - free_gpu_memory
+        assert self.baseline_snapshot.supa_memory < supa_memory, (
+            "Error in memory profiling. "
+            f"Initial used memory {self.baseline_snapshot.supa_memory}, "
+            f"currently used memory {supa_memory}. "
+            f"This happens when the GPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Allocate GPU and CPU KV cache with the specified number of blocks.
+
+        This also warms up the model, which may record CUDA graphs.
+        """
+        raise_if_cache_size_invalid(
+            num_gpu_blocks, self.cache_config.block_size,
+            self.cache_config.is_attention_free,
+            self.model_config.max_model_len,
+            self.parallel_config.pipeline_parallel_size)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        if self.vllm_config.model_config.enable_sleep_mode:
+            raise NotImplementedError('SUPA do not support sleep mode')
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()
+        with context:
+            self._init_cache_engine()
+        self._warm_up_model()
+
+    def _init_cache_engine(self):
+        assert self.cache_config.num_gpu_blocks is not None
+        self.cache_engine = [
+            CacheEngine(self.cache_config, self.model_config,
+                        self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.gpu_cache = [
+            self.cache_engine[ve].gpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.gpu_cache)
+
+    def _warm_up_model(self) -> None:
+        # warm up sizes that are not in cudagraph capture sizes,
+        # but users still want to compile for better performance,
+        # e.g. for the max-num-batched token size in chunked prefill.
+        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
+        if not self.model_config.enforce_eager:
+            warmup_sizes = [
+                x for x in warmup_sizes
+                if x not in self.vllm_config.cuda_graph_sizes
+            ]
+        for size in sorted(warmup_sizes, reverse=True):
+            logger.info("Compile and warming up model for size %d", size)
+            self.model_runner._dummy_run(size)
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model(self.gpu_cache)
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return self.gpu_cache
+
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        virtual_engine = execute_model_req.virtual_engine
+        num_steps = execute_model_req.num_steps
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
+        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
+        # they contain parameters to launch cudamemcpyasync.
+        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
+                                         device="cpu",
+                                         dtype=torch.int64).view(-1, 2)
+        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
+                                          device="cpu",
+                                          dtype=torch.int64).view(-1, 2)
+        # `blocks_to_copy` is a gpu tensor. The src and tgt of
+        # blocks to copy are in the same device, and `blocks_to_copy`
+        # can be used directly within cuda kernels.
+        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                      device=self.device,
+                                      dtype=torch.int64).view(-1, 2)
+
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
+            num_steps=num_steps,
+        )
+
+    @torch.inference_mode()
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        virtual_engine = worker_input.virtual_engine
+        # Issue cache operations.
+        if (worker_input.blocks_to_swap_in is not None
+                and worker_input.blocks_to_swap_in.numel() > 0):
+            self.cache_engine[virtual_engine].swap_in(
+                worker_input.blocks_to_swap_in)
+        if (worker_input.blocks_to_swap_out is not None
+                and worker_input.blocks_to_swap_out.numel() > 0):
+            self.cache_engine[virtual_engine].swap_out(
+                worker_input.blocks_to_swap_out)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
+
+    def _get_cached_seq_group_metadata(
+            self,
+            seq_group_metadata_list: List[Union[SequenceGroupMetadata,
+                                                SequenceGroupMetadataDelta]],
+            finished_request_ids: List[str]) -> List[SequenceGroupMetadata]:
+        """Return a list of cached Sequence Group Metadata after updating its
+        state.
+
+        It is used because scheduler only sends delta to workers to reduce
+        the data payload size. The function also cleans up cache based on
+        a given `finished_request_ids`.
+        """
+        new_seq_group_metadata_list = []
+        for metadata_or_delta in seq_group_metadata_list:
+            request_id = metadata_or_delta.request_id
+            if request_id not in self._seq_group_metadata_cache:
+                # The first prefill.
+                assert isinstance(metadata_or_delta, SequenceGroupMetadata)
+                self._seq_group_metadata_cache[request_id] = metadata_or_delta
+            else:
+                # The first prefill is already cached.
+                if isinstance(metadata_or_delta, SequenceGroupMetadataDelta):
+                    self._seq_group_metadata_cache[request_id].apply_delta(
+                        metadata_or_delta)
+                else:
+                    # If metadata snapshot is sent again, it is
+                    # preempted. Reset the cache because we need to start
+                    # from scratch.
+                    assert isinstance(metadata_or_delta, SequenceGroupMetadata)
+                    self._seq_group_metadata_cache[
+                        request_id] = metadata_or_delta
+
+            new_seq_group_metadata_list.append(
+                self._seq_group_metadata_cache[request_id])
+
+        # Clean up finished ids
+        for finished_id in finished_request_ids:
+            del self._seq_group_metadata_cache[finished_id]
+
+        return new_seq_group_metadata_list
+
+    def _execute_model_spmd(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        if execute_model_req is not None:
+            new_seq_group_metadata_list = self._get_cached_seq_group_metadata(
+                execute_model_req.seq_group_metadata_list,
+                execute_model_req.finished_requests_ids)
+
+            execute_model_req.seq_group_metadata_list = (
+                new_seq_group_metadata_list)
+        output = super()._execute_model_spmd(execute_model_req,
+                                             intermediate_tensors)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
+
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return self.model_runner.add_prompt_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_runner.remove_lora(prompt_adapter_id)
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_runner.pin_prompt_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.model_runner.list_prompt_adapters()
+
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return CacheEngine.get_cache_block_size(self.cache_config,
+                                                self.model_config,
+                                                self.parallel_config)
+
+
+def init_worker_distributed_environment(
+    vllm_config: VllmConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
+) -> None:
+    """Initialize the distributed environment."""
+    parallel_config = vllm_config.parallel_config
+    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
+
+    init_distributed_environment(parallel_config.world_size, rank,
+                                 distributed_init_method, local_rank, "sccl")
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
+    ensure_kv_transfer_initialized(vllm_config)
+
+
+def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
+    # Check if the GPU supports the dtype.
+    # TODO: add checkers
+    return
+
+
+def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
+                                max_model_len, pipeline_parallel_size) -> None:
+    if is_attention_free and num_gpu_blocks != 0:
+        raise ValueError("No memory should be allocated for the cache blocks "
+                         f"for an attention-free model, but {num_gpu_blocks} "
+                         "blocks are allocated.")
+    if not is_attention_free and num_gpu_blocks <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+    max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
+    if not is_attention_free and max_model_len > max_seq_len:
+        raise ValueError(
+            f"The model's max seq len ({max_model_len}) "
+            "is larger than the maximum number of tokens that can be "
+            f"stored in KV cache ({max_seq_len}). Try increasing "
+            "`gpu_memory_utilization` or decreasing `max_model_len` when "
+            "initializing the engine.")