# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" from dataclasses import dataclass from typing import TYPE_CHECKING, Any, ClassVar, Optional import numpy as np import torch from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType, is_quantized_kv_cache) from vllm.attention.layer import Attention from vllm.attention.ops.merge_attn_states import merge_attn_states # from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8, # get_flash_attn_version) from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_transfer.kv_connector.utils import ( get_kv_connector_cache_layout) from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import cdiv from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.block_table import BlockTable if TYPE_CHECKING: from vllm.v1.worker.gpu_model_runner import GPUModelRunner if current_platform.is_cuda(): from flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache) logger = init_logger(__name__) def flash_attn_supports_fp8() -> bool: return False def get_flash_attn_version(): return None class FlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @classmethod def get_supported_dtypes(cls) -> list[torch.dtype]: return [torch.float16, torch.bfloat16] @classmethod def get_supported_head_sizes(cls) -> list[int]: return [32, 64, 80, 96, 112, 128, 160, 192, 224, 256] @classmethod def validate_head_size(cls, head_size: int) -> None: supported_head_sizes = cls.get_supported_head_sizes() if head_size not in supported_head_sizes: attn_type = cls.__name__.removesuffix("Backend") raise ValueError( f"Head size {head_size} is not supported by {attn_type}. " f"Supported head sizes are: {supported_head_sizes}. " "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use " "FlexAttention backend which supports all head sizes.") @staticmethod def get_name() -> str: return "FLASH_ATTN_VLLM_V1" @staticmethod def get_impl_cls() -> type["FlashAttentionImpl"]: return FlashAttentionImpl @staticmethod def get_metadata_cls() -> type["AttentionMetadata"]: return FlashAttentionMetadata @staticmethod def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]: return FlashAttentionMetadataBuilder @staticmethod def get_kv_cache_shape( num_blocks: int, block_size: int, num_kv_heads: int, head_size: int, ) -> tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) @staticmethod def get_kv_cache_stride_order() -> tuple[int, ...]: # NOTE When running disaggregated PD with NIXL, HND layout is used for # faster transfer. `stride_order` indicates the permutation that gets # us from `get_kv_cache_shape` to the actual memory layout we want. cache_layout = get_kv_connector_cache_layout() if cache_layout == "NHD": stride_order = (0, 1, 2, 3, 4) elif cache_layout == "HND": stride_order = (0, 1, 3, 2, 4) else: raise ValueError("Unknown cache layout format %s.", cache_layout) return stride_order @dataclass class FlashAttentionMetadata: # NOTE(sang): Definition of context_len, query_len, and seq_len. # |---------- N-1 iteration --------| # |---------------- N iteration ---------------------| # |- tokenA -|......................|-- newTokens ---| # |---------- context_len ----------| # |-------------------- seq_len ---------------------| # |-- query_len ---| num_actual_tokens: int # Number of tokens excluding padding. max_query_len: int query_start_loc: torch.Tensor max_seq_len: int seq_lens: torch.Tensor # For handling prefill decode split num_decodes: int num_decode_tokens: int decode_query_start_loc: torch.Tensor decode_max_seq_len: int decode_seq_lens: torch.Tensor decode_block_table: torch.Tensor num_prefills: int num_prefill_tokens: int prefill_query_start_loc: torch.Tensor prefill_max_seq_len: int prefill_seq_lens: torch.Tensor prefill_block_table: torch.Tensor block_table: torch.Tensor slot_mapping: torch.Tensor # For cascade attention. use_cascade: bool common_prefix_len: int cu_prefix_query_lens: Optional[torch.Tensor] prefix_kv_lens: Optional[torch.Tensor] suffix_kv_lens: Optional[torch.Tensor] # Optional aot scheduling scheduler_metadata: Optional[torch.Tensor] = None prefix_scheduler_metadata: Optional[torch.Tensor] = None # for local attention @dataclass class LocalAttentionMetadata: local_query_start_loc: torch.Tensor local_seqused_k: torch.Tensor local_block_table: torch.Tensor local_max_query_len: int local_max_seq_len: int local_scheduler_metadata: Optional[torch.Tensor] local_attn_metadata: Optional[LocalAttentionMetadata] = None # # Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into # local attention blocks, where each block is passed to the attention kernel # as an independent local ("virtual") batch item. # # For example, if are performing a chunked prefill a batch of 3 sequences: # q_seqlens = [4, 10, 5] # kv_seqlens = [6, 17, 9] # Then normally for regular attention we would compute with an attention mask # for batch idx 0 (q_seqlens = 4, kv_seqlens = 6) like: # batch idx: 0 (q_seqlens = 4, kv_seqlens = 6) # k_toks > 0 1 2 3 4 5 # q_toks v _____________ # 0 | 1 1 1 # 1 | 1 1 1 1 # 2 | 1 1 1 1 1 # 3 | 1 1 1 1 1 1 # # for local attention (with attn_chunk_size = 4) we would compute with an # attention mask like: # batch idx: 0 (q_seqlens = 4, kv_seqlens = 6, attn_chunk_size = 4) # k_toks > 0 1 2 3 4 5 # q_toks v _____________ # 0 | 1 1 1 # 1 | 1 1 1 1 # 2 | 1 # 3 | 1 1 # # We can simulate this mask using standard flash-attention by breaking the # sequences into local ("virtual") batches, where each local batch item is a # local attention block, so in this case batch idx 0 would be broken up into: # # local-batch idx: 0 (q_seqlens = 2, kv_seqlens = 4) (batch 0) # k_toks > 0 1 2 3 # q_toks v _____________ # 0 | 1 1 1 # 1 | 1 1 1 1 # local-batch idx: 1 (q_seqlens = 2, kv_seqlens = 2) (batch 0) # k_toks > 4 5 # q_toks v _____________ # 2 | 1 # 3 | 1 1 # # e.g. if we have: # attn_chunk_size = 4 # query_start_loc_np = [0, 4, 14, 19] (q_seqlens = [4, 10, 5]) # Then this function would return: # __b0__ ______b1______ __b2__ < orig batch indices # q_seqlens_local = [ 2, 2, 1, 4, 4, 1, 4, 1] # cu_seqlens_q_local = [0, 4, 6, 10, 14, 18, 19, 23, 24] # seqlens_k_local = [ 4, 2, 4, 4, 4, 1, 4, 1] # block_table_local : shape[local_virtual_batches, pages_per_local_batch] def make_local_attention_virtual_batches( attn_chunk_size: int, query_start_loc_np: np.ndarray, seq_lens_np: np.ndarray, block_table: torch.Tensor, block_size: int = 0, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, torch.Tensor]: q_seqlens = query_start_loc_np[1:] - query_start_loc_np[:-1] actual_batch_size = seq_lens_np.shape[0] # Handle if we are starting in the middle of a local attention block, # we assume q_seqlens > 0 (for all elements), for each batch idx we compute # the number of tokens that are not in the first local attention block and # then we can simply use a cdiv for the rest. # For example if we have: # attn_chunk_size = 4 # q_seqlens = [4, 10, 5] # k_seqlens = [6, 17, 9] # Then we would get: # new_tokens_in_first_block = [2, 1, 4] # local_blocks = [2, 4, 2] q_tokens_in_first_block = np.minimum( attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size), q_seqlens).astype(np.int32) tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size) local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block, attn_chunk_size) # Once we know the number of local blocks we can compute the request spans # for each batch idx, we can figure out the number of "virtual" requests we # have to make, # For the above example we would get: # seqlens_q_local = [2, 2, 1, 4, 4, 1, 4, 1] # # First Get batched arange. (E.g., [2, 4, 2] -> [0, 1, 0, 1, 2, 3, 0, 1]) # (TODO: max a utility to share this code with _prepare_inputs) # arange step 1. [2, 4, 2] -> [2, 6, 8] cu_num_blocks = np.cumsum(local_blocks) virtual_batches = cu_num_blocks[-1] # arange step 2. [2, 6, 8] -> [0, 0, 2, 2, 2, 2, 6, 6] block_offsets = np.repeat(cu_num_blocks - local_blocks, local_blocks) # arange step 3. [0, 1, 0, 1, 2, 3, 0, 1] arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets # also compute reverse arange (i.e. [1, 0, 3, 2, 1, 0, 1, 0]) rarange = np.repeat(local_blocks, local_blocks) - arange - 1 # Then we can compute the seqlens_q_local, handling the fact that the # first and last blocks could be partial seqlens_q_local = \ np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks) # set the first block since this may be a partial block seqlens_q_local[arange == 0] = q_tokens_in_first_block # set the remaining blocks seqlens_q_local[arange > 0] = np.minimum( seqlens_q_local - attn_chunk_size * (arange - 1), attn_chunk_size)[arange > 0] # convert from q_seqlens to cu_seqlens_q cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0))\ .astype(np.int32) # compute the seqlens_k_local, # basically a full local attention block for all but the last block in each # batch # For our example this will be: # seqlens_k_local = [4, 2, 4, 4, 4, 1, 4, 1] seqlens_k_local = np.full(cu_num_blocks[-1], attn_chunk_size, dtype=np.int32) seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - \ (rarange * attn_chunk_size + \ np.repeat(tokens_in_last_block, local_blocks)) # For the example the local attention blocks start at: # _b0_ _____b1_____ _b2_ # k_seqstarts_absolute = [0, 4, 4, 8, 12, 16, 4, 8] block_starts = k_seqstarts_absolute // block_size assert attn_chunk_size % block_size == 0, \ f"attn_chunk_size {attn_chunk_size} is not " \ f"divisible by block_size {block_size}" pages_per_local_batch = attn_chunk_size // block_size # Create a block_table for the local attention blocks # For out example if we have a block-table like (assuming block_size=2): # block_table = [ # [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], < batch 0 # [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], < batch 1 # [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], < batch 2 # ] # Then for the local batches we would want a block-table like # block_table_local = [ # [ 0, 1 ], < local-batch 0, (batch 0, starting from k[0]) # [ 2, 3 ], < local-batch 1, (batch 0, starting from k[4]) # [ 12, 13 ], < local-batch 2, (batch 1, starting from k[4]) # [ 14, 15 ], < local-batch 3, (batch 1, starting from k[8]) # [ 16, 17 ], < local-batch 4, (batch 1, starting from k[12]) # [ 18, 19 ], < local-batch 5, (batch 1, starting from k[16]) # [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4]) # [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8]) # ] block_indices= np.broadcast_to( np.arange(pages_per_local_batch, dtype=np.int32), (virtual_batches, pages_per_local_batch)) \ + np.expand_dims(block_starts, axis=1) block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1) batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32), local_blocks * pages_per_local_batch) block_table_local = block_table[batch_indices, block_indices]\ .view(virtual_batches, -1) return seqlens_q_local, cu_seqlens_q_local, seqlens_k_local, \ block_table_local def _get_sliding_window_configs( vllm_config: VllmConfig) -> set[Optional[tuple[int, int]]]: """Get the set of all sliding window configs used in the model.""" sliding_window_configs: set[Optional[tuple[int, int]]] = set() layers = get_layers_from_vllm_config(vllm_config, Attention) for layer in layers.values(): assert isinstance(layer.impl, FlashAttentionImpl) sliding_window_configs.add(layer.impl.sliding_window) return sliding_window_configs class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): full_cudagraph_supported: ClassVar[bool] = True # Decode-only def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec, block_table: BlockTable): model_config = runner.model_config compilation_config = runner.vllm_config.compilation_config self.runner = runner self.num_heads_q = model_config.get_num_attention_heads( runner.parallel_config) self.num_heads_kv = model_config.get_num_kv_heads( runner.parallel_config) self.headdim = model_config.get_head_size() self.block_size = kv_cache_spec.block_size self.kv_cache_spec = kv_cache_spec self.block_table = block_table self.aot_schedule = (get_flash_attn_version() == 3) self.use_full_cuda_graph = compilation_config.full_cuda_graph # if self.use_full_cuda_graph and not self.aot_schedule: # raise ValueError("Full CUDA graph mode requires AOT scheduling, " # "which requires FlashAttention 3.") self.scheduler_metadata = torch.zeros(self.runner.max_num_reqs + 1, dtype=torch.int32, device=self.runner.device) # Sliding window size to be used with the AOT scheduler will be # populated on first build() call. self.aot_sliding_window: Optional[tuple[int, int]] = None def reorder_batch(self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput") -> bool: # We now want to reorder the batch so that the "decode" requests are and # the front and the "prefill" requests are at the using the least amount # swaps possible. (NOTE for now we loosely use "decode" to mean requests # where attention is likely memory-bound and "prefill" to mean requests # where attention is likely compute-bound, TODO(lucas): figure out a # better naming here) decodes = [] prefills = [] num_decode_tokens = 0 num_prefill_tokens = 0 for i, req_id in enumerate(input_batch.req_ids): num_tokens = scheduler_output.num_scheduled_tokens[req_id] # for now treat 1 scheduled token as "decode" even if its not, # we should update this to something like < 8 in the future but # currently the decode run only supports num_tokens = 1 if num_tokens == 1: decodes.append(i) num_decode_tokens += num_tokens else: prefills.append(i) num_prefill_tokens += num_tokens # We hope that this is fairly minimal since decodes # should be around for a number of iterations so hopefully they are # relatively stationary (and new request are generally appended to the # persistent batch so already should be at the back) # To achieve this we loop over the decodes in descending order and # the prefills in ascending order. We swap decodes from the "back" # i.e. past where the last decode should be in the reodorered with # prefills from the front of the batch. # `decodes` and `prefills` are already in ascending order just based on # the above loop num_decodes = len(decodes) num_prefills = len(prefills) modified_batch = False for i in range(1, min(num_decodes, num_prefills) + 1): # If the decode is at the "back" of the batch, i, we can swap it # with the prefill closest to the front of the batch decode_idx = decodes[num_decodes - i] if decode_idx < num_decodes: break input_batch.swap_states(prefills[i - 1], decode_idx) modified_batch = True # Save for next `build` call # TODO(lucas): this is a bit of a hack, we should probably have a # better way of doing this self._num_decodes = num_decodes self._num_prefills = num_prefills self._num_decode_tokens = num_decode_tokens self._num_prefill_tokens = num_prefill_tokens return modified_batch def build_for_cudagraph_capture( self, common_attn_metadata: CommonAttentionMetadata) -> FlashAttentionMetadata: """ This method builds the metadata for full cudagraph capture. Currently, only decode is supported for full cudagraphs with MLA. """ m = common_attn_metadata assert m.num_reqs == m.num_actual_tokens, \ "MLA only supports decode-only full CUDAGraph capture. " \ "Make sure all cudagraph capture sizes <= max_num_seq." m.max_query_len = 1 # decode-only # Update state usually set in reorder_batch. self._num_decodes = m.num_reqs self._num_decode_tokens = m.num_actual_tokens self._num_prefills = 0 self._num_prefill_tokens = 0 return self.build(0, m) def build( self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata ) -> FlashAttentionMetadata: num_reqs = common_attn_metadata.num_reqs num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len assert self._num_decodes + self._num_prefills == num_reqs assert (self._num_decode_tokens + self._num_prefill_tokens == num_actual_tokens) max_seq_len = int(self.runner.seq_lens_np[:num_reqs].max()) query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table = self.block_table block_table_tensor = block_table.get_device_tensor()[:num_reqs] block_table.slot_mapping[:num_actual_tokens].copy_( block_table.slot_mapping_cpu[:num_actual_tokens], non_blocking=True) # Fill unused with -1. Needed for reshape_and_cache in full cuda graph # mode. block_table.slot_mapping[num_actual_tokens:].fill_(-1) slot_mapping = block_table.slot_mapping[:num_actual_tokens] # For handling prefill decode split if self._num_decodes > 0: decode_max_seq_len = int(self.runner.seq_lens_np[:self._num_decodes].max()) decode_query_start_loc = common_attn_metadata.query_start_loc[:self._num_decodes + 1] decode_seq_lens = common_attn_metadata.seq_lens[:self._num_decodes] decode_block_table_tensor = block_table.get_device_tensor()[:self._num_decodes] else: decode_max_seq_len = 0 decode_query_start_loc = None decode_seq_lens = None decode_block_table_tensor = None if self._num_prefills > 0: prefill_max_seq_len = int(self.runner.seq_lens_np[self._num_decodes:num_reqs].max()) prefill_query_start_loc = (common_attn_metadata.query_start_loc[self._num_decodes:num_reqs + 1] - common_attn_metadata.query_start_loc[self._num_decodes]) prefill_seq_lens = common_attn_metadata.seq_lens[self._num_decodes:num_reqs] prefill_block_table_tensor = block_table.get_device_tensor()[self._num_decodes:num_reqs] else: prefill_max_seq_len = 0 prefill_query_start_loc = None prefill_seq_lens = None prefill_block_table_tensor = None if self.aot_sliding_window is None: self.aot_sliding_window = (-1, -1) # For the AOT scheduler we need the sliding window value to be # constant for all layers to. We have to populate this on the first # build() call so the layers are constructed (cannot populate) # in __init__. if self.aot_schedule: sliding_window_configs = _get_sliding_window_configs( self.runner.vllm_config) if len(sliding_window_configs) == 1: sliding_window_config = sliding_window_configs.pop() if sliding_window_config is not None: self.aot_sliding_window = sliding_window_config elif len(sliding_window_configs) > 1: self.aot_schedule = False def schedule(batch_size, cu_query_lens, max_query_len, seqlens, max_seq_len, causal): # if self.aot_schedule: # return get_scheduler_metadata( # batch_size=batch_size, # max_seqlen_q=max_query_len, # max_seqlen_k=max_seq_len, # cache_seqlens=seqlens, # num_heads_q=self.num_heads_q, # num_heads_kv=self.num_heads_kv, # headdim=self.headdim, # page_size=self.block_size, # cu_seqlens_q=cu_query_lens, # causal=causal, # window_size=self.aot_sliding_window, # ) return None # for local attention local_attn_metadata = None if self.runner.attention_chunk_size is not None: seqlens_q_local_np, virt_q_cu_seqlens_np, virt_k_seqlens_np, \ virt_block_table_tensor = make_local_attention_virtual_batches( self.runner.attention_chunk_size, self.runner.query_start_loc_np[:num_reqs + 1], self.runner.seq_lens_np[:num_reqs], block_table_tensor, self.block_size, ) local_query_start_loc = torch.from_numpy(virt_q_cu_seqlens_np).to( self.runner.device, non_blocking=True) local_seqused_k = torch.from_numpy(virt_k_seqlens_np).to( self.runner.device, non_blocking=True) local_max_query_len = seqlens_q_local_np.max() local_max_seq_len = virt_k_seqlens_np.max() local_scheduler_metadata = schedule( batch_size=local_query_start_loc.shape[0] - 1, cu_query_lens=local_query_start_loc, max_query_len=local_max_query_len, seqlens=local_seqused_k, max_seq_len=local_max_seq_len, causal=True) local_attn_metadata = FlashAttentionMetadata.LocalAttentionMetadata( local_query_start_loc=local_query_start_loc, local_seqused_k=local_seqused_k, local_block_table=virt_block_table_tensor, local_max_query_len=local_max_query_len, local_max_seq_len=local_max_seq_len, local_scheduler_metadata=local_scheduler_metadata, ) use_cascade = common_prefix_len > 0 if use_cascade: cu_prefix_query_lens = torch.tensor([0, num_actual_tokens], dtype=torch.int32, device=self.runner.device) prefix_kv_lens = torch.tensor([common_prefix_len], dtype=torch.int32, device=self.runner.device) suffix_kv_lens = (self.runner.seq_lens_np[:num_reqs] - common_prefix_len) suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to( self.runner.device) prefix_scheduler_metadata = schedule( batch_size=1, cu_query_lens=cu_prefix_query_lens, max_query_len=num_actual_tokens, seqlens=prefix_kv_lens, max_seq_len=common_prefix_len, causal=False) scheduler_metadata = schedule(batch_size=num_reqs, cu_query_lens=query_start_loc, max_query_len=max_query_len, seqlens=suffix_kv_lens, max_seq_len=max_seq_len - common_prefix_len, causal=True) else: cu_prefix_query_lens = None prefix_kv_lens = None suffix_kv_lens = None prefix_scheduler_metadata = None scheduler_metadata = schedule(batch_size=num_reqs, cu_query_lens=query_start_loc, max_query_len=max_query_len, seqlens=seq_lens, max_seq_len=max_seq_len, causal=True) # if self.use_full_cuda_graph: # assert scheduler_metadata is not None # n = scheduler_metadata.shape[0] # self.scheduler_metadata[:n].copy_(scheduler_metadata, # non_blocking=True) # # NOTE(woosuk): We should zero out the rest of the scheduler # # metadata to guarantee the correctness. Otherwise, some thread # # blocks may use the invalid scheduler metadata and overwrite the # # output buffer. # self.scheduler_metadata[n:] = 0 # scheduler_metadata = self.scheduler_metadata[:n] attn_metadata = FlashAttentionMetadata( num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, query_start_loc=query_start_loc, max_seq_len=max_seq_len, seq_lens=seq_lens, # For handling prefill decode split num_decodes=self._num_decodes, num_decode_tokens=self._num_decode_tokens, decode_query_start_loc=decode_query_start_loc, decode_max_seq_len=decode_max_seq_len, decode_seq_lens=decode_seq_lens, decode_block_table=decode_block_table_tensor, num_prefills=self._num_prefills, num_prefill_tokens=self._num_prefill_tokens, prefill_query_start_loc=prefill_query_start_loc, prefill_max_seq_len=prefill_max_seq_len, prefill_seq_lens=prefill_seq_lens, prefill_block_table=prefill_block_table_tensor, block_table=block_table_tensor, slot_mapping=slot_mapping, use_cascade=use_cascade, common_prefix_len=common_prefix_len, scheduler_metadata=scheduler_metadata, cu_prefix_query_lens=cu_prefix_query_lens, prefix_kv_lens=prefix_kv_lens, suffix_kv_lens=suffix_kv_lens, local_attn_metadata=local_attn_metadata, prefix_scheduler_metadata=prefix_scheduler_metadata, ) return attn_metadata def can_run_in_cudagraph( self, common_attn_metadata: CommonAttentionMetadata) -> bool: return common_attn_metadata.max_query_len == 1 def use_cascade_attention(self, *args, **kwargs) -> bool: return use_cascade_attention(*args, **kwargs) class FlashAttentionImpl(AttentionImpl): def __init__( self, num_heads: int, head_size: int, scale: float, num_kv_heads: int, alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, sinks: Optional[torch.Tensor] = None, use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) self.num_kv_heads = num_kv_heads if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.alibi_slopes = alibi_slopes if sliding_window is None: self.sliding_window = (-1, -1) elif attn_type == AttentionType.ENCODER_ONLY: self.sliding_window = (sliding_window - 1, sliding_window - 1) else: self.sliding_window = (sliding_window - 1, 0) self.kv_cache_dtype = kv_cache_dtype if logits_soft_cap is None: # In flash-attn, setting logits_soft_cap as 0 means no soft cap. logits_soft_cap = 0 self.logits_soft_cap = logits_soft_cap self.kv_sharing_target_layer_name = kv_sharing_target_layer_name self.num_queries_per_kv = self.num_heads // self.num_kv_heads FlashAttentionBackend.validate_head_size(head_size) if attn_type not in [ AttentionType.DECODER, AttentionType.ENCODER_ONLY ]: raise NotImplementedError("Encoder/decoder cross-attention " "is not implemented for " "FlashAttentionImpl") self.use_irope = use_irope self.attn_type = attn_type self.vllm_flash_attn_version = get_flash_attn_version() if is_quantized_kv_cache(self.kv_cache_dtype) \ and not flash_attn_supports_fp8(): raise NotImplementedError( "FlashAttention does not support fp8 kv-cache on this device.") self.sinks = sinks if self.sinks is not None: assert self.vllm_flash_attn_version == 3, ( "Sinks are only supported in FlashAttention 3") assert self.sinks.shape[0] == num_heads, ( "Sinks must have the same number of heads as the number of " "heads in the layer") def forward( self, layer: torch.nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: FlashAttentionMetadata, output: Optional[torch.Tensor] = None, output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. Args: query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] NOTE: FP8 quantization, flash-attn expect the size of {q,k,v}_descale to be (num_sequences, num_kv_heads). We use torch's .expand() to avoid duplicating values """ assert output is not None, "Output tensor must be provided." if output_scale is not None: raise NotImplementedError( "fused output quantization is not yet supported" " for FlashAttentionImpl") if attn_metadata is None: # Profiling run. return output # IMPORTANT! # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead # in this method. For example, `view` and `slice` (or `[:n]`) operations # are surprisingly slow even in the case they do not invoke any GPU ops. # Minimize the PyTorch ops in this method as much as possible. # Whenever making a change in this method, please benchmark the # performance to make sure it does not introduce any overhead. num_actual_tokens = attn_metadata.num_actual_tokens key_cache, value_cache = kv_cache.unbind(0) if self.kv_sharing_target_layer_name is None: # Reshape the input keys and values and store them in the cache. # Skip this if sharing KV cache with an earlier attention layer. # NOTE(woosuk): Here, key and value are padded while slot_mapping is # not padded. However, we don't need to do key[:num_actual_tokens] # and value[:num_actual_tokens] because the reshape_and_cache_flash # op uses the slot_mapping's shape to determine the number of # actual tokens. torch.ops._C_cache_ops.reshape_and_cache_flash( key, value, key_cache, value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, layer._k_scale, layer._v_scale, ) if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(torch.float8_e4m3fn) value_cache = value_cache.view(torch.float8_e4m3fn) num_tokens, num_heads, head_size = query.shape query, _ = ops.scaled_fp8_quant( query.reshape( (num_tokens, num_heads * head_size)).contiguous(), layer._q_scale) query = query.reshape((num_tokens, num_heads, head_size)) # Compute attention and update output up to `num_actual_tokens`. use_local_attn = \ (self.use_irope and attn_metadata.local_attn_metadata is not None) # For handling prefill decode split if not attn_metadata.use_cascade and not use_local_attn: num_decode_tokens = attn_metadata.num_decode_tokens if attn_metadata.num_prefills > 0: cu_prefix_kv_lens = torch.tensor([0] + attn_metadata.prefill_seq_lens.tolist(), device=attn_metadata.prefill_seq_lens.device, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) output[num_decode_tokens:num_actual_tokens] = flash_attn_varlen_func( q=query[num_decode_tokens:num_actual_tokens], k=key_cache, v=value_cache, block_table=attn_metadata.prefill_block_table, cu_seqlens_q=attn_metadata.prefill_query_start_loc, cu_seqlens_k=cu_prefix_kv_lens, max_seqlen_q=attn_metadata.max_query_len, max_seqlen_k=attn_metadata.prefill_max_seq_len, softmax_scale=self.scale, causal=True, window_size=self.sliding_window, alibi_slopes=self.alibi_slopes, softcap=self.logits_soft_cap, ) if attn_metadata.num_decodes > 0: # Use flash_attn_with_kvcache for normal decoding. decode_query = query[:num_decode_tokens] output[:num_decode_tokens] = flash_attn_with_kvcache( q=decode_query.unsqueeze(1), k_cache=key_cache, v_cache=value_cache, block_table=attn_metadata.decode_block_table, cache_seqlens=attn_metadata.decode_seq_lens, softmax_scale=self.scale, causal=True, window_size=self.sliding_window, alibi_slopes=self.alibi_slopes, softcap=self.logits_soft_cap, ).squeeze(1) return output if not attn_metadata.use_cascade or use_local_attn: if use_local_attn: assert attn_metadata.local_attn_metadata is not None local_metadata = attn_metadata.local_attn_metadata cu_seqlens_q = local_metadata.local_query_start_loc seqused_k = local_metadata.local_seqused_k max_seqlen_q = local_metadata.local_max_query_len max_seqlen_k = local_metadata.local_max_seq_len block_table = local_metadata.local_block_table scheduler_metadata = local_metadata.local_scheduler_metadata else: cu_seqlens_q = attn_metadata.query_start_loc seqused_k = attn_metadata.seq_lens max_seqlen_q = attn_metadata.max_query_len max_seqlen_k = attn_metadata.max_seq_len block_table = attn_metadata.block_table scheduler_metadata = attn_metadata.scheduler_metadata # descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) cu_prefix_kv_lens = torch.tensor([0] + seqused_k.tolist(), seqused_k.device, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) output[:num_actual_tokens] = flash_attn_varlen_func( q=query[:num_actual_tokens], k=key_cache, v=value_cache, # out=output[:num_actual_tokens], cu_seqlens_q=cu_seqlens_q, max_seqlen_q=max_seqlen_q, # seqused_k=seqused_k, cu_seqlens_k=cu_prefix_kv_lens, max_seqlen_k=max_seqlen_k, softmax_scale=self.scale, causal=True, alibi_slopes=self.alibi_slopes, window_size=self.sliding_window, block_table=block_table, softcap=self.logits_soft_cap, # scheduler_metadata=scheduler_metadata, # fa_version=self.vllm_flash_attn_version, # q_descale=layer._q_scale.expand(descale_shape), # k_descale=layer._k_scale.expand(descale_shape), # v_descale=layer._v_scale.expand(descale_shape), ) return output assert not use_local_attn, ( "Cascade attention does not support local attention.") # Cascade attention (rare case). cascade_attention( output[:num_actual_tokens], query[:num_actual_tokens], key_cache, value_cache, cu_query_lens=attn_metadata.query_start_loc, max_query_len=attn_metadata.max_query_len, cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens, prefix_kv_lens=attn_metadata.prefix_kv_lens, suffix_kv_lens=attn_metadata.suffix_kv_lens, max_kv_len=attn_metadata.max_seq_len, softmax_scale=self.scale, alibi_slopes=self.alibi_slopes, sliding_window=self.sliding_window, logits_soft_cap=self.logits_soft_cap, block_table=attn_metadata.block_table, common_prefix_len=attn_metadata.common_prefix_len, fa_version=self.vllm_flash_attn_version, prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata, suffix_scheduler_metadata=attn_metadata.scheduler_metadata, q_descale=layer._q_scale, k_descale=layer._k_scale, v_descale=layer._v_scale, ) return output def use_cascade_attention( common_prefix_len: int, query_lens: np.ndarray, num_query_heads: int, num_kv_heads: int, use_alibi: bool, use_sliding_window: bool, num_sms: int, ) -> bool: """Decide whether to use cascade attention. This function 1) checks whether cascade attention is supported with the given configuration, and 2) heuristically decides whether using cascade attention can improve performance. """ # Too short common prefix. Probably not worth using cascade attention. # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold. # NOTE(woosuk): This is the common case. We should return False as soon as # possible to avoid any unnecessary computation. if common_prefix_len < 256: return False # Cascade attention is currently not supported with these variants. if use_alibi or use_sliding_window: return False # Too few queries. Probably not worth using cascade attention. # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold. num_reqs = len(query_lens) if num_reqs < 8: return False # Heuristics to decide whether using cascade attention is beneficial. # 1. When FlashDecoding is not used for normal attention, cascade attention # is likely to be faster since it saves memory bandwidth. num_queries_per_kv = num_query_heads // num_kv_heads # The criteria for using FlashDecoding can be found in the following link: # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535 use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window and not use_alibi and np.all(query_lens == 1)) if not use_flash_decoding: # Use cascade attention. return True # 2. When FlashDecoding is used for normal attention, it is not clear # whether cascade attention is beneficial, because FlashDecoding can # launch more CTAs than cascade attention. # We use a simple performance model to compare the two methods. # NOTE(woosuk): The performance model is very rough and may not be # accurate. num_tokens = num_reqs # NOTE(woosuk): These are default tile sizes. flash-attn might use # different tile sizes (e.g., 64 or 256) depending on the configuration. q_tile_size = 128 kv_tile_size = 128 num_prefix_tiles = cdiv(common_prefix_len, kv_tile_size) cascade_ctas = num_query_heads * cdiv(num_tokens, q_tile_size) cascade_waves = cdiv(cascade_ctas, num_sms) cascade_time = cascade_waves * num_prefix_tiles flash_decoding_ctas = (num_reqs * num_kv_heads * cdiv(num_queries_per_kv, q_tile_size)) flash_decoding_ctas *= num_prefix_tiles flash_decoding_time = cdiv(flash_decoding_ctas, num_sms) # Use cascade attention if it is faster than FlashDecoding. return cascade_time < flash_decoding_time def cascade_attention( output: torch.Tensor, query: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor, cu_query_lens: torch.Tensor, max_query_len: int, cu_prefix_query_lens: torch.Tensor, prefix_kv_lens: torch.Tensor, suffix_kv_lens: torch.Tensor, max_kv_len: int, softmax_scale: float, alibi_slopes: Optional[torch.Tensor], sliding_window: tuple[int, int], logits_soft_cap: float, block_table: torch.Tensor, common_prefix_len: int, fa_version: int, prefix_scheduler_metadata: Optional[torch.Tensor] = None, suffix_scheduler_metadata: Optional[torch.Tensor] = None, q_descale: Optional[torch.Tensor] = None, k_descale: Optional[torch.Tensor] = None, v_descale: Optional[torch.Tensor] = None, ) -> torch.Tensor: assert alibi_slopes is None, ("Cascade attention does not support ALiBi.") # TODO: Support sliding window. assert sliding_window == (-1, -1), ( "Cascade attention does not support sliding window.") num_tokens = query.shape[0] block_size = key_cache.shape[-3] assert common_prefix_len % block_size == 0 num_common_kv_blocks = common_prefix_len // block_size assert num_common_kv_blocks > 0 descale_shape = (cu_prefix_query_lens.shape[0] - 1, key_cache.shape[-2]) # Process shared prefix. cu_prefix_kv_lens = torch.tensor([0] + prefix_kv_lens.tolist(), device=prefix_kv_lens.device, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) prefix_output, prefix_lse = flash_attn_varlen_func( q=query, k=key_cache, v=value_cache, cu_seqlens_q=cu_prefix_query_lens, cu_seqlens_k=cu_prefix_kv_lens, max_seqlen_q=num_tokens, max_seqlen_k=common_prefix_len, softmax_scale=softmax_scale, causal=False, window_size=sliding_window, block_table=block_table[:1], softcap=logits_soft_cap, return_softmax_lse=True, # scheduler_metadata=prefix_scheduler_metadata, # fa_version=fa_version, # q_descale=q_descale.expand(descale_shape) # if q_descale is not None else None, # k_descale=k_descale.expand(descale_shape) # if k_descale is not None else None, # v_descale=v_descale.expand(descale_shape) # if v_descale is not None else None, ) descale_shape = (cu_query_lens.shape[0] - 1, key_cache.shape[-2]) # Process suffix per query. cu_suffix_kv_lens = torch.tensor([0] + suffix_kv_lens.tolist(), device=suffix_kv_lens.device, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) suffix_output, suffix_lse = flash_attn_varlen_func( q=query, k=key_cache, v=value_cache, cu_seqlens_q=cu_query_lens, cu_seqlens_k=cu_suffix_kv_lens, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len - common_prefix_len, softmax_scale=softmax_scale, causal=True, window_size=sliding_window, block_table=block_table[:, num_common_kv_blocks:], softcap=logits_soft_cap, return_softmax_lse=True, # scheduler_metadata=suffix_scheduler_metadata, # fa_version=fa_version, # q_descale=q_descale.expand(descale_shape) # if q_descale is not None else None, # k_descale=k_descale.expand(descale_shape) # if k_descale is not None else None, # v_descale=v_descale.expand(descale_shape) # if v_descale is not None else None, ) # Merge prefix and suffix outputs, and store the result in output. merge_attn_states(output, prefix_output, prefix_lse, suffix_output, suffix_lse)