[Fix] avoid stream sync and torch compile in prefill for fa3 backend (#4932)

This commit is contained in:
Baizhou Zhang
2025-03-30 13:53:44 -07:00
committed by GitHub
parent 032f8faaab
commit e62d60fe6d
7 changed files with 30 additions and 35 deletions

View File

@@ -79,7 +79,7 @@ class FlashAttentionBackend(AttentionBackend):
torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
)
# Precompute maximum sequence length
metadata.max_seq_len_k = seqlens_in_batch.max().item()
metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
# Precompute page table
metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
forward_batch.req_pool_indices, : metadata.max_seq_len_k