[feat] Support different attention backends for prefill and decode (#6338)

Co-authored-by: tianqilin.99 <tianqilin.99@bytedance.com>
Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
This commit is contained in:
Qiaolin Yu
2025-07-27 20:42:29 -07:00
committed by GitHub
parent fe6a445d1e
commit 2810338401
9 changed files with 350 additions and 29 deletions

View File

@@ -491,6 +491,8 @@ class SRTRunner:
lora_paths: List[str] = None,
max_loras_per_batch: int = 4,
attention_backend: Optional[str] = None,
prefill_attention_backend: Optional[str] = None,
decode_attention_backend: Optional[str] = None,
lora_backend: str = "triton",
disable_cuda_graph: bool = False,
disable_radix_cache: bool = False,
@@ -540,6 +542,8 @@ class SRTRunner:
max_loras_per_batch=max_loras_per_batch,
lora_backend=lora_backend,
attention_backend=attention_backend,
prefill_attention_backend=prefill_attention_backend,
decode_attention_backend=decode_attention_backend,
disable_cuda_graph=disable_cuda_graph,
disable_radix_cache=disable_radix_cache,
chunked_prefill_size=chunked_prefill_size,