[feat] Support different attention backends for prefill and decode (#6338)
Co-authored-by: tianqilin.99 <tianqilin.99@bytedance.com> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
This commit is contained in:
@@ -491,6 +491,8 @@ class SRTRunner:
|
||||
lora_paths: List[str] = None,
|
||||
max_loras_per_batch: int = 4,
|
||||
attention_backend: Optional[str] = None,
|
||||
prefill_attention_backend: Optional[str] = None,
|
||||
decode_attention_backend: Optional[str] = None,
|
||||
lora_backend: str = "triton",
|
||||
disable_cuda_graph: bool = False,
|
||||
disable_radix_cache: bool = False,
|
||||
@@ -540,6 +542,8 @@ class SRTRunner:
|
||||
max_loras_per_batch=max_loras_per_batch,
|
||||
lora_backend=lora_backend,
|
||||
attention_backend=attention_backend,
|
||||
prefill_attention_backend=prefill_attention_backend,
|
||||
decode_attention_backend=decode_attention_backend,
|
||||
disable_cuda_graph=disable_cuda_graph,
|
||||
disable_radix_cache=disable_radix_cache,
|
||||
chunked_prefill_size=chunked_prefill_size,
|
||||
|
||||
Reference in New Issue
Block a user