[feat] Support different attention backends for prefill and decode (#6338)

Co-authored-by: tianqilin.99 <tianqilin.99@bytedance.com> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
2025-07-27 20:42:29 -07:00
parent fe6a445d1e
commit 2810338401
9 changed files with 350 additions and 29 deletions
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -491,6 +491,8 @@ class SRTRunner:
        lora_paths: List[str] = None,
        max_loras_per_batch: int = 4,
        attention_backend: Optional[str] = None,
+        prefill_attention_backend: Optional[str] = None,
+        decode_attention_backend: Optional[str] = None,
        lora_backend: str = "triton",
        disable_cuda_graph: bool = False,
        disable_radix_cache: bool = False,
@@ -540,6 +542,8 @@ class SRTRunner:
            max_loras_per_batch=max_loras_per_batch,
            lora_backend=lora_backend,
            attention_backend=attention_backend,
+            prefill_attention_backend=prefill_attention_backend,
+            decode_attention_backend=decode_attention_backend,
            disable_cuda_graph=disable_cuda_graph,
            disable_radix_cache=disable_radix_cache,
            chunked_prefill_size=chunked_prefill_size,