Allow use of TRTLLM_MHA backend for hybrid attention on Blackwell (#11138)

2025-10-03 00:04:58 +01:00
parent 963175d5c0
commit e810077488
2 changed files with 3 additions and 2 deletions
--- a/python/sglang/srt/layers/attention/attention_registry.py
+++ b/python/sglang/srt/layers/attention/attention_registry.py
@@ -178,7 +178,8 @@ def attn_backend_wrapper(runner, full_attn_backend):
        if is_blackwell():
            assert (
                runner.server_args.attention_backend == "triton"
-            ), "triton backend is the only supported backend on Blackwell GPUs for hybrid GDN models, use --attention-backend triton to specify the backend."
+                or runner.server_args.attention_backend == "trtllm_mha"
+            ), "triton or trtllm_mha backend are the only supported backends on Blackwell GPUs for hybrid GDN models, use --attention-backend triton or --attention-backend trtllm_mha to specify the backend."
        if is_npu():
            assert (
                runner.server_args.attention_backend == "ascend"