From e810077488e190c07e98022f1f16ece9f96d8e8b Mon Sep 17 00:00:00 2001 From: Dom Brown <3886319+DomBrown@users.noreply.github.com> Date: Fri, 3 Oct 2025 00:04:58 +0100 Subject: [PATCH] Allow use of TRTLLM_MHA backend for hybrid attention on Blackwell (#11138) --- python/sglang/srt/layers/attention/attention_registry.py | 3 ++- python/sglang/srt/model_executor/model_runner.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/attention/attention_registry.py b/python/sglang/srt/layers/attention/attention_registry.py index 658ad1f0f..aa843685a 100644 --- a/python/sglang/srt/layers/attention/attention_registry.py +++ b/python/sglang/srt/layers/attention/attention_registry.py @@ -178,7 +178,8 @@ def attn_backend_wrapper(runner, full_attn_backend): if is_blackwell(): assert ( runner.server_args.attention_backend == "triton" - ), "triton backend is the only supported backend on Blackwell GPUs for hybrid GDN models, use --attention-backend triton to specify the backend." + or runner.server_args.attention_backend == "trtllm_mha" + ), "triton or trtllm_mha backend are the only supported backends on Blackwell GPUs for hybrid GDN models, use --attention-backend triton or --attention-backend trtllm_mha to specify the backend." if is_npu(): assert ( runner.server_args.attention_backend == "ascend" diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 0126cd180..3c5946578 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -1620,7 +1620,7 @@ class ModelRunner: ) elif self.is_hybrid_gdn: self.token_to_kv_pool = HybridLinearKVPool( - page_size=self.page_size if _is_npu else 1, + page_size=self.page_size, size=self.max_total_num_tokens, dtype=self.kv_cache_dtype, head_num=self.model_config.get_num_kv_heads(