From e810077488e190c07e98022f1f16ece9f96d8e8b Mon Sep 17 00:00:00 2001
From: Dom Brown <3886319+DomBrown@users.noreply.github.com>
Date: Fri, 3 Oct 2025 00:04:58 +0100
Subject: [PATCH] Allow use of TRTLLM_MHA backend for hybrid attention on
 Blackwell (#11138)

---
 python/sglang/srt/layers/attention/attention_registry.py | 3 ++-
 python/sglang/srt/model_executor/model_runner.py         | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/attention/attention_registry.py b/python/sglang/srt/layers/attention/attention_registry.py
index 658ad1f0f..aa843685a 100644
--- a/python/sglang/srt/layers/attention/attention_registry.py
+++ b/python/sglang/srt/layers/attention/attention_registry.py
@@ -178,7 +178,8 @@ def attn_backend_wrapper(runner, full_attn_backend):
         if is_blackwell():
             assert (
                 runner.server_args.attention_backend == "triton"
-            ), "triton backend is the only supported backend on Blackwell GPUs for hybrid GDN models, use --attention-backend triton to specify the backend."
+                or runner.server_args.attention_backend == "trtllm_mha"
+            ), "triton or trtllm_mha backend are the only supported backends on Blackwell GPUs for hybrid GDN models, use --attention-backend triton or --attention-backend trtllm_mha to specify the backend."
         if is_npu():
             assert (
                 runner.server_args.attention_backend == "ascend"
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 0126cd180..3c5946578 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -1620,7 +1620,7 @@ class ModelRunner:
                 )
             elif self.is_hybrid_gdn:
                 self.token_to_kv_pool = HybridLinearKVPool(
-                    page_size=self.page_size if _is_npu else 1,
+                    page_size=self.page_size,
                     size=self.max_total_num_tokens,
                     dtype=self.kv_cache_dtype,
                     head_num=self.model_config.get_num_kv_heads(