feat: add trtllm-gen mha from direct call (#8782)

Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
2025-08-05 06:28:39 -04:00
parent 75df31b60e
commit 40e3b2beeb
4 changed files with 351 additions and 0 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -441,6 +441,23 @@ class ServerArgs:
                    "trtllm_mla backend does not support speculative decoding yet."
                )

+        if self.attention_backend == "trtllm_mha":
+            if not is_sm100_supported():
+                raise ValueError(
+                    "TRTLLM MHA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
+                )
+
+            if self.page_size not in [16, 32, 64]:
+                logger.warning(
+                    f"TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from {self.page_size} to 64."
+                )
+                self.page_size = 64
+
+            if self.speculative_algorithm is not None:
+                raise ValueError(
+                    "trtllm_mla backend does not support speculative decoding yet."
+                )
+
        # Set page size
        if self.page_size is None:
            self.page_size = 1
@@ -1275,6 +1292,7 @@ class ServerArgs:
                "ascend",
                "triton",
                "trtllm_mla",
+                "trtllm_mha",
            ],
            default=ServerArgs.attention_backend,
            help="Choose the kernels for attention layers.",