TRTLLM Gen MLA Decode Kernel Integration (same as #7938) (#8632)

Signed-off-by: Faraz Khoubsirat <58580514+farazkh80@users.noreply.github.com>
2025-07-31 19:03:40 -04:00
parent 3dde86194a
commit 4b04998d38
8 changed files with 1361 additions and 4 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -24,6 +24,7 @@ import tempfile
 from typing import List, Literal, Optional, Union

 from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
+from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
@@ -402,6 +403,22 @@ class ServerArgs:
            )
            self.page_size = 128

+        if self.attention_backend == "trtllm_mla":
+            if not is_sm100_supported():
+                raise ValueError(
+                    "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
+                )
+
+            if self.page_size not in [32, 64]:
+                logger.warning(
+                    f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
+                )
+                self.page_size = 64
+            if self.speculative_algorithm is not None:
+                raise ValueError(
+                    "trtllm_mla backend does not support speculative decoding yet."
+                )
+
        # Set page size
        if self.page_size is None:
            self.page_size = 1
@@ -1225,6 +1242,7 @@ class ServerArgs:
                "torch_native",
                "ascend",
                "triton",
+                "trtllm_mla",
            ],
            default=ServerArgs.attention_backend,
            help="Choose the kernels for attention layers.",