TRTLLM Gen MLA Decode Kernel Integration (same as #7938) (#8632)

Signed-off-by: Faraz Khoubsirat <58580514+farazkh80@users.noreply.github.com>
This commit is contained in:
Faraz
2025-07-31 19:03:40 -04:00
committed by GitHub
parent 3dde86194a
commit 4b04998d38
8 changed files with 1361 additions and 4 deletions

View File

@@ -436,6 +436,7 @@ class ModelRunner:
"triton",
"flashmla",
"cutlass_mla",
"trtllm_mla",
"ascend",
]:
logger.info(
@@ -1437,6 +1438,12 @@ class ModelRunner:
)
return CutlassMLABackend(self)
elif self.server_args.attention_backend == "trtllm_mla":
if not self.use_mla_backend:
raise ValueError("trtllm_mla backend can only be used with MLA models.")
from sglang.srt.layers.attention.trtllm_mla_backend import TRTLLMMLABackend
return TRTLLMMLABackend(self)
elif self.server_args.attention_backend == "intel_amx":
from sglang.srt.layers.attention.intel_amx_backend import (
IntelAMXAttnBackend,