TRTLLM-MLA FP8 path (#8638)

Signed-off-by: Faraz Khoubsirat <58580514+farazkh80@users.noreply.github.com>
This commit is contained in:
Faraz
2025-08-11 17:02:13 -04:00
committed by GitHub
parent 44e86480e8
commit f508cd3cb7
5 changed files with 347 additions and 62 deletions

View File

@@ -432,7 +432,10 @@ class ServerArgs:
)
self.page_size = 128
if self.attention_backend == "trtllm_mla":
if (
self.attention_backend == "trtllm_mla"
or self.decode_attention_backend == "trtllm_mla"
):
if not is_sm100_supported():
raise ValueError(
"TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
@@ -443,11 +446,17 @@ class ServerArgs:
f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
)
self.page_size = 64
if self.speculative_algorithm is not None:
raise ValueError(
"trtllm_mla backend does not support speculative decoding yet."
)
if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
raise ValueError(
"TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."
)
if (
self.attention_backend == "trtllm_mha"
or self.decode_attention_backend == "trtllm_mha"