TRTLLM-MLA FP8 path (#8638)
Signed-off-by: Faraz Khoubsirat <58580514+farazkh80@users.noreply.github.com>
This commit is contained in:
@@ -432,7 +432,10 @@ class ServerArgs:
|
||||
)
|
||||
self.page_size = 128
|
||||
|
||||
if self.attention_backend == "trtllm_mla":
|
||||
if (
|
||||
self.attention_backend == "trtllm_mla"
|
||||
or self.decode_attention_backend == "trtllm_mla"
|
||||
):
|
||||
if not is_sm100_supported():
|
||||
raise ValueError(
|
||||
"TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
|
||||
@@ -443,11 +446,17 @@ class ServerArgs:
|
||||
f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
|
||||
)
|
||||
self.page_size = 64
|
||||
|
||||
if self.speculative_algorithm is not None:
|
||||
raise ValueError(
|
||||
"trtllm_mla backend does not support speculative decoding yet."
|
||||
)
|
||||
|
||||
if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
|
||||
raise ValueError(
|
||||
"TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."
|
||||
)
|
||||
|
||||
if (
|
||||
self.attention_backend == "trtllm_mha"
|
||||
or self.decode_attention_backend == "trtllm_mha"
|
||||
|
||||
Reference in New Issue
Block a user