Enables speculative decoding for the trtllm_mla attention backend (#9238)

This commit is contained in:
pranavm-nvidia
2025-08-21 01:18:21 -07:00
committed by GitHub
parent 18da2c96ec
commit 64574ef8c0
3 changed files with 60 additions and 21 deletions

View File

@@ -479,11 +479,6 @@ class ServerArgs:
)
self.page_size = 64
if self.speculative_algorithm is not None:
raise ValueError(
"trtllm_mla backend does not support speculative decoding yet."
)
if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
raise ValueError(
"TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."