Enables speculative decoding for the trtllm_mla attention backend (#9238)

2025-08-21 01:18:21 -07:00
parent 18da2c96ec
commit 64574ef8c0
3 changed files with 60 additions and 21 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -479,11 +479,6 @@ class ServerArgs:
                )
                self.page_size = 64

-            if self.speculative_algorithm is not None:
-                raise ValueError(
-                    "trtllm_mla backend does not support speculative decoding yet."
-                )
-
            if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
                raise ValueError(
                    "TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."