diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 135f5e240..68f7db4a3 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -747,6 +747,15 @@ class ServerArgs: ) self.speculative_num_draft_tokens = self.speculative_num_steps + 1 + if ( + self.speculative_eagle_topk > 1 + and self.page_size > 1 + and self.attention_backend != "flashinfer" + ): + raise ValueError( + "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend." + ) + # The token generated from the verify step is counted. # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded. # assert self.speculative_num_steps < self.speculative_num_draft_tokens