Raise error when topk>1 and page>1 for paged attention backends. (#9784)
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -747,6 +747,15 @@ class ServerArgs:
|
|||||||
)
|
)
|
||||||
self.speculative_num_draft_tokens = self.speculative_num_steps + 1
|
self.speculative_num_draft_tokens = self.speculative_num_steps + 1
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.speculative_eagle_topk > 1
|
||||||
|
and self.page_size > 1
|
||||||
|
and self.attention_backend != "flashinfer"
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
"speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
|
||||||
|
)
|
||||||
|
|
||||||
# The token generated from the verify step is counted.
|
# The token generated from the verify step is counted.
|
||||||
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
|
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
|
||||||
# assert self.speculative_num_steps < self.speculative_num_draft_tokens
|
# assert self.speculative_num_steps < self.speculative_num_draft_tokens
|
||||||
|
|||||||
Reference in New Issue
Block a user