Standalone speculative decoding (#10090)
This commit is contained in:
@@ -473,9 +473,14 @@ class ServerArgs:
|
||||
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
||||
reserved_mem = 32 * 1024
|
||||
|
||||
# draft model and larger cuda graph buffers
|
||||
if self.speculative_algorithm is not None:
|
||||
# draft model and larger cuda graph buffers
|
||||
reserved_mem += 2 * 1024
|
||||
if self.speculative_algorithm == "STANDALONE":
|
||||
# Standalone speculative decoding needs more memory than other speculative
|
||||
# decoding algorithms since the draft model is typically larger.
|
||||
reserved_mem += 6 * 1024
|
||||
else:
|
||||
reserved_mem += 2 * 1024
|
||||
if self.enable_dp_attention:
|
||||
reserved_mem += 4 * 1024
|
||||
|
||||
@@ -704,7 +709,12 @@ class ServerArgs:
|
||||
# NEXTN shares the same implementation of EAGLE
|
||||
self.speculative_algorithm = "EAGLE"
|
||||
|
||||
if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
|
||||
if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
|
||||
if self.speculative_algorithm == "STANDALONE":
|
||||
# TODO: support dp attention for standalone speculative decoding
|
||||
assert (
|
||||
self.enable_dp_attention is False
|
||||
), "Currently standalone speculative decoding does not support dp attention."
|
||||
if self.max_running_requests is None:
|
||||
self.max_running_requests = 48
|
||||
self.disable_overlap_schedule = True
|
||||
@@ -1499,7 +1509,7 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--speculative-algorithm",
|
||||
type=str,
|
||||
choices=["EAGLE", "EAGLE3", "NEXTN"],
|
||||
choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
|
||||
help="Speculative algorithm.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -2635,7 +2645,9 @@ def auto_choose_speculative_params(self: ServerArgs):
|
||||
"""
|
||||
hf_config = self.get_hf_config()
|
||||
arch = hf_config.architectures[0]
|
||||
|
||||
if self.speculative_algorithm == "STANDALONE":
|
||||
# The default value for standalone speculative decoding
|
||||
return (3, 1, 4)
|
||||
if arch in ["LlamaForCausalLM"]:
|
||||
# The default value for llama
|
||||
return (5, 4, 8)
|
||||
|
||||
Reference in New Issue
Block a user