Sync the changes on cuda graph runners (#6932)

This commit is contained in:
Lianmin Zheng
2025-06-06 18:23:52 -07:00
committed by GitHub
parent 61ce91ed28
commit 60fdad7cf3
7 changed files with 63 additions and 70 deletions

View File

@@ -447,7 +447,7 @@ class ServerArgs:
self.speculative_num_steps,
self.speculative_eagle_topk,
self.speculative_num_draft_tokens,
) = auto_choose_speculative_params(model_arch)
) = auto_choose_speculative_params(self)
if self.page_size > 1 and self.speculative_eagle_topk > 1:
self.speculative_eagle_topk = 1
@@ -1655,12 +1655,23 @@ def get_model_arch(args: ServerArgs):
return hf_config.architectures[0]
def auto_choose_speculative_params(arch: str):
def auto_choose_speculative_params(self: ServerArgs):
"""
Automatically choose the parameters for speculative decoding.
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
"""
kwargs = {}
hf_config = get_config(
self.model_path,
trust_remote_code=self.trust_remote_code,
revision=self.revision,
model_override_args=json.loads(self.json_model_override_args),
**kwargs,
)
arch = hf_config.architectures[0]
if arch in ["LlamaForCausalLM"]:
# The default value for llama
return (5, 4, 8)