diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index ffe60b33e..dba70dff2 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -124,8 +124,8 @@ class ServerArgs: speculative_draft_model_path: Optional[str] = None speculative_algorithm: Optional[str] = None speculative_num_steps: int = 5 - speculative_num_draft_tokens: int = 64 speculative_eagle_topk: int = 8 + speculative_num_draft_tokens: int = 64 # Double Sparsity enable_double_sparsity: bool = False @@ -719,12 +719,6 @@ class ServerArgs: help="The number of steps sampled from draft model in Speculative Decoding.", default=ServerArgs.speculative_num_steps, ) - parser.add_argument( - "--speculative-num-draft-tokens", - type=int, - help="The number of token sampled from draft model in Speculative Decoding.", - default=ServerArgs.speculative_num_draft_tokens, - ) parser.add_argument( "--speculative-eagle-topk", type=int, @@ -732,6 +726,12 @@ class ServerArgs: choices=[1, 2, 4, 8], default=ServerArgs.speculative_eagle_topk, ) + parser.add_argument( + "--speculative-num-draft-tokens", + type=int, + help="The number of token sampled from draft model in Speculative Decoding.", + default=ServerArgs.speculative_num_draft_tokens, + ) # Double Sparsity parser.add_argument( diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index d514d8633..d5c09751b 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -20,7 +20,7 @@ pip install flashinfer_python==0.2.1.post2 --find-links ${FLASHINFER_REPO} --for pip install torch_memory_saver --force-reinstall -pip install transformers==4.45.2 sentence_transformers accelerate peft +pip install transformers==4.45.2 sentence_transformers accelerate peft pandas datasets # For compling xgrammar kernels pip install cuda-python nvidia-cuda-nvrtc-cu12