diff --git a/benchmark/gpt_oss/README.md b/benchmark/gpt_oss/README.md index baf164e10..4d1b00e91 100644 --- a/benchmark/gpt_oss/README.md +++ b/benchmark/gpt_oss/README.md @@ -132,8 +132,8 @@ python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algori # On Blackwell: # - Chain decoding (topk = 1) is supported on TRTLLM-MHA backend. Tree decoding (topk > 1) is in progress, stay tuned! # - Both tree decoding (topk > 1) and chain decoding (topk = 1) are supported on the Triton backend. -python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --tp 4 -python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8 --attention-backend triton --tp 4 +python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --tp 4 +python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8 --attention-backend triton --tp 4 ``` Benchmark Command diff --git a/benchmark/mtbench/README.md b/benchmark/mtbench/README.md index e6babf96e..fc37caee9 100644 --- a/benchmark/mtbench/README.md +++ b/benchmark/mtbench/README.md @@ -18,7 +18,7 @@ python3 bench_sglang.py --num-questions 80 ### Benchmark sglang EAGLE ``` python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algo EAGLE \ - --speculative-draft lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \ + --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \ --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --dtype float16 --port 30000 ``` diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index caf1f2abc..fb8c2501b 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -302,11 +302,16 @@ class ModelConfig: ) or getattr(self.hf_config, "image_token_index", None) @staticmethod - def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs): + def from_server_args( + server_args: ServerArgs, + model_path: str = None, + model_revision: str = None, + **kwargs, + ): return ModelConfig( model_path=model_path or server_args.model_path, trust_remote_code=server_args.trust_remote_code, - revision=server_args.revision, + revision=model_revision or server_args.revision, context_length=server_args.context_length, model_override_args=server_args.json_model_override_args, is_embedding=server_args.is_embedding, diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 968be171d..fbc12e5b0 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -78,6 +78,11 @@ class TpModelWorker: if not is_draft_worker else server_args.speculative_draft_model_path ), + model_revision=( + server_args.revision + if not is_draft_worker + else server_args.speculative_draft_model_revision + ), is_draft_model=is_draft_worker, ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 779fb5be0..0af2ad693 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -249,6 +249,7 @@ class ServerArgs: # Speculative decoding speculative_algorithm: Optional[str] = None speculative_draft_model_path: Optional[str] = None + speculative_draft_model_revision: Optional[str] = None speculative_num_steps: Optional[int] = None speculative_eagle_topk: Optional[int] = None speculative_num_draft_tokens: Optional[int] = None @@ -1498,6 +1499,14 @@ class ServerArgs: type=str, help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.", ) + parser.add_argument( + "--speculative-draft-model-revision", + type=str, + default=None, + help="The specific draft model version to use. It can be a branch " + "name, a tag name, or a commit id. If unspecified, will use " + "the default version.", + ) parser.add_argument( "--speculative-num-steps", type=int, diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 96081b2c3..8ce2e2e20 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -505,6 +505,7 @@ class SRTRunner: mem_fraction_static: float = 0.65, trust_remote_code: bool = False, speculative_draft_model_path: Optional[str] = None, + speculative_draft_model_revision: Optional[str] = None, speculative_algorithm: Optional[str] = None, speculative_num_steps: Optional[int] = None, speculative_eagle_topk: Optional[int] = None, @@ -526,6 +527,9 @@ class SRTRunner: spec_kwargs = {} if speculative_draft_model_path: spec_kwargs["speculative_draft_model_path"] = speculative_draft_model_path + spec_kwargs["speculative_draft_model_revision"] = ( + speculative_draft_model_revision + ) spec_kwargs["speculative_algorithm"] = speculative_algorithm spec_kwargs["speculative_num_steps"] = speculative_num_steps spec_kwargs["speculative_eagle_topk"] = speculative_eagle_topk diff --git a/test/srt/ep/test_deepep_small.py b/test/srt/ep/test_deepep_small.py index b2dfe9fc9..05aefe79a 100644 --- a/test/srt/ep/test_deepep_small.py +++ b/test/srt/ep/test_deepep_small.py @@ -268,7 +268,7 @@ class TestMTP(CustomTestCase): "deepep", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, "--speculative-num-steps", "2", @@ -343,7 +343,7 @@ class TestMTPWithTBO(CustomTestCase): "3", "--speculative-num-draft-tokens", "3", - "--speculative-draft", + "--speculative-draft-model-path", DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, "--chunked-prefill-size", "256", diff --git a/test/srt/ep/test_hybrid_dp_ep_tp_mtp.py b/test/srt/ep/test_hybrid_dp_ep_tp_mtp.py index e583eebbf..65fbad428 100644 --- a/test/srt/ep/test_hybrid_dp_ep_tp_mtp.py +++ b/test/srt/ep/test_hybrid_dp_ep_tp_mtp.py @@ -1225,7 +1225,7 @@ class Test30(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1272,7 +1272,7 @@ class Test31(CustomTestCase): "4", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1319,7 +1319,7 @@ class Test32(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1365,7 +1365,7 @@ class Test33(CustomTestCase): "1", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1414,7 +1414,7 @@ class Test34(CustomTestCase): "1", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1463,7 +1463,7 @@ class Test35(CustomTestCase): "1", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1511,7 +1511,7 @@ class Test36(CustomTestCase): "--enable-dp-lm-head", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1559,7 +1559,7 @@ class Test37(CustomTestCase): "--enable-dp-lm-head", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1609,7 +1609,7 @@ class Test38(CustomTestCase): "--enable-dp-lm-head", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1659,7 +1659,7 @@ class Test39(CustomTestCase): "--enable-dp-lm-head", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1709,7 +1709,7 @@ class Test40(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1762,7 +1762,7 @@ class Test41(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1815,7 +1815,7 @@ class Test42(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1867,7 +1867,7 @@ class Test43(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1922,7 +1922,7 @@ class Test44(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -1977,7 +1977,7 @@ class Test45(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2031,7 +2031,7 @@ class Test46(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2085,7 +2085,7 @@ class Test47(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2141,7 +2141,7 @@ class Test48(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2197,7 +2197,7 @@ class Test49(CustomTestCase): "32", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2243,7 +2243,7 @@ class Test50(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2292,7 +2292,7 @@ class Test51(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2341,7 +2341,7 @@ class Test52(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2389,7 +2389,7 @@ class Test53(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2440,7 +2440,7 @@ class Test54(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2491,7 +2491,7 @@ class Test55(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2541,7 +2541,7 @@ class Test56(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2591,7 +2591,7 @@ class Test57(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2643,7 +2643,7 @@ class Test58(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", @@ -2695,7 +2695,7 @@ class Test59(CustomTestCase): "8", "--speculative-algo", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", diff --git a/test/srt/test_dp_attention.py b/test/srt/test_dp_attention.py index 37b89c445..4486dc16e 100644 --- a/test/srt/test_dp_attention.py +++ b/test/srt/test_dp_attention.py @@ -74,7 +74,7 @@ class TestDPAttentionDP2TP2DeepseekV3MTP(CustomTestCase): "4", "--speculative-num-draft-tokens", "4", - "--speculative-draft", + "--speculative-draft-model-path", DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, "--tp-size", "2", diff --git a/test/srt/test_fa3.py b/test/srt/test_fa3.py index 45ad87e7d..c9f286fca 100644 --- a/test/srt/test_fa3.py +++ b/test/srt/test_fa3.py @@ -146,7 +146,7 @@ class TestFlashAttention3SpeculativeDecode(BaseFlashAttentionTest): "4", "--speculative-algorithm", "EAGLE3", - "--speculative-draft", + "--speculative-draft-model-path", DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3, "--speculative-num-steps", "3", @@ -180,7 +180,7 @@ class TestFlashAttention3SpeculativeDecodeTopk(BaseFlashAttentionTest): "4", "--speculative-algorithm", "EAGLE3", - "--speculative-draft", + "--speculative-draft-model-path", DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3, "--speculative-num-steps", "5", @@ -212,7 +212,7 @@ class TestFlashAttention3MLASpeculativeDecode(BaseFlashAttentionTest): "4", "--speculative-algorithm", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, "--speculative-num-steps", "3", @@ -244,7 +244,7 @@ class TestFlashAttention3MLASpeculativeDecodeTopk(BaseFlashAttentionTest): "4", "--speculative-algorithm", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, "--speculative-num-steps", "5", diff --git a/test/srt/test_flashmla.py b/test/srt/test_flashmla.py index 184e20ff2..681c9b8eb 100644 --- a/test/srt/test_flashmla.py +++ b/test/srt/test_flashmla.py @@ -100,7 +100,7 @@ class TestFlashMLAMTP(CustomTestCase): "1", "--speculative-algorithm", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "lmsys/sglang-ci-dsv3-test-NextN", "--speculative-num-steps", "1", diff --git a/test/srt/test_hybrid_attn_backend.py b/test/srt/test_hybrid_attn_backend.py index a527818fd..9251f34dc 100644 --- a/test/srt/test_hybrid_attn_backend.py +++ b/test/srt/test_hybrid_attn_backend.py @@ -121,7 +121,7 @@ class TestHybridAttnBackendSpeculativeDecoding(TestHybridAttnBackendBase): return DEFAULT_SERVER_ARGS + [ "--speculative-algorithm", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, "--speculative-num-steps", "3", diff --git a/test/srt/test_mla_int8_deepseek_v3.py b/test/srt/test_mla_int8_deepseek_v3.py index a528a64be..519cb0554 100644 --- a/test/srt/test_mla_int8_deepseek_v3.py +++ b/test/srt/test_mla_int8_deepseek_v3.py @@ -67,7 +67,7 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase): "1", "--speculative-algorithm", "EAGLE", - "--speculative-draft", + "--speculative-draft-model-path", "sgl-project/sglang-ci-dsv3-channel-int8-test-NextN", "--speculative-num-steps", "2",