From ebaba8565586623e274552e7848822575b5ce58a Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Fri, 2 May 2025 00:30:27 +0800 Subject: [PATCH] Update ci test and doc for MTP api change (#5952) --- docs/references/deepseek.md | 6 +-- python/sglang/srt/server_args.py | 11 ++++-- test/srt/test_full_deepseek_v3.py | 2 - test/srt/test_mla_deepseek_v3.py | 57 +++++++++++++++++++++++++++ test/srt/test_mla_flashinfer.py | 2 - test/srt/test_mla_int8_deepseek_v3.py | 2 - 6 files changed, 66 insertions(+), 14 deletions(-) diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md index 1b6b40edc..80de28ca8 100644 --- a/docs/references/deepseek.md +++ b/docs/references/deepseek.md @@ -153,12 +153,10 @@ The precompilation process typically takes around 10 minutes to complete. **Description**: SGLang implements DeepSeek V3 Multi-Token Prediction (MTP) based on [EAGLE speculative decoding](https://docs.sglang.ai/backend/speculative_decoding.html#EAGLE-Decoding). With this optimization, the decoding speed can be improved by **1.8x** for batch size 1 and **1.5x** for batch size 32 respectively on H200 TP8 setting. **Usage**: -Add arguments `--speculative-algorithm`, `--speculative-draft-model-path`, -`--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example: +Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example: ``` -python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --speculative-algorithm EAGLE --speculative-draft-model-path lmsys/DeepSeek-V3-0324-NextN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --trust-remote-code --tp 8 +python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --trust-remote-code --tp 8 ``` -- The draft model are available at huggingface: [lmsys/DeepSeek-V3-0324-NextN](https://huggingface.co/lmsys/DeepSeek-V3-0324-NextN), [lmsys/DeepSeek-R1-NextN](https://huggingface.co/lmsys/DeepSeek-R1-NextN). It can also be exported from original DeepSeek-V3/R1 model with [export_deepseek_nextn.py](https://github.com/sgl-project/sglang/blob/main/scripts/export_deepseek_nextn.py) script. - The best configuration for `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` can be searched with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py) script for given batch size. The minimum configuration is `--speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2`, which can achieve speedup for larger batch sizes. When using FlashInfer MLA wrapper (`--attention-backend flashinfer`) with speculative decoding, set the `--speculative-eagle-topk` parameter to `1`. The FlashAttention 3 backend also only supports `--speculative-eagle-topk 1`. - To enable DeepSeek MTP for large batch sizes (>32), there are some parameters should be changed (Reference [this discussion](https://github.com/sgl-project/sglang/issues/4543#issuecomment-2737413756)): diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 1d7c2aa1a..5a0b10a44 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -347,10 +347,13 @@ class ServerArgs: model_arch = get_model_arch(self) # Auto set draft_model_path DeepSeek-V3/R1 - if self.speculative_draft_model_path is None and model_arch in [ - "DeepseekV3ForCausalLM" - ]: - self.speculative_draft_model_path = self.model_path + if model_arch == "DeepseekV3ForCausalLM": + if self.speculative_draft_model_path is None: + self.speculative_draft_model_path = self.model_path + else: + logger.warning( + "DeepSeek MTP does not require setting speculative_draft_model_path." + ) # Auto choose parameters if self.speculative_num_steps is None: diff --git a/test/srt/test_full_deepseek_v3.py b/test/srt/test_full_deepseek_v3.py index 7b29787b1..a223cdc3e 100644 --- a/test/srt/test_full_deepseek_v3.py +++ b/test/srt/test_full_deepseek_v3.py @@ -80,8 +80,6 @@ class TestDeepseekV3MTP(CustomTestCase): "--trust-remote-code", "--speculative-algorithm", "EAGLE", - "--speculative-draft", - "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "3", "--speculative-eagle-topk", diff --git a/test/srt/test_mla_deepseek_v3.py b/test/srt/test_mla_deepseek_v3.py index 863756a1c..2502e2bd9 100644 --- a/test/srt/test_mla_deepseek_v3.py +++ b/test/srt/test_mla_deepseek_v3.py @@ -50,6 +50,63 @@ class TestMLADeepseekV3(CustomTestCase): class TestDeepseekV3MTP(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = "lmsys/sglang-ci-dsv3-test" + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = [ + "--trust-remote-code", + "--cuda-graph-max-bs", + "2", + "--disable-radix", + "--enable-torch-compile", + "--torch-compile-max-bs", + "1", + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "2", + "--speculative-eagle-topk", + "4", + "--speculative-num-draft-tokens", + "4", + ] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + requests.get(self.base_url + "/flush_cache") + + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.60) + + server_info = requests.get(self.base_url + "/get_server_info") + avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] + print(f"{avg_spec_accept_length=}") + self.assertGreater(avg_spec_accept_length, 2.5) + + +# compatible with old APIs +class TestDeepseekV3MTPWithDraft(CustomTestCase): @classmethod def setUpClass(cls): cls.model = "lmsys/sglang-ci-dsv3-test" diff --git a/test/srt/test_mla_flashinfer.py b/test/srt/test_mla_flashinfer.py index 4f0953e6a..be53a16f9 100644 --- a/test/srt/test_mla_flashinfer.py +++ b/test/srt/test_mla_flashinfer.py @@ -118,8 +118,6 @@ class TestFlashinferMLAMTP(CustomTestCase): "1", "--speculative-algorithm", "EAGLE", - "--speculative-draft", - "lmsys/sglang-ci-dsv3-test-NextN", "--speculative-num-steps", "3", "--speculative-eagle-topk", diff --git a/test/srt/test_mla_int8_deepseek_v3.py b/test/srt/test_mla_int8_deepseek_v3.py index 643700020..5e6dc62a4 100644 --- a/test/srt/test_mla_int8_deepseek_v3.py +++ b/test/srt/test_mla_int8_deepseek_v3.py @@ -162,8 +162,6 @@ class TestDeepseekV3MTPBlockInt8(CustomTestCase): "1", "--speculative-algorithm", "EAGLE", - "--speculative-draft", - "sgl-project/sglang-ci-dsv3-block-int8-test-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk",