Eagle speculative decoding part 3: small modifications to the general scheduler (#2709)

Co-authored-by: kavioyu <kavioyu@tencent.com>
2025-01-02 02:09:08 -08:00
parent 9183c23eca
commit ad20b7957e
13 changed files with 224 additions and 69 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -23,6 +23,7 @@ from typing import List, Optional
 import torch

 from sglang.srt.hf_transformers_utils import check_gguf_file
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
    get_amdgpu_memory_capacity,
    get_hpu_memory_capacity,
@@ -247,6 +248,17 @@ class ServerArgs:
                "Overlap scheduler is disabled."
            )

+        # Speculative Decoding
+        if self.speculative_algorithm == "EAGLE":
+            self.prefill_only_one_req = True
+            self.disable_cuda_graph_padding = True
+            self.disable_radix_cache = True
+            self.disable_overlap_schedule = True
+            self.chunked_prefill_size = -1
+            logger.info(
+                "The radix cache, chunked prefill, and overlap scheduler are disabled because of using eagle speculative decoding."
+            )
+
        # GGUF
        if (
            self.load_format == "auto" or self.load_format == "gguf"