Eagle speculative decoding part 3: small modifications to the general scheduler (#2709)

Co-authored-by: kavioyu <kavioyu@tencent.com>
This commit is contained in:
Lianmin Zheng
2025-01-02 02:09:08 -08:00
committed by GitHub
parent 9183c23eca
commit ad20b7957e
13 changed files with 224 additions and 69 deletions

View File

@@ -23,6 +23,7 @@ from typing import List, Optional
import torch
from sglang.srt.hf_transformers_utils import check_gguf_file
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
from sglang.srt.utils import (
get_amdgpu_memory_capacity,
get_hpu_memory_capacity,
@@ -247,6 +248,17 @@ class ServerArgs:
"Overlap scheduler is disabled."
)
# Speculative Decoding
if self.speculative_algorithm == "EAGLE":
self.prefill_only_one_req = True
self.disable_cuda_graph_padding = True
self.disable_radix_cache = True
self.disable_overlap_schedule = True
self.chunked_prefill_size = -1
logger.info(
"The radix cache, chunked prefill, and overlap scheduler are disabled because of using eagle speculative decoding."
)
# GGUF
if (
self.load_format == "auto" or self.load_format == "gguf"