Fix 1-step draft model forward (#11653)

Signed-off-by: Shangming Cai <csmthu@gmail.com> Co-authored-by: Liangsheng Yin <lsyincs@gmail.com>
2025-10-15 19:11:33 +08:00
parent 8221f9ae8b
commit 6d0364681c
4 changed files with 22 additions and 23 deletions
--- a/python/sglang/srt/speculative/draft_utils.py
+++ b/python/sglang/srt/speculative/draft_utils.py
@@ -33,15 +33,7 @@ class DraftBackendFactory:
    def create_decode_backend(self):
        if self.speculative_num_steps == 1:
-
+            return None
            class DummyAttnBackend:
                def __init__(self):
                    pass
                def init_forward_metadata(*args, **kwargs):
                    pass
            return DummyAttnBackend()
        backend_map = {
            "flashinfer": self._create_flashinfer_decode_backend,
--- a/python/sglang/srt/speculative/eagle_info_v2.py
+++ b/python/sglang/srt/speculative/eagle_info_v2.py
@@ -276,7 +276,7 @@ class EagleVerifyInputV2Mixin:
                accept_length=accept_length,  # mutable
                simulate_acc_len=SIMULATE_ACC_LEN,
                bs=bs,
-                spec_steps=self.draft_token_num,
+                spec_steps=self.spec_steps,
            )
        # Include the bonus token
--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -218,16 +218,17 @@ class EAGLEWorker(TpModelWorker):
            return
        # Capture draft
-        tic = time.perf_counter()
+        if self.speculative_num_steps > 1:
-        before_mem = get_available_gpu_memory(self.device, self.gpu_id)
+            tic = time.perf_counter()
-        logger.info(
+            before_mem = get_available_gpu_memory(self.device, self.gpu_id)
-            f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
+            logger.info(
-        )
+                f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
-        self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
+            )
-        after_mem = get_available_gpu_memory(self.device, self.gpu_id)
+            self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
-        logger.info(
+            after_mem = get_available_gpu_memory(self.device, self.gpu_id)
-            f"Capture draft cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
+            logger.info(
-        )
+                f"Capture draft cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
            )
        # Capture extend
        if self.draft_extend_attn_backend:
@@ -500,8 +501,11 @@ class EAGLEWorker(TpModelWorker):
            )
        else:
            forward_batch.can_run_dp_cuda_graph = False
-            if not forward_batch.forward_mode.is_idle():
+            if (
-                # Initialize attention backend
+                not forward_batch.forward_mode.is_idle()
                and self.speculative_num_steps > 1
            ):
                # Skip attention backend init for idle mode or 1-step draft
                self.draft_attn_backend.init_forward_metadata(forward_batch)
            # Run forward steps
            parent_list, top_scores_index, draft_tokens = self.draft_forward(
--- a/python/sglang/srt/speculative/eagle_worker_v2.py
+++ b/python/sglang/srt/speculative/eagle_worker_v2.py
@@ -97,7 +97,10 @@ class EAGLEWorkerV2(EAGLEWorker):
                forward_batch,
            )
        else:
-            self.draft_attn_backend.init_forward_metadata(forward_batch)
+            if self.speculative_num_steps > 1:
                # Skip attention backend init for 1-step draft,
                # `draft_forward` only does sample in this case.
                self.draft_attn_backend.init_forward_metadata(forward_batch)
            parent_list, top_scores_index, draft_tokens = self.draft_forward(
                forward_batch
            )