Fix 1-step draft model forward (#11653)

Signed-off-by: Shangming Cai <csmthu@gmail.com> Co-authored-by: Liangsheng Yin <lsyincs@gmail.com>
2025-10-15 19:11:33 +08:00
parent 8221f9ae8b
commit 6d0364681c
4 changed files with 22 additions and 23 deletions
--- a/python/sglang/srt/speculative/draft_utils.py
+++ b/python/sglang/srt/speculative/draft_utils.py
@@ -33,15 +33,7 @@ class DraftBackendFactory:

    def create_decode_backend(self):
        if self.speculative_num_steps == 1:
-
-            class DummyAttnBackend:
-                def __init__(self):
-                    pass
-
-                def init_forward_metadata(*args, **kwargs):
-                    pass
-
-            return DummyAttnBackend()
+            return None

        backend_map = {
            "flashinfer": self._create_flashinfer_decode_backend,
--- a/python/sglang/srt/speculative/eagle_info_v2.py
+++ b/python/sglang/srt/speculative/eagle_info_v2.py
@@ -276,7 +276,7 @@ class EagleVerifyInputV2Mixin:
                accept_length=accept_length,  # mutable
                simulate_acc_len=SIMULATE_ACC_LEN,
                bs=bs,
-                spec_steps=self.draft_token_num,
+                spec_steps=self.spec_steps,
            )

        # Include the bonus token
--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -218,6 +218,7 @@ class EAGLEWorker(TpModelWorker):
            return

        # Capture draft
+        if self.speculative_num_steps > 1:
            tic = time.perf_counter()
            before_mem = get_available_gpu_memory(self.device, self.gpu_id)
            logger.info(
@@ -500,8 +501,11 @@ class EAGLEWorker(TpModelWorker):
            )
        else:
            forward_batch.can_run_dp_cuda_graph = False
-            if not forward_batch.forward_mode.is_idle():
-                # Initialize attention backend
+            if (
+                not forward_batch.forward_mode.is_idle()
+                and self.speculative_num_steps > 1
+            ):
+                # Skip attention backend init for idle mode or 1-step draft
                self.draft_attn_backend.init_forward_metadata(forward_batch)
            # Run forward steps
            parent_list, top_scores_index, draft_tokens = self.draft_forward(
--- a/python/sglang/srt/speculative/eagle_worker_v2.py
+++ b/python/sglang/srt/speculative/eagle_worker_v2.py
@@ -97,6 +97,9 @@ class EAGLEWorkerV2(EAGLEWorker):
                forward_batch,
            )
        else:
+            if self.speculative_num_steps > 1:
+                # Skip attention backend init for 1-step draft,
+                # `draft_forward` only does sample in this case.
                self.draft_attn_backend.init_forward_metadata(forward_batch)
            parent_list, top_scores_index, draft_tokens = self.draft_forward(
                forward_batch