Support page size > 1 + eagle (#4908)

2025-03-30 00:46:23 -07:00
parent 5ec5eaf760
commit b26bc86b36
16 changed files with 374 additions and 71 deletions
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -14,7 +14,6 @@ from functools import partial
 from typing import TYPE_CHECKING, Callable, List, Optional, Union

 import torch
-import triton

 from sglang.global_config import global_config
 from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
@@ -22,7 +21,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
+from sglang.srt.utils import is_flashinfer_available, next_power_of_2

 if TYPE_CHECKING:
    from sglang.srt.layers.radix_attention import RadixAttention
@@ -932,6 +931,7 @@ class FlashInferMultiStepDraftBackend:
        self.topk = topk
        self.speculative_num_steps = speculative_num_steps
        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
+        self.page_size = model_runner.page_size

        max_bs = model_runner.req_to_token_pool.size * self.topk
        self.kv_indptr = torch.zeros(
@@ -985,9 +985,9 @@ class FlashInferMultiStepDraftBackend:
            self.pool_len,
            kv_indices_buffer.shape[1],
            self.kv_indptr.shape[1],
-            triton.next_power_of_2(num_seqs),
-            triton.next_power_of_2(self.speculative_num_steps),
-            triton.next_power_of_2(bs),
+            next_power_of_2(num_seqs),
+            next_power_of_2(self.speculative_num_steps),
+            next_power_of_2(bs),
        )

        assert forward_batch.spec_info is not None
@@ -1018,8 +1018,6 @@ class FlashInferMultiStepDraftBackend:
        )

        def call_fn(i, forward_batch):
-            assert forward_batch.spec_info is not None
-            assert isinstance(forward_batch.spec_info, EagleDraftInput)
            forward_batch.spec_info.kv_indptr = (
                forward_batch.spec_info.kv_indptr.clone()
            )