Minor style and doc fix (#7228)

2025-06-16 01:03:13 -07:00
parent 0ae1e9a755
commit 21615cc3fe
4 changed files with 10 additions and 14 deletions
--- a/docs/backend/attention_backend.md
+++ b/docs/backend/attention_backend.md
@@ -3,13 +3,16 @@
 ## Supporting matrix for different attention backends

 | **Backend**              | **Page Size > 1** | **Spec Decoding** | **MLA** | **Sliding Window** | **MultiModal** |
-|--------------------------|-------------------|-------------------|--------|--------------------|------------|
-| **FlashInfer** | ✅                | ✅                | ✅     | ✅                 | ✅ |
-| **FA3**                  | ✅                | ✅                | ✅     | ✅                 | ✅ |
-| **Triton**               | ❌                | ✅                | ✅     | ❌                 | ❌ |
-| **Torch Native**         | ❌                | ❌                | ❌     | ❌                 | ❌ |
-| **FlashMLA**                  | ✅                | ✅                | ✅     | ❌                 | ❌ |
+|--------------------------|-------------------|-------------------|---------|--------------------|----------------|
+| **FlashInfer**           | ❌                | ✅                 | ✅      | ✅                 | ✅              |
+| **FA3**                  | ✅                | ✅                 | ✅      | ✅                 | ✅              |
+| **Triton**               | ❌                | ✅                 | ✅      | ✅                 | ❌              |
+| **Torch Native**         | ❌                | ❌                 | ❌      | ❌                 | ❌              |
+| **FlashMLA**             | ✅                | ✅                 | ✅      | ❌                 | ❌              |

+Note: Every kernel backend is compatible with a page size > 1 by specifying an argument such as `--page-size 16`.
+This is because a page size of 16 can be converted to a page size of 1 in the kernel backend.
+The "❌" and "✅" symbols in the table above under "Page Size > 1" indicate whether the kernel actually operates with a page size greater than 1, rather than treating a page size of 16 as a page size of 1.

 ## User guide

--- a/python/sglang/srt/layers/attention/cutlass_mla_backend.py
+++ b/python/sglang/srt/layers/attention/cutlass_mla_backend.py
@@ -11,8 +11,6 @@ from typing import TYPE_CHECKING, Optional, Union
 import torch
 import triton

-from sglang.global_config import global_config
-from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
 from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
 from sglang.srt.layers.dp_attention import get_attention_tp_size
@@ -22,7 +20,6 @@ from sglang.srt.utils import is_cuda
 if TYPE_CHECKING:
    from sglang.srt.layers.radix_attention import RadixAttention
    from sglang.srt.model_executor.model_runner import ModelRunner
-    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
    from sglang.srt.speculative.spec_info import SpecInfo

 _is_cuda = is_cuda()
--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -11,7 +11,6 @@ from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import get_compiler_backend

 if TYPE_CHECKING:
    from sglang.srt.layers.radix_attention import RadixAttention
--- a/python/sglang/srt/layers/attention/flashmla_backend.py
+++ b/python/sglang/srt/layers/attention/flashmla_backend.py
@@ -14,8 +14,6 @@ import torch
 import triton
 from flash_mla import flash_mla_with_kvcache, get_mla_metadata

-from sglang.global_config import global_config
-from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
 from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
 from sglang.srt.layers.dp_attention import get_attention_tp_size
@@ -24,7 +22,6 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMo
 if TYPE_CHECKING:
    from sglang.srt.layers.radix_attention import RadixAttention
    from sglang.srt.model_executor.model_runner import ModelRunner
-    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
    from sglang.srt.speculative.spec_info import SpecInfo


@@ -330,7 +327,7 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
            )

    def get_cuda_graph_seq_len_fill_value(self):
-        return 1024
+        return 1

    def forward_decode(
        self,