Minor style and doc fix (#7228)
This commit is contained in:
@@ -3,13 +3,16 @@
|
||||
## Supporting matrix for different attention backends
|
||||
|
||||
| **Backend** | **Page Size > 1** | **Spec Decoding** | **MLA** | **Sliding Window** | **MultiModal** |
|
||||
|--------------------------|-------------------|-------------------|--------|--------------------|------------|
|
||||
| **FlashInfer** | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| **FA3** | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| **Triton** | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| **Torch Native** | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| **FlashMLA** | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
|--------------------------|-------------------|-------------------|---------|--------------------|----------------|
|
||||
| **FlashInfer** | ❌ | ✅ | ✅ | ✅ | ✅ |
|
||||
| **FA3** | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| **Triton** | ❌ | ✅ | ✅ | ✅ | ❌ |
|
||||
| **Torch Native** | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| **FlashMLA** | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
Note: Every kernel backend is compatible with a page size > 1 by specifying an argument such as `--page-size 16`.
|
||||
This is because a page size of 16 can be converted to a page size of 1 in the kernel backend.
|
||||
The "❌" and "✅" symbols in the table above under "Page Size > 1" indicate whether the kernel actually operates with a page size greater than 1, rather than treating a page size of 16 as a page size of 1.
|
||||
|
||||
## User guide
|
||||
|
||||
|
||||
@@ -11,8 +11,6 @@ from typing import TYPE_CHECKING, Optional, Union
|
||||
import torch
|
||||
import triton
|
||||
|
||||
from sglang.global_config import global_config
|
||||
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
||||
from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
|
||||
from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
|
||||
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
||||
@@ -22,7 +20,6 @@ from sglang.srt.utils import is_cuda
|
||||
if TYPE_CHECKING:
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
||||
from sglang.srt.speculative.spec_info import SpecInfo
|
||||
|
||||
_is_cuda = is_cuda()
|
||||
|
||||
@@ -11,7 +11,6 @@ from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
||||
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
||||
from sglang.srt.utils import get_compiler_backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
|
||||
@@ -14,8 +14,6 @@ import torch
|
||||
import triton
|
||||
from flash_mla import flash_mla_with_kvcache, get_mla_metadata
|
||||
|
||||
from sglang.global_config import global_config
|
||||
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
||||
from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
|
||||
from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
|
||||
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
||||
@@ -24,7 +22,6 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMo
|
||||
if TYPE_CHECKING:
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
||||
from sglang.srt.speculative.spec_info import SpecInfo
|
||||
|
||||
|
||||
@@ -330,7 +327,7 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
|
||||
)
|
||||
|
||||
def get_cuda_graph_seq_len_fill_value(self):
|
||||
return 1024
|
||||
return 1
|
||||
|
||||
def forward_decode(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user