Minor style and doc fix (#7228)
This commit is contained in:
@@ -3,13 +3,16 @@
|
|||||||
## Supporting matrix for different attention backends
|
## Supporting matrix for different attention backends
|
||||||
|
|
||||||
| **Backend** | **Page Size > 1** | **Spec Decoding** | **MLA** | **Sliding Window** | **MultiModal** |
|
| **Backend** | **Page Size > 1** | **Spec Decoding** | **MLA** | **Sliding Window** | **MultiModal** |
|
||||||
|--------------------------|-------------------|-------------------|--------|--------------------|------------|
|
|--------------------------|-------------------|-------------------|---------|--------------------|----------------|
|
||||||
| **FlashInfer** | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| **FlashInfer** | ❌ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| **FA3** | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| **FA3** | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| **Triton** | ❌ | ✅ | ✅ | ❌ | ❌ |
|
| **Triton** | ❌ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| **Torch Native** | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| **Torch Native** | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| **FlashMLA** | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| **FlashMLA** | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
|
|
||||||
|
Note: Every kernel backend is compatible with a page size > 1 by specifying an argument such as `--page-size 16`.
|
||||||
|
This is because a page size of 16 can be converted to a page size of 1 in the kernel backend.
|
||||||
|
The "❌" and "✅" symbols in the table above under "Page Size > 1" indicate whether the kernel actually operates with a page size greater than 1, rather than treating a page size of 16 as a page size of 1.
|
||||||
|
|
||||||
## User guide
|
## User guide
|
||||||
|
|
||||||
|
|||||||
@@ -11,8 +11,6 @@ from typing import TYPE_CHECKING, Optional, Union
|
|||||||
import torch
|
import torch
|
||||||
import triton
|
import triton
|
||||||
|
|
||||||
from sglang.global_config import global_config
|
|
||||||
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
|
||||||
from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
|
from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
|
||||||
from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
|
from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
|
||||||
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
||||||
@@ -22,7 +20,6 @@ from sglang.srt.utils import is_cuda
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from sglang.srt.layers.radix_attention import RadixAttention
|
from sglang.srt.layers.radix_attention import RadixAttention
|
||||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||||
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
|
||||||
from sglang.srt.speculative.spec_info import SpecInfo
|
from sglang.srt.speculative.spec_info import SpecInfo
|
||||||
|
|
||||||
_is_cuda = is_cuda()
|
_is_cuda = is_cuda()
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
|||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
||||||
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
||||||
from sglang.srt.utils import get_compiler_backend
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from sglang.srt.layers.radix_attention import RadixAttention
|
from sglang.srt.layers.radix_attention import RadixAttention
|
||||||
|
|||||||
@@ -14,8 +14,6 @@ import torch
|
|||||||
import triton
|
import triton
|
||||||
from flash_mla import flash_mla_with_kvcache, get_mla_metadata
|
from flash_mla import flash_mla_with_kvcache, get_mla_metadata
|
||||||
|
|
||||||
from sglang.global_config import global_config
|
|
||||||
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
|
||||||
from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
|
from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
|
||||||
from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
|
from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
|
||||||
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
||||||
@@ -24,7 +22,6 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMo
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from sglang.srt.layers.radix_attention import RadixAttention
|
from sglang.srt.layers.radix_attention import RadixAttention
|
||||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||||
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
|
||||||
from sglang.srt.speculative.spec_info import SpecInfo
|
from sglang.srt.speculative.spec_info import SpecInfo
|
||||||
|
|
||||||
|
|
||||||
@@ -330,7 +327,7 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def get_cuda_graph_seq_len_fill_value(self):
|
def get_cuda_graph_seq_len_fill_value(self):
|
||||||
return 1024
|
return 1
|
||||||
|
|
||||||
def forward_decode(
|
def forward_decode(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user