use fa3 in sgl-kernel (#4954)

2025-03-31 16:14:49 -07:00
parent ee47a6c1c3
commit 1c63e79756
3 changed files with 3 additions and 3 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -47,7 +47,7 @@ runtime_common = [

 srt = [
    "sglang[runtime_common]",
-    "sgl-kernel==0.0.5.post4",
+    "sgl-kernel==0.0.6",
    "flashinfer_python==0.2.3",
    "torch==2.5.1",
    "cuda-python",
--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -22,7 +22,7 @@ if TYPE_CHECKING:
    from sglang.srt.layers.radix_attention import RadixAttention
    from sglang.srt.model_executor.model_runner import ModelRunner

-from flash_attn_interface import flash_attn_with_kvcache
+from sgl_kernel.flash_attn import flash_attn_with_kvcache


@dataclass
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -15,7 +15,7 @@ pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2
 rm -rf /root/.cache/flashinfer
 # Force reinstall flashinfer and torch_memory_saver
 pip install flashinfer_python==0.2.3 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps
-pip install sgl-kernel==0.0.5.post4 --force-reinstall
+pip install sgl-kernel==0.0.6 --force-reinstall

 pip install torch_memory_saver
 pip install transformers==4.50.0 sentence_transformers accelerate==1.4.0 peft pandas datasets timm torchaudio