use fa3 in sgl-kernel (#4954)
This commit is contained in:
@@ -47,7 +47,7 @@ runtime_common = [
|
|||||||
|
|
||||||
srt = [
|
srt = [
|
||||||
"sglang[runtime_common]",
|
"sglang[runtime_common]",
|
||||||
"sgl-kernel==0.0.5.post4",
|
"sgl-kernel==0.0.6",
|
||||||
"flashinfer_python==0.2.3",
|
"flashinfer_python==0.2.3",
|
||||||
"torch==2.5.1",
|
"torch==2.5.1",
|
||||||
"cuda-python",
|
"cuda-python",
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ if TYPE_CHECKING:
|
|||||||
from sglang.srt.layers.radix_attention import RadixAttention
|
from sglang.srt.layers.radix_attention import RadixAttention
|
||||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||||
|
|
||||||
from flash_attn_interface import flash_attn_with_kvcache
|
from sgl_kernel.flash_attn import flash_attn_with_kvcache
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2
|
|||||||
rm -rf /root/.cache/flashinfer
|
rm -rf /root/.cache/flashinfer
|
||||||
# Force reinstall flashinfer and torch_memory_saver
|
# Force reinstall flashinfer and torch_memory_saver
|
||||||
pip install flashinfer_python==0.2.3 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps
|
pip install flashinfer_python==0.2.3 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps
|
||||||
pip install sgl-kernel==0.0.5.post4 --force-reinstall
|
pip install sgl-kernel==0.0.6 --force-reinstall
|
||||||
|
|
||||||
pip install torch_memory_saver
|
pip install torch_memory_saver
|
||||||
pip install transformers==4.50.0 sentence_transformers accelerate==1.4.0 peft pandas datasets timm torchaudio
|
pip install transformers==4.50.0 sentence_transformers accelerate==1.4.0 peft pandas datasets timm torchaudio
|
||||||
|
|||||||
Reference in New Issue
Block a user