diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml index c9481ec50..0ab80bde6 100644 --- a/.github/workflows/pr-test-sgl-kernel.yml +++ b/.github/workflows/pr-test-sgl-kernel.yml @@ -88,7 +88,7 @@ jobs: - name: Install run: | bash scripts/ci_install_dependency.sh - pip3 install torch==2.5.1 && pip3 install pytest + pip3 install torch==2.6.0 && pip3 install pytest pip3 uninstall sgl-kernel -y || true pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps pip3 list | grep sgl-kernel diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md index cc9f4588c..d792fee21 100644 --- a/benchmark/deepseek_v3/README.md +++ b/benchmark/deepseek_v3/README.md @@ -33,7 +33,7 @@ Add [performance optimization options](#performance-optimization-options) as nee ```bash # Installation -pip install "sglang[all]>=0.4.3" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python +pip install "sglang[all]>=0.4.5.post2" # Launch python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code diff --git a/docker/Dockerfile b/docker/Dockerfile index 075b1e8d9..25b3e44c9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -43,6 +43,6 @@ RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \ fi \ && python3 -m pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu${CUINDEX} \ && cd sglang \ - && python3 -m pip --no-cache-dir install -e "python[${BUILD_TYPE}]" --find-links https://flashinfer.ai/whl/cu${CUINDEX}/torch2.5/flashinfer-python + && python3 -m pip --no-cache-dir install -e "python[${BUILD_TYPE}]" --find-links https://flashinfer.ai/whl/cu${CUINDEX}/torch2.6/flashinfer-python ENV DEBIAN_FRONTEND=interactive diff --git a/docs/start/install.md b/docs/start/install.md index 787b99053..f2c19e2e8 100644 --- a/docs/start/install.md +++ b/docs/start/install.md @@ -164,4 +164,4 @@ sky status --endpoint 30000 sglang - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please switch to other kernels by adding `--attention-backend triton --sampling-backend pytorch` and open an issue on GitHub. - If you only need to use OpenAI models with the frontend language, you can avoid installing other dependencies by using `pip install "sglang[openai]"`. - The language frontend operates independently of the backend runtime. You can install the frontend locally without needing a GPU, while the backend can be set up on a GPU-enabled machine. To install the frontend, run `pip install sglang`, and for the backend, use `pip install sglang[srt]`. `srt` is the abbreviation of SGLang runtime. -- To reinstall flashinfer locally, use the following command: `pip install "flashinfer-python>=0.2.3" -i https://flashinfer.ai/whl/cu124/torch2.5 --force-reinstall --no-deps` and then delete the cache with `rm -rf ~/.cache/flashinfer`. +- To reinstall flashinfer locally, use the following command: `pip install "flashinfer-python==0.2.3" -i https://flashinfer.ai/whl/cu124/torch2.6 --force-reinstall --no-deps` and then delete the cache with `rm -rf ~/.cache/flashinfer`. diff --git a/python/pyproject.toml b/python/pyproject.toml index 37f6b6346..c86288a8e 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -49,8 +49,8 @@ srt = [ "sglang[runtime_common]", "sgl-kernel==0.0.9.post2", "flashinfer_python==0.2.3", - "torch==2.5.1", - "torchvision==0.20.1", + "torch==2.6.0", + "torchvision==0.21.0", "cuda-python", "outlines>=0.0.44,<=0.1.11", "partial_json_parser", diff --git a/python/sglang/srt/layers/dp_attention.py b/python/sglang/srt/layers/dp_attention.py index c1b9e05ec..5f140a3df 100644 --- a/python/sglang/srt/layers/dp_attention.py +++ b/python/sglang/srt/layers/dp_attention.py @@ -143,7 +143,7 @@ def memcpy_triton_kernel( src_ptr, offset_ptr, sz_ptr, - offset_src, + offset_src: tl.constexpr, chunk_size, # multiplied for offset and sz BLOCK_SIZE: tl.constexpr, ): diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index 2f854573d..e4ab36f9e 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -23,7 +23,7 @@ pip install -e "python[all]" # Install additional dependencies pip install torch_memory_saver -pip install transformers==4.51.0 sentence_transformers accelerate==1.4.0 peft pandas datasets timm torchaudio +pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio # For compling xgrammar kernels pip install cuda-python nvidia-cuda-nvrtc-cu12