diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml index f3e05dd84..7298d80ec 100644 --- a/.github/workflows/execute-notebook.yml +++ b/.github/workflows/execute-notebook.yml @@ -24,7 +24,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh pip install -r docs/requirements.txt apt-get update && apt-get install -y pandoc parallel retry ln -sf "$(which python3)" /usr/bin/python diff --git a/.github/workflows/experiment-runner.yml b/.github/workflows/experiment-runner.yml index f3382320b..487ed9ba3 100644 --- a/.github/workflows/experiment-runner.yml +++ b/.github/workflows/experiment-runner.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Test experiment runner timeout-minutes: 120 diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml index 2cdb55ef1..096e876de 100644 --- a/.github/workflows/nightly-test-amd.yml +++ b/.github/workflows/nightly-test-amd.yml @@ -28,14 +28,14 @@ jobs: - name: Setup docker run: | touch github_summary.md - bash scripts/amd_ci_start_container.sh + bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: Nightly Test run: | - bash scripts/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200 + bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200 echo "$(> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/nightly-test.yml b/.github/workflows/nightly-test.yml index 5d6cf34c3..a32c1dbea 100644 --- a/.github/workflows/nightly-test.yml +++ b/.github/workflows/nightly-test.yml @@ -24,7 +24,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Run test timeout-minutes: 120 diff --git a/.github/workflows/pr-benchmark-rust.yml b/.github/workflows/pr-benchmark-rust.yml index b2f16bf4d..e34454c19 100644 --- a/.github/workflows/pr-benchmark-rust.yml +++ b/.github/workflows/pr-benchmark-rust.yml @@ -31,7 +31,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_rust.sh + bash scripts/ci/ci_install_rust.sh - name: Cache Rust dependencies uses: actions/cache@v4 @@ -78,7 +78,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_rust.sh + bash scripts/ci/ci_install_rust.sh - name: Cache Rust dependencies uses: actions/cache@v4 diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index aba17ccb9..9756356bb 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -36,19 +36,19 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: Evaluate Accuracy timeout-minutes: 30 run: | - bash scripts/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py - bash scripts/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py - bash scripts/amd_ci_exec.sh python3 models/test_qwen_models.py + bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py + bash scripts/ci/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py + bash scripts/ci/amd_ci_exec.sh python3 models/test_qwen_models.py accuracy-test-2-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -62,17 +62,17 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: Evaluate accuracy (TP=2) timeout-minutes: 30 run: | - bash scripts/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py + bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py mla-test-1-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -86,17 +86,17 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: MLA TEST timeout-minutes: 30 run: | - bash scripts/amd_ci_exec.sh python3 test_mla.py + bash scripts/ci/amd_ci_exec.sh python3 test_mla.py performance-test-1-gpu-part-1-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -110,33 +110,33 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: Benchmark single latency timeout-minutes: 20 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default - name: Benchmark online latency timeout-minutes: 15 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default - name: Benchmark offline throughput timeout-minutes: 15 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default - name: Benchmark offline throughput (Non-streaming, small batch size) timeout-minutes: 15 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size performance-test-1-gpu-part-2-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -150,27 +150,27 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: Benchmark offline throughput (w/o RadixAttention) timeout-minutes: 15 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache - name: Benchmark offline throughput (w/ Triton) timeout-minutes: 15 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend - name: Benchmark offline throughput (w/ FP8) timeout-minutes: 15 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 bench-test-2-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -184,37 +184,37 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: Benchmark dummy grok (TP=2) timeout-minutes: 30 run: | - bash scripts/amd_ci_exec.sh python3 models/test_dummy_grok_models.py + bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py - name: Benchmark single latency (TP=2) timeout-minutes: 25 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - name: Benchmark single latency + torch.compile (TP=2) timeout-minutes: 25 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 - name: Benchmark offline throughput (TP=2) timeout-minutes: 25 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default - name: Benchmark offline throughput (w/o RadixAttention) (TP=2) timeout-minutes: 25 run: | - bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache + bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache unit-test-backend-1-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -230,17 +230,17 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 50 run: | - bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 7 + bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 7 unit-test-backend-2-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -254,17 +254,17 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 40 run: | - bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd + bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd unit-test-backend-8-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -278,22 +278,22 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/amd_ci_install_dependency.sh + run: bash scripts/ci/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 60 run: | - bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600 + bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600 - name: Run CustomAllReduce test timeout-minutes: 20 run: | - bash scripts/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce + bash scripts/ci/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce unit-test-sgl-kernel-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -308,13 +308,13 @@ jobs: uses: actions/checkout@v4 - name: Start CI container - run: bash scripts/amd_ci_start_container.sh + run: bash scripts/ci/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: | - bash scripts/amd_ci_install_dependency.sh + bash scripts/ci/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 10 diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index 777d75435..fe03a0db1 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -34,7 +34,7 @@ jobs: - name: Install dependencies run: | - bash scripts/npu_ci_install_dependency.sh + bash scripts/ci/npu_ci_install_dependency.sh # copy required file from our daily cache cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp # copy download through proxy @@ -63,7 +63,7 @@ jobs: - name: Install dependencies run: | - bash scripts/npu_ci_install_dependency.sh + bash scripts/ci/npu_ci_install_dependency.sh # copy required file from our daily cache cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp # copy download through proxy @@ -92,7 +92,7 @@ jobs: - name: Install dependencies run: | - bash scripts/npu_ci_install_dependency.sh + bash scripts/ci/npu_ci_install_dependency.sh # copy required file from our daily cache cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp # copy download through proxy diff --git a/.github/workflows/pr-test-pd-router.yml b/.github/workflows/pr-test-pd-router.yml index acae4c610..caca5c94e 100644 --- a/.github/workflows/pr-test-pd-router.yml +++ b/.github/workflows/pr-test-pd-router.yml @@ -5,13 +5,13 @@ on: branches: [ main ] paths: - 'python/sglang/srt/disaggregation/**' - - 'scripts/ci_start_disaggregation_servers.sh' + - 'scripts/ci/ci_start_disaggregation_servers.sh' - 'sgl-router/**' pull_request: branches: [ main ] paths: - 'python/sglang/srt/disaggregation/**' - - 'scripts/ci_start_disaggregation_servers.sh' + - 'scripts/ci/ci_start_disaggregation_servers.sh' - 'sgl-router/**' workflow_dispatch: @@ -44,7 +44,7 @@ jobs: - name: Setup Rust run: | - bash scripts/ci_install_rust.sh + bash scripts/ci/ci_install_rust.sh - name: Cache Rust dependencies uses: actions/cache@v4 @@ -132,7 +132,7 @@ jobs: id: start_servers run: | echo "Starting disaggregation servers..." - bash scripts/ci_start_disaggregation_servers.sh & + bash scripts/ci/ci_start_disaggregation_servers.sh & SERVER_PID=$! echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT diff --git a/.github/workflows/pr-test-rust.yml b/.github/workflows/pr-test-rust.yml index b5c3cd01d..cc44192cb 100644 --- a/.github/workflows/pr-test-rust.yml +++ b/.github/workflows/pr-test-rust.yml @@ -25,7 +25,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_rust.sh + bash scripts/ci/ci_install_rust.sh - name: Run fmt run: | @@ -64,7 +64,7 @@ jobs: - name: Install rust dependencies run: | - bash scripts/ci_install_rust.sh + bash scripts/ci/ci_install_rust.sh - name: Build python binding run: | diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml index a3c84a539..624d9ed32 100644 --- a/.github/workflows/pr-test-sgl-kernel.yml +++ b/.github/workflows/pr-test-sgl-kernel.yml @@ -84,7 +84,7 @@ jobs: - name: Install run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 && pip3 install pytest pip3 uninstall sgl-kernel -y || true pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps @@ -116,7 +116,7 @@ jobs: - name: Install run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 pip3 uninstall sgl-kernel -y || true pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 65cf23bfa..7f76b02bf 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -52,7 +52,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Run test timeout-minutes: 10 @@ -76,7 +76,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Run test timeout-minutes: 30 @@ -96,7 +96,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Run test timeout-minutes: 30 @@ -120,7 +120,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Run test timeout-minutes: 20 @@ -144,7 +144,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Run test timeout-minutes: 20 @@ -164,7 +164,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Benchmark single latency timeout-minutes: 10 @@ -216,7 +216,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Benchmark offline throughput (w/o RadixAttention) timeout-minutes: 10 @@ -260,7 +260,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh - name: Benchmark single latency (TP=2) timeout-minutes: 10 @@ -310,7 +310,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh git clone https://github.com/merrymercy/human-eval.git cd human-eval pip install -e . @@ -333,7 +333,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh git clone https://github.com/merrymercy/human-eval.git cd human-eval pip install -e . @@ -356,7 +356,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_deepep.sh + bash scripts/ci/ci_install_deepep.sh - name: Run test timeout-minutes: 20 @@ -376,7 +376,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_deepep.sh + bash scripts/ci/ci_install_deepep.sh - name: Run test timeout-minutes: 20 @@ -398,7 +398,7 @@ jobs: - name: Install dependencies run: | - IS_BLACKWELL=1 bash scripts/ci_install_dependency.sh + IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh - name: Run test timeout-minutes: 20 diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml index 591cd5bdf..f4ae7ae3e 100644 --- a/.github/workflows/release-docs.yml +++ b/.github/workflows/release-docs.yml @@ -24,29 +24,28 @@ jobs: - name: Install dependencies run: | - find /public_sglang_ci/runner-a-gpu-1/_work/_tool/Python/3.10.13/x64/lib/python3.10/site-packages -name "sgl-kernel*" -exec rm -rf {} + || true - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh pip install -r docs/requirements.txt - apt-get update - apt-get install -y pandoc - apt-get update && apt-get install -y parallel retry - + apt-get update && apt-get install -y pandoc parallel retry ln -sf "$(which python3)" /usr/bin/python - name: Setup Jupyter Kernel run: | python -m ipykernel install --user --name python3 --display-name "Python 3" - - name: Execute notebooks and push to documents - env: - GITHUB_TOKEN: ${{ secrets.DOCUMENTATION_PAT_TOKEN }} + - name: Execute notebooks + timeout-minutes: 40 run: | cd docs make clean make compile + - name: Push HTML to sgl-project.github.io + run: | + cd docs make html python3 wrap_run_llm.py + cd _build/html git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1 diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml index 7dc6a8ba6..5bb1392e1 100644 --- a/.github/workflows/vllm-dependency-test.yml +++ b/.github/workflows/vllm-dependency-test.yml @@ -29,7 +29,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci_install_dependency.sh + bash scripts/ci/ci_install_dependency.sh pip install "vllm==0.10.0" pip install "openai==1.99.1" pip install "bitsandbytes>=0.44.0" diff --git a/README.md b/README.md index 3b3a226b9..63a8952c6 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ | [**Documentation**](https://docs.sglang.ai/) | [**Join Slack**](https://slack.sglang.ai/) | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/) -| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042) +| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) | ## News diff --git a/scripts/amd_ci_exec.sh b/scripts/ci/amd_ci_exec.sh similarity index 100% rename from scripts/amd_ci_exec.sh rename to scripts/ci/amd_ci_exec.sh diff --git a/scripts/amd_ci_install_dependency.sh b/scripts/ci/amd_ci_install_dependency.sh similarity index 100% rename from scripts/amd_ci_install_dependency.sh rename to scripts/ci/amd_ci_install_dependency.sh diff --git a/scripts/amd_ci_start_container.sh b/scripts/ci/amd_ci_start_container.sh similarity index 98% rename from scripts/amd_ci_start_container.sh rename to scripts/ci/amd_ci_start_container.sh index ebb41debf..5d1e6cfe1 100755 --- a/scripts/amd_ci_start_container.sh +++ b/scripts/ci/amd_ci_start_container.sh @@ -3,7 +3,7 @@ set -euo pipefail # Get version from SGLang version.py file FALLBACK_SGLANG_VERSION="v0.4.10.post2" -SGLANG_VERSION_FILE="$(dirname "$0")/../python/sglang/version.py" +SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py" if [ -f "$SGLANG_VERSION_FILE" ]; then SGLANG_VERSION=$(python3 -c ' diff --git a/scripts/ci_install_deepep.sh b/scripts/ci/ci_install_deepep.sh similarity index 98% rename from scripts/ci_install_deepep.sh rename to scripts/ci/ci_install_deepep.sh index e743bddaf..d82dca935 100755 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci/ci_install_deepep.sh @@ -2,7 +2,7 @@ # Install the dependency in CI. set -euxo pipefail -bash scripts/ci_install_dependency.sh +bash scripts/ci/ci_install_dependency.sh export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ export NVSHMEM_DIR=/opt/nvshmem/install diff --git a/scripts/ci_install_dependency.sh b/scripts/ci/ci_install_dependency.sh similarity index 98% rename from scripts/ci_install_dependency.sh rename to scripts/ci/ci_install_dependency.sh index 0ad51c7a3..83108a0e1 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci/ci_install_dependency.sh @@ -12,7 +12,7 @@ fi # Kill existing processes SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -bash "${SCRIPT_DIR}/killall_sglang.sh" +bash "${SCRIPT_DIR}/../killall_sglang.sh" # Install apt packages apt install -y git libnuma-dev diff --git a/scripts/ci_install_rust.sh b/scripts/ci/ci_install_rust.sh similarity index 100% rename from scripts/ci_install_rust.sh rename to scripts/ci/ci_install_rust.sh diff --git a/scripts/ci_start_disaggregation_servers.sh b/scripts/ci/ci_start_disaggregation_servers.sh similarity index 100% rename from scripts/ci_start_disaggregation_servers.sh rename to scripts/ci/ci_start_disaggregation_servers.sh diff --git a/scripts/npu_ci_install_dependency.sh b/scripts/ci/npu_ci_install_dependency.sh similarity index 100% rename from scripts/npu_ci_install_dependency.sh rename to scripts/ci/npu_ci_install_dependency.sh diff --git a/scripts/ci_cache_models.sh b/scripts/ci_cache_models.sh deleted file mode 100755 index 0ebe6c055..000000000 --- a/scripts/ci_cache_models.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -set -euxo pipefail - -mapfile -t models < <(python3 -c "from sglang.test.test_utils import _get_default_models; print(_get_default_models())" | jq -r '.[]') - -if [ ${#models[@]} -eq 0 ]; then - echo "Failed to get default models." - exit 1 -fi - -cache_dir="${DEFAULT_MODEL_CACHE_DIR:-}" - -if [ -z "$cache_dir" ]; then - echo "DEFAULT_MODEL_CACHE_DIR environment variable is not set." - exit 1 -fi - -failed_models=() -for model in "${models[@]}"; do - local_model_dir="$cache_dir/$model" - echo "Caching model: $model to $local_model_dir" - mkdir -p "$local_model_dir" - - if ! huggingface-cli download "$model" \ - --local-dir "$local_model_dir" \ - --local-dir-use-symlinks False 2>/dev/null; then - echo "WARNING: Failed to cache model: $model" - rm -rf "$local_model_dir" - failed_models+=("$model") - continue - fi - echo "Successfully cached model: $model" -done - -if [ ${#failed_models[@]} -gt 0 ]; then - echo -e "\n[Summary] Failed to cache following models:" - printf ' - %s\n' "${failed_models[@]}" -else - echo -e "\n[Summary] All models cached successfully" -fi diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index aeefd3371..4fa98e436 100644 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -87,6 +87,7 @@ FetchContent_Declare( GIT_SHALLOW OFF ) FetchContent_Populate(repo-flashinfer) + # flash-attention FetchContent_Declare( repo-flash-attention @@ -95,6 +96,7 @@ FetchContent_Declare( GIT_SHALLOW OFF ) FetchContent_Populate(repo-flash-attention) + # mscclpp FetchContent_Declare( repo-mscclpp @@ -232,6 +234,7 @@ set(SOURCES "csrc/elementwise/activation.cu" "csrc/elementwise/fused_add_rms_norm_kernel.cu" "csrc/elementwise/rope.cu" + "csrc/common_extension.cc" "csrc/gemm/awq_kernel.cu" "csrc/gemm/bmm_fp8.cu" "csrc/gemm/dsv3_fused_a_gemm.cu" @@ -251,24 +254,10 @@ set(SOURCES "csrc/gemm/per_token_quant_fp8.cu" "csrc/gemm/qserve_w4a8_per_chn_gemm.cu" "csrc/gemm/qserve_w4a8_per_group_gemm.cu" - "csrc/moe/moe_align_kernel.cu" - "csrc/moe/moe_fused_gate.cu" - "csrc/moe/moe_topk_softmax_kernels.cu" - "csrc/moe/nvfp4_blockwise_moe.cu" - "csrc/moe/fp8_blockwise_moe_kernel.cu" - "csrc/moe/prepare_moe_input.cu" - "csrc/moe/ep_moe_reorder_kernel.cu" - "csrc/moe/ep_moe_silu_and_mul_kernel.cu" - "csrc/speculative/eagle_utils.cu" - "csrc/speculative/packbit.cu" - "csrc/spatial/greenctx_stream.cu" - "csrc/speculative/speculative_sampling.cu" "csrc/grammar/apply_token_bitmask_inplace_cuda.cu" - "csrc/kvcacheio/transfer.cu" "csrc/moe/cutlass_moe/w4a8/scaled_mm_entry.cu" "csrc/moe/cutlass_moe/w4a8/w4a8_moe_data.cu" "csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cu" - "csrc/common_extension.cc" "csrc/moe/marlin_moe_wna16/ops.cu" "csrc/moe/marlin_moe_wna16/gptq_marlin_repack.cu" "csrc/moe/marlin_moe_wna16/awq_marlin_repack.cu" @@ -278,6 +267,19 @@ set(SOURCES "csrc/moe/marlin_moe_wna16/kernel_fp16_ku4.cu" "csrc/moe/marlin_moe_wna16/kernel_fp16_ku4b8.cu" "csrc/moe/marlin_moe_wna16/kernel_fp16_ku8b128.cu" + "csrc/moe/moe_align_kernel.cu" + "csrc/moe/moe_fused_gate.cu" + "csrc/moe/moe_topk_softmax_kernels.cu" + "csrc/moe/nvfp4_blockwise_moe.cu" + "csrc/moe/fp8_blockwise_moe_kernel.cu" + "csrc/moe/prepare_moe_input.cu" + "csrc/moe/ep_moe_reorder_kernel.cu" + "csrc/moe/ep_moe_silu_and_mul_kernel.cu" + "csrc/kvcacheio/transfer.cu" + "csrc/speculative/eagle_utils.cu" + "csrc/speculative/packbit.cu" + "csrc/spatial/greenctx_stream.cu" + "csrc/speculative/speculative_sampling.cu" "${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu" "${repo-flashinfer_SOURCE_DIR}/csrc/renorm.cu" "${repo-flashinfer_SOURCE_DIR}/csrc/sampling.cu" @@ -312,12 +314,15 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1") endif() + +# mscclpp set(MSCCLPP_USE_CUDA ON) set(MSCCLPP_BYPASS_GPU_CHECK ON) set(MSCCLPP_BUILD_TESTS OFF) add_subdirectory(${repo-mscclpp_SOURCE_DIR}) target_link_libraries(common_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda cublas cublasLt mscclpp_static) +# flash attention target_compile_definitions(common_ops PRIVATE FLASHATTENTION_DISABLE_BACKWARD FLASHATTENTION_DISABLE_DROPOUT diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md index 72491433a..c81a2af0b 100644 --- a/sgl-kernel/README.md +++ b/sgl-kernel/README.md @@ -5,6 +5,11 @@ [![PyPI](https://img.shields.io/pypi/v/sgl-kernel)](https://pypi.org/project/sgl-kernel) ## Installation +For CUDA 12.1 and above: + +```bash +pip3 install sgl-kernel +``` For CUDA 11.8: @@ -12,11 +17,6 @@ For CUDA 11.8: pip3 install sgl-kernel -i https://docs.sglang.ai/whl/cu118 ``` -For CUDA 12.1 or CUDA 12.4: - -```bash -pip3 install sgl-kernel -``` ## Build from source Development build: diff --git a/test/srt/test_ascend_mla_w8a8int8.py b/test/srt/ascend/test_ascend_mla_w8a8int8.py similarity index 100% rename from test/srt/test_ascend_mla_w8a8int8.py rename to test/srt/ascend/test_ascend_mla_w8a8int8.py diff --git a/test/srt/test_ascend_tp1_bf16.py b/test/srt/ascend/test_ascend_tp1_bf16.py similarity index 100% rename from test/srt/test_ascend_tp1_bf16.py rename to test/srt/ascend/test_ascend_tp1_bf16.py diff --git a/test/srt/test_ascend_tp2_bf16.py b/test/srt/ascend/test_ascend_tp2_bf16.py similarity index 100% rename from test/srt/test_ascend_tp2_bf16.py rename to test/srt/ascend/test_ascend_tp2_bf16.py diff --git a/test/srt/test_deepep_internode.py b/test/srt/ep/test_deepep_internode.py similarity index 100% rename from test/srt/test_deepep_internode.py rename to test/srt/ep/test_deepep_internode.py diff --git a/test/srt/test_deepep_intranode.py b/test/srt/ep/test_deepep_intranode.py similarity index 100% rename from test/srt/test_deepep_intranode.py rename to test/srt/ep/test_deepep_intranode.py diff --git a/test/srt/test_deepep_large.py b/test/srt/ep/test_deepep_large.py similarity index 100% rename from test/srt/test_deepep_large.py rename to test/srt/ep/test_deepep_large.py diff --git a/test/srt/test_deepep_low_latency.py b/test/srt/ep/test_deepep_low_latency.py similarity index 100% rename from test/srt/test_deepep_low_latency.py rename to test/srt/ep/test_deepep_low_latency.py diff --git a/test/srt/test_deepep_small.py b/test/srt/ep/test_deepep_small.py similarity index 100% rename from test/srt/test_deepep_small.py rename to test/srt/ep/test_deepep_small.py diff --git a/test/srt/test_eplb.py b/test/srt/ep/test_eplb.py similarity index 100% rename from test/srt/test_eplb.py rename to test/srt/ep/test_eplb.py diff --git a/test/srt/test_hybrid_dp_ep_tp_mtp.py b/test/srt/ep/test_hybrid_dp_ep_tp_mtp.py similarity index 100% rename from test/srt/test_hybrid_dp_ep_tp_mtp.py rename to test/srt/ep/test_hybrid_dp_ep_tp_mtp.py diff --git a/test/srt/test_moe_deepep.py b/test/srt/ep/test_moe_deepep.py similarity index 100% rename from test/srt/test_moe_deepep.py rename to test/srt/ep/test_moe_deepep.py diff --git a/test/srt/test_moe_deepep_eval_accuracy_large.py b/test/srt/ep/test_moe_deepep_eval_accuracy_large.py similarity index 100% rename from test/srt/test_moe_deepep_eval_accuracy_large.py rename to test/srt/ep/test_moe_deepep_eval_accuracy_large.py diff --git a/test/srt/test_moe_ep.py b/test/srt/ep/test_moe_ep.py similarity index 100% rename from test/srt/test_moe_ep.py rename to test/srt/ep/test_moe_ep.py diff --git a/test/srt/test_hicache.py b/test/srt/hicache/test_hicache.py similarity index 100% rename from test/srt/test_hicache.py rename to test/srt/hicache/test_hicache.py diff --git a/test/srt/test_hicache_mla.py b/test/srt/hicache/test_hicache_mla.py similarity index 100% rename from test/srt/test_hicache_mla.py rename to test/srt/hicache/test_hicache_mla.py diff --git a/test/srt/test_hicache_page.py b/test/srt/hicache/test_hicache_page.py similarity index 100% rename from test/srt/test_hicache_page.py rename to test/srt/hicache/test_hicache_page.py diff --git a/test/srt/test_hicache_storage.py b/test/srt/hicache/test_hicache_storage.py similarity index 100% rename from test/srt/test_hicache_storage.py rename to test/srt/hicache/test_hicache_storage.py diff --git a/test/srt/models/lora/test_lora.py b/test/srt/lora/test_lora.py similarity index 100% rename from test/srt/models/lora/test_lora.py rename to test/srt/lora/test_lora.py diff --git a/test/srt/models/lora/test_lora_backend.py b/test/srt/lora/test_lora_backend.py similarity index 100% rename from test/srt/models/lora/test_lora_backend.py rename to test/srt/lora/test_lora_backend.py diff --git a/test/srt/models/lora/test_lora_cuda_graph.py b/test/srt/lora/test_lora_cuda_graph.py similarity index 100% rename from test/srt/models/lora/test_lora_cuda_graph.py rename to test/srt/lora/test_lora_cuda_graph.py diff --git a/test/srt/models/lora/test_lora_eviction.py b/test/srt/lora/test_lora_eviction.py similarity index 100% rename from test/srt/models/lora/test_lora_eviction.py rename to test/srt/lora/test_lora_eviction.py diff --git a/test/srt/models/lora/test_lora_qwen3.py b/test/srt/lora/test_lora_qwen3.py similarity index 100% rename from test/srt/models/lora/test_lora_qwen3.py rename to test/srt/lora/test_lora_qwen3.py diff --git a/test/srt/models/lora/test_lora_tp.py b/test/srt/lora/test_lora_tp.py similarity index 100% rename from test/srt/models/lora/test_lora_tp.py rename to test/srt/lora/test_lora_tp.py diff --git a/test/srt/models/lora/test_lora_update.py b/test/srt/lora/test_lora_update.py similarity index 100% rename from test/srt/models/lora/test_lora_update.py rename to test/srt/lora/test_lora_update.py diff --git a/test/srt/models/lora/test_multi_lora_backend.py b/test/srt/lora/test_multi_lora_backend.py similarity index 100% rename from test/srt/models/lora/test_multi_lora_backend.py rename to test/srt/lora/test_multi_lora_backend.py diff --git a/test/srt/models/lora/utils.py b/test/srt/lora/utils.py similarity index 100% rename from test/srt/models/lora/utils.py rename to test/srt/lora/utils.py diff --git a/test/srt/test_awq.py b/test/srt/quant/test_awq.py similarity index 100% rename from test/srt/test_awq.py rename to test/srt/quant/test_awq.py diff --git a/test/srt/test_awq_dequant.py b/test/srt/quant/test_awq_dequant.py similarity index 100% rename from test/srt/test_awq_dequant.py rename to test/srt/quant/test_awq_dequant.py diff --git a/test/srt/test_block_int8.py b/test/srt/quant/test_block_int8.py similarity index 100% rename from test/srt/test_block_int8.py rename to test/srt/quant/test_block_int8.py diff --git a/test/srt/test_fp8_kernel.py b/test/srt/quant/test_fp8_kernel.py similarity index 100% rename from test/srt/test_fp8_kernel.py rename to test/srt/quant/test_fp8_kernel.py diff --git a/test/srt/test_fp8_kvcache.py b/test/srt/quant/test_fp8_kvcache.py similarity index 100% rename from test/srt/test_fp8_kvcache.py rename to test/srt/quant/test_fp8_kvcache.py diff --git a/test/srt/test_int8_kernel.py b/test/srt/quant/test_int8_kernel.py similarity index 100% rename from test/srt/test_int8_kernel.py rename to test/srt/quant/test_int8_kernel.py diff --git a/test/srt/test_w8a8_quantization.py b/test/srt/quant/test_w8a8_quantization.py similarity index 97% rename from test/srt/test_w8a8_quantization.py rename to test/srt/quant/test_w8a8_quantization.py index 3d4ce1afa..acb7f5c7d 100644 --- a/test/srt/test_w8a8_quantization.py +++ b/test/srt/quant/test_w8a8_quantization.py @@ -43,7 +43,7 @@ class TestW8A8(CustomTestCase): metrics = run_eval(args) print(metrics) - self.assertGreater(metrics["accuracy"], 0.7) + self.assertGreater(metrics["accuracy"], 0.69) def run_decode(self, max_new_tokens): response = requests.post( diff --git a/test/srt/test_update_weights_from_disk.py b/test/srt/rl/test_update_weights_from_disk.py similarity index 100% rename from test/srt/test_update_weights_from_disk.py rename to test/srt/rl/test_update_weights_from_disk.py diff --git a/test/srt/test_update_weights_from_distributed.py b/test/srt/rl/test_update_weights_from_distributed.py similarity index 100% rename from test/srt/test_update_weights_from_distributed.py rename to test/srt/rl/test_update_weights_from_distributed.py diff --git a/test/srt/test_update_weights_from_tensor.py b/test/srt/rl/test_update_weights_from_tensor.py similarity index 100% rename from test/srt/test_update_weights_from_tensor.py rename to test/srt/rl/test_update_weights_from_tensor.py diff --git a/test/srt/test_verl_engine_2_gpu.py b/test/srt/rl/test_verl_engine_2_gpu.py similarity index 100% rename from test/srt/test_verl_engine_2_gpu.py rename to test/srt/rl/test_verl_engine_2_gpu.py diff --git a/test/srt/test_verl_engine_4_gpu.py b/test/srt/rl/test_verl_engine_4_gpu.py similarity index 100% rename from test/srt/test_verl_engine_4_gpu.py rename to test/srt/rl/test_verl_engine_4_gpu.py diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index fa265e698..0eab9537e 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -13,13 +13,16 @@ class TestFile: suites = { "per-commit": [ - TestFile("models/lora/test_lora.py", 200), - TestFile("models/lora/test_lora_eviction.py", 200), - TestFile("models/lora/test_lora_backend.py", 99), - TestFile("models/lora/test_multi_lora_backend.py", 60), - TestFile("models/lora/test_lora_cuda_graph.py", 250), - TestFile("models/lora/test_lora_update.py", 800), - TestFile("models/lora/test_lora_qwen3.py", 97), + TestFile("hicache/test_hicache.py", 116), + TestFile("hicache/test_hicache_mla.py", 127), + TestFile("hicache/test_hicache_storage.py", 127), + TestFile("lora/test_lora.py", 200), + TestFile("lora/test_lora_eviction.py", 200), + TestFile("lora/test_lora_backend.py", 99), + TestFile("lora/test_multi_lora_backend.py", 60), + TestFile("lora/test_lora_cuda_graph.py", 250), + TestFile("lora/test_lora_update.py", 800), + TestFile("lora/test_lora_qwen3.py", 97), TestFile("models/test_embedding_models.py", 73), # TestFile("models/test_clip_models.py", 52), TestFile("models/test_encoder_embedding_models.py", 100), @@ -50,8 +53,13 @@ suites = { TestFile("openai_server/validation/test_matched_stop.py", 60), TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85), TestFile("openai_server/validation/test_request_length_validation.py", 31), + TestFile("quant/test_block_int8.py", 22), + TestFile("quant/test_fp8_kernel.py", 8), + TestFile("quant/test_int8_kernel.py", 8), + TestFile("quant/test_w8a8_quantization.py", 46), + TestFile("rl/test_update_weights_from_disk.py", 114), + TestFile("rl/test_update_weights_from_tensor.py", 48), TestFile("test_abort.py", 51), - TestFile("test_block_int8.py", 22), TestFile("test_create_kvindices.py", 2), TestFile("test_chunked_prefill.py", 313), TestFile("test_eagle_infer_a.py", 370), @@ -60,15 +68,11 @@ suites = { TestFile("test_eval_fp8_accuracy.py", 303), TestFile("test_fa3.py", 376), # TestFile("test_flashmla.py", 352), - TestFile("test_fp8_kernel.py", 8), TestFile("test_function_call_parser.py", 10), TestFile("test_fused_moe.py", 30), TestFile("test_gpt_oss_1gpu.py", 600), - TestFile("test_hicache.py", 116), - TestFile("test_hicache_mla.py", 127), - TestFile("test_hicache_storage.py", 127), TestFile("test_hidden_states.py", 55), - TestFile("test_int8_kernel.py", 8), + TestFile("test_hybrid_attn_backend.py", 100), TestFile("test_input_embeddings.py", 38), TestFile("test_io_struct.py", 8), TestFile("test_jinja_template_utils.py", 1), @@ -85,6 +89,7 @@ suites = { TestFile("test_pytorch_sampling_backend.py", 66), TestFile("test_radix_attention.py", 105), TestFile("test_regex_constrained.py", 64), + TestFile("test_reasoning_parser.py", 5), TestFile("test_retract_decode.py", 54), TestFile("test_request_queue_validation.py", 30), TestFile("test_server_args.py", 1), @@ -100,23 +105,18 @@ suites = { TestFile("test_triton_attention_backend.py", 150), TestFile("test_triton_moe_channel_fp8_kernel.py", 25), TestFile("test_triton_sliding_window.py", 250), - TestFile("test_update_weights_from_disk.py", 114), - TestFile("test_update_weights_from_tensor.py", 48), TestFile("test_utils_update_weights.py", 48), TestFile("test_vision_chunked_prefill.py", 175), TestFile("test_vlm_input_format.py", 300), TestFile("test_vision_openai_server_a.py", 989), TestFile("test_vision_openai_server_b.py", 620), - TestFile("test_w8a8_quantization.py", 46), - TestFile("test_reasoning_parser.py", 5), - TestFile("test_hybrid_attn_backend.py", 100), ], "per-commit-2-gpu": [ - TestFile("models/lora/test_lora_tp.py", 116), + TestFile("lora/test_lora_tp.py", 116), + TestFile("rl/test_update_weights_from_distributed.py", 103), TestFile("test_data_parallelism.py", 73), TestFile("test_dp_attention.py", 277), TestFile("test_patch_torch.py", 19), - TestFile("test_update_weights_from_distributed.py", 103), TestFile("test_release_memory_occupation.py", 127), ], "per-commit-4-gpu": [ @@ -127,7 +127,7 @@ suites = { ], "per-commit-8-gpu": [ # Disabled because it hangs on the CI. - # TestFile("test_moe_ep.py", 181), + # TestFile("ep/test_moe_ep.py", 181), TestFile("test_disaggregation.py", 499), TestFile("test_disaggregation_different_tp.py", 155), TestFile("test_full_deepseek_v3.py", 333), @@ -136,16 +136,16 @@ suites = { # add more here ], "per-commit-4-gpu-deepep": [ - TestFile("test_deepep_small.py", 531), + TestFile("ep/test_deepep_small.py", 531), ], "per-commit-8-gpu-deepep": [ - TestFile("test_deepep_large.py", 338), + TestFile("ep/test_deepep_large.py", 338), ], "nightly": [ TestFile("test_nightly_gsm8k_eval.py"), ], "vllm_dependency_test": [ - TestFile("test_awq.py", 163), + TestFile("quant/test_awq.py", 163), TestFile("test_bnb.py", 5), TestFile("test_gguf.py", 96), TestFile("test_gptqmodel_dynamic.py", 102), @@ -156,13 +156,9 @@ suites = { # Add AMD tests suite_amd = { "per-commit-amd": [ - TestFile("models/lora/test_lora_backend.py", 99), - TestFile("models/lora/test_multi_lora_backend.py", 60), - TestFile("models/lora/test_lora_cuda_graph.py", 250), - TestFile("test_mla.py", 242), - TestFile("test_mla_deepseek_v3.py", 221), - TestFile("test_torch_compile.py", 76), - TestFile("test_torch_compile_moe.py", 172), + TestFile("lora/test_lora_backend.py", 99), + TestFile("lora/test_multi_lora_backend.py", 60), + TestFile("lora/test_lora_cuda_graph.py", 250), TestFile("models/test_qwen_models.py", 82), TestFile("models/test_reward_models.py", 132), TestFile("openai_server/basic/test_openai_embedding.py", 141), @@ -170,14 +166,18 @@ suite_amd = { TestFile("openai_server/features/test_reasoning_content.py", 89), TestFile("openai_server/validation/test_large_max_new_tokens.py", 41), TestFile("openai_server/validation/test_request_length_validation.py", 31), + TestFile("quant/test_block_int8.py", 22), + TestFile("quant/test_awq_dequant.py", 2), + TestFile("rl/test_update_weights_from_disk.py", 114), TestFile("test_abort.py", 51), - TestFile("test_block_int8.py", 22), TestFile("test_create_kvindices.py", 2), TestFile("test_chunked_prefill.py", 313), TestFile("test_eval_fp8_accuracy.py", 303), TestFile("test_function_call_parser.py", 10), TestFile("test_fused_moe.py", 30), TestFile("test_input_embeddings.py", 38), + TestFile("test_mla.py", 242), + TestFile("test_mla_deepseek_v3.py", 221), TestFile("test_metrics.py", 32), TestFile("test_no_chunked_prefill.py", 108), # TestFile("test_no_overlap_scheduler.py", 234), # Disabled temporarily and track in #7703 @@ -186,22 +186,21 @@ suite_amd = { TestFile("test_pytorch_sampling_backend.py", 66), TestFile("test_radix_attention.py", 105), TestFile("test_retract_decode.py", 54), - TestFile("test_server_args.py", 1), - TestFile("test_skip_tokenizer_init.py", 117), - TestFile("test_torch_native_attention_backend.py", 123), - TestFile("test_triton_attention_backend.py", 150), - TestFile("test_update_weights_from_disk.py", 114), - TestFile("test_vertex_endpoint.py", 31), - # TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701 TestFile("test_reasoning_parser.py", 5), TestFile("test_rope_rocm.py", 3), - TestFile("test_awq_dequant.py", 2), + TestFile("test_server_args.py", 1), + TestFile("test_skip_tokenizer_init.py", 117), + TestFile("test_torch_compile.py", 76), + TestFile("test_torch_compile_moe.py", 172), + TestFile("test_torch_native_attention_backend.py", 123), + TestFile("test_triton_attention_backend.py", 150), + # TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701 ], "per-commit-2-gpu-amd": [ - TestFile("models/lora/test_lora_tp.py", 116), + TestFile("lora/test_lora_tp.py", 116), + TestFile("rl/test_update_weights_from_distributed.py", 103), TestFile("test_data_parallelism.py", 73), TestFile("test_patch_torch.py", 19), - TestFile("test_update_weights_from_distributed.py", 103), ], "per-commit-4-gpu-amd": [ TestFile("test_pp_single_node.py", 150), @@ -236,13 +235,13 @@ suite_xeon = { # Add Ascend NPU tests suite_ascend = { "per-commit-1-ascend-npu": [ - TestFile("test_ascend_tp1_bf16.py", 400), + TestFile("ascend/test_ascend_tp1_bf16.py", 400), ], "per-commit-2-ascend-npu": [ - TestFile("test_ascend_tp2_bf16.py", 400), + TestFile("ascend/test_ascend_tp2_bf16.py", 400), ], "per-commit-4-ascend-npu": [ - TestFile("test_ascend_mla_w8a8int8.py", 400), + TestFile("ascend/test_ascend_mla_w8a8int8.py", 400), ], } diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 30e1fab50..608595b95 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -56,7 +56,10 @@ class TestBenchServing(CustomTestCase): f"### test_offline_throughput_non_stream_small_batch_size\n" f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) - self.assertGreater(res["output_throughput"], 1045) + if is_in_amd_ci(): + self.assertGreater(res["output_throughput"], 1000) + else: + self.assertGreater(res["output_throughput"], 1050) def test_offline_throughput_without_radix_cache(self): res = run_bench_serving( diff --git a/test/srt/test_intel_amx_attention_backend.py b/test/srt/test_intel_amx_attention_backend.py index 4c2bc130e..0b49c8af7 100644 --- a/test/srt/test_intel_amx_attention_backend.py +++ b/test/srt/test_intel_amx_attention_backend.py @@ -70,7 +70,7 @@ class TestIntelAMXAttnBackend(CustomTestCase): ) metrics = run_eval(args) - self.assertGreater(metrics["score"], 0.5) + self.assertGreater(metrics["score"], 0.45) finally: kill_process_tree(process.pid)