Reorganize CI and test files (#9027)
This commit is contained in:
2
.github/workflows/execute-notebook.yml
vendored
2
.github/workflows/execute-notebook.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
pip install -r docs/requirements.txt
|
||||
apt-get update && apt-get install -y pandoc parallel retry
|
||||
ln -sf "$(which python3)" /usr/bin/python
|
||||
|
||||
2
.github/workflows/experiment-runner.yml
vendored
2
.github/workflows/experiment-runner.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Test experiment runner
|
||||
timeout-minutes: 120
|
||||
|
||||
6
.github/workflows/nightly-test-amd.yml
vendored
6
.github/workflows/nightly-test-amd.yml
vendored
@@ -28,14 +28,14 @@ jobs:
|
||||
- name: Setup docker
|
||||
run: |
|
||||
touch github_summary.md
|
||||
bash scripts/amd_ci_start_container.sh
|
||||
bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Nightly Test
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
|
||||
bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
|
||||
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
2
.github/workflows/nightly-test.yml
vendored
2
.github/workflows/nightly-test.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 120
|
||||
|
||||
4
.github/workflows/pr-benchmark-rust.yml
vendored
4
.github/workflows/pr-benchmark-rust.yml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_rust.sh
|
||||
bash scripts/ci/ci_install_rust.sh
|
||||
|
||||
- name: Cache Rust dependencies
|
||||
uses: actions/cache@v4
|
||||
@@ -78,7 +78,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_rust.sh
|
||||
bash scripts/ci/ci_install_rust.sh
|
||||
|
||||
- name: Cache Rust dependencies
|
||||
uses: actions/cache@v4
|
||||
|
||||
84
.github/workflows/pr-test-amd.yml
vendored
84
.github/workflows/pr-test-amd.yml
vendored
@@ -36,19 +36,19 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Evaluate Accuracy
|
||||
timeout-minutes: 30
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
|
||||
bash scripts/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
|
||||
bash scripts/amd_ci_exec.sh python3 models/test_qwen_models.py
|
||||
bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
|
||||
bash scripts/ci/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
|
||||
bash scripts/ci/amd_ci_exec.sh python3 models/test_qwen_models.py
|
||||
|
||||
accuracy-test-2-gpu-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
@@ -62,17 +62,17 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Evaluate accuracy (TP=2)
|
||||
timeout-minutes: 30
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
|
||||
bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
|
||||
|
||||
mla-test-1-gpu-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
@@ -86,17 +86,17 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: MLA TEST
|
||||
timeout-minutes: 30
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 test_mla.py
|
||||
bash scripts/ci/amd_ci_exec.sh python3 test_mla.py
|
||||
|
||||
performance-test-1-gpu-part-1-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
@@ -110,33 +110,33 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark single latency
|
||||
timeout-minutes: 20
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
|
||||
|
||||
- name: Benchmark online latency
|
||||
timeout-minutes: 15
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
|
||||
|
||||
- name: Benchmark offline throughput
|
||||
timeout-minutes: 15
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
|
||||
|
||||
- name: Benchmark offline throughput (Non-streaming, small batch size)
|
||||
timeout-minutes: 15
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
|
||||
|
||||
performance-test-1-gpu-part-2-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
@@ -150,27 +150,27 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark offline throughput (w/o RadixAttention)
|
||||
timeout-minutes: 15
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
|
||||
|
||||
- name: Benchmark offline throughput (w/ Triton)
|
||||
timeout-minutes: 15
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
|
||||
|
||||
- name: Benchmark offline throughput (w/ FP8)
|
||||
timeout-minutes: 15
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
|
||||
|
||||
bench-test-2-gpu-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
@@ -184,37 +184,37 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark dummy grok (TP=2)
|
||||
timeout-minutes: 30
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
|
||||
bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
|
||||
|
||||
- name: Benchmark single latency (TP=2)
|
||||
timeout-minutes: 25
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
|
||||
|
||||
- name: Benchmark single latency + torch.compile (TP=2)
|
||||
timeout-minutes: 25
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
|
||||
|
||||
- name: Benchmark offline throughput (TP=2)
|
||||
timeout-minutes: 25
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
|
||||
|
||||
- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
|
||||
timeout-minutes: 25
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
|
||||
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
|
||||
|
||||
unit-test-backend-1-gpu-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
@@ -230,17 +230,17 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 50
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 7
|
||||
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 7
|
||||
|
||||
unit-test-backend-2-gpu-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
@@ -254,17 +254,17 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 40
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
|
||||
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
|
||||
|
||||
unit-test-backend-8-gpu-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
@@ -278,22 +278,22 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: bash scripts/amd_ci_install_dependency.sh
|
||||
run: bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
|
||||
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
|
||||
|
||||
- name: Run CustomAllReduce test
|
||||
timeout-minutes: 20
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce
|
||||
bash scripts/ci/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce
|
||||
|
||||
unit-test-sgl-kernel-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
@@ -308,13 +308,13 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
run: bash scripts/ci/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/amd_ci_install_dependency.sh
|
||||
bash scripts/ci/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 10
|
||||
|
||||
6
.github/workflows/pr-test-npu.yml
vendored
6
.github/workflows/pr-test-npu.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/npu_ci_install_dependency.sh
|
||||
bash scripts/ci/npu_ci_install_dependency.sh
|
||||
# copy required file from our daily cache
|
||||
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
||||
# copy download through proxy
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/npu_ci_install_dependency.sh
|
||||
bash scripts/ci/npu_ci_install_dependency.sh
|
||||
# copy required file from our daily cache
|
||||
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
||||
# copy download through proxy
|
||||
@@ -92,7 +92,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/npu_ci_install_dependency.sh
|
||||
bash scripts/ci/npu_ci_install_dependency.sh
|
||||
# copy required file from our daily cache
|
||||
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
||||
# copy download through proxy
|
||||
|
||||
8
.github/workflows/pr-test-pd-router.yml
vendored
8
.github/workflows/pr-test-pd-router.yml
vendored
@@ -5,13 +5,13 @@ on:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'python/sglang/srt/disaggregation/**'
|
||||
- 'scripts/ci_start_disaggregation_servers.sh'
|
||||
- 'scripts/ci/ci_start_disaggregation_servers.sh'
|
||||
- 'sgl-router/**'
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'python/sglang/srt/disaggregation/**'
|
||||
- 'scripts/ci_start_disaggregation_servers.sh'
|
||||
- 'scripts/ci/ci_start_disaggregation_servers.sh'
|
||||
- 'sgl-router/**'
|
||||
workflow_dispatch:
|
||||
|
||||
@@ -44,7 +44,7 @@ jobs:
|
||||
|
||||
- name: Setup Rust
|
||||
run: |
|
||||
bash scripts/ci_install_rust.sh
|
||||
bash scripts/ci/ci_install_rust.sh
|
||||
|
||||
- name: Cache Rust dependencies
|
||||
uses: actions/cache@v4
|
||||
@@ -132,7 +132,7 @@ jobs:
|
||||
id: start_servers
|
||||
run: |
|
||||
echo "Starting disaggregation servers..."
|
||||
bash scripts/ci_start_disaggregation_servers.sh &
|
||||
bash scripts/ci/ci_start_disaggregation_servers.sh &
|
||||
SERVER_PID=$!
|
||||
echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT
|
||||
|
||||
|
||||
4
.github/workflows/pr-test-rust.yml
vendored
4
.github/workflows/pr-test-rust.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_rust.sh
|
||||
bash scripts/ci/ci_install_rust.sh
|
||||
|
||||
- name: Run fmt
|
||||
run: |
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
|
||||
- name: Install rust dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_rust.sh
|
||||
bash scripts/ci/ci_install_rust.sh
|
||||
|
||||
- name: Build python binding
|
||||
run: |
|
||||
|
||||
4
.github/workflows/pr-test-sgl-kernel.yml
vendored
4
.github/workflows/pr-test-sgl-kernel.yml
vendored
@@ -84,7 +84,7 @@ jobs:
|
||||
|
||||
- name: Install
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 && pip3 install pytest
|
||||
pip3 uninstall sgl-kernel -y || true
|
||||
pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
|
||||
@@ -116,7 +116,7 @@ jobs:
|
||||
|
||||
- name: Install
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
|
||||
pip3 uninstall sgl-kernel -y || true
|
||||
pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
|
||||
|
||||
26
.github/workflows/pr-test.yml
vendored
26
.github/workflows/pr-test.yml
vendored
@@ -52,7 +52,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 10
|
||||
@@ -76,7 +76,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 30
|
||||
@@ -96,7 +96,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 30
|
||||
@@ -120,7 +120,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
@@ -164,7 +164,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark single latency
|
||||
timeout-minutes: 10
|
||||
@@ -216,7 +216,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark offline throughput (w/o RadixAttention)
|
||||
timeout-minutes: 10
|
||||
@@ -260,7 +260,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark single latency (TP=2)
|
||||
timeout-minutes: 10
|
||||
@@ -310,7 +310,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
git clone https://github.com/merrymercy/human-eval.git
|
||||
cd human-eval
|
||||
pip install -e .
|
||||
@@ -333,7 +333,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
git clone https://github.com/merrymercy/human-eval.git
|
||||
cd human-eval
|
||||
pip install -e .
|
||||
@@ -356,7 +356,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_deepep.sh
|
||||
bash scripts/ci/ci_install_deepep.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
@@ -376,7 +376,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_deepep.sh
|
||||
bash scripts/ci/ci_install_deepep.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
@@ -398,7 +398,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
IS_BLACKWELL=1 bash scripts/ci_install_dependency.sh
|
||||
IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
|
||||
17
.github/workflows/release-docs.yml
vendored
17
.github/workflows/release-docs.yml
vendored
@@ -24,29 +24,28 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
find /public_sglang_ci/runner-a-gpu-1/_work/_tool/Python/3.10.13/x64/lib/python3.10/site-packages -name "sgl-kernel*" -exec rm -rf {} + || true
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
pip install -r docs/requirements.txt
|
||||
apt-get update
|
||||
apt-get install -y pandoc
|
||||
apt-get update && apt-get install -y parallel retry
|
||||
|
||||
apt-get update && apt-get install -y pandoc parallel retry
|
||||
ln -sf "$(which python3)" /usr/bin/python
|
||||
|
||||
- name: Setup Jupyter Kernel
|
||||
run: |
|
||||
python -m ipykernel install --user --name python3 --display-name "Python 3"
|
||||
|
||||
- name: Execute notebooks and push to documents
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.DOCUMENTATION_PAT_TOKEN }}
|
||||
- name: Execute notebooks
|
||||
timeout-minutes: 40
|
||||
run: |
|
||||
cd docs
|
||||
make clean
|
||||
make compile
|
||||
|
||||
- name: Push HTML to sgl-project.github.io
|
||||
run: |
|
||||
cd docs
|
||||
make html
|
||||
python3 wrap_run_llm.py
|
||||
|
||||
cd _build/html
|
||||
|
||||
git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
|
||||
|
||||
2
.github/workflows/vllm-dependency-test.yml
vendored
2
.github/workflows/vllm-dependency-test.yml
vendored
@@ -29,7 +29,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
pip install "vllm==0.10.0"
|
||||
pip install "openai==1.99.1"
|
||||
pip install "bitsandbytes>=0.44.0"
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
| [**Documentation**](https://docs.sglang.ai/)
|
||||
| [**Join Slack**](https://slack.sglang.ai/)
|
||||
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
|
||||
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
|
||||
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
|
||||
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
||||
|
||||
## News
|
||||
|
||||
@@ -3,7 +3,7 @@ set -euo pipefail
|
||||
|
||||
# Get version from SGLang version.py file
|
||||
FALLBACK_SGLANG_VERSION="v0.4.10.post2"
|
||||
SGLANG_VERSION_FILE="$(dirname "$0")/../python/sglang/version.py"
|
||||
SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
|
||||
|
||||
if [ -f "$SGLANG_VERSION_FILE" ]; then
|
||||
SGLANG_VERSION=$(python3 -c '
|
||||
@@ -2,7 +2,7 @@
|
||||
# Install the dependency in CI.
|
||||
set -euxo pipefail
|
||||
|
||||
bash scripts/ci_install_dependency.sh
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
|
||||
export NVSHMEM_DIR=/opt/nvshmem/install
|
||||
@@ -12,7 +12,7 @@ fi
|
||||
|
||||
# Kill existing processes
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
bash "${SCRIPT_DIR}/killall_sglang.sh"
|
||||
bash "${SCRIPT_DIR}/../killall_sglang.sh"
|
||||
|
||||
# Install apt packages
|
||||
apt install -y git libnuma-dev
|
||||
@@ -1,40 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -euxo pipefail
|
||||
|
||||
mapfile -t models < <(python3 -c "from sglang.test.test_utils import _get_default_models; print(_get_default_models())" | jq -r '.[]')
|
||||
|
||||
if [ ${#models[@]} -eq 0 ]; then
|
||||
echo "Failed to get default models."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cache_dir="${DEFAULT_MODEL_CACHE_DIR:-}"
|
||||
|
||||
if [ -z "$cache_dir" ]; then
|
||||
echo "DEFAULT_MODEL_CACHE_DIR environment variable is not set."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
failed_models=()
|
||||
for model in "${models[@]}"; do
|
||||
local_model_dir="$cache_dir/$model"
|
||||
echo "Caching model: $model to $local_model_dir"
|
||||
mkdir -p "$local_model_dir"
|
||||
|
||||
if ! huggingface-cli download "$model" \
|
||||
--local-dir "$local_model_dir" \
|
||||
--local-dir-use-symlinks False 2>/dev/null; then
|
||||
echo "WARNING: Failed to cache model: $model"
|
||||
rm -rf "$local_model_dir"
|
||||
failed_models+=("$model")
|
||||
continue
|
||||
fi
|
||||
echo "Successfully cached model: $model"
|
||||
done
|
||||
|
||||
if [ ${#failed_models[@]} -gt 0 ]; then
|
||||
echo -e "\n[Summary] Failed to cache following models:"
|
||||
printf ' - %s\n' "${failed_models[@]}"
|
||||
else
|
||||
echo -e "\n[Summary] All models cached successfully"
|
||||
fi
|
||||
@@ -87,6 +87,7 @@ FetchContent_Declare(
|
||||
GIT_SHALLOW OFF
|
||||
)
|
||||
FetchContent_Populate(repo-flashinfer)
|
||||
|
||||
# flash-attention
|
||||
FetchContent_Declare(
|
||||
repo-flash-attention
|
||||
@@ -95,6 +96,7 @@ FetchContent_Declare(
|
||||
GIT_SHALLOW OFF
|
||||
)
|
||||
FetchContent_Populate(repo-flash-attention)
|
||||
|
||||
# mscclpp
|
||||
FetchContent_Declare(
|
||||
repo-mscclpp
|
||||
@@ -232,6 +234,7 @@ set(SOURCES
|
||||
"csrc/elementwise/activation.cu"
|
||||
"csrc/elementwise/fused_add_rms_norm_kernel.cu"
|
||||
"csrc/elementwise/rope.cu"
|
||||
"csrc/common_extension.cc"
|
||||
"csrc/gemm/awq_kernel.cu"
|
||||
"csrc/gemm/bmm_fp8.cu"
|
||||
"csrc/gemm/dsv3_fused_a_gemm.cu"
|
||||
@@ -251,24 +254,10 @@ set(SOURCES
|
||||
"csrc/gemm/per_token_quant_fp8.cu"
|
||||
"csrc/gemm/qserve_w4a8_per_chn_gemm.cu"
|
||||
"csrc/gemm/qserve_w4a8_per_group_gemm.cu"
|
||||
"csrc/moe/moe_align_kernel.cu"
|
||||
"csrc/moe/moe_fused_gate.cu"
|
||||
"csrc/moe/moe_topk_softmax_kernels.cu"
|
||||
"csrc/moe/nvfp4_blockwise_moe.cu"
|
||||
"csrc/moe/fp8_blockwise_moe_kernel.cu"
|
||||
"csrc/moe/prepare_moe_input.cu"
|
||||
"csrc/moe/ep_moe_reorder_kernel.cu"
|
||||
"csrc/moe/ep_moe_silu_and_mul_kernel.cu"
|
||||
"csrc/speculative/eagle_utils.cu"
|
||||
"csrc/speculative/packbit.cu"
|
||||
"csrc/spatial/greenctx_stream.cu"
|
||||
"csrc/speculative/speculative_sampling.cu"
|
||||
"csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
|
||||
"csrc/kvcacheio/transfer.cu"
|
||||
"csrc/moe/cutlass_moe/w4a8/scaled_mm_entry.cu"
|
||||
"csrc/moe/cutlass_moe/w4a8/w4a8_moe_data.cu"
|
||||
"csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cu"
|
||||
"csrc/common_extension.cc"
|
||||
"csrc/moe/marlin_moe_wna16/ops.cu"
|
||||
"csrc/moe/marlin_moe_wna16/gptq_marlin_repack.cu"
|
||||
"csrc/moe/marlin_moe_wna16/awq_marlin_repack.cu"
|
||||
@@ -278,6 +267,19 @@ set(SOURCES
|
||||
"csrc/moe/marlin_moe_wna16/kernel_fp16_ku4.cu"
|
||||
"csrc/moe/marlin_moe_wna16/kernel_fp16_ku4b8.cu"
|
||||
"csrc/moe/marlin_moe_wna16/kernel_fp16_ku8b128.cu"
|
||||
"csrc/moe/moe_align_kernel.cu"
|
||||
"csrc/moe/moe_fused_gate.cu"
|
||||
"csrc/moe/moe_topk_softmax_kernels.cu"
|
||||
"csrc/moe/nvfp4_blockwise_moe.cu"
|
||||
"csrc/moe/fp8_blockwise_moe_kernel.cu"
|
||||
"csrc/moe/prepare_moe_input.cu"
|
||||
"csrc/moe/ep_moe_reorder_kernel.cu"
|
||||
"csrc/moe/ep_moe_silu_and_mul_kernel.cu"
|
||||
"csrc/kvcacheio/transfer.cu"
|
||||
"csrc/speculative/eagle_utils.cu"
|
||||
"csrc/speculative/packbit.cu"
|
||||
"csrc/spatial/greenctx_stream.cu"
|
||||
"csrc/speculative/speculative_sampling.cu"
|
||||
"${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
|
||||
"${repo-flashinfer_SOURCE_DIR}/csrc/renorm.cu"
|
||||
"${repo-flashinfer_SOURCE_DIR}/csrc/sampling.cu"
|
||||
@@ -312,12 +314,15 @@ else()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
|
||||
endif()
|
||||
|
||||
# mscclpp
|
||||
set(MSCCLPP_USE_CUDA ON)
|
||||
set(MSCCLPP_BYPASS_GPU_CHECK ON)
|
||||
set(MSCCLPP_BUILD_TESTS OFF)
|
||||
add_subdirectory(${repo-mscclpp_SOURCE_DIR})
|
||||
target_link_libraries(common_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda cublas cublasLt mscclpp_static)
|
||||
|
||||
# flash attention
|
||||
target_compile_definitions(common_ops PRIVATE
|
||||
FLASHATTENTION_DISABLE_BACKWARD
|
||||
FLASHATTENTION_DISABLE_DROPOUT
|
||||
|
||||
@@ -5,6 +5,11 @@
|
||||
[](https://pypi.org/project/sgl-kernel)
|
||||
|
||||
## Installation
|
||||
For CUDA 12.1 and above:
|
||||
|
||||
```bash
|
||||
pip3 install sgl-kernel
|
||||
```
|
||||
|
||||
For CUDA 11.8:
|
||||
|
||||
@@ -12,11 +17,6 @@ For CUDA 11.8:
|
||||
pip3 install sgl-kernel -i https://docs.sglang.ai/whl/cu118
|
||||
```
|
||||
|
||||
For CUDA 12.1 or CUDA 12.4:
|
||||
|
||||
```bash
|
||||
pip3 install sgl-kernel
|
||||
```
|
||||
## Build from source
|
||||
|
||||
Development build:
|
||||
|
||||
@@ -43,7 +43,7 @@ class TestW8A8(CustomTestCase):
|
||||
metrics = run_eval(args)
|
||||
print(metrics)
|
||||
|
||||
self.assertGreater(metrics["accuracy"], 0.7)
|
||||
self.assertGreater(metrics["accuracy"], 0.69)
|
||||
|
||||
def run_decode(self, max_new_tokens):
|
||||
response = requests.post(
|
||||
@@ -13,13 +13,16 @@ class TestFile:
|
||||
|
||||
suites = {
|
||||
"per-commit": [
|
||||
TestFile("models/lora/test_lora.py", 200),
|
||||
TestFile("models/lora/test_lora_eviction.py", 200),
|
||||
TestFile("models/lora/test_lora_backend.py", 99),
|
||||
TestFile("models/lora/test_multi_lora_backend.py", 60),
|
||||
TestFile("models/lora/test_lora_cuda_graph.py", 250),
|
||||
TestFile("models/lora/test_lora_update.py", 800),
|
||||
TestFile("models/lora/test_lora_qwen3.py", 97),
|
||||
TestFile("hicache/test_hicache.py", 116),
|
||||
TestFile("hicache/test_hicache_mla.py", 127),
|
||||
TestFile("hicache/test_hicache_storage.py", 127),
|
||||
TestFile("lora/test_lora.py", 200),
|
||||
TestFile("lora/test_lora_eviction.py", 200),
|
||||
TestFile("lora/test_lora_backend.py", 99),
|
||||
TestFile("lora/test_multi_lora_backend.py", 60),
|
||||
TestFile("lora/test_lora_cuda_graph.py", 250),
|
||||
TestFile("lora/test_lora_update.py", 800),
|
||||
TestFile("lora/test_lora_qwen3.py", 97),
|
||||
TestFile("models/test_embedding_models.py", 73),
|
||||
# TestFile("models/test_clip_models.py", 52),
|
||||
TestFile("models/test_encoder_embedding_models.py", 100),
|
||||
@@ -50,8 +53,13 @@ suites = {
|
||||
TestFile("openai_server/validation/test_matched_stop.py", 60),
|
||||
TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
|
||||
TestFile("openai_server/validation/test_request_length_validation.py", 31),
|
||||
TestFile("quant/test_block_int8.py", 22),
|
||||
TestFile("quant/test_fp8_kernel.py", 8),
|
||||
TestFile("quant/test_int8_kernel.py", 8),
|
||||
TestFile("quant/test_w8a8_quantization.py", 46),
|
||||
TestFile("rl/test_update_weights_from_disk.py", 114),
|
||||
TestFile("rl/test_update_weights_from_tensor.py", 48),
|
||||
TestFile("test_abort.py", 51),
|
||||
TestFile("test_block_int8.py", 22),
|
||||
TestFile("test_create_kvindices.py", 2),
|
||||
TestFile("test_chunked_prefill.py", 313),
|
||||
TestFile("test_eagle_infer_a.py", 370),
|
||||
@@ -60,15 +68,11 @@ suites = {
|
||||
TestFile("test_eval_fp8_accuracy.py", 303),
|
||||
TestFile("test_fa3.py", 376),
|
||||
# TestFile("test_flashmla.py", 352),
|
||||
TestFile("test_fp8_kernel.py", 8),
|
||||
TestFile("test_function_call_parser.py", 10),
|
||||
TestFile("test_fused_moe.py", 30),
|
||||
TestFile("test_gpt_oss_1gpu.py", 600),
|
||||
TestFile("test_hicache.py", 116),
|
||||
TestFile("test_hicache_mla.py", 127),
|
||||
TestFile("test_hicache_storage.py", 127),
|
||||
TestFile("test_hidden_states.py", 55),
|
||||
TestFile("test_int8_kernel.py", 8),
|
||||
TestFile("test_hybrid_attn_backend.py", 100),
|
||||
TestFile("test_input_embeddings.py", 38),
|
||||
TestFile("test_io_struct.py", 8),
|
||||
TestFile("test_jinja_template_utils.py", 1),
|
||||
@@ -85,6 +89,7 @@ suites = {
|
||||
TestFile("test_pytorch_sampling_backend.py", 66),
|
||||
TestFile("test_radix_attention.py", 105),
|
||||
TestFile("test_regex_constrained.py", 64),
|
||||
TestFile("test_reasoning_parser.py", 5),
|
||||
TestFile("test_retract_decode.py", 54),
|
||||
TestFile("test_request_queue_validation.py", 30),
|
||||
TestFile("test_server_args.py", 1),
|
||||
@@ -100,23 +105,18 @@ suites = {
|
||||
TestFile("test_triton_attention_backend.py", 150),
|
||||
TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
|
||||
TestFile("test_triton_sliding_window.py", 250),
|
||||
TestFile("test_update_weights_from_disk.py", 114),
|
||||
TestFile("test_update_weights_from_tensor.py", 48),
|
||||
TestFile("test_utils_update_weights.py", 48),
|
||||
TestFile("test_vision_chunked_prefill.py", 175),
|
||||
TestFile("test_vlm_input_format.py", 300),
|
||||
TestFile("test_vision_openai_server_a.py", 989),
|
||||
TestFile("test_vision_openai_server_b.py", 620),
|
||||
TestFile("test_w8a8_quantization.py", 46),
|
||||
TestFile("test_reasoning_parser.py", 5),
|
||||
TestFile("test_hybrid_attn_backend.py", 100),
|
||||
],
|
||||
"per-commit-2-gpu": [
|
||||
TestFile("models/lora/test_lora_tp.py", 116),
|
||||
TestFile("lora/test_lora_tp.py", 116),
|
||||
TestFile("rl/test_update_weights_from_distributed.py", 103),
|
||||
TestFile("test_data_parallelism.py", 73),
|
||||
TestFile("test_dp_attention.py", 277),
|
||||
TestFile("test_patch_torch.py", 19),
|
||||
TestFile("test_update_weights_from_distributed.py", 103),
|
||||
TestFile("test_release_memory_occupation.py", 127),
|
||||
],
|
||||
"per-commit-4-gpu": [
|
||||
@@ -127,7 +127,7 @@ suites = {
|
||||
],
|
||||
"per-commit-8-gpu": [
|
||||
# Disabled because it hangs on the CI.
|
||||
# TestFile("test_moe_ep.py", 181),
|
||||
# TestFile("ep/test_moe_ep.py", 181),
|
||||
TestFile("test_disaggregation.py", 499),
|
||||
TestFile("test_disaggregation_different_tp.py", 155),
|
||||
TestFile("test_full_deepseek_v3.py", 333),
|
||||
@@ -136,16 +136,16 @@ suites = {
|
||||
# add more here
|
||||
],
|
||||
"per-commit-4-gpu-deepep": [
|
||||
TestFile("test_deepep_small.py", 531),
|
||||
TestFile("ep/test_deepep_small.py", 531),
|
||||
],
|
||||
"per-commit-8-gpu-deepep": [
|
||||
TestFile("test_deepep_large.py", 338),
|
||||
TestFile("ep/test_deepep_large.py", 338),
|
||||
],
|
||||
"nightly": [
|
||||
TestFile("test_nightly_gsm8k_eval.py"),
|
||||
],
|
||||
"vllm_dependency_test": [
|
||||
TestFile("test_awq.py", 163),
|
||||
TestFile("quant/test_awq.py", 163),
|
||||
TestFile("test_bnb.py", 5),
|
||||
TestFile("test_gguf.py", 96),
|
||||
TestFile("test_gptqmodel_dynamic.py", 102),
|
||||
@@ -156,13 +156,9 @@ suites = {
|
||||
# Add AMD tests
|
||||
suite_amd = {
|
||||
"per-commit-amd": [
|
||||
TestFile("models/lora/test_lora_backend.py", 99),
|
||||
TestFile("models/lora/test_multi_lora_backend.py", 60),
|
||||
TestFile("models/lora/test_lora_cuda_graph.py", 250),
|
||||
TestFile("test_mla.py", 242),
|
||||
TestFile("test_mla_deepseek_v3.py", 221),
|
||||
TestFile("test_torch_compile.py", 76),
|
||||
TestFile("test_torch_compile_moe.py", 172),
|
||||
TestFile("lora/test_lora_backend.py", 99),
|
||||
TestFile("lora/test_multi_lora_backend.py", 60),
|
||||
TestFile("lora/test_lora_cuda_graph.py", 250),
|
||||
TestFile("models/test_qwen_models.py", 82),
|
||||
TestFile("models/test_reward_models.py", 132),
|
||||
TestFile("openai_server/basic/test_openai_embedding.py", 141),
|
||||
@@ -170,14 +166,18 @@ suite_amd = {
|
||||
TestFile("openai_server/features/test_reasoning_content.py", 89),
|
||||
TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
|
||||
TestFile("openai_server/validation/test_request_length_validation.py", 31),
|
||||
TestFile("quant/test_block_int8.py", 22),
|
||||
TestFile("quant/test_awq_dequant.py", 2),
|
||||
TestFile("rl/test_update_weights_from_disk.py", 114),
|
||||
TestFile("test_abort.py", 51),
|
||||
TestFile("test_block_int8.py", 22),
|
||||
TestFile("test_create_kvindices.py", 2),
|
||||
TestFile("test_chunked_prefill.py", 313),
|
||||
TestFile("test_eval_fp8_accuracy.py", 303),
|
||||
TestFile("test_function_call_parser.py", 10),
|
||||
TestFile("test_fused_moe.py", 30),
|
||||
TestFile("test_input_embeddings.py", 38),
|
||||
TestFile("test_mla.py", 242),
|
||||
TestFile("test_mla_deepseek_v3.py", 221),
|
||||
TestFile("test_metrics.py", 32),
|
||||
TestFile("test_no_chunked_prefill.py", 108),
|
||||
# TestFile("test_no_overlap_scheduler.py", 234), # Disabled temporarily and track in #7703
|
||||
@@ -186,22 +186,21 @@ suite_amd = {
|
||||
TestFile("test_pytorch_sampling_backend.py", 66),
|
||||
TestFile("test_radix_attention.py", 105),
|
||||
TestFile("test_retract_decode.py", 54),
|
||||
TestFile("test_server_args.py", 1),
|
||||
TestFile("test_skip_tokenizer_init.py", 117),
|
||||
TestFile("test_torch_native_attention_backend.py", 123),
|
||||
TestFile("test_triton_attention_backend.py", 150),
|
||||
TestFile("test_update_weights_from_disk.py", 114),
|
||||
TestFile("test_vertex_endpoint.py", 31),
|
||||
# TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701
|
||||
TestFile("test_reasoning_parser.py", 5),
|
||||
TestFile("test_rope_rocm.py", 3),
|
||||
TestFile("test_awq_dequant.py", 2),
|
||||
TestFile("test_server_args.py", 1),
|
||||
TestFile("test_skip_tokenizer_init.py", 117),
|
||||
TestFile("test_torch_compile.py", 76),
|
||||
TestFile("test_torch_compile_moe.py", 172),
|
||||
TestFile("test_torch_native_attention_backend.py", 123),
|
||||
TestFile("test_triton_attention_backend.py", 150),
|
||||
# TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701
|
||||
],
|
||||
"per-commit-2-gpu-amd": [
|
||||
TestFile("models/lora/test_lora_tp.py", 116),
|
||||
TestFile("lora/test_lora_tp.py", 116),
|
||||
TestFile("rl/test_update_weights_from_distributed.py", 103),
|
||||
TestFile("test_data_parallelism.py", 73),
|
||||
TestFile("test_patch_torch.py", 19),
|
||||
TestFile("test_update_weights_from_distributed.py", 103),
|
||||
],
|
||||
"per-commit-4-gpu-amd": [
|
||||
TestFile("test_pp_single_node.py", 150),
|
||||
@@ -236,13 +235,13 @@ suite_xeon = {
|
||||
# Add Ascend NPU tests
|
||||
suite_ascend = {
|
||||
"per-commit-1-ascend-npu": [
|
||||
TestFile("test_ascend_tp1_bf16.py", 400),
|
||||
TestFile("ascend/test_ascend_tp1_bf16.py", 400),
|
||||
],
|
||||
"per-commit-2-ascend-npu": [
|
||||
TestFile("test_ascend_tp2_bf16.py", 400),
|
||||
TestFile("ascend/test_ascend_tp2_bf16.py", 400),
|
||||
],
|
||||
"per-commit-4-ascend-npu": [
|
||||
TestFile("test_ascend_mla_w8a8int8.py", 400),
|
||||
TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,10 @@ class TestBenchServing(CustomTestCase):
|
||||
f"### test_offline_throughput_non_stream_small_batch_size\n"
|
||||
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
|
||||
)
|
||||
self.assertGreater(res["output_throughput"], 1045)
|
||||
if is_in_amd_ci():
|
||||
self.assertGreater(res["output_throughput"], 1000)
|
||||
else:
|
||||
self.assertGreater(res["output_throughput"], 1050)
|
||||
|
||||
def test_offline_throughput_without_radix_cache(self):
|
||||
res = run_bench_serving(
|
||||
|
||||
@@ -70,7 +70,7 @@ class TestIntelAMXAttnBackend(CustomTestCase):
|
||||
)
|
||||
|
||||
metrics = run_eval(args)
|
||||
self.assertGreater(metrics["score"], 0.5)
|
||||
self.assertGreater(metrics["score"], 0.45)
|
||||
finally:
|
||||
kill_process_tree(process.pid)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user