Reorganize CI and test files (#9027)

This commit is contained in:
Lianmin Zheng
2025-08-10 12:30:06 -07:00
committed by GitHub
parent b58ae7a2a0
commit 2c7f01bc89
66 changed files with 161 additions and 195 deletions

View File

@@ -24,7 +24,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
pip install -r docs/requirements.txt
apt-get update && apt-get install -y pandoc parallel retry
ln -sf "$(which python3)" /usr/bin/python

View File

@@ -21,7 +21,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Test experiment runner
timeout-minutes: 120

View File

@@ -28,14 +28,14 @@ jobs:
- name: Setup docker
run: |
touch github_summary.md
bash scripts/amd_ci_start_container.sh
bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Nightly Test
run: |
bash scripts/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY

View File

@@ -24,7 +24,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 120

View File

@@ -31,7 +31,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_rust.sh
bash scripts/ci/ci_install_rust.sh
- name: Cache Rust dependencies
uses: actions/cache@v4
@@ -78,7 +78,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_rust.sh
bash scripts/ci/ci_install_rust.sh
- name: Cache Rust dependencies
uses: actions/cache@v4

View File

@@ -36,19 +36,19 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Evaluate Accuracy
timeout-minutes: 30
run: |
bash scripts/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
bash scripts/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
bash scripts/amd_ci_exec.sh python3 models/test_qwen_models.py
bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
bash scripts/ci/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
bash scripts/ci/amd_ci_exec.sh python3 models/test_qwen_models.py
accuracy-test-2-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -62,17 +62,17 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Evaluate accuracy (TP=2)
timeout-minutes: 30
run: |
bash scripts/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
mla-test-1-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -86,17 +86,17 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: MLA TEST
timeout-minutes: 30
run: |
bash scripts/amd_ci_exec.sh python3 test_mla.py
bash scripts/ci/amd_ci_exec.sh python3 test_mla.py
performance-test-1-gpu-part-1-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -110,33 +110,33 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Benchmark single latency
timeout-minutes: 20
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
- name: Benchmark online latency
timeout-minutes: 15
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
- name: Benchmark offline throughput
timeout-minutes: 15
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
- name: Benchmark offline throughput (Non-streaming, small batch size)
timeout-minutes: 15
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
performance-test-1-gpu-part-2-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -150,27 +150,27 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Benchmark offline throughput (w/o RadixAttention)
timeout-minutes: 15
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
- name: Benchmark offline throughput (w/ Triton)
timeout-minutes: 15
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
- name: Benchmark offline throughput (w/ FP8)
timeout-minutes: 15
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
bench-test-2-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -184,37 +184,37 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Benchmark dummy grok (TP=2)
timeout-minutes: 30
run: |
bash scripts/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
- name: Benchmark single latency (TP=2)
timeout-minutes: 25
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
- name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 25
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
- name: Benchmark offline throughput (TP=2)
timeout-minutes: 25
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 25
run: |
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
unit-test-backend-1-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -230,17 +230,17 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 50
run: |
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 7
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 7
unit-test-backend-2-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -254,17 +254,17 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 40
run: |
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
unit-test-backend-8-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -278,22 +278,22 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/amd_ci_install_dependency.sh
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 60
run: |
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
- name: Run CustomAllReduce test
timeout-minutes: 20
run: |
bash scripts/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce
bash scripts/ci/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce
unit-test-sgl-kernel-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -308,13 +308,13 @@ jobs:
uses: actions/checkout@v4
- name: Start CI container
run: bash scripts/amd_ci_start_container.sh
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: |
bash scripts/amd_ci_install_dependency.sh
bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 10

View File

@@ -34,7 +34,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/npu_ci_install_dependency.sh
bash scripts/ci/npu_ci_install_dependency.sh
# copy required file from our daily cache
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
# copy download through proxy
@@ -63,7 +63,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/npu_ci_install_dependency.sh
bash scripts/ci/npu_ci_install_dependency.sh
# copy required file from our daily cache
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
# copy download through proxy
@@ -92,7 +92,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/npu_ci_install_dependency.sh
bash scripts/ci/npu_ci_install_dependency.sh
# copy required file from our daily cache
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
# copy download through proxy

View File

@@ -5,13 +5,13 @@ on:
branches: [ main ]
paths:
- 'python/sglang/srt/disaggregation/**'
- 'scripts/ci_start_disaggregation_servers.sh'
- 'scripts/ci/ci_start_disaggregation_servers.sh'
- 'sgl-router/**'
pull_request:
branches: [ main ]
paths:
- 'python/sglang/srt/disaggregation/**'
- 'scripts/ci_start_disaggregation_servers.sh'
- 'scripts/ci/ci_start_disaggregation_servers.sh'
- 'sgl-router/**'
workflow_dispatch:
@@ -44,7 +44,7 @@ jobs:
- name: Setup Rust
run: |
bash scripts/ci_install_rust.sh
bash scripts/ci/ci_install_rust.sh
- name: Cache Rust dependencies
uses: actions/cache@v4
@@ -132,7 +132,7 @@ jobs:
id: start_servers
run: |
echo "Starting disaggregation servers..."
bash scripts/ci_start_disaggregation_servers.sh &
bash scripts/ci/ci_start_disaggregation_servers.sh &
SERVER_PID=$!
echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT

View File

@@ -25,7 +25,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_rust.sh
bash scripts/ci/ci_install_rust.sh
- name: Run fmt
run: |
@@ -64,7 +64,7 @@ jobs:
- name: Install rust dependencies
run: |
bash scripts/ci_install_rust.sh
bash scripts/ci/ci_install_rust.sh
- name: Build python binding
run: |

View File

@@ -84,7 +84,7 @@ jobs:
- name: Install
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 && pip3 install pytest
pip3 uninstall sgl-kernel -y || true
pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
@@ -116,7 +116,7 @@ jobs:
- name: Install
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
pip3 uninstall sgl-kernel -y || true
pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps

View File

@@ -52,7 +52,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 10
@@ -76,7 +76,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
@@ -96,7 +96,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
@@ -120,7 +120,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
@@ -144,7 +144,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
@@ -164,7 +164,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Benchmark single latency
timeout-minutes: 10
@@ -216,7 +216,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Benchmark offline throughput (w/o RadixAttention)
timeout-minutes: 10
@@ -260,7 +260,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
- name: Benchmark single latency (TP=2)
timeout-minutes: 10
@@ -310,7 +310,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
@@ -333,7 +333,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
@@ -356,7 +356,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_deepep.sh
bash scripts/ci/ci_install_deepep.sh
- name: Run test
timeout-minutes: 20
@@ -376,7 +376,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_deepep.sh
bash scripts/ci/ci_install_deepep.sh
- name: Run test
timeout-minutes: 20
@@ -398,7 +398,7 @@ jobs:
- name: Install dependencies
run: |
IS_BLACKWELL=1 bash scripts/ci_install_dependency.sh
IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20

View File

@@ -24,29 +24,28 @@ jobs:
- name: Install dependencies
run: |
find /public_sglang_ci/runner-a-gpu-1/_work/_tool/Python/3.10.13/x64/lib/python3.10/site-packages -name "sgl-kernel*" -exec rm -rf {} + || true
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
pip install -r docs/requirements.txt
apt-get update
apt-get install -y pandoc
apt-get update && apt-get install -y parallel retry
apt-get update && apt-get install -y pandoc parallel retry
ln -sf "$(which python3)" /usr/bin/python
- name: Setup Jupyter Kernel
run: |
python -m ipykernel install --user --name python3 --display-name "Python 3"
- name: Execute notebooks and push to documents
env:
GITHUB_TOKEN: ${{ secrets.DOCUMENTATION_PAT_TOKEN }}
- name: Execute notebooks
timeout-minutes: 40
run: |
cd docs
make clean
make compile
- name: Push HTML to sgl-project.github.io
run: |
cd docs
make html
python3 wrap_run_llm.py
cd _build/html
git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1

View File

@@ -29,7 +29,7 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
pip install "vllm==0.10.0"
pip install "openai==1.99.1"
pip install "bitsandbytes>=0.44.0"

View File

@@ -16,7 +16,7 @@
| [**Documentation**](https://docs.sglang.ai/)
| [**Join Slack**](https://slack.sglang.ai/)
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
## News

View File

@@ -3,7 +3,7 @@ set -euo pipefail
# Get version from SGLang version.py file
FALLBACK_SGLANG_VERSION="v0.4.10.post2"
SGLANG_VERSION_FILE="$(dirname "$0")/../python/sglang/version.py"
SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
if [ -f "$SGLANG_VERSION_FILE" ]; then
SGLANG_VERSION=$(python3 -c '

View File

@@ -2,7 +2,7 @@
# Install the dependency in CI.
set -euxo pipefail
bash scripts/ci_install_dependency.sh
bash scripts/ci/ci_install_dependency.sh
export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
export NVSHMEM_DIR=/opt/nvshmem/install

View File

@@ -12,7 +12,7 @@ fi
# Kill existing processes
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
bash "${SCRIPT_DIR}/killall_sglang.sh"
bash "${SCRIPT_DIR}/../killall_sglang.sh"
# Install apt packages
apt install -y git libnuma-dev

View File

@@ -1,40 +0,0 @@
#!/bin/bash
set -euxo pipefail
mapfile -t models < <(python3 -c "from sglang.test.test_utils import _get_default_models; print(_get_default_models())" | jq -r '.[]')
if [ ${#models[@]} -eq 0 ]; then
echo "Failed to get default models."
exit 1
fi
cache_dir="${DEFAULT_MODEL_CACHE_DIR:-}"
if [ -z "$cache_dir" ]; then
echo "DEFAULT_MODEL_CACHE_DIR environment variable is not set."
exit 1
fi
failed_models=()
for model in "${models[@]}"; do
local_model_dir="$cache_dir/$model"
echo "Caching model: $model to $local_model_dir"
mkdir -p "$local_model_dir"
if ! huggingface-cli download "$model" \
--local-dir "$local_model_dir" \
--local-dir-use-symlinks False 2>/dev/null; then
echo "WARNING: Failed to cache model: $model"
rm -rf "$local_model_dir"
failed_models+=("$model")
continue
fi
echo "Successfully cached model: $model"
done
if [ ${#failed_models[@]} -gt 0 ]; then
echo -e "\n[Summary] Failed to cache following models:"
printf ' - %s\n' "${failed_models[@]}"
else
echo -e "\n[Summary] All models cached successfully"
fi

View File

@@ -87,6 +87,7 @@ FetchContent_Declare(
GIT_SHALLOW OFF
)
FetchContent_Populate(repo-flashinfer)
# flash-attention
FetchContent_Declare(
repo-flash-attention
@@ -95,6 +96,7 @@ FetchContent_Declare(
GIT_SHALLOW OFF
)
FetchContent_Populate(repo-flash-attention)
# mscclpp
FetchContent_Declare(
repo-mscclpp
@@ -232,6 +234,7 @@ set(SOURCES
"csrc/elementwise/activation.cu"
"csrc/elementwise/fused_add_rms_norm_kernel.cu"
"csrc/elementwise/rope.cu"
"csrc/common_extension.cc"
"csrc/gemm/awq_kernel.cu"
"csrc/gemm/bmm_fp8.cu"
"csrc/gemm/dsv3_fused_a_gemm.cu"
@@ -251,24 +254,10 @@ set(SOURCES
"csrc/gemm/per_token_quant_fp8.cu"
"csrc/gemm/qserve_w4a8_per_chn_gemm.cu"
"csrc/gemm/qserve_w4a8_per_group_gemm.cu"
"csrc/moe/moe_align_kernel.cu"
"csrc/moe/moe_fused_gate.cu"
"csrc/moe/moe_topk_softmax_kernels.cu"
"csrc/moe/nvfp4_blockwise_moe.cu"
"csrc/moe/fp8_blockwise_moe_kernel.cu"
"csrc/moe/prepare_moe_input.cu"
"csrc/moe/ep_moe_reorder_kernel.cu"
"csrc/moe/ep_moe_silu_and_mul_kernel.cu"
"csrc/speculative/eagle_utils.cu"
"csrc/speculative/packbit.cu"
"csrc/spatial/greenctx_stream.cu"
"csrc/speculative/speculative_sampling.cu"
"csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
"csrc/kvcacheio/transfer.cu"
"csrc/moe/cutlass_moe/w4a8/scaled_mm_entry.cu"
"csrc/moe/cutlass_moe/w4a8/w4a8_moe_data.cu"
"csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cu"
"csrc/common_extension.cc"
"csrc/moe/marlin_moe_wna16/ops.cu"
"csrc/moe/marlin_moe_wna16/gptq_marlin_repack.cu"
"csrc/moe/marlin_moe_wna16/awq_marlin_repack.cu"
@@ -278,6 +267,19 @@ set(SOURCES
"csrc/moe/marlin_moe_wna16/kernel_fp16_ku4.cu"
"csrc/moe/marlin_moe_wna16/kernel_fp16_ku4b8.cu"
"csrc/moe/marlin_moe_wna16/kernel_fp16_ku8b128.cu"
"csrc/moe/moe_align_kernel.cu"
"csrc/moe/moe_fused_gate.cu"
"csrc/moe/moe_topk_softmax_kernels.cu"
"csrc/moe/nvfp4_blockwise_moe.cu"
"csrc/moe/fp8_blockwise_moe_kernel.cu"
"csrc/moe/prepare_moe_input.cu"
"csrc/moe/ep_moe_reorder_kernel.cu"
"csrc/moe/ep_moe_silu_and_mul_kernel.cu"
"csrc/kvcacheio/transfer.cu"
"csrc/speculative/eagle_utils.cu"
"csrc/speculative/packbit.cu"
"csrc/spatial/greenctx_stream.cu"
"csrc/speculative/speculative_sampling.cu"
"${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
"${repo-flashinfer_SOURCE_DIR}/csrc/renorm.cu"
"${repo-flashinfer_SOURCE_DIR}/csrc/sampling.cu"
@@ -312,12 +314,15 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
endif()
# mscclpp
set(MSCCLPP_USE_CUDA ON)
set(MSCCLPP_BYPASS_GPU_CHECK ON)
set(MSCCLPP_BUILD_TESTS OFF)
add_subdirectory(${repo-mscclpp_SOURCE_DIR})
target_link_libraries(common_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda cublas cublasLt mscclpp_static)
# flash attention
target_compile_definitions(common_ops PRIVATE
FLASHATTENTION_DISABLE_BACKWARD
FLASHATTENTION_DISABLE_DROPOUT

View File

@@ -5,6 +5,11 @@
[![PyPI](https://img.shields.io/pypi/v/sgl-kernel)](https://pypi.org/project/sgl-kernel)
## Installation
For CUDA 12.1 and above:
```bash
pip3 install sgl-kernel
```
For CUDA 11.8:
@@ -12,11 +17,6 @@ For CUDA 11.8:
pip3 install sgl-kernel -i https://docs.sglang.ai/whl/cu118
```
For CUDA 12.1 or CUDA 12.4:
```bash
pip3 install sgl-kernel
```
## Build from source
Development build:

View File

@@ -43,7 +43,7 @@ class TestW8A8(CustomTestCase):
metrics = run_eval(args)
print(metrics)
self.assertGreater(metrics["accuracy"], 0.7)
self.assertGreater(metrics["accuracy"], 0.69)
def run_decode(self, max_new_tokens):
response = requests.post(

View File

@@ -13,13 +13,16 @@ class TestFile:
suites = {
"per-commit": [
TestFile("models/lora/test_lora.py", 200),
TestFile("models/lora/test_lora_eviction.py", 200),
TestFile("models/lora/test_lora_backend.py", 99),
TestFile("models/lora/test_multi_lora_backend.py", 60),
TestFile("models/lora/test_lora_cuda_graph.py", 250),
TestFile("models/lora/test_lora_update.py", 800),
TestFile("models/lora/test_lora_qwen3.py", 97),
TestFile("hicache/test_hicache.py", 116),
TestFile("hicache/test_hicache_mla.py", 127),
TestFile("hicache/test_hicache_storage.py", 127),
TestFile("lora/test_lora.py", 200),
TestFile("lora/test_lora_eviction.py", 200),
TestFile("lora/test_lora_backend.py", 99),
TestFile("lora/test_multi_lora_backend.py", 60),
TestFile("lora/test_lora_cuda_graph.py", 250),
TestFile("lora/test_lora_update.py", 800),
TestFile("lora/test_lora_qwen3.py", 97),
TestFile("models/test_embedding_models.py", 73),
# TestFile("models/test_clip_models.py", 52),
TestFile("models/test_encoder_embedding_models.py", 100),
@@ -50,8 +53,13 @@ suites = {
TestFile("openai_server/validation/test_matched_stop.py", 60),
TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
TestFile("openai_server/validation/test_request_length_validation.py", 31),
TestFile("quant/test_block_int8.py", 22),
TestFile("quant/test_fp8_kernel.py", 8),
TestFile("quant/test_int8_kernel.py", 8),
TestFile("quant/test_w8a8_quantization.py", 46),
TestFile("rl/test_update_weights_from_disk.py", 114),
TestFile("rl/test_update_weights_from_tensor.py", 48),
TestFile("test_abort.py", 51),
TestFile("test_block_int8.py", 22),
TestFile("test_create_kvindices.py", 2),
TestFile("test_chunked_prefill.py", 313),
TestFile("test_eagle_infer_a.py", 370),
@@ -60,15 +68,11 @@ suites = {
TestFile("test_eval_fp8_accuracy.py", 303),
TestFile("test_fa3.py", 376),
# TestFile("test_flashmla.py", 352),
TestFile("test_fp8_kernel.py", 8),
TestFile("test_function_call_parser.py", 10),
TestFile("test_fused_moe.py", 30),
TestFile("test_gpt_oss_1gpu.py", 600),
TestFile("test_hicache.py", 116),
TestFile("test_hicache_mla.py", 127),
TestFile("test_hicache_storage.py", 127),
TestFile("test_hidden_states.py", 55),
TestFile("test_int8_kernel.py", 8),
TestFile("test_hybrid_attn_backend.py", 100),
TestFile("test_input_embeddings.py", 38),
TestFile("test_io_struct.py", 8),
TestFile("test_jinja_template_utils.py", 1),
@@ -85,6 +89,7 @@ suites = {
TestFile("test_pytorch_sampling_backend.py", 66),
TestFile("test_radix_attention.py", 105),
TestFile("test_regex_constrained.py", 64),
TestFile("test_reasoning_parser.py", 5),
TestFile("test_retract_decode.py", 54),
TestFile("test_request_queue_validation.py", 30),
TestFile("test_server_args.py", 1),
@@ -100,23 +105,18 @@ suites = {
TestFile("test_triton_attention_backend.py", 150),
TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
TestFile("test_triton_sliding_window.py", 250),
TestFile("test_update_weights_from_disk.py", 114),
TestFile("test_update_weights_from_tensor.py", 48),
TestFile("test_utils_update_weights.py", 48),
TestFile("test_vision_chunked_prefill.py", 175),
TestFile("test_vlm_input_format.py", 300),
TestFile("test_vision_openai_server_a.py", 989),
TestFile("test_vision_openai_server_b.py", 620),
TestFile("test_w8a8_quantization.py", 46),
TestFile("test_reasoning_parser.py", 5),
TestFile("test_hybrid_attn_backend.py", 100),
],
"per-commit-2-gpu": [
TestFile("models/lora/test_lora_tp.py", 116),
TestFile("lora/test_lora_tp.py", 116),
TestFile("rl/test_update_weights_from_distributed.py", 103),
TestFile("test_data_parallelism.py", 73),
TestFile("test_dp_attention.py", 277),
TestFile("test_patch_torch.py", 19),
TestFile("test_update_weights_from_distributed.py", 103),
TestFile("test_release_memory_occupation.py", 127),
],
"per-commit-4-gpu": [
@@ -127,7 +127,7 @@ suites = {
],
"per-commit-8-gpu": [
# Disabled because it hangs on the CI.
# TestFile("test_moe_ep.py", 181),
# TestFile("ep/test_moe_ep.py", 181),
TestFile("test_disaggregation.py", 499),
TestFile("test_disaggregation_different_tp.py", 155),
TestFile("test_full_deepseek_v3.py", 333),
@@ -136,16 +136,16 @@ suites = {
# add more here
],
"per-commit-4-gpu-deepep": [
TestFile("test_deepep_small.py", 531),
TestFile("ep/test_deepep_small.py", 531),
],
"per-commit-8-gpu-deepep": [
TestFile("test_deepep_large.py", 338),
TestFile("ep/test_deepep_large.py", 338),
],
"nightly": [
TestFile("test_nightly_gsm8k_eval.py"),
],
"vllm_dependency_test": [
TestFile("test_awq.py", 163),
TestFile("quant/test_awq.py", 163),
TestFile("test_bnb.py", 5),
TestFile("test_gguf.py", 96),
TestFile("test_gptqmodel_dynamic.py", 102),
@@ -156,13 +156,9 @@ suites = {
# Add AMD tests
suite_amd = {
"per-commit-amd": [
TestFile("models/lora/test_lora_backend.py", 99),
TestFile("models/lora/test_multi_lora_backend.py", 60),
TestFile("models/lora/test_lora_cuda_graph.py", 250),
TestFile("test_mla.py", 242),
TestFile("test_mla_deepseek_v3.py", 221),
TestFile("test_torch_compile.py", 76),
TestFile("test_torch_compile_moe.py", 172),
TestFile("lora/test_lora_backend.py", 99),
TestFile("lora/test_multi_lora_backend.py", 60),
TestFile("lora/test_lora_cuda_graph.py", 250),
TestFile("models/test_qwen_models.py", 82),
TestFile("models/test_reward_models.py", 132),
TestFile("openai_server/basic/test_openai_embedding.py", 141),
@@ -170,14 +166,18 @@ suite_amd = {
TestFile("openai_server/features/test_reasoning_content.py", 89),
TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
TestFile("openai_server/validation/test_request_length_validation.py", 31),
TestFile("quant/test_block_int8.py", 22),
TestFile("quant/test_awq_dequant.py", 2),
TestFile("rl/test_update_weights_from_disk.py", 114),
TestFile("test_abort.py", 51),
TestFile("test_block_int8.py", 22),
TestFile("test_create_kvindices.py", 2),
TestFile("test_chunked_prefill.py", 313),
TestFile("test_eval_fp8_accuracy.py", 303),
TestFile("test_function_call_parser.py", 10),
TestFile("test_fused_moe.py", 30),
TestFile("test_input_embeddings.py", 38),
TestFile("test_mla.py", 242),
TestFile("test_mla_deepseek_v3.py", 221),
TestFile("test_metrics.py", 32),
TestFile("test_no_chunked_prefill.py", 108),
# TestFile("test_no_overlap_scheduler.py", 234), # Disabled temporarily and track in #7703
@@ -186,22 +186,21 @@ suite_amd = {
TestFile("test_pytorch_sampling_backend.py", 66),
TestFile("test_radix_attention.py", 105),
TestFile("test_retract_decode.py", 54),
TestFile("test_server_args.py", 1),
TestFile("test_skip_tokenizer_init.py", 117),
TestFile("test_torch_native_attention_backend.py", 123),
TestFile("test_triton_attention_backend.py", 150),
TestFile("test_update_weights_from_disk.py", 114),
TestFile("test_vertex_endpoint.py", 31),
# TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701
TestFile("test_reasoning_parser.py", 5),
TestFile("test_rope_rocm.py", 3),
TestFile("test_awq_dequant.py", 2),
TestFile("test_server_args.py", 1),
TestFile("test_skip_tokenizer_init.py", 117),
TestFile("test_torch_compile.py", 76),
TestFile("test_torch_compile_moe.py", 172),
TestFile("test_torch_native_attention_backend.py", 123),
TestFile("test_triton_attention_backend.py", 150),
# TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701
],
"per-commit-2-gpu-amd": [
TestFile("models/lora/test_lora_tp.py", 116),
TestFile("lora/test_lora_tp.py", 116),
TestFile("rl/test_update_weights_from_distributed.py", 103),
TestFile("test_data_parallelism.py", 73),
TestFile("test_patch_torch.py", 19),
TestFile("test_update_weights_from_distributed.py", 103),
],
"per-commit-4-gpu-amd": [
TestFile("test_pp_single_node.py", 150),
@@ -236,13 +235,13 @@ suite_xeon = {
# Add Ascend NPU tests
suite_ascend = {
"per-commit-1-ascend-npu": [
TestFile("test_ascend_tp1_bf16.py", 400),
TestFile("ascend/test_ascend_tp1_bf16.py", 400),
],
"per-commit-2-ascend-npu": [
TestFile("test_ascend_tp2_bf16.py", 400),
TestFile("ascend/test_ascend_tp2_bf16.py", 400),
],
"per-commit-4-ascend-npu": [
TestFile("test_ascend_mla_w8a8int8.py", 400),
TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),
],
}

View File

@@ -56,7 +56,10 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_non_stream_small_batch_size\n"
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
self.assertGreater(res["output_throughput"], 1045)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 1000)
else:
self.assertGreater(res["output_throughput"], 1050)
def test_offline_throughput_without_radix_cache(self):
res = run_bench_serving(

View File

@@ -70,7 +70,7 @@ class TestIntelAMXAttnBackend(CustomTestCase):
)
metrics = run_eval(args)
self.assertGreater(metrics["score"], 0.5)
self.assertGreater(metrics["score"], 0.45)
finally:
kill_process_tree(process.pid)