diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml index 0a35d3aef..2cdb55ef1 100644 --- a/.github/workflows/nightly-test-amd.yml +++ b/.github/workflows/nightly-test-amd.yml @@ -27,32 +27,15 @@ jobs: - name: Setup docker run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi touch github_summary.md - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . - docker exec ci_sglang pip install huggingface_hub[hf_xet] + run: bash scripts/amd_ci_install_dependency.sh - name: Nightly Test run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" ci_sglang python3 run_suite.py --suite nightly-amd --timeout-per-file 7200 + bash scripts/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200 echo "$(> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 8da2fabb7..2f7bba308 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -35,37 +35,20 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup docker - run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . + run: bash scripts/amd_ci_install_dependency.sh - name: Evaluate Accuracy timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_accuracy_large.py - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_fp8_accuracy.py - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_qwen_models.py + bash scripts/amd_ci_exec.sh python3 test_eval_accuracy_large.py + bash scripts/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py + bash scripts/amd_ci_exec.sh python3 models/test_qwen_models.py accuracy-test-2-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -78,35 +61,18 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup docker - run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . + run: bash scripts/amd_ci_install_dependency.sh - name: Evaluate accuracy (TP=2) timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_moe_eval_accuracy_large.py + bash scripts/amd_ci_exec.sh python3 test_moe_eval_accuracy_large.py mla-test-1-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -119,35 +85,18 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup docker - run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . + run: bash scripts/amd_ci_install_dependency.sh - name: MLA TEST timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py + bash scripts/amd_ci_exec.sh python3 test_mla.py performance-test-1-gpu-part-1-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -160,56 +109,39 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup docker - run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . + run: bash scripts/amd_ci_install_dependency.sh - name: Benchmark single latency timeout-minutes: 10 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small - docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default - name: Benchmark online latency timeout-minutes: 10 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default - name: Benchmark offline throughput timeout-minutes: 10 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default - name: Benchmark offline throughput (Non-streaming, small batch size) timeout-minutes: 10 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size - name: Benchmark online latency (EAGLE) timeout-minutes: 10 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle performance-test-1-gpu-part-2-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -222,45 +154,28 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup docker - run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . + run: bash scripts/amd_ci_install_dependency.sh - name: Benchmark offline throughput (w/o RadixAttention) timeout-minutes: 10 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache - name: Benchmark offline throughput (w/ Triton) timeout-minutes: 10 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend - name: Benchmark offline throughput (w/ FP8) timeout-minutes: 10 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 bench-test-2-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -273,59 +188,38 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup docker - run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . - - docker exec -w / ci_sglang mkdir -p /dummy-grok - mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json - docker cp ./dummy-grok ci_sglang:/ + run: bash scripts/amd_ci_install_dependency.sh - name: Benchmark dummy grok (TP=2) timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_dummy_grok_models.py + bash scripts/amd_ci_exec.sh python3 models/test_dummy_grok_models.py - name: Benchmark single latency (TP=2) timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - name: Benchmark single latency + torch.compile (TP=2) timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 - name: Benchmark offline throughput (TP=2) timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default - name: Benchmark offline throughput (w/o RadixAttention) (TP=2) timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache + bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache unit-test-backend-1-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -338,35 +232,18 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup docker - run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . + run: bash scripts/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 -e SGLANG_AITER_MOE=1 ci_sglang python3 run_suite.py --suite per-commit-amd + bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd unit-test-backend-2-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -379,35 +256,18 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup docker - run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . + run: bash scripts/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 -e SGLANG_AITER_MOE=1 ci_sglang python3 run_suite.py --suite per-commit-2-gpu-amd + bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd unit-test-backend-8-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -420,35 +280,18 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup docker - run: | - # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. - if [ -f "/etc/podinfo/gha-render-devices" ]; then - DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) - else - DEVICE_FLAG="--device /dev/dri" - fi - docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 - docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ - -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ - --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ - -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.6.post3-rocm630 + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: | - docker exec ci_sglang pip install --upgrade pip - docker exec ci_sglang pip uninstall sgl-kernel -y || true - docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" - docker exec ci_sglang pip install -e "python[dev_hip]" - - docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git - docker exec -w /human-eval ci_sglang pip install -e . + run: bash scripts/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | - docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 -e SGLANG_AITER_MOE=1 ci_sglang python3 run_suite.py --suite per-commit-8-gpu-amd + bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd finish: if: always() diff --git a/scripts/amd_ci_exec.sh b/scripts/amd_ci_exec.sh new file mode 100755 index 000000000..a57e608e6 --- /dev/null +++ b/scripts/amd_ci_exec.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -euo pipefail + +# Default working directory +WORKDIR="/sglang-checkout/test/srt" +ENV_ARGS=( + -e SGLANG_AMD_CI=1 + -e SGLANG_IS_IN_CI=1 + -e SGLANG_AITER_MOE=1 +) + +# Parse optional -w/--workdir and -e ENV=VAL flags +while [[ $# -gt 0 ]]; do + case "$1" in + -w|--workdir) + WORKDIR="$2" + shift 2 + ;; + -e) + ENV_ARGS+=("-e" "$2") + shift 2 + ;; + --) + shift + break + ;; + *) + break + ;; + esac +done + +# Run docker exec +docker exec \ + -w "$WORKDIR" \ + "${ENV_ARGS[@]}" \ + ci_sglang "$@" diff --git a/scripts/amd_ci_install_dependency.sh b/scripts/amd_ci_install_dependency.sh new file mode 100755 index 000000000..eedbed020 --- /dev/null +++ b/scripts/amd_ci_install_dependency.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -euo pipefail + +# Install the required dependencies in CI. +docker exec ci_sglang pip install --upgrade pip +docker exec ci_sglang pip uninstall sgl-kernel -y || true +docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" +docker exec ci_sglang pip install -e "python[dev_hip]" + +docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git +docker exec -w /human-eval ci_sglang pip install -e . + +docker exec -w / ci_sglang mkdir -p /dummy-grok +mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json +docker cp ./dummy-grok ci_sglang:/ diff --git a/scripts/amd_ci_start_container.sh b/scripts/amd_ci_start_container.sh new file mode 100755 index 000000000..30fd26d05 --- /dev/null +++ b/scripts/amd_ci_start_container.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -euo pipefail + +# Set up DEVICE_FLAG based on Kubernetes pod info +if [ -f "/etc/podinfo/gha-render-devices" ]; then + DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) +else + DEVICE_FLAG="--device /dev/dri" +fi + +# Pull the image +IMAGE="lmsysorg/sglang:v0.4.6.post3-rocm630" +echo "Pulling Docker image: $IMAGE" +docker pull "$IMAGE" + +# Run the container +echo "Starting container: ci_sglang" +docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ + -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \ + --ipc=host --group-add video \ + --cap-add=SYS_PTRACE \ + -e HF_TOKEN="${HF_TOKEN:-}" \ + --security-opt seccomp=unconfined \ + -w /sglang-checkout \ + --name ci_sglang \ + "$IMAGE"