[ci]use H20 to run disaggregation test (#11543)
This commit is contained in:
106
.github/workflows/pr-test-h20.yml
vendored
106
.github/workflows/pr-test-h20.yml
vendored
@@ -1,106 +0,0 @@
|
|||||||
name: PR Test (H20)
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: [ main ]
|
|
||||||
pull_request:
|
|
||||||
branches: [ main ]
|
|
||||||
types: [synchronize, labeled]
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
version:
|
|
||||||
required: true
|
|
||||||
type: choice
|
|
||||||
default: 'release'
|
|
||||||
options:
|
|
||||||
- 'release'
|
|
||||||
- 'nightly'
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: pr-test-h20-${{ github.ref }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
check-changes:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
outputs:
|
|
||||||
h20_files: ${{ steps.filter.outputs.h20_files }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Fail if the PR does not have the 'run-ci' label
|
|
||||||
if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
|
|
||||||
run: |
|
|
||||||
echo "This pull request does not have the 'run-ci' label. Failing the workflow."
|
|
||||||
exit 1
|
|
||||||
|
|
||||||
- name: Fail if the PR is a draft
|
|
||||||
if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
|
|
||||||
run: |
|
|
||||||
echo "This pull request is a draft. Failing the workflow."
|
|
||||||
exit 1
|
|
||||||
|
|
||||||
- name: Detect file changes
|
|
||||||
id: filter
|
|
||||||
uses: dorny/paths-filter@v3
|
|
||||||
with:
|
|
||||||
filters: |
|
|
||||||
h20_files:
|
|
||||||
- "python/sglang/srt/models/deepseek*"
|
|
||||||
- "python/sglang/srt/layers/moe/**"
|
|
||||||
- ".github/workflows/pr-test-h20.yml"
|
|
||||||
- "python/pyproject.toml"
|
|
||||||
|
|
||||||
per-commit-8-gpu-h20:
|
|
||||||
needs: [check-changes]
|
|
||||||
if: needs.check-changes.outputs.h20_files == 'true'
|
|
||||||
runs-on: 8-gpu-h20
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
bash scripts/ci/ci_install_dependency.sh
|
|
||||||
|
|
||||||
- name: Run test
|
|
||||||
timeout-minutes: 20
|
|
||||||
|
|
||||||
run: |
|
|
||||||
cd test/srt
|
|
||||||
python3 run_suite.py --suite per-commit-8-gpu-h20
|
|
||||||
|
|
||||||
pr-test-h20-finish:
|
|
||||||
needs: [
|
|
||||||
check-changes,
|
|
||||||
per-commit-8-gpu-h20,
|
|
||||||
]
|
|
||||||
if: always()
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Check all dependent job statuses
|
|
||||||
run: |
|
|
||||||
# Convert the 'needs' context to a JSON string
|
|
||||||
json_needs='${{ toJson(needs) }}'
|
|
||||||
|
|
||||||
# Get a list of all job names from the JSON keys
|
|
||||||
job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
|
|
||||||
|
|
||||||
for job in $job_names; do
|
|
||||||
# For each job, extract its result
|
|
||||||
result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
|
|
||||||
|
|
||||||
# Print the job name and its result
|
|
||||||
echo "$job: $result"
|
|
||||||
|
|
||||||
# Check for failure or cancellation and exit if found
|
|
||||||
if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
|
|
||||||
echo "The above jobs failed."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# If the loop completes, all jobs were successful
|
|
||||||
echo "All jobs completed successfully"
|
|
||||||
exit 0
|
|
||||||
33
.github/workflows/pr-test.yml
vendored
33
.github/workflows/pr-test.yml
vendored
@@ -350,6 +350,39 @@ jobs:
|
|||||||
cd test/srt
|
cd test/srt
|
||||||
python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
||||||
|
|
||||||
|
unit-test-backend-8-gpu-h20:
|
||||||
|
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
||||||
|
if: always() && !failure() && !cancelled() &&
|
||||||
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
|
runs-on: 8-gpu-h20
|
||||||
|
env:
|
||||||
|
SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
part: [0, 1]
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Download artifacts
|
||||||
|
if: needs.check-changes.outputs.sgl_kernel == 'true'
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
path: sgl-kernel/dist/
|
||||||
|
merge-multiple: true
|
||||||
|
pattern: wheel-python3.10-cuda12.9
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
|
||||||
|
|
||||||
|
- name: Run test
|
||||||
|
timeout-minutes: 20
|
||||||
|
run: |
|
||||||
|
cd test/srt
|
||||||
|
python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
||||||
|
|
||||||
performance-test-1-gpu-part-1:
|
performance-test-1-gpu-part-1:
|
||||||
needs: [check-changes, sgl-kernel-build-wheels]
|
needs: [check-changes, sgl-kernel-build-wheels]
|
||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
@@ -15,6 +16,8 @@ from sglang.test.test_utils import (
|
|||||||
popen_with_error_check,
|
popen_with_error_check,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TestDisaggregationBase(CustomTestCase):
|
class TestDisaggregationBase(CustomTestCase):
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -100,11 +103,28 @@ class TestDisaggregationBase(CustomTestCase):
|
|||||||
|
|
||||||
|
|
||||||
def get_rdma_devices_args():
|
def get_rdma_devices_args():
|
||||||
|
def _parse_list_env(var_name: str):
|
||||||
|
val = os.getenv(var_name)
|
||||||
|
if not val:
|
||||||
|
return None
|
||||||
|
items = [x.strip() for x in val.split(",") if x.strip()]
|
||||||
|
return items or None
|
||||||
|
|
||||||
|
def _pick_default_pair(rdma_all_devices):
|
||||||
|
return [rdma_all_devices[0], rdma_all_devices[len(rdma_all_devices) // 2]]
|
||||||
|
|
||||||
|
rdma_all_devices = _parse_list_env("SGLANG_CI_RDMA_ALL_DEVICES") or [
|
||||||
|
f"mlx5_roce{i}" for i in range(8)
|
||||||
|
]
|
||||||
|
logger.info("Resolved rdma_all_devices=%s", rdma_all_devices)
|
||||||
|
|
||||||
|
n_rdma = len(rdma_all_devices)
|
||||||
|
|
||||||
# 1. Get visible GPU indices
|
# 1. Get visible GPU indices
|
||||||
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
|
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
|
||||||
if not cuda_visible_devices:
|
if not cuda_visible_devices:
|
||||||
warnings.warn("CUDA_VISIBLE_DEVICES is not set. Using default RDMA devices.")
|
warnings.warn("CUDA_VISIBLE_DEVICES is not set. Using default RDMA devices.")
|
||||||
return "mlx5_roce0,mlx5_roce4"
|
return ",".join(_pick_default_pair(rdma_all_devices))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Convert to list of integers (handling possible spaces and empty strings)
|
# Convert to list of integers (handling possible spaces and empty strings)
|
||||||
@@ -112,29 +132,27 @@ def get_rdma_devices_args():
|
|||||||
int(idx.strip()) for idx in cuda_visible_devices.split(",") if idx.strip()
|
int(idx.strip()) for idx in cuda_visible_devices.split(",") if idx.strip()
|
||||||
]
|
]
|
||||||
if not gpu_indices or len(gpu_indices) > 4:
|
if not gpu_indices or len(gpu_indices) > 4:
|
||||||
return "mlx5_roce0,mlx5_roce4"
|
return ",".join(_pick_default_pair(rdma_all_devices))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
warnings.warn(f"Invalid CUDA_VISIBLE_DEVICES format: {cuda_visible_devices}")
|
warnings.warn(f"Invalid CUDA_VISIBLE_DEVICES format: {cuda_visible_devices}")
|
||||||
return "mlx5_roce0,mlx5_roce4"
|
return ",".join(_pick_default_pair(rdma_all_devices))
|
||||||
|
|
||||||
# 2. Calculate base RDMA index group (each group of 4 GPUs uses consecutive devices)
|
# 2. Calculate base RDMA index group (each group of 4 GPUs uses consecutive devices)
|
||||||
base_rdma_group = min(gpu_indices) // 4 * 4
|
base_rdma_group = (min(gpu_indices) // 4) * 4
|
||||||
|
for gpu_idx in gpu_indices:
|
||||||
|
if not (base_rdma_group <= gpu_idx < base_rdma_group + 4):
|
||||||
|
warnings.warn(
|
||||||
|
f"GPU index {gpu_idx} is outside expected group "
|
||||||
|
f"{base_rdma_group}-{base_rdma_group+3}"
|
||||||
|
)
|
||||||
|
|
||||||
# 3. Generate RDMA device names
|
# 3. Generate RDMA device names
|
||||||
rdma_devices = []
|
rdma_devices = []
|
||||||
for gpu_idx in gpu_indices:
|
for gpu_idx in gpu_indices:
|
||||||
# Validate GPU index within expected range
|
nic_index = gpu_idx // (8 // n_rdma)
|
||||||
if gpu_idx < base_rdma_group or gpu_idx >= base_rdma_group + 4:
|
rdma_devices.append(rdma_all_devices[nic_index])
|
||||||
warnings.warn(
|
|
||||||
f"GPU index {gpu_idx} is outside expected group {base_rdma_group}-{base_rdma_group+3}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Map GPU index to RDMA device index
|
|
||||||
rdma_index = base_rdma_group // 4 * 4 + (gpu_idx % 4)
|
|
||||||
rdma_devices.append(f"mlx5_roce{rdma_index}")
|
|
||||||
|
|
||||||
if not rdma_devices:
|
if not rdma_devices:
|
||||||
return "mlx5_roce0,mlx5_roce4"
|
return ",".join(_pick_default_pair(rdma_all_devices))
|
||||||
|
|
||||||
return ",".join(rdma_devices)
|
return ",".join(rdma_devices)
|
||||||
|
|||||||
@@ -163,9 +163,7 @@ suites = {
|
|||||||
TestFile("lora/test_lora_llama4.py", 400),
|
TestFile("lora/test_lora_llama4.py", 400),
|
||||||
TestFile("test_deepseek_v3_basic.py", 275),
|
TestFile("test_deepseek_v3_basic.py", 275),
|
||||||
TestFile("test_deepseek_v3_mtp.py", 275),
|
TestFile("test_deepseek_v3_mtp.py", 275),
|
||||||
TestFile("test_disaggregation_different_tp.py", 600),
|
|
||||||
TestFile("test_disaggregation_hybrid_attention.py", 200),
|
TestFile("test_disaggregation_hybrid_attention.py", 200),
|
||||||
TestFile("test_disaggregation_pp.py", 140),
|
|
||||||
],
|
],
|
||||||
"per-commit-4-gpu-b200": [
|
"per-commit-4-gpu-b200": [
|
||||||
# TestFile("test_gpt_oss_4gpu.py", 600),
|
# TestFile("test_gpt_oss_4gpu.py", 600),
|
||||||
@@ -182,6 +180,8 @@ suites = {
|
|||||||
TestFile("test_deepseek_v32_basic.py", 275),
|
TestFile("test_deepseek_v32_basic.py", 275),
|
||||||
],
|
],
|
||||||
"per-commit-8-gpu-h20": [
|
"per-commit-8-gpu-h20": [
|
||||||
|
TestFile("test_disaggregation_different_tp.py", 600),
|
||||||
|
TestFile("test_disaggregation_pp.py", 140),
|
||||||
TestFile("quant/test_w4a8_deepseek_v3.py", 371),
|
TestFile("quant/test_w4a8_deepseek_v3.py", 371),
|
||||||
],
|
],
|
||||||
"vllm_dependency_test": [
|
"vllm_dependency_test": [
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from sglang.test.test_utils import (
|
|||||||
DEFAULT_MODEL_NAME_FOR_TEST_MLA,
|
DEFAULT_MODEL_NAME_FOR_TEST_MLA,
|
||||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
popen_launch_pd_server,
|
popen_launch_pd_server,
|
||||||
|
try_cached_model,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -19,7 +20,7 @@ class TestDisaggregationMooncakePrefillLargerTP(TestDisaggregationBase):
|
|||||||
# Temporarily disable JIT DeepGEMM
|
# Temporarily disable JIT DeepGEMM
|
||||||
envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
|
envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
|
||||||
|
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
|
cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST_MLA)
|
||||||
|
|
||||||
# Non blocking start servers
|
# Non blocking start servers
|
||||||
cls.start_prefill()
|
cls.start_prefill()
|
||||||
@@ -90,7 +91,7 @@ class TestDisaggregationMooncakeDecodeLargerTP(TestDisaggregationBase):
|
|||||||
# Temporarily disable JIT DeepGEMM
|
# Temporarily disable JIT DeepGEMM
|
||||||
envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
|
envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
|
||||||
|
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
|
cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST_MLA)
|
||||||
|
|
||||||
# Non blocking start servers
|
# Non blocking start servers
|
||||||
cls.start_prefill()
|
cls.start_prefill()
|
||||||
@@ -161,7 +162,7 @@ class TestDisaggregationMooncakeMHAPrefillLargerTP(TestDisaggregationBase):
|
|||||||
# Temporarily disable JIT DeepGEMM
|
# Temporarily disable JIT DeepGEMM
|
||||||
envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
|
envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
|
||||||
|
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST)
|
||||||
|
|
||||||
# Non blocking start servers
|
# Non blocking start servers
|
||||||
cls.start_prefill()
|
cls.start_prefill()
|
||||||
@@ -232,7 +233,7 @@ class TestDisaggregationMooncakeMHADecodeLargerTP(TestDisaggregationBase):
|
|||||||
# Temporarily disable JIT DeepGEMM
|
# Temporarily disable JIT DeepGEMM
|
||||||
envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
|
envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
|
||||||
|
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST)
|
||||||
|
|
||||||
# Non blocking start servers
|
# Non blocking start servers
|
||||||
cls.start_prefill()
|
cls.start_prefill()
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from sglang.test.test_utils import (
|
|||||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
DEFAULT_MODEL_NAME_FOR_TEST,
|
||||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
popen_launch_pd_server,
|
popen_launch_pd_server,
|
||||||
|
try_cached_model,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -15,7 +16,7 @@ class TestDisaggregationPPAccuracy(TestDisaggregationBase):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
super().setUpClass()
|
super().setUpClass()
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST)
|
||||||
|
|
||||||
# Non blocking start servers
|
# Non blocking start servers
|
||||||
cls.start_prefill()
|
cls.start_prefill()
|
||||||
|
|||||||
Reference in New Issue
Block a user