diff --git a/.github/workflows/pr-test-h20.yml b/.github/workflows/pr-test-h20.yml deleted file mode 100644 index f91b22108..000000000 --- a/.github/workflows/pr-test-h20.yml +++ /dev/null @@ -1,106 +0,0 @@ -name: PR Test (H20) - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - types: [synchronize, labeled] - workflow_dispatch: - inputs: - version: - required: true - type: choice - default: 'release' - options: - - 'release' - - 'nightly' - -concurrency: - group: pr-test-h20-${{ github.ref }} - cancel-in-progress: true - -jobs: - check-changes: - runs-on: ubuntu-latest - outputs: - h20_files: ${{ steps.filter.outputs.h20_files }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Fail if the PR does not have the 'run-ci' label - if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci') - run: | - echo "This pull request does not have the 'run-ci' label. Failing the workflow." - exit 1 - - - name: Fail if the PR is a draft - if: github.event_name == 'pull_request' && github.event.pull_request.draft == true - run: | - echo "This pull request is a draft. Failing the workflow." - exit 1 - - - name: Detect file changes - id: filter - uses: dorny/paths-filter@v3 - with: - filters: | - h20_files: - - "python/sglang/srt/models/deepseek*" - - "python/sglang/srt/layers/moe/**" - - ".github/workflows/pr-test-h20.yml" - - "python/pyproject.toml" - - per-commit-8-gpu-h20: - needs: [check-changes] - if: needs.check-changes.outputs.h20_files == 'true' - runs-on: 8-gpu-h20 - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 20 - - run: | - cd test/srt - python3 run_suite.py --suite per-commit-8-gpu-h20 - - pr-test-h20-finish: - needs: [ - check-changes, - per-commit-8-gpu-h20, - ] - if: always() - runs-on: ubuntu-latest - steps: - - name: Check all dependent job statuses - run: | - # Convert the 'needs' context to a JSON string - json_needs='${{ toJson(needs) }}' - - # Get a list of all job names from the JSON keys - job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]') - - for job in $job_names; do - # For each job, extract its result - result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result') - - # Print the job name and its result - echo "$job: $result" - - # Check for failure or cancellation and exit if found - if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then - echo "The above jobs failed." - exit 1 - fi - done - - # If the loop completes, all jobs were successful - echo "All jobs completed successfully" - exit 0 diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 277f9a281..41d5d2a17 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -350,6 +350,39 @@ jobs: cd test/srt python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 + unit-test-backend-8-gpu-h20: + needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels] + if: always() && !failure() && !cancelled() && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + runs-on: 8-gpu-h20 + env: + SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4" + strategy: + fail-fast: false + matrix: + part: [0, 1] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download artifacts + if: needs.check-changes.outputs.sgl_kernel == 'true' + uses: actions/download-artifact@v4 + with: + path: sgl-kernel/dist/ + merge-multiple: true + pattern: wheel-python3.10-cuda12.9 + + - name: Install dependencies + run: | + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh + + - name: Run test + timeout-minutes: 20 + run: | + cd test/srt + python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 + performance-test-1-gpu-part-1: needs: [check-changes, sgl-kernel-build-wheels] if: always() && !failure() && !cancelled() && diff --git a/python/sglang/test/test_disaggregation_utils.py b/python/sglang/test/test_disaggregation_utils.py index e8084f802..e4396170f 100644 --- a/python/sglang/test/test_disaggregation_utils.py +++ b/python/sglang/test/test_disaggregation_utils.py @@ -1,3 +1,4 @@ +import logging import os import time import warnings @@ -15,6 +16,8 @@ from sglang.test.test_utils import ( popen_with_error_check, ) +logger = logging.getLogger(__name__) + class TestDisaggregationBase(CustomTestCase): @classmethod @@ -100,11 +103,28 @@ class TestDisaggregationBase(CustomTestCase): def get_rdma_devices_args(): + def _parse_list_env(var_name: str): + val = os.getenv(var_name) + if not val: + return None + items = [x.strip() for x in val.split(",") if x.strip()] + return items or None + + def _pick_default_pair(rdma_all_devices): + return [rdma_all_devices[0], rdma_all_devices[len(rdma_all_devices) // 2]] + + rdma_all_devices = _parse_list_env("SGLANG_CI_RDMA_ALL_DEVICES") or [ + f"mlx5_roce{i}" for i in range(8) + ] + logger.info("Resolved rdma_all_devices=%s", rdma_all_devices) + + n_rdma = len(rdma_all_devices) + # 1. Get visible GPU indices cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") if not cuda_visible_devices: warnings.warn("CUDA_VISIBLE_DEVICES is not set. Using default RDMA devices.") - return "mlx5_roce0,mlx5_roce4" + return ",".join(_pick_default_pair(rdma_all_devices)) try: # Convert to list of integers (handling possible spaces and empty strings) @@ -112,29 +132,27 @@ def get_rdma_devices_args(): int(idx.strip()) for idx in cuda_visible_devices.split(",") if idx.strip() ] if not gpu_indices or len(gpu_indices) > 4: - return "mlx5_roce0,mlx5_roce4" + return ",".join(_pick_default_pair(rdma_all_devices)) except ValueError: warnings.warn(f"Invalid CUDA_VISIBLE_DEVICES format: {cuda_visible_devices}") - return "mlx5_roce0,mlx5_roce4" + return ",".join(_pick_default_pair(rdma_all_devices)) # 2. Calculate base RDMA index group (each group of 4 GPUs uses consecutive devices) - base_rdma_group = min(gpu_indices) // 4 * 4 + base_rdma_group = (min(gpu_indices) // 4) * 4 + for gpu_idx in gpu_indices: + if not (base_rdma_group <= gpu_idx < base_rdma_group + 4): + warnings.warn( + f"GPU index {gpu_idx} is outside expected group " + f"{base_rdma_group}-{base_rdma_group+3}" + ) # 3. Generate RDMA device names rdma_devices = [] for gpu_idx in gpu_indices: - # Validate GPU index within expected range - if gpu_idx < base_rdma_group or gpu_idx >= base_rdma_group + 4: - warnings.warn( - f"GPU index {gpu_idx} is outside expected group {base_rdma_group}-{base_rdma_group+3}" - ) - continue - - # Map GPU index to RDMA device index - rdma_index = base_rdma_group // 4 * 4 + (gpu_idx % 4) - rdma_devices.append(f"mlx5_roce{rdma_index}") + nic_index = gpu_idx // (8 // n_rdma) + rdma_devices.append(rdma_all_devices[nic_index]) if not rdma_devices: - return "mlx5_roce0,mlx5_roce4" + return ",".join(_pick_default_pair(rdma_all_devices)) return ",".join(rdma_devices) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index a14d6508c..2ee87b21f 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -163,9 +163,7 @@ suites = { TestFile("lora/test_lora_llama4.py", 400), TestFile("test_deepseek_v3_basic.py", 275), TestFile("test_deepseek_v3_mtp.py", 275), - TestFile("test_disaggregation_different_tp.py", 600), TestFile("test_disaggregation_hybrid_attention.py", 200), - TestFile("test_disaggregation_pp.py", 140), ], "per-commit-4-gpu-b200": [ # TestFile("test_gpt_oss_4gpu.py", 600), @@ -182,6 +180,8 @@ suites = { TestFile("test_deepseek_v32_basic.py", 275), ], "per-commit-8-gpu-h20": [ + TestFile("test_disaggregation_different_tp.py", 600), + TestFile("test_disaggregation_pp.py", 140), TestFile("quant/test_w4a8_deepseek_v3.py", 371), ], "vllm_dependency_test": [ diff --git a/test/srt/test_disaggregation_different_tp.py b/test/srt/test_disaggregation_different_tp.py index a146cbfe2..2cb058422 100644 --- a/test/srt/test_disaggregation_different_tp.py +++ b/test/srt/test_disaggregation_different_tp.py @@ -9,6 +9,7 @@ from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST_MLA, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, popen_launch_pd_server, + try_cached_model, ) @@ -19,7 +20,7 @@ class TestDisaggregationMooncakePrefillLargerTP(TestDisaggregationBase): # Temporarily disable JIT DeepGEMM envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False) - cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST_MLA) # Non blocking start servers cls.start_prefill() @@ -90,7 +91,7 @@ class TestDisaggregationMooncakeDecodeLargerTP(TestDisaggregationBase): # Temporarily disable JIT DeepGEMM envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False) - cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST_MLA) # Non blocking start servers cls.start_prefill() @@ -161,7 +162,7 @@ class TestDisaggregationMooncakeMHAPrefillLargerTP(TestDisaggregationBase): # Temporarily disable JIT DeepGEMM envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False) - cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST) # Non blocking start servers cls.start_prefill() @@ -232,7 +233,7 @@ class TestDisaggregationMooncakeMHADecodeLargerTP(TestDisaggregationBase): # Temporarily disable JIT DeepGEMM envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False) - cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST) # Non blocking start servers cls.start_prefill() diff --git a/test/srt/test_disaggregation_pp.py b/test/srt/test_disaggregation_pp.py index b20ba8898..29df8b070 100644 --- a/test/srt/test_disaggregation_pp.py +++ b/test/srt/test_disaggregation_pp.py @@ -8,6 +8,7 @@ from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, popen_launch_pd_server, + try_cached_model, ) @@ -15,7 +16,7 @@ class TestDisaggregationPPAccuracy(TestDisaggregationBase): @classmethod def setUpClass(cls): super().setUpClass() - cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST) # Non blocking start servers cls.start_prefill()