diff --git a/.github/workflows/experiment-runner.yml b/.github/workflows/experiment-runner.yml deleted file mode 100644 index 487ed9ba3..000000000 --- a/.github/workflows/experiment-runner.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: Experiment Runner - -on: - workflow_dispatch: - inputs: - script: - description: "Experiment Runner Script" - default: "configs/sharegpt_config.yaml" - -concurrency: - group: experiment-runner-${{ github.ref }} - cancel-in-progress: true - -jobs: - experiment-runner-1-gpu: - if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: 1-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Test experiment runner - timeout-minutes: 120 - run: | - cd test/srt - python3 experiment_runner.py --config ${{ inputs.script }} diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index bd434c5ed..487b0719d 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -615,7 +615,7 @@ jobs: CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh - name: Run test - timeout-minutes: 60 + timeout-minutes: 45 run: | cd test/srt python3 run_suite.py --suite per-commit-4-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600 diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml index 57bfb06ca..78fafc60b 100644 --- a/.github/workflows/release-docs.yml +++ b/.github/workflows/release-docs.yml @@ -41,7 +41,7 @@ jobs: make compile - name: Push HTML to sgl-project.github.io - timeout-minutes: 60 + timeout-minutes: 30 env: GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_DOCUMENTATION }} run: | diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml index 6c4755009..64fdc5cb2 100644 --- a/.github/workflows/vllm-dependency-test.yml +++ b/.github/workflows/vllm-dependency-test.yml @@ -39,4 +39,4 @@ jobs: timeout-minutes: 30 run: | cd test/srt - python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 600 + python3 run_suite.py --suite vllm_dependency_test diff --git a/docs/developer_guide/contribution_guide.md b/docs/developer_guide/contribution_guide.md index 291dc89d3..479713446 100644 --- a/docs/developer_guide/contribution_guide.md +++ b/docs/developer_guide/contribution_guide.md @@ -78,10 +78,14 @@ To trigger CI, the pull request must have the "run-ci" label. ## General code style - Avoid code duplication. If the same code snippet (more than five lines) appears multiple times, extract it into a shared function. - Minimize device synchronization. Reduce expensive CPU-GPU synchronization operations, such as `tensor.item()` or `tensor.cpu()`, whenever possible. Use vectorized code. -- Keep files concise. If a file exceeds 2,000 lines of code, split it into multiple smaller files. - Prioritize extreme efficiency. SGLang is a runtime, and most of your code runs on the critical path for every request. Optimize all minor overheads as much as possible, especially in the model forward code. - A common pattern is some runtime checks in the model forward pass (e.g., [this](https://github.com/sgl-project/sglang/blob/f1b0eda55c2c4838e8ab90a0fac7fb1e3d7064ab/python/sglang/srt/models/deepseek_v2.py#L486-L491)). These are very likely the same for every layer. Please cache the result as a single boolean value whenever possible. -- Strive to make functions as pure as possible. Avoid in-place modification of arguments. +- Make functions as pure as possible. Avoid in-place modification of arguments. +- Keep files concise. If a file exceeds 2,000 lines of code, split it into multiple smaller files. (e.g., `scheduler.py`, `scheduler_output_processor_mixin.py`) +- Keep tests run fast. + - If a single test file run longer than 500 seconds, split it into multiple smaller files (e.g., `test_eagle_infer_a.py`, `test_eagle_infer_b.py`). + - If a single job in a github workflow runs longer than 30 mins, split it into smaller jobs/steps. + - Reuse server launches in your unit tests to make tests run faster. - When supporting new hardware or features, follow these guidelines: - Do not drastically change existing code. - Always prefer new files to introduce specific components for your new hardware (e.g., `allocator_ascend.py`). diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index a54330dfc..5dd588649 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -154,7 +154,7 @@ suites = { ], "per-commit-4-gpu-b200": [ # TestFile("test_gpt_oss_4gpu.py", 600), - TestFile("test_deepseek_v3_fp4_4gpu.py", 3600), + # TestFile("test_deepseek_v3_fp4_4gpu.py", 3600), ], "per-commit-4-gpu-deepep": [ TestFile("ep/test_deepep_small.py", 531), @@ -369,7 +369,7 @@ if __name__ == "__main__": arg_parser.add_argument( "--timeout-per-file", type=int, - default=1800, + default=1200, help="The time limit for running one file in seconds.", ) arg_parser.add_argument( diff --git a/test/srt/test_lookahead_speculative_decoding.py b/test/srt/test_lookahead_speculative_decoding.py index b0e7da529..1cf3e2101 100644 --- a/test/srt/test_lookahead_speculative_decoding.py +++ b/test/srt/test_lookahead_speculative_decoding.py @@ -35,7 +35,7 @@ class TestStandaloneSpeculativeDecodingBase(CustomTestCase): model = DEFAULT_LOOKAHEAD_SPECULATIVE_TARGET_MODEL_FOR_TEST base_url = DEFAULT_URL_FOR_TEST - accuracy_threshold = 0.8 # derived tests need to override this + accuracy_threshold = 0.79 # derived tests need to override this spec_decode_threshold = 1.8 # derived spec decoding tests need to override this @classmethod