diff --git a/.github/workflows/nightly-eval.yml b/.github/workflows/nightly-test.yml similarity index 62% rename from .github/workflows/nightly-eval.yml rename to .github/workflows/nightly-test.yml index 7b77c63a5..04a109f23 100644 --- a/.github/workflows/nightly-eval.yml +++ b/.github/workflows/nightly-test.yml @@ -1,4 +1,4 @@ -name: Nightly Evaluation +name: Nightly Test on: schedule: @@ -11,11 +11,11 @@ on: workflow_dispatch: concurrency: - group: nightly-eval-${{ github.ref }} + group: nightly-test-${{ github.ref }} cancel-in-progress: true jobs: - nightly-eval-2-gpu: + nightly-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 2-gpu-runner steps: @@ -27,14 +27,8 @@ jobs: bash scripts/ci_install_dependency.sh pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus" - - name: Test gsm8k - timeout-minutes: 120 + - name: Run test + timeout-minutes: 10 run: | - cd test/srt - python3 test_nightly_gsm8k_eval.py - - - name: Test human eval - timeout-minutes: 120 - run: | - cd test/srt - python3 test_nightly_human_eval.py + cd test/lang + python3 run_suite.py --suite nightly --timeout-per-file 2400 diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index dd872c768..36b53baa3 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -45,7 +45,7 @@ jobs: timeout-minutes: 10 run: | cd test/lang - python3 run_suite.py --suite minimal + python3 run_suite.py --suite per-commit unit-test-backend-1-gpu: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' @@ -70,7 +70,7 @@ jobs: RANGE=${{ matrix.range }} range_begin=${RANGE%-*} range_end=${RANGE#*-} - python3 run_suite.py --suite minimal --range-begin ${range_begin} --range-end ${range_end} + python3 run_suite.py --suite per-commit --range-begin ${range_begin} --range-end ${range_end} unit-test-backend-2-gpu: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' diff --git a/docs/references/benchmark_and_profiling.md b/docs/references/benchmark_and_profiling.md index 329dad336..87ac51774 100644 --- a/docs/references/benchmark_and_profiling.md +++ b/docs/references/benchmark_and_profiling.md @@ -56,6 +56,8 @@ with nvtx.annotate("description", color="color"): ## Other tips 1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder. +2. You can benchmark a model with modified configs (e.g., less layers) by using `--json-model-override-args`. For example, you can benchmark a model with only 2 layers and 2 kv heads using `python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32 --load-format dummy --json-model-override-args '{"num_hidden_layers": 1, "num_key_value_heads": 1}'` + ## Profile with PyTorch Profiler - To profile a server diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index ac2474549..6067a7444 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -897,6 +897,7 @@ async def benchmark( else: raise ValueError(f"Unknown backend: {backend}") + # Limit concurrency # From https://github.com/vllm-project/vllm/pull/9390 semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None @@ -906,6 +907,7 @@ async def benchmark( async with semaphore: return await request_func(request_func_input=request_func_input, pbar=pbar) + # Warmup print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len = input_requests[0] test_input = RequestFuncInput( @@ -924,11 +926,15 @@ async def benchmark( f"are correctly specified. Error: {test_output.error}" ) else: - requests.post(base_url + "/flush_cache") print("Initial test run completed. Starting main benchmark run...") - time.sleep(1.5) + # Flush cache + if "sglang" in backend: + requests.post(base_url + "/flush_cache") + time.sleep(1.0) + + # Start profiler if profile: print("Starting profiler...") profile_output = await async_request_profile( @@ -939,6 +945,7 @@ async def benchmark( pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + # Run all requests benchmark_start_time = time.perf_counter() tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): @@ -959,6 +966,7 @@ async def benchmark( ) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + # Stop profiler if profile: print("Stopping profiler...") profile_output = await async_request_profile(api_url=base_url + "/stop_profile") @@ -968,8 +976,8 @@ async def benchmark( if pbar is not None: pbar.close() + # Compute metrics and print results benchmark_duration = time.perf_counter() - benchmark_start_time - metrics, output_lens = calculate_metrics( input_requests=input_requests, outputs=outputs, diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/README b/python/sglang/srt/layers/moe/fused_moe_triton/configs/README index 45d40cbfb..4aa527f27 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/README +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/README @@ -8,3 +8,5 @@ the JSON file contains a mapping from M (batch size) to the chosen configuration The example configurations provided are for the Mixtral model for TP2 on H100 and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have N = 7168 and for TP4 we have N = 3584. + +See `benchmark/kernels/fused_moe_triton/README.md` on how to generate these config files. diff --git a/sgl-kernel/tests/.gitkeep b/sgl-kernel/tests/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/test/lang/run_suite.py b/test/lang/run_suite.py index 379427afa..ebc26e608 100644 --- a/test/lang/run_suite.py +++ b/test/lang/run_suite.py @@ -4,7 +4,7 @@ import glob from sglang.test.test_utils import run_unittest_files suites = { - "minimal": ["test_srt_backend.py", "test_openai_backend.py"], + "per-commit": ["test_srt_backend.py", "test_openai_backend.py"], } diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 137507656..a0ca5fabb 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -4,7 +4,7 @@ import glob from sglang.test.test_utils import run_unittest_files suites = { - "minimal": [ + "per-commit": [ "models/test_embedding_models.py", "models/test_generation_models.py", "models/test_lora.py", diff --git a/test/srt/test_triton_attention_backend.py b/test/srt/test_triton_attention_backend.py index 905590965..88904c55f 100644 --- a/test/srt/test_triton_attention_backend.py +++ b/test/srt/test_triton_attention_backend.py @@ -30,7 +30,7 @@ class TestTritonAttnBackend(unittest.TestCase): ) if is_in_ci(): - assert output_throughput > 153, f"{output_throughput=}" + self.assertGreater(output_throughput, 153) def test_mmlu(self): model = DEFAULT_MODEL_NAME_FOR_TEST