Fix test and benchmark scripts (#2598)

This commit is contained in:
Lianmin Zheng
2024-12-26 07:56:26 -08:00
committed by GitHub
parent a74d194146
commit dc3bee4815
9 changed files with 27 additions and 21 deletions

View File

@@ -1,4 +1,4 @@
name: Nightly Evaluation name: Nightly Test
on: on:
schedule: schedule:
@@ -11,11 +11,11 @@ on:
workflow_dispatch: workflow_dispatch:
concurrency: concurrency:
group: nightly-eval-${{ github.ref }} group: nightly-test-${{ github.ref }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
nightly-eval-2-gpu: nightly-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner runs-on: 2-gpu-runner
steps: steps:
@@ -27,14 +27,8 @@ jobs:
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus" pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
- name: Test gsm8k - name: Run test
timeout-minutes: 120 timeout-minutes: 10
run: | run: |
cd test/srt cd test/lang
python3 test_nightly_gsm8k_eval.py python3 run_suite.py --suite nightly --timeout-per-file 2400
- name: Test human eval
timeout-minutes: 120
run: |
cd test/srt
python3 test_nightly_human_eval.py

View File

@@ -45,7 +45,7 @@ jobs:
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/lang cd test/lang
python3 run_suite.py --suite minimal python3 run_suite.py --suite per-commit
unit-test-backend-1-gpu: unit-test-backend-1-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -70,7 +70,7 @@ jobs:
RANGE=${{ matrix.range }} RANGE=${{ matrix.range }}
range_begin=${RANGE%-*} range_begin=${RANGE%-*}
range_end=${RANGE#*-} range_end=${RANGE#*-}
python3 run_suite.py --suite minimal --range-begin ${range_begin} --range-end ${range_end} python3 run_suite.py --suite per-commit --range-begin ${range_begin} --range-end ${range_end}
unit-test-backend-2-gpu: unit-test-backend-2-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'

View File

@@ -56,6 +56,8 @@ with nvtx.annotate("description", color="color"):
## Other tips ## Other tips
1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder. 1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
2. You can benchmark a model with modified configs (e.g., less layers) by using `--json-model-override-args`. For example, you can benchmark a model with only 2 layers and 2 kv heads using `python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32 --load-format dummy --json-model-override-args '{"num_hidden_layers": 1, "num_key_value_heads": 1}'`
## Profile with PyTorch Profiler ## Profile with PyTorch Profiler
- To profile a server - To profile a server

View File

@@ -897,6 +897,7 @@ async def benchmark(
else: else:
raise ValueError(f"Unknown backend: {backend}") raise ValueError(f"Unknown backend: {backend}")
# Limit concurrency
# From https://github.com/vllm-project/vllm/pull/9390 # From https://github.com/vllm-project/vllm/pull/9390
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
@@ -906,6 +907,7 @@ async def benchmark(
async with semaphore: async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar) return await request_func(request_func_input=request_func_input, pbar=pbar)
# Warmup
print("Starting initial single prompt test run...") print("Starting initial single prompt test run...")
test_prompt, test_prompt_len, test_output_len = input_requests[0] test_prompt, test_prompt_len, test_output_len = input_requests[0]
test_input = RequestFuncInput( test_input = RequestFuncInput(
@@ -924,11 +926,15 @@ async def benchmark(
f"are correctly specified. Error: {test_output.error}" f"are correctly specified. Error: {test_output.error}"
) )
else: else:
requests.post(base_url + "/flush_cache")
print("Initial test run completed. Starting main benchmark run...") print("Initial test run completed. Starting main benchmark run...")
time.sleep(1.5) # Flush cache
if "sglang" in backend:
requests.post(base_url + "/flush_cache")
time.sleep(1.0)
# Start profiler
if profile: if profile:
print("Starting profiler...") print("Starting profiler...")
profile_output = await async_request_profile( profile_output = await async_request_profile(
@@ -939,6 +945,7 @@ async def benchmark(
pbar = None if disable_tqdm else tqdm(total=len(input_requests)) pbar = None if disable_tqdm else tqdm(total=len(input_requests))
# Run all requests
benchmark_start_time = time.perf_counter() benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = [] tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate): async for request in get_request(input_requests, request_rate):
@@ -959,6 +966,7 @@ async def benchmark(
) )
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
# Stop profiler
if profile: if profile:
print("Stopping profiler...") print("Stopping profiler...")
profile_output = await async_request_profile(api_url=base_url + "/stop_profile") profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
@@ -968,8 +976,8 @@ async def benchmark(
if pbar is not None: if pbar is not None:
pbar.close() pbar.close()
# Compute metrics and print results
benchmark_duration = time.perf_counter() - benchmark_start_time benchmark_duration = time.perf_counter() - benchmark_start_time
metrics, output_lens = calculate_metrics( metrics, output_lens = calculate_metrics(
input_requests=input_requests, input_requests=input_requests,
outputs=outputs, outputs=outputs,

View File

@@ -8,3 +8,5 @@ the JSON file contains a mapping from M (batch size) to the chosen configuration
The example configurations provided are for the Mixtral model for TP2 on H100 The example configurations provided are for the Mixtral model for TP2 on H100
and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
N = 7168 and for TP4 we have N = 3584. N = 7168 and for TP4 we have N = 3584.
See `benchmark/kernels/fused_moe_triton/README.md` on how to generate these config files.

View File

@@ -4,7 +4,7 @@ import glob
from sglang.test.test_utils import run_unittest_files from sglang.test.test_utils import run_unittest_files
suites = { suites = {
"minimal": ["test_srt_backend.py", "test_openai_backend.py"], "per-commit": ["test_srt_backend.py", "test_openai_backend.py"],
} }

View File

@@ -4,7 +4,7 @@ import glob
from sglang.test.test_utils import run_unittest_files from sglang.test.test_utils import run_unittest_files
suites = { suites = {
"minimal": [ "per-commit": [
"models/test_embedding_models.py", "models/test_embedding_models.py",
"models/test_generation_models.py", "models/test_generation_models.py",
"models/test_lora.py", "models/test_lora.py",

View File

@@ -30,7 +30,7 @@ class TestTritonAttnBackend(unittest.TestCase):
) )
if is_in_ci(): if is_in_ci():
assert output_throughput > 153, f"{output_throughput=}" self.assertGreater(output_throughput, 153)
def test_mmlu(self): def test_mmlu(self):
model = DEFAULT_MODEL_NAME_FOR_TEST model = DEFAULT_MODEL_NAME_FOR_TEST