Fix test and benchmark scripts (#2598)
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
name: Nightly Evaluation
|
name: Nightly Test
|
||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
@@ -11,11 +11,11 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: nightly-eval-${{ github.ref }}
|
group: nightly-test-${{ github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
nightly-eval-2-gpu:
|
nightly-test:
|
||||||
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
||||||
runs-on: 2-gpu-runner
|
runs-on: 2-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
@@ -27,14 +27,8 @@ jobs:
|
|||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
|
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
|
||||||
|
|
||||||
- name: Test gsm8k
|
- name: Run test
|
||||||
timeout-minutes: 120
|
timeout-minutes: 10
|
||||||
run: |
|
run: |
|
||||||
cd test/srt
|
cd test/lang
|
||||||
python3 test_nightly_gsm8k_eval.py
|
python3 run_suite.py --suite nightly --timeout-per-file 2400
|
||||||
|
|
||||||
- name: Test human eval
|
|
||||||
timeout-minutes: 120
|
|
||||||
run: |
|
|
||||||
cd test/srt
|
|
||||||
python3 test_nightly_human_eval.py
|
|
||||||
4
.github/workflows/pr-test.yml
vendored
4
.github/workflows/pr-test.yml
vendored
@@ -45,7 +45,7 @@ jobs:
|
|||||||
timeout-minutes: 10
|
timeout-minutes: 10
|
||||||
run: |
|
run: |
|
||||||
cd test/lang
|
cd test/lang
|
||||||
python3 run_suite.py --suite minimal
|
python3 run_suite.py --suite per-commit
|
||||||
|
|
||||||
unit-test-backend-1-gpu:
|
unit-test-backend-1-gpu:
|
||||||
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
||||||
@@ -70,7 +70,7 @@ jobs:
|
|||||||
RANGE=${{ matrix.range }}
|
RANGE=${{ matrix.range }}
|
||||||
range_begin=${RANGE%-*}
|
range_begin=${RANGE%-*}
|
||||||
range_end=${RANGE#*-}
|
range_end=${RANGE#*-}
|
||||||
python3 run_suite.py --suite minimal --range-begin ${range_begin} --range-end ${range_end}
|
python3 run_suite.py --suite per-commit --range-begin ${range_begin} --range-end ${range_end}
|
||||||
|
|
||||||
unit-test-backend-2-gpu:
|
unit-test-backend-2-gpu:
|
||||||
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
||||||
|
|||||||
@@ -56,6 +56,8 @@ with nvtx.annotate("description", color="color"):
|
|||||||
## Other tips
|
## Other tips
|
||||||
|
|
||||||
1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
|
1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
|
||||||
|
2. You can benchmark a model with modified configs (e.g., less layers) by using `--json-model-override-args`. For example, you can benchmark a model with only 2 layers and 2 kv heads using `python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32 --load-format dummy --json-model-override-args '{"num_hidden_layers": 1, "num_key_value_heads": 1}'`
|
||||||
|
|
||||||
|
|
||||||
## Profile with PyTorch Profiler
|
## Profile with PyTorch Profiler
|
||||||
- To profile a server
|
- To profile a server
|
||||||
|
|||||||
@@ -897,6 +897,7 @@ async def benchmark(
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {backend}")
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
|
# Limit concurrency
|
||||||
# From https://github.com/vllm-project/vllm/pull/9390
|
# From https://github.com/vllm-project/vllm/pull/9390
|
||||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||||
|
|
||||||
@@ -906,6 +907,7 @@ async def benchmark(
|
|||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
|
|
||||||
|
# Warmup
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
||||||
test_input = RequestFuncInput(
|
test_input = RequestFuncInput(
|
||||||
@@ -924,11 +926,15 @@ async def benchmark(
|
|||||||
f"are correctly specified. Error: {test_output.error}"
|
f"are correctly specified. Error: {test_output.error}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
requests.post(base_url + "/flush_cache")
|
|
||||||
print("Initial test run completed. Starting main benchmark run...")
|
print("Initial test run completed. Starting main benchmark run...")
|
||||||
|
|
||||||
time.sleep(1.5)
|
# Flush cache
|
||||||
|
if "sglang" in backend:
|
||||||
|
requests.post(base_url + "/flush_cache")
|
||||||
|
|
||||||
|
time.sleep(1.0)
|
||||||
|
|
||||||
|
# Start profiler
|
||||||
if profile:
|
if profile:
|
||||||
print("Starting profiler...")
|
print("Starting profiler...")
|
||||||
profile_output = await async_request_profile(
|
profile_output = await async_request_profile(
|
||||||
@@ -939,6 +945,7 @@ async def benchmark(
|
|||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
|
# Run all requests
|
||||||
benchmark_start_time = time.perf_counter()
|
benchmark_start_time = time.perf_counter()
|
||||||
tasks: List[asyncio.Task] = []
|
tasks: List[asyncio.Task] = []
|
||||||
async for request in get_request(input_requests, request_rate):
|
async for request in get_request(input_requests, request_rate):
|
||||||
@@ -959,6 +966,7 @@ async def benchmark(
|
|||||||
)
|
)
|
||||||
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
# Stop profiler
|
||||||
if profile:
|
if profile:
|
||||||
print("Stopping profiler...")
|
print("Stopping profiler...")
|
||||||
profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
|
profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
|
||||||
@@ -968,8 +976,8 @@ async def benchmark(
|
|||||||
if pbar is not None:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
|
# Compute metrics and print results
|
||||||
benchmark_duration = time.perf_counter() - benchmark_start_time
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
||||||
|
|
||||||
metrics, output_lens = calculate_metrics(
|
metrics, output_lens = calculate_metrics(
|
||||||
input_requests=input_requests,
|
input_requests=input_requests,
|
||||||
outputs=outputs,
|
outputs=outputs,
|
||||||
|
|||||||
@@ -8,3 +8,5 @@ the JSON file contains a mapping from M (batch size) to the chosen configuration
|
|||||||
The example configurations provided are for the Mixtral model for TP2 on H100
|
The example configurations provided are for the Mixtral model for TP2 on H100
|
||||||
and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
|
and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
|
||||||
N = 7168 and for TP4 we have N = 3584.
|
N = 7168 and for TP4 we have N = 3584.
|
||||||
|
|
||||||
|
See `benchmark/kernels/fused_moe_triton/README.md` on how to generate these config files.
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import glob
|
|||||||
from sglang.test.test_utils import run_unittest_files
|
from sglang.test.test_utils import run_unittest_files
|
||||||
|
|
||||||
suites = {
|
suites = {
|
||||||
"minimal": ["test_srt_backend.py", "test_openai_backend.py"],
|
"per-commit": ["test_srt_backend.py", "test_openai_backend.py"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import glob
|
|||||||
from sglang.test.test_utils import run_unittest_files
|
from sglang.test.test_utils import run_unittest_files
|
||||||
|
|
||||||
suites = {
|
suites = {
|
||||||
"minimal": [
|
"per-commit": [
|
||||||
"models/test_embedding_models.py",
|
"models/test_embedding_models.py",
|
||||||
"models/test_generation_models.py",
|
"models/test_generation_models.py",
|
||||||
"models/test_lora.py",
|
"models/test_lora.py",
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class TestTritonAttnBackend(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if is_in_ci():
|
if is_in_ci():
|
||||||
assert output_throughput > 153, f"{output_throughput=}"
|
self.assertGreater(output_throughput, 153)
|
||||||
|
|
||||||
def test_mmlu(self):
|
def test_mmlu(self):
|
||||||
model = DEFAULT_MODEL_NAME_FOR_TEST
|
model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||||
|
|||||||
Reference in New Issue
Block a user