[Generative Scores API] add performance tests to CICD (#10830)
This commit is contained in:
33
.github/workflows/pr-test.yml
vendored
33
.github/workflows/pr-test.yml
vendored
@@ -460,6 +460,39 @@ jobs:
|
||||
cd test/srt
|
||||
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
|
||||
|
||||
performance-test-1-gpu-part-3:
|
||||
needs: [check-changes, sgl-kernel-build-wheels]
|
||||
if: always() && !failure() && !cancelled() &&
|
||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||
runs-on: 1-gpu-runner
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Download artifacts
|
||||
if: needs.check-changes.outputs.sgl_kernel == 'true'
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: sgl-kernel/dist/
|
||||
merge-multiple: true
|
||||
pattern: wheel-python3.10-cuda12.9
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark Scores online latency and throughput
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
cd test/srt
|
||||
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput
|
||||
|
||||
- name: Benchmark Scores online latency and throughput (batch size scaling)
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
cd test/srt
|
||||
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling
|
||||
|
||||
performance-test-2-gpu:
|
||||
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
||||
if: always() && !failure() && !cancelled() &&
|
||||
|
||||
@@ -43,6 +43,7 @@ from sglang.utils import get_exception_traceback
|
||||
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
|
||||
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
|
||||
@@ -873,6 +874,154 @@ def run_bench_serving(
|
||||
return res
|
||||
|
||||
|
||||
def run_score_benchmark(
|
||||
model,
|
||||
num_requests=100,
|
||||
batch_size=5,
|
||||
other_server_args=None,
|
||||
need_warmup=False,
|
||||
device="auto",
|
||||
):
|
||||
"""Score API benchmark function compatible with run_bench_serving pattern"""
|
||||
if other_server_args is None:
|
||||
other_server_args = []
|
||||
|
||||
if device == "auto":
|
||||
device = auto_config_device()
|
||||
|
||||
# Launch the server (consistent with run_bench_serving)
|
||||
base_url = DEFAULT_URL_FOR_TEST
|
||||
process = popen_launch_server(
|
||||
model,
|
||||
base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=other_server_args,
|
||||
)
|
||||
|
||||
async def _run_benchmark():
|
||||
|
||||
# Load tokenizer for generating test data
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
|
||||
tokenizer = get_tokenizer(model)
|
||||
|
||||
# Score API configuration
|
||||
score_query_tokens = 120
|
||||
score_item_tokens = 180
|
||||
score_label_token_ids = [9454, 2753] # Yes/No token IDs
|
||||
special_token = "<|im_start|>"
|
||||
|
||||
def generate_text_with_token_count(num_tokens):
|
||||
"""Generate text with precise token count using replicated token."""
|
||||
text = special_token * num_tokens
|
||||
actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
|
||||
if actual_tokens != num_tokens:
|
||||
text = special_token * (
|
||||
num_tokens
|
||||
// len(tokenizer.encode(special_token, add_special_tokens=False))
|
||||
)
|
||||
return text
|
||||
|
||||
if need_warmup:
|
||||
warmup_data = {
|
||||
"query": generate_text_with_token_count(score_query_tokens),
|
||||
"items": [
|
||||
generate_text_with_token_count(score_item_tokens) for _ in range(3)
|
||||
],
|
||||
"label_token_ids": score_label_token_ids,
|
||||
"model": model,
|
||||
"apply_softmax": True,
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
await session.post(
|
||||
f"{base_url}/v1/score",
|
||||
json=warmup_data,
|
||||
timeout=aiohttp.ClientTimeout(total=30),
|
||||
)
|
||||
except:
|
||||
pass # Ignore warmup errors
|
||||
|
||||
test_requests = []
|
||||
for i in range(num_requests):
|
||||
query = generate_text_with_token_count(score_query_tokens)
|
||||
items = [
|
||||
generate_text_with_token_count(score_item_tokens)
|
||||
for _ in range(batch_size)
|
||||
]
|
||||
|
||||
score_data = {
|
||||
"query": query,
|
||||
"items": items,
|
||||
"label_token_ids": score_label_token_ids,
|
||||
"model": model,
|
||||
"apply_softmax": True,
|
||||
}
|
||||
test_requests.append(score_data)
|
||||
|
||||
start_time = time.monotonic()
|
||||
successful_requests = 0
|
||||
total_latency = 0
|
||||
latencies = []
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for request_data in test_requests:
|
||||
try:
|
||||
request_start = time.monotonic()
|
||||
async with session.post(
|
||||
f"{base_url}/v1/score",
|
||||
json=request_data,
|
||||
timeout=aiohttp.ClientTimeout(total=30),
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
response_data = await response.json()
|
||||
request_end = time.monotonic()
|
||||
|
||||
if "scores" in response_data or "logprobs" in response_data:
|
||||
latency_ms = (request_end - request_start) * 1000
|
||||
latencies.append(latency_ms)
|
||||
total_latency += latency_ms
|
||||
successful_requests += 1
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
end_time = time.monotonic()
|
||||
total_time = end_time - start_time
|
||||
|
||||
if successful_requests > 0:
|
||||
throughput = successful_requests / total_time
|
||||
avg_latency = total_latency / successful_requests
|
||||
latencies.sort()
|
||||
p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
|
||||
|
||||
return {
|
||||
"completed": successful_requests,
|
||||
"total_requests": num_requests,
|
||||
"throughput": throughput,
|
||||
"avg_latency_ms": avg_latency,
|
||||
"p95_latency_ms": p95_latency,
|
||||
"successful_requests": successful_requests,
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"completed": 0,
|
||||
"total_requests": num_requests,
|
||||
"throughput": 0,
|
||||
"avg_latency_ms": 0,
|
||||
"p95_latency_ms": 0,
|
||||
"successful_requests": 0,
|
||||
}
|
||||
|
||||
try:
|
||||
res = asyncio.run(_run_benchmark())
|
||||
finally:
|
||||
kill_process_tree(process.pid)
|
||||
|
||||
assert res["completed"] == res["successful_requests"]
|
||||
return res
|
||||
|
||||
|
||||
def run_bench_serving_multi(
|
||||
model,
|
||||
base_url,
|
||||
|
||||
@@ -4,17 +4,20 @@ import unittest
|
||||
|
||||
import requests
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
|
||||
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
|
||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_MODEL_NAME_FOR_TEST_FP8,
|
||||
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
|
||||
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
|
||||
CustomTestCase,
|
||||
is_in_amd_ci,
|
||||
is_in_ci,
|
||||
run_bench_serving,
|
||||
run_score_benchmark,
|
||||
write_github_step_summary,
|
||||
)
|
||||
|
||||
@@ -440,6 +443,71 @@ class TestBenchServing(CustomTestCase):
|
||||
)
|
||||
self.assertGreater(res["input_throughput"], 4000)
|
||||
|
||||
def test_score_api_latency_throughput(self):
|
||||
"""Test score API latency and throughput performance"""
|
||||
res = run_score_benchmark(
|
||||
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
|
||||
num_requests=1000,
|
||||
batch_size=10,
|
||||
other_server_args=[],
|
||||
need_warmup=True,
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
write_github_step_summary(
|
||||
f"### test_score_api_throughput\n"
|
||||
f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
|
||||
f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
|
||||
f"Score API throughput: {res['throughput']:.2f} req/s\n"
|
||||
f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
|
||||
)
|
||||
|
||||
self.assertEqual(res["successful_requests"], res["total_requests"])
|
||||
self.assertLess(res["avg_latency_ms"], 48)
|
||||
self.assertLess(res["p95_latency_ms"], 50)
|
||||
self.assertGreater(res["throughput"], 20)
|
||||
|
||||
def test_score_api_batch_scaling(self):
|
||||
"""Test score API performance with different batch sizes"""
|
||||
batch_sizes = [10, 25, 50]
|
||||
|
||||
for batch_size in batch_sizes:
|
||||
res = run_score_benchmark(
|
||||
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
|
||||
num_requests=500,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
write_github_step_summary(
|
||||
f"### test_score_api_batch_scaling_size_{batch_size}\n"
|
||||
f"Batch size: {batch_size}\n"
|
||||
f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
|
||||
f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
|
||||
f"Throughput: {res['throughput']:.2f} req/s\n"
|
||||
f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
|
||||
)
|
||||
|
||||
self.assertEqual(res["successful_requests"], res["total_requests"])
|
||||
if batch_size == 10:
|
||||
avg_latency_bound = 45
|
||||
elif batch_size == 25:
|
||||
avg_latency_bound = 50
|
||||
elif batch_size == 50:
|
||||
avg_latency_bound = 60
|
||||
else:
|
||||
avg_latency_bound = 60
|
||||
self.assertLess(res["avg_latency_ms"], avg_latency_bound)
|
||||
if batch_size == 10:
|
||||
p95_latency_bound = 50
|
||||
elif batch_size == 25:
|
||||
p95_latency_bound = 60
|
||||
elif batch_size == 50:
|
||||
p95_latency_bound = 65
|
||||
else:
|
||||
p95_latency_bound = 65
|
||||
self.assertLess(res["p95_latency_ms"], p95_latency_bound)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user