[Generative Scores API] add performance tests to CICD (#10830)

This commit is contained in:
Vedant V Jhaveri
2025-10-02 19:57:55 -07:00
committed by GitHub
parent 3c699772c9
commit 7e61737d3f
3 changed files with 250 additions and 0 deletions

View File

@@ -4,17 +4,20 @@ import unittest
import requests
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST_FP8,
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
CustomTestCase,
is_in_amd_ci,
is_in_ci,
run_bench_serving,
run_score_benchmark,
write_github_step_summary,
)
@@ -440,6 +443,71 @@ class TestBenchServing(CustomTestCase):
)
self.assertGreater(res["input_throughput"], 4000)
def test_score_api_latency_throughput(self):
"""Test score API latency and throughput performance"""
res = run_score_benchmark(
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
num_requests=1000,
batch_size=10,
other_server_args=[],
need_warmup=True,
)
if is_in_ci():
write_github_step_summary(
f"### test_score_api_throughput\n"
f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
f"Score API throughput: {res['throughput']:.2f} req/s\n"
f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
)
self.assertEqual(res["successful_requests"], res["total_requests"])
self.assertLess(res["avg_latency_ms"], 48)
self.assertLess(res["p95_latency_ms"], 50)
self.assertGreater(res["throughput"], 20)
def test_score_api_batch_scaling(self):
"""Test score API performance with different batch sizes"""
batch_sizes = [10, 25, 50]
for batch_size in batch_sizes:
res = run_score_benchmark(
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
num_requests=500,
batch_size=batch_size,
)
if is_in_ci():
write_github_step_summary(
f"### test_score_api_batch_scaling_size_{batch_size}\n"
f"Batch size: {batch_size}\n"
f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
f"Throughput: {res['throughput']:.2f} req/s\n"
f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
)
self.assertEqual(res["successful_requests"], res["total_requests"])
if batch_size == 10:
avg_latency_bound = 45
elif batch_size == 25:
avg_latency_bound = 50
elif batch_size == 50:
avg_latency_bound = 60
else:
avg_latency_bound = 60
self.assertLess(res["avg_latency_ms"], avg_latency_bound)
if batch_size == 10:
p95_latency_bound = 50
elif batch_size == 25:
p95_latency_bound = 60
elif batch_size == 50:
p95_latency_bound = 65
else:
p95_latency_bound = 65
self.assertLess(res["p95_latency_ms"], p95_latency_bound)
if __name__ == "__main__":
unittest.main()