[Generative Scores API] add performance tests to CICD (#10830)

This commit is contained in:
Vedant V Jhaveri
2025-10-02 19:57:55 -07:00
committed by GitHub
parent 3c699772c9
commit 7e61737d3f
3 changed files with 250 additions and 0 deletions

View File

@@ -43,6 +43,7 @@ from sglang.utils import get_exception_traceback
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
@@ -873,6 +874,154 @@ def run_bench_serving(
return res
def run_score_benchmark(
model,
num_requests=100,
batch_size=5,
other_server_args=None,
need_warmup=False,
device="auto",
):
"""Score API benchmark function compatible with run_bench_serving pattern"""
if other_server_args is None:
other_server_args = []
if device == "auto":
device = auto_config_device()
# Launch the server (consistent with run_bench_serving)
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_server_args,
)
async def _run_benchmark():
# Load tokenizer for generating test data
from sglang.srt.hf_transformers_utils import get_tokenizer
tokenizer = get_tokenizer(model)
# Score API configuration
score_query_tokens = 120
score_item_tokens = 180
score_label_token_ids = [9454, 2753] # Yes/No token IDs
special_token = "<|im_start|>"
def generate_text_with_token_count(num_tokens):
"""Generate text with precise token count using replicated token."""
text = special_token * num_tokens
actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
if actual_tokens != num_tokens:
text = special_token * (
num_tokens
// len(tokenizer.encode(special_token, add_special_tokens=False))
)
return text
if need_warmup:
warmup_data = {
"query": generate_text_with_token_count(score_query_tokens),
"items": [
generate_text_with_token_count(score_item_tokens) for _ in range(3)
],
"label_token_ids": score_label_token_ids,
"model": model,
"apply_softmax": True,
}
async with aiohttp.ClientSession() as session:
try:
await session.post(
f"{base_url}/v1/score",
json=warmup_data,
timeout=aiohttp.ClientTimeout(total=30),
)
except:
pass # Ignore warmup errors
test_requests = []
for i in range(num_requests):
query = generate_text_with_token_count(score_query_tokens)
items = [
generate_text_with_token_count(score_item_tokens)
for _ in range(batch_size)
]
score_data = {
"query": query,
"items": items,
"label_token_ids": score_label_token_ids,
"model": model,
"apply_softmax": True,
}
test_requests.append(score_data)
start_time = time.monotonic()
successful_requests = 0
total_latency = 0
latencies = []
async with aiohttp.ClientSession() as session:
for request_data in test_requests:
try:
request_start = time.monotonic()
async with session.post(
f"{base_url}/v1/score",
json=request_data,
timeout=aiohttp.ClientTimeout(total=30),
) as response:
if response.status == 200:
response_data = await response.json()
request_end = time.monotonic()
if "scores" in response_data or "logprobs" in response_data:
latency_ms = (request_end - request_start) * 1000
latencies.append(latency_ms)
total_latency += latency_ms
successful_requests += 1
except Exception:
continue
end_time = time.monotonic()
total_time = end_time - start_time
if successful_requests > 0:
throughput = successful_requests / total_time
avg_latency = total_latency / successful_requests
latencies.sort()
p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
return {
"completed": successful_requests,
"total_requests": num_requests,
"throughput": throughput,
"avg_latency_ms": avg_latency,
"p95_latency_ms": p95_latency,
"successful_requests": successful_requests,
}
else:
return {
"completed": 0,
"total_requests": num_requests,
"throughput": 0,
"avg_latency_ms": 0,
"p95_latency_ms": 0,
"successful_requests": 0,
}
try:
res = asyncio.run(_run_benchmark())
finally:
kill_process_tree(process.pid)
assert res["completed"] == res["successful_requests"]
return res
def run_bench_serving_multi(
model,
base_url,