diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 4e7ab9075..d983c91bf 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -460,6 +460,39 @@ jobs: cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency + performance-test-1-gpu-part-3: + needs: [check-changes, sgl-kernel-build-wheels] + if: always() && !failure() && !cancelled() && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download artifacts + if: needs.check-changes.outputs.sgl_kernel == 'true' + uses: actions/download-artifact@v4 + with: + path: sgl-kernel/dist/ + merge-multiple: true + pattern: wheel-python3.10-cuda12.9 + + - name: Install dependencies + run: | + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh + + - name: Benchmark Scores online latency and throughput + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput + + - name: Benchmark Scores online latency and throughput (batch size scaling) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling + performance-test-2-gpu: needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels] if: always() && !failure() && !cancelled() && diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 360c852cb..ddab8bbf6 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -43,6 +43,7 @@ from sglang.utils import get_exception_traceback DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B" +DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B" DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat" @@ -873,6 +874,154 @@ def run_bench_serving( return res +def run_score_benchmark( + model, + num_requests=100, + batch_size=5, + other_server_args=None, + need_warmup=False, + device="auto", +): + """Score API benchmark function compatible with run_bench_serving pattern""" + if other_server_args is None: + other_server_args = [] + + if device == "auto": + device = auto_config_device() + + # Launch the server (consistent with run_bench_serving) + base_url = DEFAULT_URL_FOR_TEST + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_server_args, + ) + + async def _run_benchmark(): + + # Load tokenizer for generating test data + from sglang.srt.hf_transformers_utils import get_tokenizer + + tokenizer = get_tokenizer(model) + + # Score API configuration + score_query_tokens = 120 + score_item_tokens = 180 + score_label_token_ids = [9454, 2753] # Yes/No token IDs + special_token = "<|im_start|>" + + def generate_text_with_token_count(num_tokens): + """Generate text with precise token count using replicated token.""" + text = special_token * num_tokens + actual_tokens = len(tokenizer.encode(text, add_special_tokens=False)) + if actual_tokens != num_tokens: + text = special_token * ( + num_tokens + // len(tokenizer.encode(special_token, add_special_tokens=False)) + ) + return text + + if need_warmup: + warmup_data = { + "query": generate_text_with_token_count(score_query_tokens), + "items": [ + generate_text_with_token_count(score_item_tokens) for _ in range(3) + ], + "label_token_ids": score_label_token_ids, + "model": model, + "apply_softmax": True, + } + + async with aiohttp.ClientSession() as session: + try: + await session.post( + f"{base_url}/v1/score", + json=warmup_data, + timeout=aiohttp.ClientTimeout(total=30), + ) + except: + pass # Ignore warmup errors + + test_requests = [] + for i in range(num_requests): + query = generate_text_with_token_count(score_query_tokens) + items = [ + generate_text_with_token_count(score_item_tokens) + for _ in range(batch_size) + ] + + score_data = { + "query": query, + "items": items, + "label_token_ids": score_label_token_ids, + "model": model, + "apply_softmax": True, + } + test_requests.append(score_data) + + start_time = time.monotonic() + successful_requests = 0 + total_latency = 0 + latencies = [] + + async with aiohttp.ClientSession() as session: + for request_data in test_requests: + try: + request_start = time.monotonic() + async with session.post( + f"{base_url}/v1/score", + json=request_data, + timeout=aiohttp.ClientTimeout(total=30), + ) as response: + if response.status == 200: + response_data = await response.json() + request_end = time.monotonic() + + if "scores" in response_data or "logprobs" in response_data: + latency_ms = (request_end - request_start) * 1000 + latencies.append(latency_ms) + total_latency += latency_ms + successful_requests += 1 + except Exception: + continue + + end_time = time.monotonic() + total_time = end_time - start_time + + if successful_requests > 0: + throughput = successful_requests / total_time + avg_latency = total_latency / successful_requests + latencies.sort() + p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0 + + return { + "completed": successful_requests, + "total_requests": num_requests, + "throughput": throughput, + "avg_latency_ms": avg_latency, + "p95_latency_ms": p95_latency, + "successful_requests": successful_requests, + } + else: + return { + "completed": 0, + "total_requests": num_requests, + "throughput": 0, + "avg_latency_ms": 0, + "p95_latency_ms": 0, + "successful_requests": 0, + } + + try: + res = asyncio.run(_run_benchmark()) + finally: + kill_process_tree(process.pid) + + assert res["completed"] == res["successful_requests"] + return res + + def run_bench_serving_multi( model, base_url, diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 608595b95..6a73566e1 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -4,17 +4,20 @@ import unittest import requests +from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.test.test_utils import ( DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST_FP8, DEFAULT_MOE_MODEL_NAME_FOR_TEST, + DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE, DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, CustomTestCase, is_in_amd_ci, is_in_ci, run_bench_serving, + run_score_benchmark, write_github_step_summary, ) @@ -440,6 +443,71 @@ class TestBenchServing(CustomTestCase): ) self.assertGreater(res["input_throughput"], 4000) + def test_score_api_latency_throughput(self): + """Test score API latency and throughput performance""" + res = run_score_benchmark( + model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE, + num_requests=1000, + batch_size=10, + other_server_args=[], + need_warmup=True, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_score_api_throughput\n" + f"Average latency: {res['avg_latency_ms']:.2f} ms\n" + f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" + f"Score API throughput: {res['throughput']:.2f} req/s\n" + f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" + ) + + self.assertEqual(res["successful_requests"], res["total_requests"]) + self.assertLess(res["avg_latency_ms"], 48) + self.assertLess(res["p95_latency_ms"], 50) + self.assertGreater(res["throughput"], 20) + + def test_score_api_batch_scaling(self): + """Test score API performance with different batch sizes""" + batch_sizes = [10, 25, 50] + + for batch_size in batch_sizes: + res = run_score_benchmark( + model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE, + num_requests=500, + batch_size=batch_size, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_score_api_batch_scaling_size_{batch_size}\n" + f"Batch size: {batch_size}\n" + f"Average latency: {res['avg_latency_ms']:.2f} ms\n" + f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" + f"Throughput: {res['throughput']:.2f} req/s\n" + f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" + ) + + self.assertEqual(res["successful_requests"], res["total_requests"]) + if batch_size == 10: + avg_latency_bound = 45 + elif batch_size == 25: + avg_latency_bound = 50 + elif batch_size == 50: + avg_latency_bound = 60 + else: + avg_latency_bound = 60 + self.assertLess(res["avg_latency_ms"], avg_latency_bound) + if batch_size == 10: + p95_latency_bound = 50 + elif batch_size == 25: + p95_latency_bound = 60 + elif batch_size == 50: + p95_latency_bound = 65 + else: + p95_latency_bound = 65 + self.assertLess(res["p95_latency_ms"], p95_latency_bound) + if __name__ == "__main__": unittest.main()