[Generative Scores API] add performance tests to CICD (#10830)

2025-10-02 19:57:55 -07:00
parent 3c699772c9
commit 7e61737d3f
3 changed files with 250 additions and 0 deletions
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -460,6 +460,39 @@ jobs:
          cd test/srt
          python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency

+  performance-test-1-gpu-part-3:
+    needs: [check-changes, sgl-kernel-build-wheels]
+    if: always() && !failure() && !cancelled() &&
+        ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Download artifacts
+        if: needs.check-changes.outputs.sgl_kernel == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Install dependencies
+        run: |
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
+
+      - name: Benchmark Scores online latency and throughput
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput
+
+      - name: Benchmark Scores online latency and throughput (batch size scaling)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling
+
  performance-test-2-gpu:
    needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
    if: always() && !failure() && !cancelled() &&
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -43,6 +43,7 @@ from sglang.utils import get_exception_traceback
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
+DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
 DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
@@ -873,6 +874,154 @@ def run_bench_serving(
    return res


+def run_score_benchmark(
+    model,
+    num_requests=100,
+    batch_size=5,
+    other_server_args=None,
+    need_warmup=False,
+    device="auto",
+):
+    """Score API benchmark function compatible with run_bench_serving pattern"""
+    if other_server_args is None:
+        other_server_args = []
+
+    if device == "auto":
+        device = auto_config_device()
+
+    # Launch the server (consistent with run_bench_serving)
+    base_url = DEFAULT_URL_FOR_TEST
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_server_args,
+    )
+
+    async def _run_benchmark():
+
+        # Load tokenizer for generating test data
+        from sglang.srt.hf_transformers_utils import get_tokenizer
+
+        tokenizer = get_tokenizer(model)
+
+        # Score API configuration
+        score_query_tokens = 120
+        score_item_tokens = 180
+        score_label_token_ids = [9454, 2753]  # Yes/No token IDs
+        special_token = "<|im_start|>"
+
+        def generate_text_with_token_count(num_tokens):
+            """Generate text with precise token count using replicated token."""
+            text = special_token * num_tokens
+            actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
+            if actual_tokens != num_tokens:
+                text = special_token * (
+                    num_tokens
+                    // len(tokenizer.encode(special_token, add_special_tokens=False))
+                )
+            return text
+
+        if need_warmup:
+            warmup_data = {
+                "query": generate_text_with_token_count(score_query_tokens),
+                "items": [
+                    generate_text_with_token_count(score_item_tokens) for _ in range(3)
+                ],
+                "label_token_ids": score_label_token_ids,
+                "model": model,
+                "apply_softmax": True,
+            }
+
+            async with aiohttp.ClientSession() as session:
+                try:
+                    await session.post(
+                        f"{base_url}/v1/score",
+                        json=warmup_data,
+                        timeout=aiohttp.ClientTimeout(total=30),
+                    )
+                except:
+                    pass  # Ignore warmup errors
+
+        test_requests = []
+        for i in range(num_requests):
+            query = generate_text_with_token_count(score_query_tokens)
+            items = [
+                generate_text_with_token_count(score_item_tokens)
+                for _ in range(batch_size)
+            ]
+
+            score_data = {
+                "query": query,
+                "items": items,
+                "label_token_ids": score_label_token_ids,
+                "model": model,
+                "apply_softmax": True,
+            }
+            test_requests.append(score_data)
+
+        start_time = time.monotonic()
+        successful_requests = 0
+        total_latency = 0
+        latencies = []
+
+        async with aiohttp.ClientSession() as session:
+            for request_data in test_requests:
+                try:
+                    request_start = time.monotonic()
+                    async with session.post(
+                        f"{base_url}/v1/score",
+                        json=request_data,
+                        timeout=aiohttp.ClientTimeout(total=30),
+                    ) as response:
+                        if response.status == 200:
+                            response_data = await response.json()
+                            request_end = time.monotonic()
+
+                            if "scores" in response_data or "logprobs" in response_data:
+                                latency_ms = (request_end - request_start) * 1000
+                                latencies.append(latency_ms)
+                                total_latency += latency_ms
+                                successful_requests += 1
+                except Exception:
+                    continue
+
+        end_time = time.monotonic()
+        total_time = end_time - start_time
+
+        if successful_requests > 0:
+            throughput = successful_requests / total_time
+            avg_latency = total_latency / successful_requests
+            latencies.sort()
+            p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
+
+            return {
+                "completed": successful_requests,
+                "total_requests": num_requests,
+                "throughput": throughput,
+                "avg_latency_ms": avg_latency,
+                "p95_latency_ms": p95_latency,
+                "successful_requests": successful_requests,
+            }
+        else:
+            return {
+                "completed": 0,
+                "total_requests": num_requests,
+                "throughput": 0,
+                "avg_latency_ms": 0,
+                "p95_latency_ms": 0,
+                "successful_requests": 0,
+            }
+
+    try:
+        res = asyncio.run(_run_benchmark())
+    finally:
+        kill_process_tree(process.pid)
+
+    assert res["completed"] == res["successful_requests"]
+    return res
+
+
 def run_bench_serving_multi(
    model,
    base_url,
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -4,17 +4,20 @@ import unittest

 import requests

+from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_MODEL_NAME_FOR_TEST_FP8,
    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
    DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
    CustomTestCase,
    is_in_amd_ci,
    is_in_ci,
    run_bench_serving,
+    run_score_benchmark,
    write_github_step_summary,
 )

@@ -440,6 +443,71 @@ class TestBenchServing(CustomTestCase):
            )
            self.assertGreater(res["input_throughput"], 4000)

+    def test_score_api_latency_throughput(self):
+        """Test score API latency and throughput performance"""
+        res = run_score_benchmark(
+            model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
+            num_requests=1000,
+            batch_size=10,
+            other_server_args=[],
+            need_warmup=True,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_score_api_throughput\n"
+                f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
+                f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
+                f"Score API throughput: {res['throughput']:.2f} req/s\n"
+                f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
+            )
+
+        self.assertEqual(res["successful_requests"], res["total_requests"])
+        self.assertLess(res["avg_latency_ms"], 48)
+        self.assertLess(res["p95_latency_ms"], 50)
+        self.assertGreater(res["throughput"], 20)
+
+    def test_score_api_batch_scaling(self):
+        """Test score API performance with different batch sizes"""
+        batch_sizes = [10, 25, 50]
+
+        for batch_size in batch_sizes:
+            res = run_score_benchmark(
+                model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
+                num_requests=500,
+                batch_size=batch_size,
+            )
+
+            if is_in_ci():
+                write_github_step_summary(
+                    f"### test_score_api_batch_scaling_size_{batch_size}\n"
+                    f"Batch size: {batch_size}\n"
+                    f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
+                    f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
+                    f"Throughput: {res['throughput']:.2f} req/s\n"
+                    f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
+                )
+
+            self.assertEqual(res["successful_requests"], res["total_requests"])
+            if batch_size == 10:
+                avg_latency_bound = 45
+            elif batch_size == 25:
+                avg_latency_bound = 50
+            elif batch_size == 50:
+                avg_latency_bound = 60
+            else:
+                avg_latency_bound = 60
+            self.assertLess(res["avg_latency_ms"], avg_latency_bound)
+            if batch_size == 10:
+                p95_latency_bound = 50
+            elif batch_size == 25:
+                p95_latency_bound = 60
+            elif batch_size == 50:
+                p95_latency_bound = 65
+            else:
+                p95_latency_bound = 65
+            self.assertLess(res["p95_latency_ms"], p95_latency_bound)
+

 if __name__ == "__main__":
    unittest.main()