[Benchmark] Prefil-only benchmark scripts (#10240)

2025-09-09 19:59:07 -07:00
parent dccf52f9c8
commit a1d038924b
4 changed files with 1153 additions and 603 deletions
--- a/benchmark/prefill_only/bench_score.py
+++ b/benchmark/prefill_only/bench_score.py
@@ -0,0 +1,192 @@
+"""
+SGLang Scoring Benchmark Script
+
+This script benchmarks SGLang's scoring API performance using HTTP requests.
+
+Current Features:
+- HTTP-only implementation (open source compatible)
+- Uses /v1/score API endpoint directly
+- Single item scoring with batching support
+- Configurable RPS, duration, and batch sizes
+- Progress tracking and detailed metrics
+- Poisson and constant request distributions
+
+Usage:
+- Update configuration variables at the top of the file
+- Ensure SGLang server is running on the configured HTTP_URL
+- Run: python bench_score.py
+- Each request will contain ITEM_COUNT_VALUES items for batch scoring
+
+"""
+
+import asyncio
+
+from transformers import AutoTokenizer
+from util import (
+    BenchmarkConfig,
+    generate_text_with_token_count,
+    run_benchmark_main,
+    run_generic_benchmark,
+)
+
+###############################################################################
+# CONFIG
+###############################################################################
+# Create benchmark configuration
+config = BenchmarkConfig()
+config.rps_values = [160]
+config.duration_secs_values = [60]
+config.num_unique_requests = 100
+config.distribution = "POISSON"
+config.profile = False
+config.freeze_gc = True  # Enable GC freeze functionality
+# Profiler output directory - by default uses present working directory (pwd)
+# Uncomment and customize the line below to override the default location:
+# config.profiler_dir = "/sglang-oss-trace"
+
+# HTTP Configuration
+HTTP_URL = "http://localhost:30000/v1/score"  # Use score API directly
+
+# Score API Config
+# ITEM_COUNT_VALUES determines number of items per score request (batch size)
+SCORE_QUERY_TOKENS = 120
+SCORE_ITEM_TOKENS = 180
+SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B"
+SCORE_LABEL_TOKEN_IDS = [9454, 2753]  # Yes/No token IDs
+ITEM_COUNT_VALUES = [10]  # Number of items per request
+
+# Special token to replicate for precise token counting
+SPECIAL_REPLICATED_TOKEN = "<|im_start|>"
+
+
+###############################################################################
+# REQUEST GENERATION (in parallel)
+###############################################################################
+def create_score_request_builder():
+    """Create a score request builder function with shared tokenizer."""
+    # Load tokenizer once here to verify special token and get precise counts
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
+
+    # Verify that our special token produces exactly 1 token
+    special_token_count = len(
+        tokenizer.encode(config.special_replicated_token, add_special_tokens=False)
+    )
+    print(
+        f"Special token '{config.special_replicated_token}' produces "
+        f"{special_token_count} token(s)"
+    )
+
+    def generate_text_with_token_count_local(num_toks):
+        """Generate text with precise token count using replicated token."""
+        return generate_text_with_token_count(
+            SCORE_MODEL_PATH,
+            num_toks,
+            config.special_replicated_token,
+            tokenizer=tokenizer,
+        )
+
+    def build_score_request(index: int, item_count: int) -> tuple:
+        """Build a single score request."""
+        try:
+            # Generate query and items for score API
+            query = generate_text_with_token_count_local(SCORE_QUERY_TOKENS)
+            items = [
+                generate_text_with_token_count_local(SCORE_ITEM_TOKENS)
+                for _ in range(item_count)
+            ]
+
+            # Return as dict for score API format
+            score_data = {
+                "query": query,
+                "items": items,
+                "label_token_ids": SCORE_LABEL_TOKEN_IDS,
+                "model": SCORE_MODEL_PATH,
+            }
+            return (index, score_data)
+
+        except Exception as e:
+            print(f"Error building request {index}: {e}")
+            return (index, None)
+
+    return build_score_request
+
+
+def validate_score_response(response_data: dict) -> bool:
+    """Validate score API response."""
+    return "scores" in response_data or "logprobs" in response_data
+
+
+def build_warmup_score_request() -> dict:
+    """Build a warmup request for the score API."""
+    # Load tokenizer once for warmup generation
+    tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
+
+    warmup_query = generate_text_with_token_count(
+        SCORE_MODEL_PATH,
+        SCORE_QUERY_TOKENS,
+        config.special_replicated_token,
+        tokenizer=tokenizer,
+    )
+    warmup_items = [
+        generate_text_with_token_count(
+            SCORE_MODEL_PATH,
+            SCORE_ITEM_TOKENS,
+            config.special_replicated_token,
+            tokenizer=tokenizer,
+        )
+        for _ in range(3)
+    ]
+
+    return {
+        "query": warmup_query,
+        "items": warmup_items,
+        "label_token_ids": SCORE_LABEL_TOKEN_IDS,
+        "model": SCORE_MODEL_PATH,
+        # Add missing parameters for consistency with the original warmup
+        "apply_softmax": True,
+        "item_first": False,
+    }
+
+
+###############################################################################
+# MAIN
+###############################################################################
+async def run_benchmark(rps, duration_secs, item_count):
+    """Run a single benchmark with the given RPS value."""
+    # Create the request builder function with shared tokenizer
+    build_request_func = create_score_request_builder()
+
+    return await run_generic_benchmark(
+        rps=rps,
+        duration_secs=duration_secs,
+        item_count=item_count,
+        config=config,
+        http_url=HTTP_URL,
+        build_request_func=build_request_func,
+        response_validator=validate_score_response,
+        api_name="SINGLE_ITEM_SCORING",
+        request_description="score requests",
+    )
+
+
+async def main():
+    """Main function that runs benchmarks for all RPS values."""
+    additional_info = {
+        "Query tokens per request": SCORE_QUERY_TOKENS,
+        "Item tokens per item": SCORE_ITEM_TOKENS,
+    }
+
+    await run_benchmark_main(
+        config,
+        run_benchmark,
+        "SINGLE_ITEM_SCORING",
+        HTTP_URL,
+        ITEM_COUNT_VALUES,
+        additional_info,
+        build_warmup_score_request,
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())