sglang/benchmark/prefill_only/bench_score.py

"""
SGLang Scoring Benchmark Script

This script benchmarks SGLang's scoring API performance using HTTP requests.

Current Features:
- HTTP-only implementation (open source compatible)
- Uses /v1/score API endpoint directly
- Single item scoring with batching support
- Configurable RPS, duration, and batch sizes
- Progress tracking and detailed metrics
- Poisson and constant request distributions

Usage:
- Update configuration variables at the top of the file
- Ensure SGLang server is running on the configured HTTP_URL
- Run: python bench_score.py
- Each request will contain ITEM_COUNT_VALUES items for batch scoring

"""

import asyncio

from transformers import AutoTokenizer
from util import (
    BenchmarkConfig,
    generate_text_with_token_count,
    run_benchmark_main,
    run_generic_benchmark,
)

###############################################################################
# CONFIG
###############################################################################
# Create benchmark configuration
config = BenchmarkConfig()
config.rps_values = [160]
config.duration_secs_values = [60]
config.num_unique_requests = 100
config.distribution = "POISSON"
config.profile = False
config.freeze_gc = True  # Enable GC freeze functionality
# Profiler output directory - by default uses present working directory (pwd)
# Uncomment and customize the line below to override the default location:
# config.profiler_dir = "/sglang-oss-trace"

# HTTP Configuration
HTTP_URL = "http://localhost:30000/v1/score"  # Use score API directly

# Score API Config
# ITEM_COUNT_VALUES determines number of items per score request (batch size)
SCORE_QUERY_TOKENS = 120
SCORE_ITEM_TOKENS = 180
SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B"
SCORE_LABEL_TOKEN_IDS = [9454, 2753]  # Yes/No token IDs
ITEM_COUNT_VALUES = [10]  # Number of items per request

# Special token to replicate for precise token counting
SPECIAL_REPLICATED_TOKEN = "<|im_start|>"


###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def create_score_request_builder():
    """Create a score request builder function with shared tokenizer."""
    # Load tokenizer once here to verify special token and get precise counts
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)

    # Verify that our special token produces exactly 1 token
    special_token_count = len(
        tokenizer.encode(config.special_replicated_token, add_special_tokens=False)
    )
    print(
        f"Special token '{config.special_replicated_token}' produces "
        f"{special_token_count} token(s)"
    )

    def generate_text_with_token_count_local(num_toks):
        """Generate text with precise token count using replicated token."""
        return generate_text_with_token_count(
            SCORE_MODEL_PATH,
            num_toks,
            config.special_replicated_token,
            tokenizer=tokenizer,
        )

    def build_score_request(index: int, item_count: int) -> tuple:
        """Build a single score request."""
        try:
            # Generate query and items for score API
            query = generate_text_with_token_count_local(SCORE_QUERY_TOKENS)
            items = [
                generate_text_with_token_count_local(SCORE_ITEM_TOKENS)
                for _ in range(item_count)
            ]

            # Return as dict for score API format
            score_data = {
                "query": query,
                "items": items,
                "label_token_ids": SCORE_LABEL_TOKEN_IDS,
                "model": SCORE_MODEL_PATH,
            }
            return (index, score_data)

        except Exception as e:
            print(f"Error building request {index}: {e}")
            return (index, None)

    return build_score_request


def validate_score_response(response_data: dict) -> bool:
    """Validate score API response."""
    return "scores" in response_data or "logprobs" in response_data


def build_warmup_score_request() -> dict:
    """Build a warmup request for the score API."""
    # Load tokenizer once for warmup generation
    tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)

    warmup_query = generate_text_with_token_count(
        SCORE_MODEL_PATH,
        SCORE_QUERY_TOKENS,
        config.special_replicated_token,
        tokenizer=tokenizer,
    )
    warmup_items = [
        generate_text_with_token_count(
            SCORE_MODEL_PATH,
            SCORE_ITEM_TOKENS,
            config.special_replicated_token,
            tokenizer=tokenizer,
        )
        for _ in range(3)
    ]

    return {
        "query": warmup_query,
        "items": warmup_items,
        "label_token_ids": SCORE_LABEL_TOKEN_IDS,
        "model": SCORE_MODEL_PATH,
        # Add missing parameters for consistency with the original warmup
        "apply_softmax": True,
        "item_first": False,
    }


###############################################################################
# MAIN
###############################################################################
async def run_benchmark(rps, duration_secs, item_count):
    """Run a single benchmark with the given RPS value."""
    # Create the request builder function with shared tokenizer
    build_request_func = create_score_request_builder()

    return await run_generic_benchmark(
        rps=rps,
        duration_secs=duration_secs,
        item_count=item_count,
        config=config,
        http_url=HTTP_URL,
        build_request_func=build_request_func,
        response_validator=validate_score_response,
        api_name="SINGLE_ITEM_SCORING",
        request_description="score requests",
    )


async def main():
    """Main function that runs benchmarks for all RPS values."""
    additional_info = {
        "Query tokens per request": SCORE_QUERY_TOKENS,
        "Item tokens per item": SCORE_ITEM_TOKENS,
    }

    await run_benchmark_main(
        config,
        run_benchmark,
        "SINGLE_ITEM_SCORING",
        HTTP_URL,
        ITEM_COUNT_VALUES,
        additional_info,
        build_warmup_score_request,
    )


if __name__ == "__main__":
    asyncio.run(main())