[Benchmark] Prefil-only benchmark scripts (#10240)
This commit is contained in:
committed by
GitHub
parent
dccf52f9c8
commit
a1d038924b
192
benchmark/prefill_only/bench_score.py
Normal file
192
benchmark/prefill_only/bench_score.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
SGLang Scoring Benchmark Script
|
||||
|
||||
This script benchmarks SGLang's scoring API performance using HTTP requests.
|
||||
|
||||
Current Features:
|
||||
- HTTP-only implementation (open source compatible)
|
||||
- Uses /v1/score API endpoint directly
|
||||
- Single item scoring with batching support
|
||||
- Configurable RPS, duration, and batch sizes
|
||||
- Progress tracking and detailed metrics
|
||||
- Poisson and constant request distributions
|
||||
|
||||
Usage:
|
||||
- Update configuration variables at the top of the file
|
||||
- Ensure SGLang server is running on the configured HTTP_URL
|
||||
- Run: python bench_score.py
|
||||
- Each request will contain ITEM_COUNT_VALUES items for batch scoring
|
||||
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from util import (
|
||||
BenchmarkConfig,
|
||||
generate_text_with_token_count,
|
||||
run_benchmark_main,
|
||||
run_generic_benchmark,
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
# CONFIG
|
||||
###############################################################################
|
||||
# Create benchmark configuration
|
||||
config = BenchmarkConfig()
|
||||
config.rps_values = [160]
|
||||
config.duration_secs_values = [60]
|
||||
config.num_unique_requests = 100
|
||||
config.distribution = "POISSON"
|
||||
config.profile = False
|
||||
config.freeze_gc = True # Enable GC freeze functionality
|
||||
# Profiler output directory - by default uses present working directory (pwd)
|
||||
# Uncomment and customize the line below to override the default location:
|
||||
# config.profiler_dir = "/sglang-oss-trace"
|
||||
|
||||
# HTTP Configuration
|
||||
HTTP_URL = "http://localhost:30000/v1/score" # Use score API directly
|
||||
|
||||
# Score API Config
|
||||
# ITEM_COUNT_VALUES determines number of items per score request (batch size)
|
||||
SCORE_QUERY_TOKENS = 120
|
||||
SCORE_ITEM_TOKENS = 180
|
||||
SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B"
|
||||
SCORE_LABEL_TOKEN_IDS = [9454, 2753] # Yes/No token IDs
|
||||
ITEM_COUNT_VALUES = [10] # Number of items per request
|
||||
|
||||
# Special token to replicate for precise token counting
|
||||
SPECIAL_REPLICATED_TOKEN = "<|im_start|>"
|
||||
|
||||
|
||||
###############################################################################
|
||||
# REQUEST GENERATION (in parallel)
|
||||
###############################################################################
|
||||
def create_score_request_builder():
|
||||
"""Create a score request builder function with shared tokenizer."""
|
||||
# Load tokenizer once here to verify special token and get precise counts
|
||||
print("Loading tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
|
||||
|
||||
# Verify that our special token produces exactly 1 token
|
||||
special_token_count = len(
|
||||
tokenizer.encode(config.special_replicated_token, add_special_tokens=False)
|
||||
)
|
||||
print(
|
||||
f"Special token '{config.special_replicated_token}' produces "
|
||||
f"{special_token_count} token(s)"
|
||||
)
|
||||
|
||||
def generate_text_with_token_count_local(num_toks):
|
||||
"""Generate text with precise token count using replicated token."""
|
||||
return generate_text_with_token_count(
|
||||
SCORE_MODEL_PATH,
|
||||
num_toks,
|
||||
config.special_replicated_token,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
def build_score_request(index: int, item_count: int) -> tuple:
|
||||
"""Build a single score request."""
|
||||
try:
|
||||
# Generate query and items for score API
|
||||
query = generate_text_with_token_count_local(SCORE_QUERY_TOKENS)
|
||||
items = [
|
||||
generate_text_with_token_count_local(SCORE_ITEM_TOKENS)
|
||||
for _ in range(item_count)
|
||||
]
|
||||
|
||||
# Return as dict for score API format
|
||||
score_data = {
|
||||
"query": query,
|
||||
"items": items,
|
||||
"label_token_ids": SCORE_LABEL_TOKEN_IDS,
|
||||
"model": SCORE_MODEL_PATH,
|
||||
}
|
||||
return (index, score_data)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error building request {index}: {e}")
|
||||
return (index, None)
|
||||
|
||||
return build_score_request
|
||||
|
||||
|
||||
def validate_score_response(response_data: dict) -> bool:
|
||||
"""Validate score API response."""
|
||||
return "scores" in response_data or "logprobs" in response_data
|
||||
|
||||
|
||||
def build_warmup_score_request() -> dict:
|
||||
"""Build a warmup request for the score API."""
|
||||
# Load tokenizer once for warmup generation
|
||||
tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
|
||||
|
||||
warmup_query = generate_text_with_token_count(
|
||||
SCORE_MODEL_PATH,
|
||||
SCORE_QUERY_TOKENS,
|
||||
config.special_replicated_token,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
warmup_items = [
|
||||
generate_text_with_token_count(
|
||||
SCORE_MODEL_PATH,
|
||||
SCORE_ITEM_TOKENS,
|
||||
config.special_replicated_token,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
for _ in range(3)
|
||||
]
|
||||
|
||||
return {
|
||||
"query": warmup_query,
|
||||
"items": warmup_items,
|
||||
"label_token_ids": SCORE_LABEL_TOKEN_IDS,
|
||||
"model": SCORE_MODEL_PATH,
|
||||
# Add missing parameters for consistency with the original warmup
|
||||
"apply_softmax": True,
|
||||
"item_first": False,
|
||||
}
|
||||
|
||||
|
||||
###############################################################################
|
||||
# MAIN
|
||||
###############################################################################
|
||||
async def run_benchmark(rps, duration_secs, item_count):
|
||||
"""Run a single benchmark with the given RPS value."""
|
||||
# Create the request builder function with shared tokenizer
|
||||
build_request_func = create_score_request_builder()
|
||||
|
||||
return await run_generic_benchmark(
|
||||
rps=rps,
|
||||
duration_secs=duration_secs,
|
||||
item_count=item_count,
|
||||
config=config,
|
||||
http_url=HTTP_URL,
|
||||
build_request_func=build_request_func,
|
||||
response_validator=validate_score_response,
|
||||
api_name="SINGLE_ITEM_SCORING",
|
||||
request_description="score requests",
|
||||
)
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main function that runs benchmarks for all RPS values."""
|
||||
additional_info = {
|
||||
"Query tokens per request": SCORE_QUERY_TOKENS,
|
||||
"Item tokens per item": SCORE_ITEM_TOKENS,
|
||||
}
|
||||
|
||||
await run_benchmark_main(
|
||||
config,
|
||||
run_benchmark,
|
||||
"SINGLE_ITEM_SCORING",
|
||||
HTTP_URL,
|
||||
ITEM_COUNT_VALUES,
|
||||
additional_info,
|
||||
build_warmup_score_request,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user