sglang/benchmark/benchmark_batch/benchmark_batch.py

import concurrent.futures
import os
import random
import time
from concurrent.futures import ProcessPoolExecutor
from statistics import mean

import requests
from tqdm import tqdm
from transformers import AutoTokenizer

from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint

###############################################################################
# CONFIG
###############################################################################
ENDPOINT_URL = "http://127.0.0.1:30000"
TOKENIZER_DIR = "/models/meta-llama/Llama-3.2-3B"

# Benchmark configurations
NUM_REQUESTS = 10  # Total number of requests (each with BATCH_SIZE prompts)
NUM_TOKENS = 32000  # Tokens per prompt
BATCH_SIZE = 8  # Number of prompts per request
GEN_TOKENS = 0  # Tokens to generate per prompt


###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def generate_random_prompt(index, tokenizer_dir, num_tokens):
    """Generate a single random prompt with specified token count."""
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
    vocab_size = tokenizer.vocab_size

    def generate_random_text(num_toks):
        random_token_ids = [random.randint(0, vocab_size - 1) for _ in range(num_toks)]
        return tokenizer.decode(random_token_ids, clean_up_tokenization_spaces=True)

    random_text = generate_random_text(num_tokens)
    return f"Prompt {index}: {random_text}"


def prepare_all_prompts(num_requests, batch_size, num_tokens, tokenizer_dir):
    """Generate prompts for all requests in parallel."""
    total_prompts = num_requests * batch_size
    all_prompts = [None] * total_prompts
    max_workers = min(os.cpu_count() or 1, total_prompts)

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(generate_random_prompt, i, tokenizer_dir, num_tokens)
            for i in range(total_prompts)
        ]
        for future in tqdm(
            concurrent.futures.as_completed(futures),
            total=total_prompts,
            desc="Generating prompts",
        ):
            index = futures.index(future)
            all_prompts[index] = future.result()

    batched_prompts = [
        all_prompts[i * batch_size : (i + 1) * batch_size] for i in range(num_requests)
    ]

    print(
        f"Generated {total_prompts} prompts with {num_tokens} tokens each, grouped into {num_requests} requests of {batch_size} prompts.\n"
    )
    return batched_prompts


###############################################################################
# HTTP CALLS
###############################################################################
def send_batch_request(endpoint, prompts, gen_tokens, request_id):
    """Send a batch of prompts to the /generate endpoint synchronously."""
    sampling_params = {
        "max_new_tokens": gen_tokens,
        "temperature": 0.7,
        "stop": "\n",
    }
    data = {"text": prompts, "sampling_params": sampling_params}

    start_time = time.perf_counter()
    try:
        response = requests.post(
            endpoint.base_url + "/generate", json=data, timeout=3600
        )
        if response.status_code != 200:
            error = response.json()
            raise RuntimeError(f"Request {request_id} failed: {error}")
        result = response.json()
        elapsed_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
        avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
        return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
    except Exception as e:
        print(f"[Request] Error for request {request_id}: {e}")
        return request_id, 0, 0, False, len(prompts)


def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
    """Run the benchmark sequentially."""
    results = []
    num_requests = len(batched_prompts)

    # Record start time for total latency
    benchmark_start_time = time.perf_counter()

    for i, batch_prompts in enumerate(batched_prompts):
        request_id = i + 1
        assert (
            len(batch_prompts) == batch_size
        ), f"Request {request_id} should have {batch_size} prompts, got {len(batch_prompts)}"

        print(
            f"[Request] Sending request {request_id}/{num_requests} with {len(batch_prompts)} prompts at {int(time.time()*1000)}"
        )
        result = send_batch_request(endpoint, batch_prompts, gen_tokens, request_id)
        results.append(result)

    # Calculate total latency
    total_latency = (time.perf_counter() - benchmark_start_time) * 1000  # Convert to ms

    return results, total_latency


###############################################################################
# RESULTS
###############################################################################
def process_results(results, total_latency, num_requests):
    """Process and display benchmark results."""
    total_time = 0
    successful_requests = 0
    failed_requests = 0
    request_latencies = []
    per_prompt_latencies = []
    total_prompts = 0

    for request_id, elapsed_time, avg_per_prompt, success, batch_size in results:
        if success:
            successful_requests += 1
            total_prompts += batch_size
            request_latencies.append(elapsed_time)
            per_prompt_latencies.append(avg_per_prompt)
            total_time += elapsed_time / 1000  # Convert to seconds
        else:
            failed_requests += 1

    avg_request_latency = mean(request_latencies) if request_latencies else 0
    avg_per_prompt_latency = mean(per_prompt_latencies) if per_prompt_latencies else 0
    throughput = total_prompts / total_time if total_time > 0 else 0

    print("\nBenchmark Summary:")
    print(f"  Total requests sent:         {len(results)}")
    print(f"  Total prompts sent:          {total_prompts}")
    print(f"  Successful requests:         {successful_requests}")
    print(f"  Failed requests:             {failed_requests}")
    print(f"  Total latency (all requests): {total_latency:.2f} ms")
    print(f"  Avg per request latency:     {avg_request_latency:.2f} ms")
    print(f"  Avg per prompt latency:      {avg_per_prompt_latency:.2f} ms")
    print(f"  Throughput:                  {throughput:.2f} prompts/second\n")


###############################################################################
# MAIN
###############################################################################
def main():
    # Initialize endpoint
    endpoint = RuntimeEndpoint(ENDPOINT_URL)

    # Generate prompts
    batched_prompts = prepare_all_prompts(
        NUM_REQUESTS, BATCH_SIZE, NUM_TOKENS, TOKENIZER_DIR
    )

    # Flush cache before benchmark
    # endpoint.flush_cache()

    # Run benchmark
    print(
        f"Starting benchmark: NUM_TOKENS={NUM_TOKENS}, BATCH_SIZE={BATCH_SIZE}, NUM_REQUESTS={NUM_REQUESTS}\n"
    )
    results, total_latency = run_benchmark(
        endpoint, batched_prompts, BATCH_SIZE, GEN_TOKENS
    )

    # Process and display results
    process_results(results, total_latency, NUM_REQUESTS)


if __name__ == "__main__":
    random.seed(0)
    main()
Perform Batch Tokenization. (#5141) 2025-04-20 18:10:37 -07:00			`import concurrent.futures`
			`import os`
			`import random`
			`import time`
			`from concurrent.futures import ProcessPoolExecutor`
			`from statistics import mean`

			`import requests`
			`from tqdm import tqdm`
			`from transformers import AutoTokenizer`

			`from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint`

			`###############################################################################`
			`# CONFIG`
			`###############################################################################`
			`ENDPOINT_URL = "http://127.0.0.1:30000"`
			`TOKENIZER_DIR = "/models/meta-llama/Llama-3.2-3B"`

			`# Benchmark configurations`
			`NUM_REQUESTS = 10 # Total number of requests (each with BATCH_SIZE prompts)`
			`NUM_TOKENS = 32000 # Tokens per prompt`
			`BATCH_SIZE = 8 # Number of prompts per request`
			`GEN_TOKENS = 0 # Tokens to generate per prompt`


			`###############################################################################`
			`# REQUEST GENERATION (in parallel)`
			`###############################################################################`
			`def generate_random_prompt(index, tokenizer_dir, num_tokens):`
			`"""Generate a single random prompt with specified token count."""`
			`tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)`
			`vocab_size = tokenizer.vocab_size`

			`def generate_random_text(num_toks):`
			`random_token_ids = [random.randint(0, vocab_size - 1) for _ in range(num_toks)]`
			`return tokenizer.decode(random_token_ids, clean_up_tokenization_spaces=True)`

			`random_text = generate_random_text(num_tokens)`
			`return f"Prompt {index}: {random_text}"`


			`def prepare_all_prompts(num_requests, batch_size, num_tokens, tokenizer_dir):`
			`"""Generate prompts for all requests in parallel."""`
			`total_prompts = num_requests * batch_size`
			`all_prompts = [None] * total_prompts`
			`max_workers = min(os.cpu_count() or 1, total_prompts)`

			`with ProcessPoolExecutor(max_workers=max_workers) as executor:`
			`futures = [`
			`executor.submit(generate_random_prompt, i, tokenizer_dir, num_tokens)`
			`for i in range(total_prompts)`
			`]`
			`for future in tqdm(`
			`concurrent.futures.as_completed(futures),`
			`total=total_prompts,`
			`desc="Generating prompts",`
			`):`
			`index = futures.index(future)`
			`all_prompts[index] = future.result()`

			`batched_prompts = [`
			`all_prompts[i * batch_size : (i + 1) * batch_size] for i in range(num_requests)`
			`]`

			`print(`
			`f"Generated {total_prompts} prompts with {num_tokens} tokens each, grouped into {num_requests} requests of {batch_size} prompts.\n"`
			`)`
			`return batched_prompts`


			`###############################################################################`
			`# HTTP CALLS`
			`###############################################################################`
			`def send_batch_request(endpoint, prompts, gen_tokens, request_id):`
			`"""Send a batch of prompts to the /generate endpoint synchronously."""`
			`sampling_params = {`
			`"max_new_tokens": gen_tokens,`
			`"temperature": 0.7,`
			`"stop": "\n",`
			`}`
			`data = {"text": prompts, "sampling_params": sampling_params}`

Replace time.time() to time.perf_counter() for benchmarking. (#6178) Signed-off-by: Lifu Huang <lifu.hlf@gmail.com> 2025-05-11 14:32:49 -07:00			`start_time = time.perf_counter()`
Perform Batch Tokenization. (#5141) 2025-04-20 18:10:37 -07:00			`try:`
			`response = requests.post(`
			`endpoint.base_url + "/generate", json=data, timeout=3600`
			`)`
			`if response.status_code != 200:`
			`error = response.json()`
			`raise RuntimeError(f"Request {request_id} failed: {error}")`
			`result = response.json()`
Replace time.time() to time.perf_counter() for benchmarking. (#6178) Signed-off-by: Lifu Huang <lifu.hlf@gmail.com> 2025-05-11 14:32:49 -07:00			`elapsed_time = (time.perf_counter() - start_time) * 1000 # Convert to ms`
Perform Batch Tokenization. (#5141) 2025-04-20 18:10:37 -07:00			`avg_per_prompt = elapsed_time / len(prompts) if prompts else 0`
			`return request_id, elapsed_time, avg_per_prompt, True, len(prompts)`
			`except Exception as e:`
			`print(f"[Request] Error for request {request_id}: {e}")`
			`return request_id, 0, 0, False, len(prompts)`


			`def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):`
			`"""Run the benchmark sequentially."""`
			`results = []`
			`num_requests = len(batched_prompts)`

			`# Record start time for total latency`
Replace time.time() to time.perf_counter() for benchmarking. (#6178) Signed-off-by: Lifu Huang <lifu.hlf@gmail.com> 2025-05-11 14:32:49 -07:00			`benchmark_start_time = time.perf_counter()`
Perform Batch Tokenization. (#5141) 2025-04-20 18:10:37 -07:00
			`for i, batch_prompts in enumerate(batched_prompts):`
			`request_id = i + 1`
			`assert (`
			`len(batch_prompts) == batch_size`
			`), f"Request {request_id} should have {batch_size} prompts, got {len(batch_prompts)}"`

			`print(`
			`f"[Request] Sending request {request_id}/{num_requests} with {len(batch_prompts)} prompts at {int(time.time()*1000)}"`
			`)`
			`result = send_batch_request(endpoint, batch_prompts, gen_tokens, request_id)`
			`results.append(result)`

			`# Calculate total latency`
Replace time.time() to time.perf_counter() for benchmarking. (#6178) Signed-off-by: Lifu Huang <lifu.hlf@gmail.com> 2025-05-11 14:32:49 -07:00			`total_latency = (time.perf_counter() - benchmark_start_time) * 1000 # Convert to ms`
Perform Batch Tokenization. (#5141) 2025-04-20 18:10:37 -07:00
			`return results, total_latency`


			`###############################################################################`
			`# RESULTS`
			`###############################################################################`
			`def process_results(results, total_latency, num_requests):`
			`"""Process and display benchmark results."""`
			`total_time = 0`
			`successful_requests = 0`
			`failed_requests = 0`
			`request_latencies = []`
			`per_prompt_latencies = []`
			`total_prompts = 0`

			`for request_id, elapsed_time, avg_per_prompt, success, batch_size in results:`
			`if success:`
			`successful_requests += 1`
			`total_prompts += batch_size`
			`request_latencies.append(elapsed_time)`
			`per_prompt_latencies.append(avg_per_prompt)`
			`total_time += elapsed_time / 1000 # Convert to seconds`
			`else:`
			`failed_requests += 1`

			`avg_request_latency = mean(request_latencies) if request_latencies else 0`
			`avg_per_prompt_latency = mean(per_prompt_latencies) if per_prompt_latencies else 0`
			`throughput = total_prompts / total_time if total_time > 0 else 0`

			`print("\nBenchmark Summary:")`
			`print(f" Total requests sent: {len(results)}")`
			`print(f" Total prompts sent: {total_prompts}")`
			`print(f" Successful requests: {successful_requests}")`
			`print(f" Failed requests: {failed_requests}")`
			`print(f" Total latency (all requests): {total_latency:.2f} ms")`
			`print(f" Avg per request latency: {avg_request_latency:.2f} ms")`
			`print(f" Avg per prompt latency: {avg_per_prompt_latency:.2f} ms")`
			`print(f" Throughput: {throughput:.2f} prompts/second\n")`


			`###############################################################################`
			`# MAIN`
			`###############################################################################`
			`def main():`
			`# Initialize endpoint`
			`endpoint = RuntimeEndpoint(ENDPOINT_URL)`

			`# Generate prompts`
			`batched_prompts = prepare_all_prompts(`
			`NUM_REQUESTS, BATCH_SIZE, NUM_TOKENS, TOKENIZER_DIR`
			`)`

			`# Flush cache before benchmark`
			`# endpoint.flush_cache()`

			`# Run benchmark`
			`print(`
			`f"Starting benchmark: NUM_TOKENS={NUM_TOKENS}, BATCH_SIZE={BATCH_SIZE}, NUM_REQUESTS={NUM_REQUESTS}\n"`
			`)`
			`results, total_latency = run_benchmark(`
			`endpoint, batched_prompts, BATCH_SIZE, GEN_TOKENS`
			`)`

			`# Process and display results`
			`process_results(results, total_latency, NUM_REQUESTS)`


			`if __name__ == "__main__":`
			`random.seed(0)`
			`main()`