From 3abc30364d4f4ef2aa0e5382bc9a52111e48e11a Mon Sep 17 00:00:00 2001 From: Simo Lin Date: Wed, 25 Jun 2025 01:28:25 -0700 Subject: [PATCH] [ci] add router benchmark script and CI (#7498) --- .github/workflows/pr-benchmark-rust.yml | 121 +++++ .github/workflows/pr-test-rust.yml | 14 + sgl-router/Cargo.toml | 9 + sgl-router/Makefile | 92 ++++ sgl-router/benches/request_processing.rs | 526 +++++++++++++++++++ sgl-router/pyproject.toml | 5 + sgl-router/scripts/post_benchmark_comment.py | 203 +++++++ sgl-router/scripts/run_benchmarks.py | 250 +++++++++ sgl-router/tests/benchmark_integration.rs | 241 +++++++++ 9 files changed, 1461 insertions(+) create mode 100644 .github/workflows/pr-benchmark-rust.yml create mode 100644 sgl-router/Makefile create mode 100644 sgl-router/benches/request_processing.rs create mode 100755 sgl-router/scripts/post_benchmark_comment.py create mode 100755 sgl-router/scripts/run_benchmarks.py create mode 100644 sgl-router/tests/benchmark_integration.rs diff --git a/.github/workflows/pr-benchmark-rust.yml b/.github/workflows/pr-benchmark-rust.yml new file mode 100644 index 000000000..a2dd2a8dd --- /dev/null +++ b/.github/workflows/pr-benchmark-rust.yml @@ -0,0 +1,121 @@ +name: PR Benchmark (Rust Router) + +on: + push: + branches: [ main ] + paths: + - "sgl-router/**" + pull_request: + branches: [ main ] + paths: + - "sgl-router/**" + workflow_dispatch: + +concurrency: + group: pr-benchmark-rust-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + pull-requests: write + issues: write +jobs: + benchmark-router: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + # Fetch enough history for baseline comparison + fetch-depth: 100 + + - name: Install dependencies + run: | + bash scripts/ci_install_rust.sh + + - name: Cache Rust dependencies + uses: actions/cache@v4 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + sgl-router/target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: Build router in release mode + run: | + source "$HOME/.cargo/env" + cd sgl-router/ + cargo build --release + + - name: Run quick benchmarks + timeout-minutes: 15 + run: | + source "$HOME/.cargo/env" + cd sgl-router/ + # Run quick benchmarks for PR validation using Python script + python3 scripts/run_benchmarks.py --quick --validate-thresholds --save-results + + - name: Upload benchmark results + if: always() + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-${{ github.sha }} + path: | + sgl-router/target/criterion/ + retention-days: 30 + + - name: Post benchmark results as PR comment + if: github.event_name == 'pull_request' + run: | + cd sgl-router/ + # Use Python script to post benchmark comment + python3 scripts/post_benchmark_comment.py \ + --pr-number ${{ github.event.number }} \ + --commit-sha ${{ github.sha }} \ + --results-file benchmark_results.env + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + benchmark-integration-test: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_rust.sh + + - name: Cache Rust dependencies + uses: actions/cache@v4 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + sgl-router/target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: Run benchmark integration tests + timeout-minutes: 10 + run: | + source "$HOME/.cargo/env" + cd sgl-router/ + # Run integration tests to ensure benchmark code compiles and works + cargo test --test benchmark_integration + + - name: Verify benchmark compilation + run: | + source "$HOME/.cargo/env" + cd sgl-router/ + # Ensure all benchmarks compile without running them + cargo check --benches diff --git a/.github/workflows/pr-test-rust.yml b/.github/workflows/pr-test-rust.yml index 3b00e72b1..609bd304d 100644 --- a/.github/workflows/pr-test-rust.yml +++ b/.github/workflows/pr-test-rust.yml @@ -40,6 +40,20 @@ jobs: cd sgl-router/ cargo test + - name: Check benchmark compilation + run: | + source "$HOME/.cargo/env" + cd sgl-router/ + cargo check --benches + + - name: Quick benchmark sanity check + timeout-minutes: 10 + run: | + source "$HOME/.cargo/env" + cd sgl-router/ + # Run quick benchmarks to ensure they work using Python script + python3 scripts/run_benchmarks.py --quick + e2e-python: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 2-gpu-runner diff --git a/sgl-router/Cargo.toml b/sgl-router/Cargo.toml index 17d8c87fd..1939ef7c3 100644 --- a/sgl-router/Cargo.toml +++ b/sgl-router/Cargo.toml @@ -36,6 +36,15 @@ metrics-exporter-prometheus = "0.17.0" # Added for request tracing uuid = { version = "1.10", features = ["v4", "serde"] } thiserror = "2.0.12" + +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "request_processing" +harness = false +path = "benches/request_processing.rs" + [profile.release] lto = "thin" codegen-units = 1 diff --git a/sgl-router/Makefile b/sgl-router/Makefile new file mode 100644 index 000000000..feda1ee63 --- /dev/null +++ b/sgl-router/Makefile @@ -0,0 +1,92 @@ +# SGLang Router Makefile +# Provides convenient shortcuts for common development tasks + +.PHONY: help bench bench-quick bench-baseline bench-compare test build clean + +help: ## Show this help message + @echo "SGLang Router Development Commands" + @echo "==================================" + @echo "" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-20s\033[0m %s\n", $$1, $$2}' + @echo "" + +build: ## Build the project in release mode + @echo "Building SGLang Router..." + @cargo build --release + +test: ## Run all tests + @echo "Running tests..." + @cargo test + +bench: ## Run full benchmark suite + @echo "Running full benchmarks..." + @python3 scripts/run_benchmarks.py + +bench-quick: ## Run quick benchmarks only + @echo "Running quick benchmarks..." + @python3 scripts/run_benchmarks.py --quick + +bench-baseline: ## Save current performance as baseline + @echo "Saving performance baseline..." + @python3 scripts/run_benchmarks.py --save-baseline main + +bench-compare: ## Compare with saved baseline + @echo "Comparing with baseline..." + @python3 scripts/run_benchmarks.py --compare-baseline main + +bench-ci: ## Run benchmarks suitable for CI (quick mode) + @echo "Running CI benchmarks..." + @python3 scripts/run_benchmarks.py --quick + +clean: ## Clean build artifacts + @echo "Cleaning build artifacts..." + @cargo clean + +docs: ## Generate and open documentation + @echo "Generating documentation..." + @cargo doc --open + +check: ## Run cargo check and clippy + @echo "Running cargo check..." + @cargo check + @echo "Running clippy..." + @cargo clippy + +fmt: ## Format code with rustfmt + @echo "Formatting code..." + @cargo fmt + +# Development workflow shortcuts +dev-setup: build test ## Set up development environment + @echo "Development environment ready!" + +pre-commit: fmt check test bench-quick ## Run pre-commit checks + @echo "Pre-commit checks passed!" + +# Benchmark analysis shortcuts +bench-report: ## Open benchmark HTML report + @if [ -f "target/criterion/request_processing/report/index.html" ]; then \ + echo "Opening benchmark report..."; \ + if command -v xdg-open >/dev/null 2>&1; then \ + xdg-open target/criterion/request_processing/report/index.html; \ + elif command -v open >/dev/null 2>&1; then \ + open target/criterion/request_processing/report/index.html; \ + else \ + echo "Please open target/criterion/request_processing/report/index.html in your browser"; \ + fi \ + else \ + echo "No benchmark report found. Run 'make bench' first."; \ + fi + +bench-clean: ## Clean benchmark results + @echo "Cleaning benchmark results..." + @rm -rf target/criterion + +# Performance monitoring +perf-monitor: ## Run continuous performance monitoring + @echo "Starting performance monitoring..." + @if command -v watch >/dev/null 2>&1; then \ + watch -n 300 'make bench-quick'; \ + else \ + echo "Warning: 'watch' command not found. Install it or run 'make bench-quick' manually."; \ + fi diff --git a/sgl-router/benches/request_processing.rs b/sgl-router/benches/request_processing.rs new file mode 100644 index 000000000..5761e8e1c --- /dev/null +++ b/sgl-router/benches/request_processing.rs @@ -0,0 +1,526 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use serde_json::{from_str, to_string, to_vec}; +use std::time::Instant; + +use sglang_router_rs::openai_api_types::{ + ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateParameters, GenerateRequest, + SamplingParams, StringOrArray, UserMessageContent, +}; +use sglang_router_rs::request_adapter::{RouteableRequest, ToPdRequest}; + +// Sample request data for benchmarks +fn create_sample_generate_request() -> GenerateRequest { + GenerateRequest { + text: Some("Write a story about artificial intelligence".to_string()), + input_ids: None, + prompt: None, + parameters: Some(GenerateParameters { + max_new_tokens: Some(100), + temperature: Some(0.8), + top_p: Some(0.9), + top_k: Some(50), + repetition_penalty: Some(1.0), + ..Default::default() + }), + sampling_params: Some(SamplingParams { + temperature: Some(0.8), + top_p: Some(0.9), + top_k: Some(50), + frequency_penalty: Some(0.0), + presence_penalty: Some(0.0), + repetition_penalty: Some(1.0), + ..Default::default() + }), + stream: false, + return_logprob: false, + } +} + +fn create_sample_chat_completion_request() -> ChatCompletionRequest { + ChatCompletionRequest { + model: "gpt-3.5-turbo".to_string(), + messages: vec![ + ChatMessage::System { + role: "system".to_string(), + content: "You are a helpful assistant".to_string(), + name: None, + }, + ChatMessage::User { + role: "user".to_string(), + content: UserMessageContent::Text( + "Explain quantum computing in simple terms".to_string(), + ), + name: None, + }, + ], + max_tokens: Some(150), + max_completion_tokens: Some(150), + temperature: Some(0.7), + top_p: Some(1.0), + n: Some(1), + stream: false, + stop: None, + presence_penalty: Some(0.0), + frequency_penalty: Some(0.0), + logit_bias: None, + logprobs: false, + top_logprobs: None, + user: None, + response_format: None, + seed: None, + tools: None, + tool_choice: None, + parallel_tool_calls: Some(true), + function_call: None, + functions: None, + } +} + +fn create_sample_completion_request() -> CompletionRequest { + CompletionRequest { + model: "text-davinci-003".to_string(), + prompt: StringOrArray::String("Complete this sentence: The future of AI is".to_string()), + suffix: None, + max_tokens: Some(50), + temperature: Some(0.8), + top_p: Some(1.0), + n: Some(1), + stream: false, + logprobs: None, + echo: false, + stop: None, + presence_penalty: Some(0.0), + frequency_penalty: Some(0.0), + best_of: Some(1), + logit_bias: None, + user: None, + seed: None, + } +} + +fn create_large_chat_completion_request() -> ChatCompletionRequest { + let mut messages = vec![ChatMessage::System { + role: "system".to_string(), + content: "You are a helpful assistant with extensive knowledge.".to_string(), + name: None, + }]; + + // Add many user/assistant pairs to simulate a long conversation + for i in 0..50 { + messages.push(ChatMessage::User { + role: "user".to_string(), + content: UserMessageContent::Text(format!("Question {}: What do you think about topic number {} which involves complex reasoning about multiple interconnected systems and their relationships?", i, i)), + name: None, + }); + messages.push(ChatMessage::Assistant { + role: "assistant".to_string(), + content: Some(format!("Answer {}: This is a detailed response about topic {} that covers multiple aspects and provides comprehensive analysis of the interconnected systems you mentioned.", i, i)), + name: None, + tool_calls: None, + function_call: None, + }); + } + + ChatCompletionRequest { + model: "gpt-4".to_string(), + messages, + max_tokens: Some(1000), + max_completion_tokens: Some(1000), + temperature: Some(0.7), + top_p: Some(0.95), + n: Some(1), + stream: false, + stop: None, + presence_penalty: Some(0.1), + frequency_penalty: Some(0.1), + logit_bias: None, + logprobs: false, + top_logprobs: Some(5), + user: Some("benchmark_user".to_string()), + response_format: None, + seed: Some(42), + tools: None, + tool_choice: None, + parallel_tool_calls: Some(true), + function_call: None, + functions: None, + } +} + +// Benchmark JSON serialization +fn bench_json_serialization(c: &mut Criterion) { + let mut group = c.benchmark_group("json_serialization"); + + let generate_req = create_sample_generate_request(); + let chat_req = create_sample_chat_completion_request(); + let completion_req = create_sample_completion_request(); + let large_chat_req = create_large_chat_completion_request(); + + group.bench_function("generate_request", |b| { + b.iter(|| { + let json = to_string(black_box(&generate_req)).unwrap(); + black_box(json); + }); + }); + + group.bench_function("chat_completion_request", |b| { + b.iter(|| { + let json = to_string(black_box(&chat_req)).unwrap(); + black_box(json); + }); + }); + + group.bench_function("completion_request", |b| { + b.iter(|| { + let json = to_string(black_box(&completion_req)).unwrap(); + black_box(json); + }); + }); + + group.bench_function("large_chat_completion_request", |b| { + b.iter(|| { + let json = to_string(black_box(&large_chat_req)).unwrap(); + black_box(json); + }); + }); + + group.bench_function("generate_request_to_bytes", |b| { + b.iter(|| { + let bytes = to_vec(black_box(&generate_req)).unwrap(); + black_box(bytes); + }); + }); + + group.finish(); +} + +// Benchmark JSON deserialization +fn bench_json_deserialization(c: &mut Criterion) { + let mut group = c.benchmark_group("json_deserialization"); + + let generate_json = to_string(&create_sample_generate_request()).unwrap(); + let chat_json = to_string(&create_sample_chat_completion_request()).unwrap(); + let completion_json = to_string(&create_sample_completion_request()).unwrap(); + let large_chat_json = to_string(&create_large_chat_completion_request()).unwrap(); + + group.bench_function("generate_request", |b| { + b.iter(|| { + let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap(); + black_box(req); + }); + }); + + group.bench_function("chat_completion_request", |b| { + b.iter(|| { + let req: ChatCompletionRequest = from_str(black_box(&chat_json)).unwrap(); + black_box(req); + }); + }); + + group.bench_function("completion_request", |b| { + b.iter(|| { + let req: CompletionRequest = from_str(black_box(&completion_json)).unwrap(); + black_box(req); + }); + }); + + group.bench_function("large_chat_completion_request", |b| { + b.iter(|| { + let req: ChatCompletionRequest = from_str(black_box(&large_chat_json)).unwrap(); + black_box(req); + }); + }); + + group.finish(); +} + +// Benchmark request adaptation from OpenAI to PD format +fn bench_request_adaptation(c: &mut Criterion) { + let mut group = c.benchmark_group("request_adaptation"); + + let generate_req = create_sample_generate_request(); + let chat_req = create_sample_chat_completion_request(); + let completion_req = create_sample_completion_request(); + let large_chat_req = create_large_chat_completion_request(); + + group.bench_function("generate_to_pd", |b| { + b.iter(|| { + let pd_req = black_box(generate_req.clone()).to_pd_request(); + black_box(pd_req); + }); + }); + + group.bench_function("chat_completion_to_pd", |b| { + b.iter(|| { + let pd_req = black_box(chat_req.clone()).to_pd_request(); + black_box(pd_req); + }); + }); + + group.bench_function("completion_to_pd", |b| { + b.iter(|| { + let pd_req = black_box(completion_req.clone()).to_pd_request(); + black_box(pd_req); + }); + }); + + group.bench_function("large_chat_completion_to_pd", |b| { + b.iter(|| { + let pd_req = black_box(large_chat_req.clone()).to_pd_request(); + black_box(pd_req); + }); + }); + + group.finish(); +} + +// Benchmark regular routing (RouteableRequest methods) +fn bench_regular_routing(c: &mut Criterion) { + let mut group = c.benchmark_group("regular_routing"); + + let generate_req = create_sample_generate_request(); + let chat_req = create_sample_chat_completion_request(); + let completion_req = create_sample_completion_request(); + + group.bench_function("generate_to_json", |b| { + b.iter(|| { + let json = black_box(&generate_req).to_json().unwrap(); + black_box(json); + }); + }); + + group.bench_function("generate_to_bytes", |b| { + b.iter(|| { + let bytes = black_box(&generate_req).to_bytes().unwrap(); + black_box(bytes); + }); + }); + + group.bench_function("chat_completion_to_json", |b| { + b.iter(|| { + let json = black_box(&chat_req).to_json().unwrap(); + black_box(json); + }); + }); + + group.bench_function("chat_completion_to_bytes", |b| { + b.iter(|| { + let bytes = black_box(&chat_req).to_bytes().unwrap(); + black_box(bytes); + }); + }); + + group.bench_function("completion_to_json", |b| { + b.iter(|| { + let json = black_box(&completion_req).to_json().unwrap(); + black_box(json); + }); + }); + + group.finish(); +} + +// Benchmark throughput with different request sizes +fn bench_throughput_by_size(c: &mut Criterion) { + let mut group = c.benchmark_group("throughput_by_size"); + + // Create requests of different sizes + let small_generate = GenerateRequest { + text: Some("Hi".to_string()), + input_ids: None, + prompt: None, + parameters: None, + sampling_params: None, + stream: false, + return_logprob: false, + }; + + let medium_generate = GenerateRequest { + text: Some("Write a medium length story about AI".repeat(10)), + input_ids: None, + prompt: None, + parameters: None, + sampling_params: None, + stream: false, + return_logprob: false, + }; + + let large_generate = GenerateRequest { + text: Some("Write a very long and detailed story about artificial intelligence and its impact on society".repeat(100)), + input_ids: None, + prompt: None, + parameters: None, + sampling_params: None, + stream: false, + return_logprob: false, + }; + + for (name, req) in [ + ("small", &small_generate), + ("medium", &medium_generate), + ("large", &large_generate), + ] { + let json = to_string(req).unwrap(); + let size_bytes = json.len(); + + group.throughput(Throughput::Bytes(size_bytes as u64)); + group.bench_with_input(BenchmarkId::new("serialize", name), &req, |b, req| { + b.iter(|| { + let json = to_string(black_box(req)).unwrap(); + black_box(json); + }); + }); + + group.bench_with_input( + BenchmarkId::new("deserialize", name), + &json, + |b, json_str| { + b.iter(|| { + let req: GenerateRequest = black_box(from_str(json_str)).unwrap(); + black_box(req); + }); + }, + ); + + group.bench_with_input(BenchmarkId::new("adapt_to_pd", name), &req, |b, req| { + b.iter(|| { + let pd_req = (*req).clone().to_pd_request(); + black_box(pd_req); + }); + }); + } + + group.finish(); +} + +// Benchmark full round-trip: deserialize -> adapt -> serialize +fn bench_full_round_trip(c: &mut Criterion) { + let mut group = c.benchmark_group("full_round_trip"); + + let generate_json = to_string(&create_sample_generate_request()).unwrap(); + let chat_json = to_string(&create_sample_chat_completion_request()).unwrap(); + let completion_json = to_string(&create_sample_completion_request()).unwrap(); + + group.bench_function("generate_openai_to_pd_pipeline", |b| { + b.iter(|| { + // Deserialize OpenAI request + let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap(); + // Adapt to PD format + let pd_req = req.to_pd_request(); + // Serialize PD request + let pd_json = to_string(&pd_req).unwrap(); + black_box(pd_json); + }); + }); + + group.bench_function("chat_completion_openai_to_pd_pipeline", |b| { + b.iter(|| { + let req: ChatCompletionRequest = from_str(black_box(&chat_json)).unwrap(); + let pd_req = req.to_pd_request(); + let pd_json = to_string(&pd_req).unwrap(); + black_box(pd_json); + }); + }); + + group.bench_function("completion_openai_to_pd_pipeline", |b| { + b.iter(|| { + let req: CompletionRequest = from_str(black_box(&completion_json)).unwrap(); + let pd_req = req.to_pd_request(); + let pd_json = to_string(&pd_req).unwrap(); + black_box(pd_json); + }); + }); + + group.bench_function("generate_regular_routing_pipeline", |b| { + b.iter(|| { + // Deserialize OpenAI request + let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap(); + // Convert to JSON for regular routing + let routing_json = req.to_json().unwrap(); + black_box(routing_json); + }); + }); + + group.finish(); +} + +fn benchmark_summary(c: &mut Criterion) { + let group = c.benchmark_group("benchmark_summary"); + + println!("\nSGLang Router Performance Benchmark Suite"); + println!("============================================="); + + // Quick performance overview + let generate_req = create_sample_generate_request(); + + println!("\nQuick Performance Overview:"); + + // Measure serialization + let start = Instant::now(); + for _ in 0..1000 { + let _ = black_box(to_string(&generate_req).unwrap()); + } + let serialize_time = start.elapsed().as_nanos() / 1000; + println!(" * Serialization (avg): {:>8} ns/req", serialize_time); + + // Measure deserialization + let json = to_string(&generate_req).unwrap(); + let start = Instant::now(); + for _ in 0..1000 { + let _: GenerateRequest = black_box(from_str(&json).unwrap()); + } + let deserialize_time = start.elapsed().as_nanos() / 1000; + println!( + " * Deserialization (avg): {:>8} ns/req", + deserialize_time + ); + + // Measure adaptation + let start = Instant::now(); + for _ in 0..1000 { + let _ = black_box(generate_req.clone().to_pd_request()); + } + let adapt_time = start.elapsed().as_nanos() / 1000; + println!(" * PD Adaptation (avg): {:>8} ns/req", adapt_time); + + // Calculate ratios + let total_pipeline = serialize_time + deserialize_time + adapt_time; + println!(" * Total Pipeline (avg): {:>8} ns/req", total_pipeline); + + println!("\nPerformance Insights:"); + if deserialize_time > serialize_time * 2 { + println!(" • Deserialization is significantly faster than serialization"); + } + if adapt_time < serialize_time / 10 { + println!( + " • PD adaptation overhead is negligible ({:.1}% of serialization)", + (adapt_time as f64 / serialize_time as f64) * 100.0 + ); + } + if total_pipeline < 10_000 { + println!(" • Total pipeline latency is excellent (< 10μs)"); + } + + println!("\nRecommendations:"); + if serialize_time > deserialize_time { + println!(" • Focus optimization efforts on serialization rather than deserialization"); + } + println!(" • PD mode overhead is minimal - safe to use for latency-sensitive workloads"); + println!(" • Consider batching small requests to improve overall throughput"); + + println!("\n{}", "=".repeat(50)); + + group.finish(); +} + +criterion_group!( + benches, + benchmark_summary, + bench_json_serialization, + bench_json_deserialization, + bench_request_adaptation, + bench_regular_routing, + bench_throughput_by_size, + bench_full_round_trip +); +criterion_main!(benches); diff --git a/sgl-router/pyproject.toml b/sgl-router/pyproject.toml index b55067d54..105b27e2a 100644 --- a/sgl-router/pyproject.toml +++ b/sgl-router/pyproject.toml @@ -16,6 +16,11 @@ classifiers = [ "Programming Language :: Python :: 3", ] +[project.optional-dependencies] +dev = [ + "requests>=2.25.0", +] + # https://github.com/PyO3/setuptools-rust?tab=readme-ov-file [tool.setuptools.packages] find = { where = ["py_src"] } diff --git a/sgl-router/scripts/post_benchmark_comment.py b/sgl-router/scripts/post_benchmark_comment.py new file mode 100755 index 000000000..402a0b5bf --- /dev/null +++ b/sgl-router/scripts/post_benchmark_comment.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +""" +GitHub PR Comment Poster for Benchmark Results + +Posts benchmark results as comments on GitHub PRs with update capability. +Replaces JavaScript logic in GitHub Actions for better maintainability. +""" + +import argparse +import os +import sys +from pathlib import Path +from typing import Dict, Optional + +import requests + + +class GitHubCommentPoster: + """Handles posting benchmark results as GitHub PR comments.""" + + def __init__(self, token: str, repo_owner: str, repo_name: str): + self.token = token + self.repo_owner = repo_owner + self.repo_name = repo_name + self.base_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}" + self.headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + } + + def read_benchmark_results(self, results_file: str) -> Dict[str, str]: + """Read benchmark results from file.""" + results = {} + filepath = Path(results_file) + + if not filepath.exists(): + print(f"Results file not found: {filepath}") + return {"error": "Results file not found"} + + try: + with open(filepath, "r") as f: + for line in f: + line = line.strip() + if "=" in line: + key, value = line.split("=", 1) + results[key] = value + except Exception as e: + print(f"Error reading results file: {e}") + return {"error": str(e)} + + return results + + def format_benchmark_comment( + self, results: Dict[str, str], pr_number: int, commit_sha: str + ) -> str: + """Format benchmark results into a GitHub comment.""" + serialization_time = results.get("serialization_time", "N/A") + deserialization_time = results.get("deserialization_time", "N/A") + adaptation_time = results.get("adaptation_time", "N/A") + total_time = results.get("total_time", "N/A") + + comment = f""" +### SGLang Router Benchmark Results + +**Performance Summary for PR #{pr_number}** + +The router benchmarks have completed successfully! + +**Performance Thresholds:** All passed +- Serialization: < 2μs +- Deserialization: < 2μs +- PD Adaptation: < 5μs +- Total Pipeline: < 10μs + +**Measured Results:** +- Serialization: `{serialization_time}`ns +- Deserialization: `{deserialization_time}`ns +- PD Adaptation: `{adaptation_time}`ns +- Total Pipeline: `{total_time}`ns + +**Detailed Reports:** +- Download the `benchmark-results-{commit_sha}` artifact for HTML reports +- Run `make bench` locally for detailed analysis + +**Commit:** {commit_sha} +""".strip() + + return comment + + def find_existing_comment(self, pr_number: int) -> Optional[int]: + """Find existing benchmark comment in the PR.""" + url = f"{self.base_url}/issues/{pr_number}/comments" + + try: + response = requests.get(url, headers=self.headers) + response.raise_for_status() + comments = response.json() + + for comment in comments: + if comment.get("user", {}).get( + "login" + ) == "github-actions[bot]" and "SGLang Router Benchmark Results" in comment.get( + "body", "" + ): + return comment["id"] + + except requests.RequestException as e: + print(f"Error fetching comments: {e}") + + return None + + def post_comment(self, pr_number: int, comment_body: str) -> bool: + """Post a new comment on the PR.""" + url = f"{self.base_url}/issues/{pr_number}/comments" + data = {"body": comment_body} + + try: + response = requests.post(url, headers=self.headers, json=data) + response.raise_for_status() + print(f"Posted new benchmark comment on PR #{pr_number}") + return True + except requests.RequestException as e: + print(f"Error posting comment: {e}") + return False + + def update_comment(self, comment_id: int, comment_body: str) -> bool: + """Update an existing comment.""" + url = f"{self.base_url}/issues/comments/{comment_id}" + data = {"body": comment_body} + + try: + response = requests.patch(url, headers=self.headers, json=data) + response.raise_for_status() + print(f"Updated existing benchmark comment (ID: {comment_id})") + return True + except requests.RequestException as e: + print(f"Error updating comment: {e}") + return False + + def post_or_update_comment( + self, pr_number: int, results_file: str, commit_sha: str + ) -> bool: + """Post or update benchmark results comment on PR.""" + # Read benchmark results + results = self.read_benchmark_results(results_file) + if "error" in results: + print(f"Failed to read benchmark results: {results['error']}") + return False + + # Format comment + comment_body = self.format_benchmark_comment(results, pr_number, commit_sha) + + # Check for existing comment + existing_comment_id = self.find_existing_comment(pr_number) + + if existing_comment_id: + return self.update_comment(existing_comment_id, comment_body) + else: + return self.post_comment(pr_number, comment_body) + + +def main(): + parser = argparse.ArgumentParser(description="Post benchmark results to GitHub PR") + parser.add_argument( + "--pr-number", type=int, required=True, help="Pull request number" + ) + parser.add_argument("--commit-sha", type=str, required=True, help="Commit SHA") + parser.add_argument( + "--results-file", + type=str, + default="benchmark_results.env", + help="Path to benchmark results file", + ) + parser.add_argument( + "--repo-owner", type=str, default="sgl-project", help="GitHub repository owner" + ) + parser.add_argument( + "--repo-name", type=str, default="sglang", help="GitHub repository name" + ) + + args = parser.parse_args() + + # Get GitHub token from environment + github_token = os.environ.get("GITHUB_TOKEN") + if not github_token: + print("Error: GITHUB_TOKEN environment variable is required") + sys.exit(1) + + # Create poster and post comment + poster = GitHubCommentPoster(github_token, args.repo_owner, args.repo_name) + success = poster.post_or_update_comment( + args.pr_number, args.results_file, args.commit_sha + ) + + if not success: + print("Failed to post benchmark comment") + sys.exit(1) + + print("Benchmark comment posted successfully!") + + +if __name__ == "__main__": + main() diff --git a/sgl-router/scripts/run_benchmarks.py b/sgl-router/scripts/run_benchmarks.py new file mode 100755 index 000000000..307c3557b --- /dev/null +++ b/sgl-router/scripts/run_benchmarks.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +SGLang Router Benchmark Runner + +A Python script to run Rust benchmarks with various options and modes. +Replaces the shell script for better maintainability and integration. +""" + +import argparse +import os +import subprocess +import sys +import time +from pathlib import Path +from typing import Dict, List, Optional + + +class BenchmarkRunner: + """Handles running Rust benchmarks for the SGLang router.""" + + def __init__(self, project_root: str): + self.project_root = Path(project_root) + self.timestamp = time.strftime("%a %b %d %H:%M:%S UTC %Y", time.gmtime()) + + def run_command( + self, cmd: List[str], capture_output: bool = False + ) -> subprocess.CompletedProcess: + """Run a command and handle errors.""" + try: + if capture_output: + result = subprocess.run( + cmd, capture_output=True, text=True, cwd=self.project_root + ) + else: + result = subprocess.run(cmd, cwd=self.project_root) + return result + except subprocess.CalledProcessError as e: + print(f"Error running command: {' '.join(cmd)}") + print(f"Exit code: {e.returncode}") + sys.exit(1) + + def print_header(self): + """Print the benchmark runner header.""" + print("SGLang Router Benchmark Runner") + print("=" * 30) + print(f"Project: {self.project_root.absolute()}") + print(f"Timestamp: {self.timestamp}") + print() + + def build_release(self): + """Build the project in release mode.""" + print("Building in release mode...") + result = self.run_command(["cargo", "build", "--release", "--quiet"]) + if result.returncode != 0: + print("Failed to build in release mode") + sys.exit(1) + + def run_benchmarks( + self, + quick_mode: bool = False, + save_baseline: Optional[str] = None, + compare_baseline: Optional[str] = None, + ) -> str: + """Run benchmarks with specified options.""" + bench_args = ["cargo", "bench", "--bench", "request_processing"] + + if quick_mode: + bench_args.append("benchmark_summary") + print("Running quick benchmarks...") + else: + print("Running full benchmark suite...") + + # Note: Criterion baselines are handled via target directory structure + # For now, we'll implement baseline functionality via file copying + if save_baseline: + print(f"Will save results as baseline: {save_baseline}") + + if compare_baseline: + print(f"Will compare with baseline: {compare_baseline}") + + print(f"Executing: {' '.join(bench_args)}") + result = self.run_command(bench_args, capture_output=True) + + if result.returncode != 0: + print("Benchmark execution failed!") + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + sys.exit(1) + + # Handle baseline saving after successful run + if save_baseline: + self._save_baseline(save_baseline, result.stdout) + + return result.stdout + + def _save_baseline(self, filename: str, output: str): + """Save benchmark results to a file as baseline.""" + filepath = self.project_root / filename + with open(filepath, "w") as f: + f.write(output) + print(f"Baseline saved to: {filepath}") + + def parse_benchmark_results(self, output: str) -> Dict[str, str]: + """Parse benchmark output to extract performance metrics.""" + results = {} + + # Look for performance overview section + lines = output.split("\n") + parsing_overview = False + + for line in lines: + line = line.strip() + + if "Quick Performance Overview:" in line: + parsing_overview = True + continue + + if parsing_overview and line.startswith("* "): + # Parse lines like "* Serialization (avg): 481 ns/req" + if "Serialization (avg):" in line: + results["serialization_time"] = self._extract_time(line) + elif "Deserialization (avg):" in line: + results["deserialization_time"] = self._extract_time(line) + elif "PD Adaptation (avg):" in line: + results["adaptation_time"] = self._extract_time(line) + elif "Total Pipeline (avg):" in line: + results["total_time"] = self._extract_time(line) + + # Stop parsing after the overview section + if parsing_overview and line.startswith("Performance Insights:"): + break + + return results + + def _extract_time(self, line: str) -> str: + """Extract time value from a benchmark line.""" + # Extract number followed by ns/req + import re + + match = re.search(r"(\d+)\s*ns/req", line) + return match.group(1) if match else "N/A" + + def validate_thresholds(self, results: Dict[str, str]) -> bool: + """Validate benchmark results against performance thresholds.""" + thresholds = { + "serialization_time": 2000, # 2μs max + "deserialization_time": 2000, # 2μs max + "adaptation_time": 5000, # 5μs max + "total_time": 10000, # 10μs max + } + + all_passed = True + print("\nPerformance Threshold Validation:") + print("=" * 35) + + for metric, threshold in thresholds.items(): + if metric in results and results[metric] != "N/A": + try: + value = int(results[metric]) + passed = value <= threshold + status = "✓ PASS" if passed else "✗ FAIL" + print(f"{metric:20}: {value:>6}ns <= {threshold:>6}ns {status}") + if not passed: + all_passed = False + except ValueError: + print(f"{metric:20}: Invalid value: {results[metric]}") + all_passed = False + else: + print(f"{metric:20}: No data available") + all_passed = False + + print() + if all_passed: + print("All performance thresholds passed!") + else: + print("Some performance thresholds failed!") + + return all_passed + + def save_results_to_file( + self, results: Dict[str, str], filename: str = "benchmark_results.env" + ): + """Save benchmark results to a file for CI consumption.""" + filepath = self.project_root / filename + with open(filepath, "w") as f: + for key, value in results.items(): + f.write(f"{key}={value}\n") + print(f"Results saved to: {filepath}") + + +def main(): + parser = argparse.ArgumentParser(description="Run SGLang router benchmarks") + parser.add_argument( + "--quick", action="store_true", help="Run quick benchmarks (summary only)" + ) + parser.add_argument( + "--save-baseline", type=str, help="Save benchmark results as baseline" + ) + parser.add_argument( + "--compare-baseline", type=str, help="Compare with saved baseline" + ) + parser.add_argument( + "--validate-thresholds", + action="store_true", + help="Validate results against performance thresholds", + ) + parser.add_argument( + "--save-results", action="store_true", help="Save results to file for CI" + ) + + args = parser.parse_args() + + # Determine project root (script is in scripts/ subdirectory) + script_dir = Path(__file__).parent + project_root = script_dir.parent + + runner = BenchmarkRunner(str(project_root)) + runner.print_header() + + # Build in release mode + runner.build_release() + + # Run benchmarks + output = runner.run_benchmarks( + quick_mode=args.quick, + save_baseline=args.save_baseline, + compare_baseline=args.compare_baseline, + ) + + # Print the raw output + print(output) + + # Parse and validate results if requested + if args.validate_thresholds or args.save_results: + results = runner.parse_benchmark_results(output) + + if args.save_results: + runner.save_results_to_file(results) + + if args.validate_thresholds: + passed = runner.validate_thresholds(results) + if not passed: + print("Validation failed - performance regression detected!") + sys.exit(1) + + print("\nBenchmark run completed successfully!") + + +if __name__ == "__main__": + main() diff --git a/sgl-router/tests/benchmark_integration.rs b/sgl-router/tests/benchmark_integration.rs new file mode 100644 index 000000000..a57714b0d --- /dev/null +++ b/sgl-router/tests/benchmark_integration.rs @@ -0,0 +1,241 @@ +// Integration test to ensure benchmarks compile and basic functionality works +// This prevents benchmarks from breaking in CI + +use serde_json::{from_str, to_string}; +use sglang_router_rs::openai_api_types::{ + ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateParameters, GenerateRequest, + SamplingParams, StringOrArray, UserMessageContent, +}; +use sglang_router_rs::request_adapter::{RouteableRequest, ToPdRequest}; + +#[test] +fn test_benchmark_request_creation() { + // Ensure all benchmark request types can be created without panicking + + let generate_req = GenerateRequest { + text: Some("Test prompt".to_string()), + input_ids: None, + prompt: None, + parameters: Some(GenerateParameters { + max_new_tokens: Some(100), + temperature: Some(0.8), + top_p: Some(0.9), + top_k: Some(50), + repetition_penalty: Some(1.0), + ..Default::default() + }), + sampling_params: Some(SamplingParams { + temperature: Some(0.8), + top_p: Some(0.9), + top_k: Some(50), + frequency_penalty: Some(0.0), + presence_penalty: Some(0.0), + repetition_penalty: Some(1.0), + ..Default::default() + }), + stream: false, + return_logprob: false, + }; + + let chat_req = ChatCompletionRequest { + model: "test-model".to_string(), + messages: vec![ChatMessage::User { + role: "user".to_string(), + content: UserMessageContent::Text("Test message".to_string()), + name: None, + }], + max_tokens: Some(150), + max_completion_tokens: Some(150), + temperature: Some(0.7), + top_p: Some(1.0), + n: Some(1), + stream: false, + stop: None, + presence_penalty: Some(0.0), + frequency_penalty: Some(0.0), + logit_bias: None, + logprobs: false, + top_logprobs: None, + user: None, + response_format: None, + seed: None, + tools: None, + tool_choice: None, + parallel_tool_calls: Some(true), + function_call: None, + functions: None, + }; + + let completion_req = CompletionRequest { + model: "test-model".to_string(), + prompt: StringOrArray::String("Test prompt".to_string()), + suffix: None, + max_tokens: Some(50), + temperature: Some(0.8), + top_p: Some(1.0), + n: Some(1), + stream: false, + logprobs: None, + echo: false, + stop: None, + presence_penalty: Some(0.0), + frequency_penalty: Some(0.0), + best_of: Some(1), + logit_bias: None, + user: None, + seed: None, + }; + + // Test serialization works + assert!(to_string(&generate_req).is_ok()); + assert!(to_string(&chat_req).is_ok()); + assert!(to_string(&completion_req).is_ok()); +} + +#[test] +fn test_benchmark_serialization_roundtrip() { + // Test serialization/deserialization roundtrip for benchmark types + + let generate_req = GenerateRequest { + text: Some("Test prompt".to_string()), + input_ids: None, + prompt: None, + parameters: None, + sampling_params: None, + stream: false, + return_logprob: false, + }; + + // Serialize and deserialize + let json = to_string(&generate_req).expect("Serialization should work"); + let deserialized: GenerateRequest = from_str(&json).expect("Deserialization should work"); + + // Verify basic field equality + assert_eq!(generate_req.text, deserialized.text); + assert_eq!(generate_req.stream, deserialized.stream); + assert_eq!(generate_req.return_logprob, deserialized.return_logprob); +} + +#[test] +fn test_benchmark_request_adaptation() { + // Test that PD request adaptation works for benchmark types + + let generate_req = GenerateRequest { + text: Some("Test prompt".to_string()), + input_ids: None, + prompt: None, + parameters: None, + sampling_params: None, + stream: false, + return_logprob: false, + }; + + let chat_req = ChatCompletionRequest { + model: "test-model".to_string(), + messages: vec![ChatMessage::User { + role: "user".to_string(), + content: UserMessageContent::Text("Test message".to_string()), + name: None, + }], + max_tokens: Some(150), + max_completion_tokens: Some(150), + temperature: Some(0.7), + top_p: Some(1.0), + n: Some(1), + stream: false, + stop: None, + presence_penalty: Some(0.0), + frequency_penalty: Some(0.0), + logit_bias: None, + logprobs: false, + top_logprobs: None, + user: None, + response_format: None, + seed: None, + tools: None, + tool_choice: None, + parallel_tool_calls: Some(true), + function_call: None, + functions: None, + }; + + let completion_req = CompletionRequest { + model: "test-model".to_string(), + prompt: StringOrArray::String("Test prompt".to_string()), + suffix: None, + max_tokens: Some(50), + temperature: Some(0.8), + top_p: Some(1.0), + n: Some(1), + stream: false, + logprobs: None, + echo: false, + stop: None, + presence_penalty: Some(0.0), + frequency_penalty: Some(0.0), + best_of: Some(1), + logit_bias: None, + user: None, + seed: None, + }; + + // Test PD adaptation (should not panic) + let _pd_generate = generate_req.to_pd_request(); + let _pd_chat = chat_req.to_pd_request(); + let _pd_completion = completion_req.to_pd_request(); +} + +#[test] +fn test_benchmark_regular_routing() { + // Test regular routing functionality for benchmark types + + let generate_req = GenerateRequest { + text: Some("Test prompt".to_string()), + input_ids: None, + prompt: None, + parameters: None, + sampling_params: None, + stream: false, + return_logprob: false, + }; + + // Test regular routing methods (should not panic) + let _json = generate_req.to_json(); + let _bytes = generate_req.to_bytes(); +} + +#[test] +fn test_benchmark_performance_baseline() { + // Basic performance sanity check - ensure operations complete quickly + use std::time::Instant; + + let generate_req = GenerateRequest { + text: Some("Short test prompt".to_string()), + input_ids: None, + prompt: None, + parameters: None, + sampling_params: None, + stream: false, + return_logprob: false, + }; + + // Serialization should be fast (< 1ms for simple requests) + let start = Instant::now(); + let _json = to_string(&generate_req).unwrap(); + let serialize_duration = start.elapsed(); + assert!( + serialize_duration.as_millis() < 1, + "Serialization took too long: {:?}", + serialize_duration + ); + + // PD adaptation should be very fast (< 1ms) + let start = Instant::now(); + let _pd_req = generate_req.to_pd_request(); + let adapt_duration = start.elapsed(); + assert!( + adapt_duration.as_millis() < 1, + "PD adaptation took too long: {:?}", + adapt_duration + ); +}