[ci] add router benchmark script and CI (#7498)

2025-06-25 01:28:25 -07:00
parent afeed46530
commit 3abc30364d
9 changed files with 1461 additions and 0 deletions
--- a/.github/workflows/pr-benchmark-rust.yml
+++ b/.github/workflows/pr-benchmark-rust.yml
@@ -0,0 +1,121 @@
 name: PR Benchmark (Rust Router)
 on:
  push:
    branches: [ main ]
    paths:
      - "sgl-router/**"
  pull_request:
    branches: [ main ]
    paths:
      - "sgl-router/**"
  workflow_dispatch:
 concurrency:
  group: pr-benchmark-rust-${{ github.ref }}
  cancel-in-progress: true
 permissions:
  contents: read
  pull-requests: write
  issues: write
 jobs:
  benchmark-router:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          # Fetch enough history for baseline comparison
          fetch-depth: 100
      - name: Install dependencies
        run: |
          bash scripts/ci_install_rust.sh
      - name: Cache Rust dependencies
        uses: actions/cache@v4
        with:
          path: |
            ~/.cargo/bin/
            ~/.cargo/registry/index/
            ~/.cargo/registry/cache/
            ~/.cargo/git/db/
            sgl-router/target/
          key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }}
          restore-keys: |
            ${{ runner.os }}-cargo-
      - name: Build router in release mode
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          cargo build --release
      - name: Run quick benchmarks
        timeout-minutes: 15
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          # Run quick benchmarks for PR validation using Python script
          python3 scripts/run_benchmarks.py --quick --validate-thresholds --save-results
      - name: Upload benchmark results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results-${{ github.sha }}
          path: |
            sgl-router/target/criterion/
          retention-days: 30
      - name: Post benchmark results as PR comment
        if: github.event_name == 'pull_request'
        run: |
          cd sgl-router/
          # Use Python script to post benchmark comment
          python3 scripts/post_benchmark_comment.py \
            --pr-number ${{ github.event.number }} \
            --commit-sha ${{ github.sha }} \
            --results-file benchmark_results.env
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  benchmark-integration-test:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Install dependencies
        run: |
          bash scripts/ci_install_rust.sh
      - name: Cache Rust dependencies
        uses: actions/cache@v4
        with:
          path: |
            ~/.cargo/bin/
            ~/.cargo/registry/index/
            ~/.cargo/registry/cache/
            ~/.cargo/git/db/
            sgl-router/target/
          key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }}
          restore-keys: |
            ${{ runner.os }}-cargo-
      - name: Run benchmark integration tests
        timeout-minutes: 10
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          # Run integration tests to ensure benchmark code compiles and works
          cargo test --test benchmark_integration
      - name: Verify benchmark compilation
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          # Ensure all benchmarks compile without running them
          cargo check --benches
--- a/.github/workflows/pr-test-rust.yml
+++ b/.github/workflows/pr-test-rust.yml
@@ -40,6 +40,20 @@ jobs:
          cd sgl-router/
          cargo test
      - name: Check benchmark compilation
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          cargo check --benches
      - name: Quick benchmark sanity check
        timeout-minutes: 10
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          # Run quick benchmarks to ensure they work using Python script
          python3 scripts/run_benchmarks.py --quick
  e2e-python:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 2-gpu-runner
--- a/sgl-router/Cargo.toml
+++ b/sgl-router/Cargo.toml
@@ -36,6 +36,15 @@ metrics-exporter-prometheus = "0.17.0"
 # Added for request tracing
 uuid = { version = "1.10", features = ["v4", "serde"] }
 thiserror = "2.0.12"
 [dev-dependencies]
 criterion = { version = "0.5", features = ["html_reports"] }
 [[bench]]
 name = "request_processing"
 harness = false
 path = "benches/request_processing.rs"
 [profile.release]
 lto = "thin"
 codegen-units = 1
--- a/sgl-router/Makefile
+++ b/sgl-router/Makefile
@@ -0,0 +1,92 @@
 # SGLang Router Makefile
 # Provides convenient shortcuts for common development tasks
 .PHONY: help bench bench-quick bench-baseline bench-compare test build clean
 help: ## Show this help message
 	@echo "SGLang Router Development Commands"
 	@echo "=================================="
 	@echo ""
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2}'
 	@echo ""
 build: ## Build the project in release mode
 	@echo "Building SGLang Router..."
 	@cargo build --release
 test: ## Run all tests
 	@echo "Running tests..."
 	@cargo test
 bench: ## Run full benchmark suite
 	@echo "Running full benchmarks..."
 	@python3 scripts/run_benchmarks.py
 bench-quick: ## Run quick benchmarks only
 	@echo "Running quick benchmarks..."
 	@python3 scripts/run_benchmarks.py --quick
 bench-baseline: ## Save current performance as baseline
 	@echo "Saving performance baseline..."
 	@python3 scripts/run_benchmarks.py --save-baseline main
 bench-compare: ## Compare with saved baseline
 	@echo "Comparing with baseline..."
 	@python3 scripts/run_benchmarks.py --compare-baseline main
 bench-ci: ## Run benchmarks suitable for CI (quick mode)
 	@echo "Running CI benchmarks..."
 	@python3 scripts/run_benchmarks.py --quick
 clean: ## Clean build artifacts
 	@echo "Cleaning build artifacts..."
 	@cargo clean
 docs: ## Generate and open documentation
 	@echo "Generating documentation..."
 	@cargo doc --open
 check: ## Run cargo check and clippy
 	@echo "Running cargo check..."
 	@cargo check
 	@echo "Running clippy..."
 	@cargo clippy
 fmt: ## Format code with rustfmt
 	@echo "Formatting code..."
 	@cargo fmt
 # Development workflow shortcuts
 dev-setup: build test ## Set up development environment
 	@echo "Development environment ready!"
 pre-commit: fmt check test bench-quick ## Run pre-commit checks
 	@echo "Pre-commit checks passed!"
 # Benchmark analysis shortcuts
 bench-report: ## Open benchmark HTML report
 	@if [ -f "target/criterion/request_processing/report/index.html" ]; then \
 		echo "Opening benchmark report..."; \
 		if command -v xdg-open >/dev/null 2>&1; then \
 			xdg-open target/criterion/request_processing/report/index.html; \
 		elif command -v open >/dev/null 2>&1; then \
 			open target/criterion/request_processing/report/index.html; \
 		else \
 			echo "Please open target/criterion/request_processing/report/index.html in your browser"; \
 		fi \
 	else \
 		echo "No benchmark report found. Run 'make bench' first."; \
 	fi
 bench-clean: ## Clean benchmark results
 	@echo "Cleaning benchmark results..."
 	@rm -rf target/criterion
 # Performance monitoring
 perf-monitor: ## Run continuous performance monitoring
 	@echo "Starting performance monitoring..."
 	@if command -v watch >/dev/null 2>&1; then \
 		watch -n 300 'make bench-quick'; \
 	else \
 		echo "Warning: 'watch' command not found. Install it or run 'make bench-quick' manually."; \
 	fi
--- a/sgl-router/benches/request_processing.rs
+++ b/sgl-router/benches/request_processing.rs
@@ -0,0 +1,526 @@
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 use serde_json::{from_str, to_string, to_vec};
 use std::time::Instant;
 use sglang_router_rs::openai_api_types::{
    ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateParameters, GenerateRequest,
    SamplingParams, StringOrArray, UserMessageContent,
 };
 use sglang_router_rs::request_adapter::{RouteableRequest, ToPdRequest};
 // Sample request data for benchmarks
 fn create_sample_generate_request() -> GenerateRequest {
    GenerateRequest {
        text: Some("Write a story about artificial intelligence".to_string()),
        input_ids: None,
        prompt: None,
        parameters: Some(GenerateParameters {
            max_new_tokens: Some(100),
            temperature: Some(0.8),
            top_p: Some(0.9),
            top_k: Some(50),
            repetition_penalty: Some(1.0),
            ..Default::default()
        }),
        sampling_params: Some(SamplingParams {
            temperature: Some(0.8),
            top_p: Some(0.9),
            top_k: Some(50),
            frequency_penalty: Some(0.0),
            presence_penalty: Some(0.0),
            repetition_penalty: Some(1.0),
            ..Default::default()
        }),
        stream: false,
        return_logprob: false,
    }
 }
 fn create_sample_chat_completion_request() -> ChatCompletionRequest {
    ChatCompletionRequest {
        model: "gpt-3.5-turbo".to_string(),
        messages: vec![
            ChatMessage::System {
                role: "system".to_string(),
                content: "You are a helpful assistant".to_string(),
                name: None,
            },
            ChatMessage::User {
                role: "user".to_string(),
                content: UserMessageContent::Text(
                    "Explain quantum computing in simple terms".to_string(),
                ),
                name: None,
            },
        ],
        max_tokens: Some(150),
        max_completion_tokens: Some(150),
        temperature: Some(0.7),
        top_p: Some(1.0),
        n: Some(1),
        stream: false,
        stop: None,
        presence_penalty: Some(0.0),
        frequency_penalty: Some(0.0),
        logit_bias: None,
        logprobs: false,
        top_logprobs: None,
        user: None,
        response_format: None,
        seed: None,
        tools: None,
        tool_choice: None,
        parallel_tool_calls: Some(true),
        function_call: None,
        functions: None,
    }
 }
 fn create_sample_completion_request() -> CompletionRequest {
    CompletionRequest {
        model: "text-davinci-003".to_string(),
        prompt: StringOrArray::String("Complete this sentence: The future of AI is".to_string()),
        suffix: None,
        max_tokens: Some(50),
        temperature: Some(0.8),
        top_p: Some(1.0),
        n: Some(1),
        stream: false,
        logprobs: None,
        echo: false,
        stop: None,
        presence_penalty: Some(0.0),
        frequency_penalty: Some(0.0),
        best_of: Some(1),
        logit_bias: None,
        user: None,
        seed: None,
    }
 }
 fn create_large_chat_completion_request() -> ChatCompletionRequest {
    let mut messages = vec![ChatMessage::System {
        role: "system".to_string(),
        content: "You are a helpful assistant with extensive knowledge.".to_string(),
        name: None,
    }];
    // Add many user/assistant pairs to simulate a long conversation
    for i in 0..50 {
        messages.push(ChatMessage::User {
            role: "user".to_string(),
            content: UserMessageContent::Text(format!("Question {}: What do you think about topic number {} which involves complex reasoning about multiple interconnected systems and their relationships?", i, i)),
            name: None,
        });
        messages.push(ChatMessage::Assistant {
            role: "assistant".to_string(),
            content: Some(format!("Answer {}: This is a detailed response about topic {} that covers multiple aspects and provides comprehensive analysis of the interconnected systems you mentioned.", i, i)),
            name: None,
            tool_calls: None,
            function_call: None,
        });
    }
    ChatCompletionRequest {
        model: "gpt-4".to_string(),
        messages,
        max_tokens: Some(1000),
        max_completion_tokens: Some(1000),
        temperature: Some(0.7),
        top_p: Some(0.95),
        n: Some(1),
        stream: false,
        stop: None,
        presence_penalty: Some(0.1),
        frequency_penalty: Some(0.1),
        logit_bias: None,
        logprobs: false,
        top_logprobs: Some(5),
        user: Some("benchmark_user".to_string()),
        response_format: None,
        seed: Some(42),
        tools: None,
        tool_choice: None,
        parallel_tool_calls: Some(true),
        function_call: None,
        functions: None,
    }
 }
 // Benchmark JSON serialization
 fn bench_json_serialization(c: &mut Criterion) {
    let mut group = c.benchmark_group("json_serialization");
    let generate_req = create_sample_generate_request();
    let chat_req = create_sample_chat_completion_request();
    let completion_req = create_sample_completion_request();
    let large_chat_req = create_large_chat_completion_request();
    group.bench_function("generate_request", |b| {
        b.iter(|| {
            let json = to_string(black_box(&generate_req)).unwrap();
            black_box(json);
        });
    });
    group.bench_function("chat_completion_request", |b| {
        b.iter(|| {
            let json = to_string(black_box(&chat_req)).unwrap();
            black_box(json);
        });
    });
    group.bench_function("completion_request", |b| {
        b.iter(|| {
            let json = to_string(black_box(&completion_req)).unwrap();
            black_box(json);
        });
    });
    group.bench_function("large_chat_completion_request", |b| {
        b.iter(|| {
            let json = to_string(black_box(&large_chat_req)).unwrap();
            black_box(json);
        });
    });
    group.bench_function("generate_request_to_bytes", |b| {
        b.iter(|| {
            let bytes = to_vec(black_box(&generate_req)).unwrap();
            black_box(bytes);
        });
    });
    group.finish();
 }
 // Benchmark JSON deserialization
 fn bench_json_deserialization(c: &mut Criterion) {
    let mut group = c.benchmark_group("json_deserialization");
    let generate_json = to_string(&create_sample_generate_request()).unwrap();
    let chat_json = to_string(&create_sample_chat_completion_request()).unwrap();
    let completion_json = to_string(&create_sample_completion_request()).unwrap();
    let large_chat_json = to_string(&create_large_chat_completion_request()).unwrap();
    group.bench_function("generate_request", |b| {
        b.iter(|| {
            let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap();
            black_box(req);
        });
    });
    group.bench_function("chat_completion_request", |b| {
        b.iter(|| {
            let req: ChatCompletionRequest = from_str(black_box(&chat_json)).unwrap();
            black_box(req);
        });
    });
    group.bench_function("completion_request", |b| {
        b.iter(|| {
            let req: CompletionRequest = from_str(black_box(&completion_json)).unwrap();
            black_box(req);
        });
    });
    group.bench_function("large_chat_completion_request", |b| {
        b.iter(|| {
            let req: ChatCompletionRequest = from_str(black_box(&large_chat_json)).unwrap();
            black_box(req);
        });
    });
    group.finish();
 }
 // Benchmark request adaptation from OpenAI to PD format
 fn bench_request_adaptation(c: &mut Criterion) {
    let mut group = c.benchmark_group("request_adaptation");
    let generate_req = create_sample_generate_request();
    let chat_req = create_sample_chat_completion_request();
    let completion_req = create_sample_completion_request();
    let large_chat_req = create_large_chat_completion_request();
    group.bench_function("generate_to_pd", |b| {
        b.iter(|| {
            let pd_req = black_box(generate_req.clone()).to_pd_request();
            black_box(pd_req);
        });
    });
    group.bench_function("chat_completion_to_pd", |b| {
        b.iter(|| {
            let pd_req = black_box(chat_req.clone()).to_pd_request();
            black_box(pd_req);
        });
    });
    group.bench_function("completion_to_pd", |b| {
        b.iter(|| {
            let pd_req = black_box(completion_req.clone()).to_pd_request();
            black_box(pd_req);
        });
    });
    group.bench_function("large_chat_completion_to_pd", |b| {
        b.iter(|| {
            let pd_req = black_box(large_chat_req.clone()).to_pd_request();
            black_box(pd_req);
        });
    });
    group.finish();
 }
 // Benchmark regular routing (RouteableRequest methods)
 fn bench_regular_routing(c: &mut Criterion) {
    let mut group = c.benchmark_group("regular_routing");
    let generate_req = create_sample_generate_request();
    let chat_req = create_sample_chat_completion_request();
    let completion_req = create_sample_completion_request();
    group.bench_function("generate_to_json", |b| {
        b.iter(|| {
            let json = black_box(&generate_req).to_json().unwrap();
            black_box(json);
        });
    });
    group.bench_function("generate_to_bytes", |b| {
        b.iter(|| {
            let bytes = black_box(&generate_req).to_bytes().unwrap();
            black_box(bytes);
        });
    });
    group.bench_function("chat_completion_to_json", |b| {
        b.iter(|| {
            let json = black_box(&chat_req).to_json().unwrap();
            black_box(json);
        });
    });
    group.bench_function("chat_completion_to_bytes", |b| {
        b.iter(|| {
            let bytes = black_box(&chat_req).to_bytes().unwrap();
            black_box(bytes);
        });
    });
    group.bench_function("completion_to_json", |b| {
        b.iter(|| {
            let json = black_box(&completion_req).to_json().unwrap();
            black_box(json);
        });
    });
    group.finish();
 }
 // Benchmark throughput with different request sizes
 fn bench_throughput_by_size(c: &mut Criterion) {
    let mut group = c.benchmark_group("throughput_by_size");
    // Create requests of different sizes
    let small_generate = GenerateRequest {
        text: Some("Hi".to_string()),
        input_ids: None,
        prompt: None,
        parameters: None,
        sampling_params: None,
        stream: false,
        return_logprob: false,
    };
    let medium_generate = GenerateRequest {
        text: Some("Write a medium length story about AI".repeat(10)),
        input_ids: None,
        prompt: None,
        parameters: None,
        sampling_params: None,
        stream: false,
        return_logprob: false,
    };
    let large_generate = GenerateRequest {
        text: Some("Write a very long and detailed story about artificial intelligence and its impact on society".repeat(100)),
        input_ids: None,
        prompt: None,
        parameters: None,
        sampling_params: None,
        stream: false,
        return_logprob: false,
    };
    for (name, req) in [
        ("small", &small_generate),
        ("medium", &medium_generate),
        ("large", &large_generate),
    ] {
        let json = to_string(req).unwrap();
        let size_bytes = json.len();
        group.throughput(Throughput::Bytes(size_bytes as u64));
        group.bench_with_input(BenchmarkId::new("serialize", name), &req, |b, req| {
            b.iter(|| {
                let json = to_string(black_box(req)).unwrap();
                black_box(json);
            });
        });
        group.bench_with_input(
            BenchmarkId::new("deserialize", name),
            &json,
            |b, json_str| {
                b.iter(|| {
                    let req: GenerateRequest = black_box(from_str(json_str)).unwrap();
                    black_box(req);
                });
            },
        );
        group.bench_with_input(BenchmarkId::new("adapt_to_pd", name), &req, |b, req| {
            b.iter(|| {
                let pd_req = (*req).clone().to_pd_request();
                black_box(pd_req);
            });
        });
    }
    group.finish();
 }
 // Benchmark full round-trip: deserialize -> adapt -> serialize
 fn bench_full_round_trip(c: &mut Criterion) {
    let mut group = c.benchmark_group("full_round_trip");
    let generate_json = to_string(&create_sample_generate_request()).unwrap();
    let chat_json = to_string(&create_sample_chat_completion_request()).unwrap();
    let completion_json = to_string(&create_sample_completion_request()).unwrap();
    group.bench_function("generate_openai_to_pd_pipeline", |b| {
        b.iter(|| {
            // Deserialize OpenAI request
            let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap();
            // Adapt to PD format
            let pd_req = req.to_pd_request();
            // Serialize PD request
            let pd_json = to_string(&pd_req).unwrap();
            black_box(pd_json);
        });
    });
    group.bench_function("chat_completion_openai_to_pd_pipeline", |b| {
        b.iter(|| {
            let req: ChatCompletionRequest = from_str(black_box(&chat_json)).unwrap();
            let pd_req = req.to_pd_request();
            let pd_json = to_string(&pd_req).unwrap();
            black_box(pd_json);
        });
    });
    group.bench_function("completion_openai_to_pd_pipeline", |b| {
        b.iter(|| {
            let req: CompletionRequest = from_str(black_box(&completion_json)).unwrap();
            let pd_req = req.to_pd_request();
            let pd_json = to_string(&pd_req).unwrap();
            black_box(pd_json);
        });
    });
    group.bench_function("generate_regular_routing_pipeline", |b| {
        b.iter(|| {
            // Deserialize OpenAI request
            let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap();
            // Convert to JSON for regular routing
            let routing_json = req.to_json().unwrap();
            black_box(routing_json);
        });
    });
    group.finish();
 }
 fn benchmark_summary(c: &mut Criterion) {
    let group = c.benchmark_group("benchmark_summary");
    println!("\nSGLang Router Performance Benchmark Suite");
    println!("=============================================");
    // Quick performance overview
    let generate_req = create_sample_generate_request();
    println!("\nQuick Performance Overview:");
    // Measure serialization
    let start = Instant::now();
    for _ in 0..1000 {
        let _ = black_box(to_string(&generate_req).unwrap());
    }
    let serialize_time = start.elapsed().as_nanos() / 1000;
    println!("  * Serialization (avg):     {:>8} ns/req", serialize_time);
    // Measure deserialization
    let json = to_string(&generate_req).unwrap();
    let start = Instant::now();
    for _ in 0..1000 {
        let _: GenerateRequest = black_box(from_str(&json).unwrap());
    }
    let deserialize_time = start.elapsed().as_nanos() / 1000;
    println!(
        "  * Deserialization (avg):   {:>8} ns/req",
        deserialize_time
    );
    // Measure adaptation
    let start = Instant::now();
    for _ in 0..1000 {
        let _ = black_box(generate_req.clone().to_pd_request());
    }
    let adapt_time = start.elapsed().as_nanos() / 1000;
    println!("  * PD Adaptation (avg):     {:>8} ns/req", adapt_time);
    // Calculate ratios
    let total_pipeline = serialize_time + deserialize_time + adapt_time;
    println!("  * Total Pipeline (avg):    {:>8} ns/req", total_pipeline);
    println!("\nPerformance Insights:");
    if deserialize_time > serialize_time * 2 {
        println!("  • Deserialization is significantly faster than serialization");
    }
    if adapt_time < serialize_time / 10 {
        println!(
            "  • PD adaptation overhead is negligible ({:.1}% of serialization)",
            (adapt_time as f64 / serialize_time as f64) * 100.0
        );
    }
    if total_pipeline < 10_000 {
        println!("  • Total pipeline latency is excellent (< 10μs)");
    }
    println!("\nRecommendations:");
    if serialize_time > deserialize_time {
        println!("  • Focus optimization efforts on serialization rather than deserialization");
    }
    println!("  • PD mode overhead is minimal - safe to use for latency-sensitive workloads");
    println!("  • Consider batching small requests to improve overall throughput");
    println!("\n{}", "=".repeat(50));
    group.finish();
 }
 criterion_group!(
    benches,
    benchmark_summary,
    bench_json_serialization,
    bench_json_deserialization,
    bench_request_adaptation,
    bench_regular_routing,
    bench_throughput_by_size,
    bench_full_round_trip
 );
 criterion_main!(benches);
--- a/sgl-router/pyproject.toml
+++ b/sgl-router/pyproject.toml
@@ -16,6 +16,11 @@ classifiers = [
    "Programming Language :: Python :: 3",
 ]
 [project.optional-dependencies]
 dev = [
    "requests>=2.25.0",
 ]
 # https://github.com/PyO3/setuptools-rust?tab=readme-ov-file
 [tool.setuptools.packages]
 find = { where = ["py_src"] }
--- a/sgl-router/scripts/post_benchmark_comment.py
+++ b/sgl-router/scripts/post_benchmark_comment.py
@@ -0,0 +1,203 @@
 #!/usr/bin/env python3
 """
 GitHub PR Comment Poster for Benchmark Results
 Posts benchmark results as comments on GitHub PRs with update capability.
 Replaces JavaScript logic in GitHub Actions for better maintainability.
 """
 import argparse
 import os
 import sys
 from pathlib import Path
 from typing import Dict, Optional
 import requests
 class GitHubCommentPoster:
    """Handles posting benchmark results as GitHub PR comments."""
    def __init__(self, token: str, repo_owner: str, repo_name: str):
        self.token = token
        self.repo_owner = repo_owner
        self.repo_name = repo_name
        self.base_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}"
        self.headers = {
            "Authorization": f"token {token}",
            "Accept": "application/vnd.github.v3+json",
        }
    def read_benchmark_results(self, results_file: str) -> Dict[str, str]:
        """Read benchmark results from file."""
        results = {}
        filepath = Path(results_file)
        if not filepath.exists():
            print(f"Results file not found: {filepath}")
            return {"error": "Results file not found"}
        try:
            with open(filepath, "r") as f:
                for line in f:
                    line = line.strip()
                    if "=" in line:
                        key, value = line.split("=", 1)
                        results[key] = value
        except Exception as e:
            print(f"Error reading results file: {e}")
            return {"error": str(e)}
        return results
    def format_benchmark_comment(
        self, results: Dict[str, str], pr_number: int, commit_sha: str
    ) -> str:
        """Format benchmark results into a GitHub comment."""
        serialization_time = results.get("serialization_time", "N/A")
        deserialization_time = results.get("deserialization_time", "N/A")
        adaptation_time = results.get("adaptation_time", "N/A")
        total_time = results.get("total_time", "N/A")
        comment = f"""
 ### SGLang Router Benchmark Results
 **Performance Summary for PR #{pr_number}**
 The router benchmarks have completed successfully!
 **Performance Thresholds:** All passed
 - Serialization: < 2μs
 - Deserialization: < 2μs
 - PD Adaptation: < 5μs
 - Total Pipeline: < 10μs
 **Measured Results:**
 - Serialization: `{serialization_time}`ns
 - Deserialization: `{deserialization_time}`ns
 - PD Adaptation: `{adaptation_time}`ns
 - Total Pipeline: `{total_time}`ns
 **Detailed Reports:**
 - Download the `benchmark-results-{commit_sha}` artifact for HTML reports
 - Run `make bench` locally for detailed analysis
 **Commit:** {commit_sha}
 """.strip()
        return comment
    def find_existing_comment(self, pr_number: int) -> Optional[int]:
        """Find existing benchmark comment in the PR."""
        url = f"{self.base_url}/issues/{pr_number}/comments"
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            comments = response.json()
            for comment in comments:
                if comment.get("user", {}).get(
                    "login"
                ) == "github-actions[bot]" and "SGLang Router Benchmark Results" in comment.get(
                    "body", ""
                ):
                    return comment["id"]
        except requests.RequestException as e:
            print(f"Error fetching comments: {e}")
        return None
    def post_comment(self, pr_number: int, comment_body: str) -> bool:
        """Post a new comment on the PR."""
        url = f"{self.base_url}/issues/{pr_number}/comments"
        data = {"body": comment_body}
        try:
            response = requests.post(url, headers=self.headers, json=data)
            response.raise_for_status()
            print(f"Posted new benchmark comment on PR #{pr_number}")
            return True
        except requests.RequestException as e:
            print(f"Error posting comment: {e}")
            return False
    def update_comment(self, comment_id: int, comment_body: str) -> bool:
        """Update an existing comment."""
        url = f"{self.base_url}/issues/comments/{comment_id}"
        data = {"body": comment_body}
        try:
            response = requests.patch(url, headers=self.headers, json=data)
            response.raise_for_status()
            print(f"Updated existing benchmark comment (ID: {comment_id})")
            return True
        except requests.RequestException as e:
            print(f"Error updating comment: {e}")
            return False
    def post_or_update_comment(
        self, pr_number: int, results_file: str, commit_sha: str
    ) -> bool:
        """Post or update benchmark results comment on PR."""
        # Read benchmark results
        results = self.read_benchmark_results(results_file)
        if "error" in results:
            print(f"Failed to read benchmark results: {results['error']}")
            return False
        # Format comment
        comment_body = self.format_benchmark_comment(results, pr_number, commit_sha)
        # Check for existing comment
        existing_comment_id = self.find_existing_comment(pr_number)
        if existing_comment_id:
            return self.update_comment(existing_comment_id, comment_body)
        else:
            return self.post_comment(pr_number, comment_body)
 def main():
    parser = argparse.ArgumentParser(description="Post benchmark results to GitHub PR")
    parser.add_argument(
        "--pr-number", type=int, required=True, help="Pull request number"
    )
    parser.add_argument("--commit-sha", type=str, required=True, help="Commit SHA")
    parser.add_argument(
        "--results-file",
        type=str,
        default="benchmark_results.env",
        help="Path to benchmark results file",
    )
    parser.add_argument(
        "--repo-owner", type=str, default="sgl-project", help="GitHub repository owner"
    )
    parser.add_argument(
        "--repo-name", type=str, default="sglang", help="GitHub repository name"
    )
    args = parser.parse_args()
    # Get GitHub token from environment
    github_token = os.environ.get("GITHUB_TOKEN")
    if not github_token:
        print("Error: GITHUB_TOKEN environment variable is required")
        sys.exit(1)
    # Create poster and post comment
    poster = GitHubCommentPoster(github_token, args.repo_owner, args.repo_name)
    success = poster.post_or_update_comment(
        args.pr_number, args.results_file, args.commit_sha
    )
    if not success:
        print("Failed to post benchmark comment")
        sys.exit(1)
    print("Benchmark comment posted successfully!")
 if __name__ == "__main__":
    main()
--- a/sgl-router/scripts/run_benchmarks.py
+++ b/sgl-router/scripts/run_benchmarks.py
@@ -0,0 +1,250 @@
 #!/usr/bin/env python3
 """
 SGLang Router Benchmark Runner
 A Python script to run Rust benchmarks with various options and modes.
 Replaces the shell script for better maintainability and integration.
 """
 import argparse
 import os
 import subprocess
 import sys
 import time
 from pathlib import Path
 from typing import Dict, List, Optional
 class BenchmarkRunner:
    """Handles running Rust benchmarks for the SGLang router."""
    def __init__(self, project_root: str):
        self.project_root = Path(project_root)
        self.timestamp = time.strftime("%a %b %d %H:%M:%S UTC %Y", time.gmtime())
    def run_command(
        self, cmd: List[str], capture_output: bool = False
    ) -> subprocess.CompletedProcess:
        """Run a command and handle errors."""
        try:
            if capture_output:
                result = subprocess.run(
                    cmd, capture_output=True, text=True, cwd=self.project_root
                )
            else:
                result = subprocess.run(cmd, cwd=self.project_root)
            return result
        except subprocess.CalledProcessError as e:
            print(f"Error running command: {' '.join(cmd)}")
            print(f"Exit code: {e.returncode}")
            sys.exit(1)
    def print_header(self):
        """Print the benchmark runner header."""
        print("SGLang Router Benchmark Runner")
        print("=" * 30)
        print(f"Project: {self.project_root.absolute()}")
        print(f"Timestamp: {self.timestamp}")
        print()
    def build_release(self):
        """Build the project in release mode."""
        print("Building in release mode...")
        result = self.run_command(["cargo", "build", "--release", "--quiet"])
        if result.returncode != 0:
            print("Failed to build in release mode")
            sys.exit(1)
    def run_benchmarks(
        self,
        quick_mode: bool = False,
        save_baseline: Optional[str] = None,
        compare_baseline: Optional[str] = None,
    ) -> str:
        """Run benchmarks with specified options."""
        bench_args = ["cargo", "bench", "--bench", "request_processing"]
        if quick_mode:
            bench_args.append("benchmark_summary")
            print("Running quick benchmarks...")
        else:
            print("Running full benchmark suite...")
        # Note: Criterion baselines are handled via target directory structure
        # For now, we'll implement baseline functionality via file copying
        if save_baseline:
            print(f"Will save results as baseline: {save_baseline}")
        if compare_baseline:
            print(f"Will compare with baseline: {compare_baseline}")
        print(f"Executing: {' '.join(bench_args)}")
        result = self.run_command(bench_args, capture_output=True)
        if result.returncode != 0:
            print("Benchmark execution failed!")
            print("STDOUT:", result.stdout)
            print("STDERR:", result.stderr)
            sys.exit(1)
        # Handle baseline saving after successful run
        if save_baseline:
            self._save_baseline(save_baseline, result.stdout)
        return result.stdout
    def _save_baseline(self, filename: str, output: str):
        """Save benchmark results to a file as baseline."""
        filepath = self.project_root / filename
        with open(filepath, "w") as f:
            f.write(output)
        print(f"Baseline saved to: {filepath}")
    def parse_benchmark_results(self, output: str) -> Dict[str, str]:
        """Parse benchmark output to extract performance metrics."""
        results = {}
        # Look for performance overview section
        lines = output.split("\n")
        parsing_overview = False
        for line in lines:
            line = line.strip()
            if "Quick Performance Overview:" in line:
                parsing_overview = True
                continue
            if parsing_overview and line.startswith("* "):
                # Parse lines like "* Serialization (avg):          481 ns/req"
                if "Serialization (avg):" in line:
                    results["serialization_time"] = self._extract_time(line)
                elif "Deserialization (avg):" in line:
                    results["deserialization_time"] = self._extract_time(line)
                elif "PD Adaptation (avg):" in line:
                    results["adaptation_time"] = self._extract_time(line)
                elif "Total Pipeline (avg):" in line:
                    results["total_time"] = self._extract_time(line)
            # Stop parsing after the overview section
            if parsing_overview and line.startswith("Performance Insights:"):
                break
        return results
    def _extract_time(self, line: str) -> str:
        """Extract time value from a benchmark line."""
        # Extract number followed by ns/req
        import re
        match = re.search(r"(\d+)\s*ns/req", line)
        return match.group(1) if match else "N/A"
    def validate_thresholds(self, results: Dict[str, str]) -> bool:
        """Validate benchmark results against performance thresholds."""
        thresholds = {
            "serialization_time": 2000,  # 2μs max
            "deserialization_time": 2000,  # 2μs max
            "adaptation_time": 5000,  # 5μs max
            "total_time": 10000,  # 10μs max
        }
        all_passed = True
        print("\nPerformance Threshold Validation:")
        print("=" * 35)
        for metric, threshold in thresholds.items():
            if metric in results and results[metric] != "N/A":
                try:
                    value = int(results[metric])
                    passed = value <= threshold
                    status = "✓ PASS" if passed else "✗ FAIL"
                    print(f"{metric:20}: {value:>6}ns <= {threshold:>6}ns {status}")
                    if not passed:
                        all_passed = False
                except ValueError:
                    print(f"{metric:20}: Invalid value: {results[metric]}")
                    all_passed = False
            else:
                print(f"{metric:20}: No data available")
                all_passed = False
        print()
        if all_passed:
            print("All performance thresholds passed!")
        else:
            print("Some performance thresholds failed!")
        return all_passed
    def save_results_to_file(
        self, results: Dict[str, str], filename: str = "benchmark_results.env"
    ):
        """Save benchmark results to a file for CI consumption."""
        filepath = self.project_root / filename
        with open(filepath, "w") as f:
            for key, value in results.items():
                f.write(f"{key}={value}\n")
        print(f"Results saved to: {filepath}")
 def main():
    parser = argparse.ArgumentParser(description="Run SGLang router benchmarks")
    parser.add_argument(
        "--quick", action="store_true", help="Run quick benchmarks (summary only)"
    )
    parser.add_argument(
        "--save-baseline", type=str, help="Save benchmark results as baseline"
    )
    parser.add_argument(
        "--compare-baseline", type=str, help="Compare with saved baseline"
    )
    parser.add_argument(
        "--validate-thresholds",
        action="store_true",
        help="Validate results against performance thresholds",
    )
    parser.add_argument(
        "--save-results", action="store_true", help="Save results to file for CI"
    )
    args = parser.parse_args()
    # Determine project root (script is in scripts/ subdirectory)
    script_dir = Path(__file__).parent
    project_root = script_dir.parent
    runner = BenchmarkRunner(str(project_root))
    runner.print_header()
    # Build in release mode
    runner.build_release()
    # Run benchmarks
    output = runner.run_benchmarks(
        quick_mode=args.quick,
        save_baseline=args.save_baseline,
        compare_baseline=args.compare_baseline,
    )
    # Print the raw output
    print(output)
    # Parse and validate results if requested
    if args.validate_thresholds or args.save_results:
        results = runner.parse_benchmark_results(output)
        if args.save_results:
            runner.save_results_to_file(results)
        if args.validate_thresholds:
            passed = runner.validate_thresholds(results)
            if not passed:
                print("Validation failed - performance regression detected!")
                sys.exit(1)
    print("\nBenchmark run completed successfully!")
 if __name__ == "__main__":
    main()
--- a/sgl-router/tests/benchmark_integration.rs
+++ b/sgl-router/tests/benchmark_integration.rs
@@ -0,0 +1,241 @@
 // Integration test to ensure benchmarks compile and basic functionality works
 // This prevents benchmarks from breaking in CI
 use serde_json::{from_str, to_string};
 use sglang_router_rs::openai_api_types::{
    ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateParameters, GenerateRequest,
    SamplingParams, StringOrArray, UserMessageContent,
 };
 use sglang_router_rs::request_adapter::{RouteableRequest, ToPdRequest};
 #[test]
 fn test_benchmark_request_creation() {
    // Ensure all benchmark request types can be created without panicking
    let generate_req = GenerateRequest {
        text: Some("Test prompt".to_string()),
        input_ids: None,
        prompt: None,
        parameters: Some(GenerateParameters {
            max_new_tokens: Some(100),
            temperature: Some(0.8),
            top_p: Some(0.9),
            top_k: Some(50),
            repetition_penalty: Some(1.0),
            ..Default::default()
        }),
        sampling_params: Some(SamplingParams {
            temperature: Some(0.8),
            top_p: Some(0.9),
            top_k: Some(50),
            frequency_penalty: Some(0.0),
            presence_penalty: Some(0.0),
            repetition_penalty: Some(1.0),
            ..Default::default()
        }),
        stream: false,
        return_logprob: false,
    };
    let chat_req = ChatCompletionRequest {
        model: "test-model".to_string(),
        messages: vec![ChatMessage::User {
            role: "user".to_string(),
            content: UserMessageContent::Text("Test message".to_string()),
            name: None,
        }],
        max_tokens: Some(150),
        max_completion_tokens: Some(150),
        temperature: Some(0.7),
        top_p: Some(1.0),
        n: Some(1),
        stream: false,
        stop: None,
        presence_penalty: Some(0.0),
        frequency_penalty: Some(0.0),
        logit_bias: None,
        logprobs: false,
        top_logprobs: None,
        user: None,
        response_format: None,
        seed: None,
        tools: None,
        tool_choice: None,
        parallel_tool_calls: Some(true),
        function_call: None,
        functions: None,
    };
    let completion_req = CompletionRequest {
        model: "test-model".to_string(),
        prompt: StringOrArray::String("Test prompt".to_string()),
        suffix: None,
        max_tokens: Some(50),
        temperature: Some(0.8),
        top_p: Some(1.0),
        n: Some(1),
        stream: false,
        logprobs: None,
        echo: false,
        stop: None,
        presence_penalty: Some(0.0),
        frequency_penalty: Some(0.0),
        best_of: Some(1),
        logit_bias: None,
        user: None,
        seed: None,
    };
    // Test serialization works
    assert!(to_string(&generate_req).is_ok());
    assert!(to_string(&chat_req).is_ok());
    assert!(to_string(&completion_req).is_ok());
 }
 #[test]
 fn test_benchmark_serialization_roundtrip() {
    // Test serialization/deserialization roundtrip for benchmark types
    let generate_req = GenerateRequest {
        text: Some("Test prompt".to_string()),
        input_ids: None,
        prompt: None,
        parameters: None,
        sampling_params: None,
        stream: false,
        return_logprob: false,
    };
    // Serialize and deserialize
    let json = to_string(&generate_req).expect("Serialization should work");
    let deserialized: GenerateRequest = from_str(&json).expect("Deserialization should work");
    // Verify basic field equality
    assert_eq!(generate_req.text, deserialized.text);
    assert_eq!(generate_req.stream, deserialized.stream);
    assert_eq!(generate_req.return_logprob, deserialized.return_logprob);
 }
 #[test]
 fn test_benchmark_request_adaptation() {
    // Test that PD request adaptation works for benchmark types
    let generate_req = GenerateRequest {
        text: Some("Test prompt".to_string()),
        input_ids: None,
        prompt: None,
        parameters: None,
        sampling_params: None,
        stream: false,
        return_logprob: false,
    };
    let chat_req = ChatCompletionRequest {
        model: "test-model".to_string(),
        messages: vec![ChatMessage::User {
            role: "user".to_string(),
            content: UserMessageContent::Text("Test message".to_string()),
            name: None,
        }],
        max_tokens: Some(150),
        max_completion_tokens: Some(150),
        temperature: Some(0.7),
        top_p: Some(1.0),
        n: Some(1),
        stream: false,
        stop: None,
        presence_penalty: Some(0.0),
        frequency_penalty: Some(0.0),
        logit_bias: None,
        logprobs: false,
        top_logprobs: None,
        user: None,
        response_format: None,
        seed: None,
        tools: None,
        tool_choice: None,
        parallel_tool_calls: Some(true),
        function_call: None,
        functions: None,
    };
    let completion_req = CompletionRequest {
        model: "test-model".to_string(),
        prompt: StringOrArray::String("Test prompt".to_string()),
        suffix: None,
        max_tokens: Some(50),
        temperature: Some(0.8),
        top_p: Some(1.0),
        n: Some(1),
        stream: false,
        logprobs: None,
        echo: false,
        stop: None,
        presence_penalty: Some(0.0),
        frequency_penalty: Some(0.0),
        best_of: Some(1),
        logit_bias: None,
        user: None,
        seed: None,
    };
    // Test PD adaptation (should not panic)
    let _pd_generate = generate_req.to_pd_request();
    let _pd_chat = chat_req.to_pd_request();
    let _pd_completion = completion_req.to_pd_request();
 }
 #[test]
 fn test_benchmark_regular_routing() {
    // Test regular routing functionality for benchmark types
    let generate_req = GenerateRequest {
        text: Some("Test prompt".to_string()),
        input_ids: None,
        prompt: None,
        parameters: None,
        sampling_params: None,
        stream: false,
        return_logprob: false,
    };
    // Test regular routing methods (should not panic)
    let _json = generate_req.to_json();
    let _bytes = generate_req.to_bytes();
 }
 #[test]
 fn test_benchmark_performance_baseline() {
    // Basic performance sanity check - ensure operations complete quickly
    use std::time::Instant;
    let generate_req = GenerateRequest {
        text: Some("Short test prompt".to_string()),
        input_ids: None,
        prompt: None,
        parameters: None,
        sampling_params: None,
        stream: false,
        return_logprob: false,
    };
    // Serialization should be fast (< 1ms for simple requests)
    let start = Instant::now();
    let _json = to_string(&generate_req).unwrap();
    let serialize_duration = start.elapsed();
    assert!(
        serialize_duration.as_millis() < 1,
        "Serialization took too long: {:?}",
        serialize_duration
    );
    // PD adaptation should be very fast (< 1ms)
    let start = Instant::now();
    let _pd_req = generate_req.to_pd_request();
    let adapt_duration = start.elapsed();
    assert!(
        adapt_duration.as_millis() < 1,
        "PD adaptation took too long: {:?}",
        adapt_duration
    );
 }