[router] PD Router Simplification and Reorganization (#8838)

2025-08-05 21:20:38 -07:00
parent ca47e24f5d
commit 8c7bb39dfb
8 changed files with 1220 additions and 2677 deletions
--- a/sgl-router/benches/request_processing.rs
+++ b/sgl-router/benches/request_processing.rs
@@ -1,12 +1,22 @@
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
-use serde_json::{from_str, to_string, to_vec};
+use serde_json::{from_str, to_string, to_value, to_vec};
 use std::time::Instant;

+use sglang_router_rs::core::{BasicWorker, WorkerType};
 use sglang_router_rs::openai_api_types::{
    ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateParameters, GenerateRequest,
    SamplingParams, StringOrArray, UserMessageContent,
 };
-use sglang_router_rs::routers::request_adapter::{RouteableRequest, ToPdRequest};
+use sglang_router_rs::routers::bootstrap_injector::inject_bootstrap_fields;
+
+fn create_test_worker() -> BasicWorker {
+    BasicWorker::new(
+        "http://test-server:8000".to_string(),
+        WorkerType::Prefill {
+            bootstrap_port: Some(5678),
+        },
+    )
+}

 /// Create a default GenerateRequest for benchmarks with minimal fields set
 fn default_generate_request() -> GenerateRequest {
@@ -312,49 +322,54 @@ fn bench_json_deserialization(c: &mut Criterion) {
    group.finish();
 }

-// Benchmark request adaptation from OpenAI to PD format
-fn bench_request_adaptation(c: &mut Criterion) {
-    let mut group = c.benchmark_group("request_adaptation");
+// Benchmark bootstrap injection (replaces request adaptation)
+fn bench_bootstrap_injection(c: &mut Criterion) {
+    let mut group = c.benchmark_group("bootstrap_injection");

    let generate_req = create_sample_generate_request();
    let chat_req = create_sample_chat_completion_request();
    let completion_req = create_sample_completion_request();
    let large_chat_req = create_large_chat_completion_request();
+    let worker = create_test_worker();

-    group.bench_function("generate_to_pd", |b| {
+    group.bench_function("generate_bootstrap_injection", |b| {
        b.iter(|| {
-            let pd_req = black_box(generate_req.clone()).to_pd_request();
-            black_box(pd_req);
+            let mut json = to_value(black_box(&generate_req)).unwrap();
+            inject_bootstrap_fields(&mut json, &worker).unwrap();
+            black_box(json);
        });
    });

-    group.bench_function("chat_completion_to_pd", |b| {
+    group.bench_function("chat_completion_bootstrap_injection", |b| {
        b.iter(|| {
-            let pd_req = black_box(chat_req.clone()).to_pd_request();
-            black_box(pd_req);
+            let mut json = to_value(black_box(&chat_req)).unwrap();
+            inject_bootstrap_fields(&mut json, &worker).unwrap();
+            black_box(json);
        });
    });

-    group.bench_function("completion_to_pd", |b| {
+    group.bench_function("completion_bootstrap_injection", |b| {
        b.iter(|| {
-            let pd_req = black_box(completion_req.clone()).to_pd_request();
-            black_box(pd_req);
+            let mut json = to_value(black_box(&completion_req)).unwrap();
+            inject_bootstrap_fields(&mut json, &worker).unwrap();
+            black_box(json);
        });
    });

-    group.bench_function("large_chat_completion_to_pd", |b| {
+    group.bench_function("large_chat_completion_bootstrap_injection", |b| {
        b.iter(|| {
-            let pd_req = black_box(large_chat_req.clone()).to_pd_request();
-            black_box(pd_req);
+            let mut json = to_value(black_box(&large_chat_req)).unwrap();
+            inject_bootstrap_fields(&mut json, &worker).unwrap();
+            black_box(json);
        });
    });

    group.finish();
 }

-// Benchmark regular routing (RouteableRequest methods)
-fn bench_regular_routing(c: &mut Criterion) {
-    let mut group = c.benchmark_group("regular_routing");
+// Benchmark direct JSON routing (replaces regular routing)
+fn bench_direct_json_routing(c: &mut Criterion) {
+    let mut group = c.benchmark_group("direct_json_routing");

    let generate_req = create_sample_generate_request();
    let chat_req = create_sample_chat_completion_request();
@@ -362,35 +377,42 @@ fn bench_regular_routing(c: &mut Criterion) {

    group.bench_function("generate_to_json", |b| {
        b.iter(|| {
-            let json = black_box(&generate_req).to_json().unwrap();
+            let json = to_value(black_box(&generate_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("generate_to_json_string", |b| {
+        b.iter(|| {
+            let json = to_string(black_box(&generate_req)).unwrap();
            black_box(json);
        });
    });

    group.bench_function("generate_to_bytes", |b| {
        b.iter(|| {
-            let bytes = black_box(&generate_req).to_bytes().unwrap();
+            let bytes = to_vec(black_box(&generate_req)).unwrap();
            black_box(bytes);
        });
    });

    group.bench_function("chat_completion_to_json", |b| {
        b.iter(|| {
-            let json = black_box(&chat_req).to_json().unwrap();
+            let json = to_value(black_box(&chat_req)).unwrap();
            black_box(json);
        });
    });

-    group.bench_function("chat_completion_to_bytes", |b| {
+    group.bench_function("chat_completion_to_json_string", |b| {
        b.iter(|| {
-            let bytes = black_box(&chat_req).to_bytes().unwrap();
-            black_box(bytes);
+            let json = to_string(black_box(&chat_req)).unwrap();
+            black_box(json);
        });
    });

    group.bench_function("completion_to_json", |b| {
        b.iter(|| {
-            let json = black_box(&completion_req).to_json().unwrap();
+            let json = to_value(black_box(&completion_req)).unwrap();
            black_box(json);
        });
    });
@@ -418,6 +440,8 @@ fn bench_throughput_by_size(c: &mut Criterion) {
        ..default_generate_request()
    };

+    let worker = create_test_worker();
+
    for (name, req) in [
        ("small", &small_generate),
        ("medium", &medium_generate),
@@ -445,33 +469,41 @@ fn bench_throughput_by_size(c: &mut Criterion) {
            },
        );

-        group.bench_with_input(BenchmarkId::new("adapt_to_pd", name), &req, |b, req| {
-            b.iter(|| {
-                let pd_req = (*req).clone().to_pd_request();
-                black_box(pd_req);
-            });
-        });
+        group.bench_with_input(
+            BenchmarkId::new("bootstrap_inject", name),
+            &req,
+            |b, req| {
+                b.iter(|| {
+                    let mut json = to_value(req).unwrap();
+                    inject_bootstrap_fields(&mut json, &worker).unwrap();
+                    black_box(json);
+                });
+            },
+        );
    }

    group.finish();
 }

-// Benchmark full round-trip: deserialize -> adapt -> serialize
+// Benchmark full round-trip: deserialize -> inject bootstrap -> serialize
 fn bench_full_round_trip(c: &mut Criterion) {
    let mut group = c.benchmark_group("full_round_trip");

    let generate_json = to_string(&create_sample_generate_request()).unwrap();
    let chat_json = to_string(&create_sample_chat_completion_request()).unwrap();
    let completion_json = to_string(&create_sample_completion_request()).unwrap();
+    let worker = create_test_worker();

    group.bench_function("generate_openai_to_pd_pipeline", |b| {
        b.iter(|| {
            // Deserialize OpenAI request
            let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap();
-            // Adapt to PD format
-            let pd_req = req.to_pd_request();
-            // Serialize PD request
-            let pd_json = to_string(&pd_req).unwrap();
+            // Convert to JSON Value
+            let mut json = to_value(&req).unwrap();
+            // Inject bootstrap fields
+            inject_bootstrap_fields(&mut json, &worker).unwrap();
+            // Serialize final request
+            let pd_json = to_string(&json).unwrap();
            black_box(pd_json);
        });
    });
@@ -479,8 +511,9 @@ fn bench_full_round_trip(c: &mut Criterion) {
    group.bench_function("chat_completion_openai_to_pd_pipeline", |b| {
        b.iter(|| {
            let req: ChatCompletionRequest = from_str(black_box(&chat_json)).unwrap();
-            let pd_req = req.to_pd_request();
-            let pd_json = to_string(&pd_req).unwrap();
+            let mut json = to_value(&req).unwrap();
+            inject_bootstrap_fields(&mut json, &worker).unwrap();
+            let pd_json = to_string(&json).unwrap();
            black_box(pd_json);
        });
    });
@@ -488,19 +521,21 @@ fn bench_full_round_trip(c: &mut Criterion) {
    group.bench_function("completion_openai_to_pd_pipeline", |b| {
        b.iter(|| {
            let req: CompletionRequest = from_str(black_box(&completion_json)).unwrap();
-            let pd_req = req.to_pd_request();
-            let pd_json = to_string(&pd_req).unwrap();
+            let mut json = to_value(&req).unwrap();
+            inject_bootstrap_fields(&mut json, &worker).unwrap();
+            let pd_json = to_string(&json).unwrap();
            black_box(pd_json);
        });
    });

-    group.bench_function("generate_regular_routing_pipeline", |b| {
+    group.bench_function("generate_direct_json_pipeline", |b| {
        b.iter(|| {
            // Deserialize OpenAI request
            let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap();
-            // Convert to JSON for regular routing
-            let routing_json = req.to_json().unwrap();
-            black_box(routing_json);
+            // Convert to JSON for direct routing (no bootstrap injection)
+            let routing_json = to_value(&req).unwrap();
+            let json_string = to_string(&routing_json).unwrap();
+            black_box(json_string);
        });
    });

@@ -515,6 +550,7 @@ fn benchmark_summary(c: &mut Criterion) {

    // Quick performance overview
    let generate_req = create_sample_generate_request();
+    let worker = create_test_worker();

    println!("\nQuick Performance Overview:");

@@ -538,32 +574,39 @@ fn benchmark_summary(c: &mut Criterion) {
        deserialize_time
    );

-    // Measure adaptation
+    // Measure bootstrap injection (replaces adaptation)
    let start = Instant::now();
    for _ in 0..1000 {
-        let _ = black_box(generate_req.clone().to_pd_request());
+        let mut json = to_value(&generate_req).unwrap();
+        let _ = black_box(inject_bootstrap_fields(&mut json, &worker));
    }
-    let adapt_time = start.elapsed().as_nanos() / 1000;
-    println!("  * PD Adaptation (avg):     {:>8} ns/req", adapt_time);
+    let inject_time = start.elapsed().as_nanos() / 1000;
+    println!("  * Bootstrap Injection (avg): {:>6} ns/req", inject_time);

    // Calculate ratios
-    let total_pipeline = serialize_time + deserialize_time + adapt_time;
+    let total_pipeline = serialize_time + deserialize_time + inject_time;
    println!("  * Total Pipeline (avg):    {:>8} ns/req", total_pipeline);

    println!("\nPerformance Insights:");
    if deserialize_time > serialize_time * 2 {
        println!("  • Deserialization is significantly faster than serialization");
    }
-    if adapt_time < serialize_time / 10 {
+    if inject_time < serialize_time / 10 {
        println!(
-            "  • PD adaptation overhead is negligible ({:.1}% of serialization)",
-            (adapt_time as f64 / serialize_time as f64) * 100.0
+            "  • Bootstrap injection overhead is negligible ({:.1}% of serialization)",
+            (inject_time as f64 / serialize_time as f64) * 100.0
        );
    }
-    if total_pipeline < 10_000 {
-        println!("  • Total pipeline latency is excellent (< 10μs)");
+    if total_pipeline < 100_000 {
+        println!("  • Total pipeline latency is excellent (< 100μs)");
    }

+    println!("\nSimplification Benefits:");
+    println!("  • Eliminated complex type conversion layer");
+    println!("  • Reduced memory allocations");
+    println!("  • Automatic field preservation (no manual mapping)");
+    println!("  • Direct JSON manipulation improves performance");
+
    println!("\nRecommendations:");
    if serialize_time > deserialize_time {
        println!("  • Focus optimization efforts on serialization rather than deserialization");
@@ -581,8 +624,8 @@ criterion_group!(
    benchmark_summary,
    bench_json_serialization,
    bench_json_deserialization,
-    bench_request_adaptation,
-    bench_regular_routing,
+    bench_bootstrap_injection,
+    bench_direct_json_routing,
    bench_throughput_by_size,
    bench_full_round_trip
 );