[router] fix req handling order, improve serialization, remove retry (#8888)

This commit is contained in:
Simo Lin
2025-08-06 23:24:39 -07:00
committed by GitHub
parent 2d120f8b18
commit a69b637014
10 changed files with 432 additions and 856 deletions

View File

@@ -9,7 +9,6 @@ use sglang_router_rs::openai_api_types::{
ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateParameters, GenerateRequest,
SamplingParams, StringOrArray, UserMessageContent,
};
use sglang_router_rs::routers::bootstrap_injector::inject_bootstrap_fields;
/// Create a default GenerateRequest for benchmarks with minimal fields set
fn default_generate_request() -> GenerateRequest {
@@ -208,63 +207,6 @@ fn test_benchmark_serialization_roundtrip() {
assert_eq!(generate_req.return_logprob, deserialized.return_logprob);
}
#[test]
fn test_benchmark_bootstrap_injection() {
// Test that bootstrap injection works for benchmark types (replaces PD request adaptation)
let generate_req = GenerateRequest {
text: Some("Test prompt".to_string()),
..default_generate_request()
};
let chat_req = ChatCompletionRequest {
model: "test-model".to_string(),
messages: vec![ChatMessage::User {
role: "user".to_string(),
content: UserMessageContent::Text("Test message".to_string()),
name: None,
}],
max_tokens: Some(150),
max_completion_tokens: Some(150),
temperature: Some(0.7),
top_p: Some(1.0),
n: Some(1),
presence_penalty: Some(0.0),
frequency_penalty: Some(0.0),
parallel_tool_calls: Some(true),
..default_chat_completion_request()
};
let completion_req = CompletionRequest {
model: "test-model".to_string(),
prompt: StringOrArray::String("Test prompt".to_string()),
max_tokens: Some(50),
temperature: Some(0.8),
top_p: Some(1.0),
n: Some(1),
presence_penalty: Some(0.0),
frequency_penalty: Some(0.0),
best_of: Some(1),
..default_completion_request()
};
let worker = create_test_worker();
// Test bootstrap injection (should not panic)
let mut generate_json = to_value(&generate_req).unwrap();
let mut chat_json = to_value(&chat_req).unwrap();
let mut completion_json = to_value(&completion_req).unwrap();
assert!(inject_bootstrap_fields(&mut generate_json, &worker).is_ok());
assert!(inject_bootstrap_fields(&mut chat_json, &worker).is_ok());
assert!(inject_bootstrap_fields(&mut completion_json, &worker).is_ok());
// Verify bootstrap fields were added
assert!(generate_json.get("bootstrap_host").is_some());
assert!(generate_json.get("bootstrap_port").is_some());
assert!(generate_json.get("bootstrap_room").is_some());
}
#[test]
fn test_benchmark_direct_json_routing() {
// Test direct JSON routing functionality for benchmark types (replaces regular routing)
@@ -283,47 +225,3 @@ fn test_benchmark_direct_json_routing() {
assert!(!json_string.is_empty());
assert!(!bytes.is_empty());
}
#[test]
fn test_benchmark_performance_baseline() {
// Basic performance sanity check - ensure operations complete quickly
use std::time::Instant;
let generate_req = GenerateRequest {
text: Some("Short test prompt".to_string()),
..default_generate_request()
};
// Test the actual simplified pipeline: to_value + bootstrap injection
let start = Instant::now();
let worker = create_test_worker();
// This mirrors the actual router pipeline
let mut json = to_value(&generate_req).unwrap();
let _ = inject_bootstrap_fields(&mut json, &worker);
let total_duration = start.elapsed();
assert!(
total_duration.as_millis() < 5,
"Simplified pipeline took too long: {:?} (should be faster than old adapter approach)",
total_duration
);
// Individual components should also be fast
let start = Instant::now();
let _json = to_value(&generate_req).unwrap();
let to_value_duration = start.elapsed();
let start = Instant::now();
let mut json = to_value(&generate_req).unwrap();
let _ = inject_bootstrap_fields(&mut json, &worker);
let inject_duration = start.elapsed();
// Bootstrap injection should be faster than the JSON conversion
assert!(
inject_duration <= to_value_duration * 3,
"Bootstrap injection ({:?}) should not be much slower than JSON conversion ({:?})",
inject_duration,
to_value_duration
);
}