diff --git a/sgl-router/benches/request_processing.rs b/sgl-router/benches/request_processing.rs index 3579d9c67..3d2d55713 100644 --- a/sgl-router/benches/request_processing.rs +++ b/sgl-router/benches/request_processing.rs @@ -48,50 +48,15 @@ fn default_generate_request() -> GenerateRequest { } /// Create a default ChatCompletionRequest for benchmarks with minimal fields set +#[allow(deprecated)] fn default_chat_completion_request() -> ChatCompletionRequest { ChatCompletionRequest { - model: String::new(), + // Required fields in OpenAI order messages: vec![], - max_tokens: None, - max_completion_tokens: None, - temperature: None, - top_p: None, - n: None, - stream: false, - stream_options: None, - stop: None, - presence_penalty: None, - frequency_penalty: None, - logit_bias: None, - logprobs: false, - top_logprobs: None, - user: None, - response_format: None, - seed: None, - tools: None, - tool_choice: None, - parallel_tool_calls: None, - function_call: None, - functions: None, - // SGLang Extensions - top_k: None, - min_p: None, - min_tokens: None, - repetition_penalty: None, - regex: None, - ebnf: None, - stop_token_ids: None, - no_stop_trim: false, - ignore_eos: false, - continue_final_message: false, - skip_special_tokens: true, - // SGLang Extensions - lora_path: None, - session_params: None, - separate_reasoning: true, - stream_reasoning: true, - chat_template_kwargs: None, - return_hidden_states: false, + model: String::new(), + + // Use default for all other fields + ..Default::default() } } @@ -161,6 +126,7 @@ fn create_sample_generate_request() -> GenerateRequest { } } +#[allow(deprecated)] fn create_sample_chat_completion_request() -> ChatCompletionRequest { ChatCompletionRequest { model: "gpt-3.5-turbo".to_string(), @@ -205,6 +171,7 @@ fn create_sample_completion_request() -> CompletionRequest { } } +#[allow(deprecated)] fn create_large_chat_completion_request() -> ChatCompletionRequest { let mut messages = vec![ChatMessage::System { role: "system".to_string(), @@ -240,7 +207,6 @@ fn create_large_chat_completion_request() -> ChatCompletionRequest { presence_penalty: Some(0.1), frequency_penalty: Some(0.1), top_logprobs: Some(5), - user: Some("benchmark_user".to_string()), seed: Some(42), parallel_tool_calls: Some(true), ..default_chat_completion_request() diff --git a/sgl-router/src/protocols/spec.rs b/sgl-router/src/protocols/spec.rs index 4760626b5..cb1f0a992 100644 --- a/sgl-router/src/protocols/spec.rs +++ b/sgl-router/src/protocols/spec.rs @@ -179,26 +179,94 @@ pub struct FunctionCallDelta { // ============= Request ============= -#[derive(Debug, Clone, Deserialize, Serialize)] +#[derive(Debug, Clone, Deserialize, Serialize, Default)] pub struct ChatCompletionRequest { - /// ID of the model to use - pub model: String, - /// A list of messages comprising the conversation so far pub messages: Vec, - /// What sampling temperature to use, between 0 and 2 - #[serde(skip_serializing_if = "Option::is_none")] - pub temperature: Option, + /// ID of the model to use + pub model: String, - /// An alternative to sampling with temperature + /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far #[serde(skip_serializing_if = "Option::is_none")] - pub top_p: Option, + pub frequency_penalty: Option, + + /// Deprecated: Replaced by tool_choice + #[serde(skip_serializing_if = "Option::is_none")] + #[deprecated(note = "Use tool_choice instead")] + pub function_call: Option, + + /// Deprecated: Replaced by tools + #[serde(skip_serializing_if = "Option::is_none")] + #[deprecated(note = "Use tools instead")] + pub functions: Option>, + + /// Modify the likelihood of specified tokens appearing in the completion + #[serde(skip_serializing_if = "Option::is_none")] + pub logit_bias: Option>, + + /// Whether to return log probabilities of the output tokens + #[serde(default)] + pub logprobs: bool, + + /// Deprecated: Replaced by max_completion_tokens + #[serde(skip_serializing_if = "Option::is_none")] + #[deprecated(note = "Use max_completion_tokens instead")] + pub max_tokens: Option, + + /// An upper bound for the number of tokens that can be generated for a completion + #[serde(skip_serializing_if = "Option::is_none")] + pub max_completion_tokens: Option, + + /// Developer-defined tags and values used for filtering completions in the dashboard + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option>, + + /// Output types that you would like the model to generate for this request + #[serde(skip_serializing_if = "Option::is_none")] + pub modalities: Option>, /// How many chat completion choices to generate for each input message #[serde(skip_serializing_if = "Option::is_none")] pub n: Option, + /// Whether to enable parallel function calling during tool use + #[serde(skip_serializing_if = "Option::is_none")] + pub parallel_tool_calls: Option, + + /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far + #[serde(skip_serializing_if = "Option::is_none")] + pub presence_penalty: Option, + + /// Cache key for prompts (beta feature) + #[serde(skip_serializing_if = "Option::is_none")] + pub prompt_cache_key: Option, + + /// Effort level for reasoning models (low, medium, high) + #[serde(skip_serializing_if = "Option::is_none")] + pub reasoning_effort: Option, + + /// An object specifying the format that the model must output + #[serde(skip_serializing_if = "Option::is_none")] + pub response_format: Option, + + /// Safety identifier for content moderation + #[serde(skip_serializing_if = "Option::is_none")] + pub safety_identifier: Option, + + /// Deprecated: This feature is in Legacy mode + #[serde(skip_serializing_if = "Option::is_none")] + #[deprecated(note = "This feature is in Legacy mode")] + pub seed: Option, + + /// The service tier to use for this request + #[serde(skip_serializing_if = "Option::is_none")] + pub service_tier: Option, + + /// Up to 4 sequences where the API will stop generating further tokens + #[serde(skip_serializing_if = "Option::is_none")] + pub stop: Option, + /// If set, partial message deltas will be sent #[serde(default)] pub stream: bool, @@ -207,69 +275,29 @@ pub struct ChatCompletionRequest { #[serde(skip_serializing_if = "Option::is_none")] pub stream_options: Option, - /// Up to 4 sequences where the API will stop generating further tokens + /// What sampling temperature to use, between 0 and 2 #[serde(skip_serializing_if = "Option::is_none")] - pub stop: Option, - - /// The maximum number of tokens to generate - #[serde(skip_serializing_if = "Option::is_none")] - pub max_tokens: Option, - - /// An upper bound for the number of tokens that can be generated for a completion - #[serde(skip_serializing_if = "Option::is_none")] - pub max_completion_tokens: Option, - - /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far - #[serde(skip_serializing_if = "Option::is_none")] - pub presence_penalty: Option, - - /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far - #[serde(skip_serializing_if = "Option::is_none")] - pub frequency_penalty: Option, - - /// Modify the likelihood of specified tokens appearing in the completion - #[serde(skip_serializing_if = "Option::is_none")] - pub logit_bias: Option>, - - /// A unique identifier representing your end-user - #[serde(skip_serializing_if = "Option::is_none")] - pub user: Option, - - /// If specified, our system will make a best effort to sample deterministically - #[serde(skip_serializing_if = "Option::is_none")] - pub seed: Option, - - /// Whether to return log probabilities of the output tokens - #[serde(default)] - pub logprobs: bool, - - /// An integer between 0 and 20 specifying the number of most likely tokens to return - #[serde(skip_serializing_if = "Option::is_none")] - pub top_logprobs: Option, - - /// An object specifying the format that the model must output - #[serde(skip_serializing_if = "Option::is_none")] - pub response_format: Option, - - /// A list of tools the model may call - #[serde(skip_serializing_if = "Option::is_none")] - pub tools: Option>, + pub temperature: Option, /// Controls which (if any) tool is called by the model #[serde(skip_serializing_if = "Option::is_none")] pub tool_choice: Option, - /// Whether to enable parallel function calling during tool use + /// A list of tools the model may call #[serde(skip_serializing_if = "Option::is_none")] - pub parallel_tool_calls: Option, + pub tools: Option>, - /// Deprecated: use tools instead + /// An integer between 0 and 20 specifying the number of most likely tokens to return #[serde(skip_serializing_if = "Option::is_none")] - pub functions: Option>, + pub top_logprobs: Option, - /// Deprecated: use tool_choice instead + /// An alternative to sampling with temperature #[serde(skip_serializing_if = "Option::is_none")] - pub function_call: Option, + pub top_p: Option, + + /// Verbosity level for debugging + #[serde(skip_serializing_if = "Option::is_none")] + pub verbosity: Option, // ============= SGLang Extensions ============= /// Top-k sampling parameter (-1 to disable) @@ -316,7 +344,6 @@ pub struct ChatCompletionRequest { #[serde(default = "default_true")] pub skip_special_tokens: bool, - // ============= SGLang Extensions ============= /// Path to LoRA adapter(s) for model customization #[serde(skip_serializing_if = "Option::is_none")] pub lora_path: Option, diff --git a/sgl-router/src/protocols/validation.rs b/sgl-router/src/protocols/validation.rs index 460ce2148..ee702f7db 100644 --- a/sgl-router/src/protocols/validation.rs +++ b/sgl-router/src/protocols/validation.rs @@ -563,6 +563,7 @@ impl StopConditionsProvider for ChatCompletionRequest { } impl TokenLimitsProvider for ChatCompletionRequest { + #[allow(deprecated)] fn get_max_tokens(&self) -> Option { // Prefer max_completion_tokens over max_tokens if both are set self.max_completion_tokens.or(self.max_tokens) @@ -656,19 +657,13 @@ impl ChatCompletionRequest { /// Validate chat API specific logprobs requirements pub fn validate_chat_logprobs(&self) -> Result<(), ValidationError> { - // In chat API, if logprobs=true, top_logprobs must be specified - if self.logprobs && self.top_logprobs.is_none() { - return Err(ValidationError::MissingRequired { - parameter: "top_logprobs".to_string(), - }); - } - - // If top_logprobs is specified, logprobs should be true + // OpenAI rule: If top_logprobs is specified, logprobs must be true + // But logprobs=true without top_logprobs is valid (returns basic logprobs) if self.top_logprobs.is_some() && !self.logprobs { return Err(ValidationError::InvalidValue { - parameter: "logprobs".to_string(), - value: "false".to_string(), - reason: "must be true when top_logprobs is specified".to_string(), + parameter: "top_logprobs".to_string(), + value: self.top_logprobs.unwrap().to_string(), + reason: "top_logprobs is only allowed when logprobs is enabled".to_string(), }); } @@ -676,6 +671,7 @@ impl ChatCompletionRequest { } /// Validate cross-parameter relationships specific to chat completions + #[allow(deprecated)] pub fn validate_chat_cross_parameters(&self) -> Result<(), ValidationError> { // Validate that both max_tokens and max_completion_tokens aren't set utils::validate_conflicting_parameters( @@ -871,53 +867,24 @@ mod tests { mod chat_tests { use super::*; + #[allow(deprecated)] fn create_valid_chat_request() -> ChatCompletionRequest { ChatCompletionRequest { - model: "gpt-4".to_string(), messages: vec![ChatMessage::User { role: "user".to_string(), content: UserMessageContent::Text("Hello".to_string()), name: None, }], + model: "gpt-4".to_string(), + // Set specific fields we want to test temperature: Some(1.0), top_p: Some(0.9), n: Some(1), - stream: false, - stream_options: None, - stop: None, max_tokens: Some(100), - max_completion_tokens: None, - presence_penalty: Some(0.0), frequency_penalty: Some(0.0), - logit_bias: None, - user: None, - seed: None, - logprobs: false, - top_logprobs: None, - response_format: None, - tools: None, - tool_choice: None, - parallel_tool_calls: None, - functions: None, - function_call: None, - // SGLang extensions - top_k: None, - min_p: None, - min_tokens: None, - repetition_penalty: None, - regex: None, - ebnf: None, - stop_token_ids: None, - no_stop_trim: false, - ignore_eos: false, - continue_final_message: false, - skip_special_tokens: true, - lora_path: None, - session_params: None, - separate_reasoning: true, - stream_reasoning: true, - chat_template_kwargs: None, - return_hidden_states: false, + presence_penalty: Some(0.0), + // Use default for all other fields + ..Default::default() } } @@ -938,19 +905,47 @@ mod tests { } #[test] - fn test_chat_conflicts() { + #[allow(deprecated)] + fn test_chat_cross_parameter_conflicts() { let mut request = create_valid_chat_request(); - // Conflicting max_tokens + // Test 1: max_tokens vs max_completion_tokens conflict request.max_tokens = Some(100); request.max_completion_tokens = Some(200); - assert!(request.validate().is_err()); + assert!( + request.validate().is_err(), + "Should reject both max_tokens and max_completion_tokens" + ); - // Logprobs without top_logprobs + // Reset for next test request.max_tokens = None; + request.max_completion_tokens = None; + + // Test 2: tools vs functions conflict (deprecated) + request.tools = Some(vec![]); + request.functions = Some(vec![]); + assert!( + request.validate().is_err(), + "Should reject both tools and functions" + ); + + // Test 3: logprobs=true without top_logprobs should be valid + let mut request = create_valid_chat_request(); request.logprobs = true; request.top_logprobs = None; - assert!(request.validate().is_err()); + assert!( + request.validate().is_ok(), + "logprobs=true without top_logprobs should be valid" + ); + + // Test 4: top_logprobs without logprobs=true should fail (OpenAI rule) + let mut request = create_valid_chat_request(); + request.logprobs = false; + request.top_logprobs = Some(5); + assert!( + request.validate().is_err(), + "top_logprobs without logprobs=true should fail" + ); } #[test] @@ -1097,14 +1092,17 @@ mod tests { fn test_logprobs_validation() { let mut request = create_valid_chat_request(); - // Valid logprobs configuration + // Valid logprobs configuration with top_logprobs request.logprobs = true; request.top_logprobs = Some(10); assert!(request.validate().is_ok()); - // logprobs=true without top_logprobs should fail + // logprobs=true without top_logprobs should be valid (OpenAI behavior) request.top_logprobs = None; - assert!(request.validate().is_err()); + assert!( + request.validate().is_ok(), + "logprobs=true without top_logprobs should be valid" + ); // top_logprobs without logprobs=true should fail request.logprobs = false; @@ -1137,6 +1135,7 @@ mod tests { } #[test] + #[allow(deprecated)] fn test_min_max_tokens_validation() { let mut request = create_valid_chat_request();