router-spec: Reorder ChatCompletionRequest and fix validation logic (#10675)

2025-09-19 16:41:21 -07:00
parent 00eb5eb721
commit 03ce92e594
3 changed files with 150 additions and 158 deletions
--- a/sgl-router/benches/request_processing.rs
+++ b/sgl-router/benches/request_processing.rs
@@ -48,50 +48,15 @@ fn default_generate_request() -> GenerateRequest {
 }

 /// Create a default ChatCompletionRequest for benchmarks with minimal fields set
+#[allow(deprecated)]
 fn default_chat_completion_request() -> ChatCompletionRequest {
    ChatCompletionRequest {
-        model: String::new(),
+        // Required fields in OpenAI order
        messages: vec![],
-        max_tokens: None,
-        max_completion_tokens: None,
-        temperature: None,
-        top_p: None,
-        n: None,
-        stream: false,
-        stream_options: None,
-        stop: None,
-        presence_penalty: None,
-        frequency_penalty: None,
-        logit_bias: None,
-        logprobs: false,
-        top_logprobs: None,
-        user: None,
-        response_format: None,
-        seed: None,
-        tools: None,
-        tool_choice: None,
-        parallel_tool_calls: None,
-        function_call: None,
-        functions: None,
-        // SGLang Extensions
-        top_k: None,
-        min_p: None,
-        min_tokens: None,
-        repetition_penalty: None,
-        regex: None,
-        ebnf: None,
-        stop_token_ids: None,
-        no_stop_trim: false,
-        ignore_eos: false,
-        continue_final_message: false,
-        skip_special_tokens: true,
-        // SGLang Extensions
-        lora_path: None,
-        session_params: None,
-        separate_reasoning: true,
-        stream_reasoning: true,
-        chat_template_kwargs: None,
-        return_hidden_states: false,
+        model: String::new(),
+
+        // Use default for all other fields
+        ..Default::default()
    }
 }

@@ -161,6 +126,7 @@ fn create_sample_generate_request() -> GenerateRequest {
    }
 }

+#[allow(deprecated)]
 fn create_sample_chat_completion_request() -> ChatCompletionRequest {
    ChatCompletionRequest {
        model: "gpt-3.5-turbo".to_string(),
@@ -205,6 +171,7 @@ fn create_sample_completion_request() -> CompletionRequest {
    }
 }

+#[allow(deprecated)]
 fn create_large_chat_completion_request() -> ChatCompletionRequest {
    let mut messages = vec![ChatMessage::System {
        role: "system".to_string(),
@@ -240,7 +207,6 @@ fn create_large_chat_completion_request() -> ChatCompletionRequest {
        presence_penalty: Some(0.1),
        frequency_penalty: Some(0.1),
        top_logprobs: Some(5),
-        user: Some("benchmark_user".to_string()),
        seed: Some(42),
        parallel_tool_calls: Some(true),
        ..default_chat_completion_request()
--- a/sgl-router/src/protocols/spec.rs
+++ b/sgl-router/src/protocols/spec.rs
@@ -179,26 +179,94 @@ pub struct FunctionCallDelta {

 // ============= Request =============

-#[derive(Debug, Clone, Deserialize, Serialize)]
+#[derive(Debug, Clone, Deserialize, Serialize, Default)]
 pub struct ChatCompletionRequest {
-    /// ID of the model to use
-    pub model: String,
-
    /// A list of messages comprising the conversation so far
    pub messages: Vec<ChatMessage>,

-    /// What sampling temperature to use, between 0 and 2
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub temperature: Option<f32>,
+    /// ID of the model to use
+    pub model: String,

-    /// An alternative to sampling with temperature
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far
    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_p: Option<f32>,
+    pub frequency_penalty: Option<f32>,
+
+    /// Deprecated: Replaced by tool_choice
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[deprecated(note = "Use tool_choice instead")]
+    pub function_call: Option<FunctionCall>,
+
+    /// Deprecated: Replaced by tools
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[deprecated(note = "Use tools instead")]
+    pub functions: Option<Vec<Function>>,
+
+    /// Modify the likelihood of specified tokens appearing in the completion
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logit_bias: Option<HashMap<String, f32>>,
+
+    /// Whether to return log probabilities of the output tokens
+    #[serde(default)]
+    pub logprobs: bool,
+
+    /// Deprecated: Replaced by max_completion_tokens
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[deprecated(note = "Use max_completion_tokens instead")]
+    pub max_tokens: Option<u32>,
+
+    /// An upper bound for the number of tokens that can be generated for a completion
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_completion_tokens: Option<u32>,
+
+    /// Developer-defined tags and values used for filtering completions in the dashboard
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<HashMap<String, String>>,
+
+    /// Output types that you would like the model to generate for this request
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub modalities: Option<Vec<String>>,

    /// How many chat completion choices to generate for each input message
    #[serde(skip_serializing_if = "Option::is_none")]
    pub n: Option<u32>,

+    /// Whether to enable parallel function calling during tool use
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub parallel_tool_calls: Option<bool>,
+
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub presence_penalty: Option<f32>,
+
+    /// Cache key for prompts (beta feature)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prompt_cache_key: Option<String>,
+
+    /// Effort level for reasoning models (low, medium, high)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reasoning_effort: Option<String>,
+
+    /// An object specifying the format that the model must output
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub response_format: Option<ResponseFormat>,
+
+    /// Safety identifier for content moderation
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub safety_identifier: Option<String>,
+
+    /// Deprecated: This feature is in Legacy mode
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[deprecated(note = "This feature is in Legacy mode")]
+    pub seed: Option<i64>,
+
+    /// The service tier to use for this request
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub service_tier: Option<String>,
+
+    /// Up to 4 sequences where the API will stop generating further tokens
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop: Option<StringOrArray>,
+
    /// If set, partial message deltas will be sent
    #[serde(default)]
    pub stream: bool,
@@ -207,69 +275,29 @@ pub struct ChatCompletionRequest {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub stream_options: Option<StreamOptions>,

-    /// Up to 4 sequences where the API will stop generating further tokens
+    /// What sampling temperature to use, between 0 and 2
    #[serde(skip_serializing_if = "Option::is_none")]
-    pub stop: Option<StringOrArray>,
-
-    /// The maximum number of tokens to generate
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub max_tokens: Option<u32>,
-
-    /// An upper bound for the number of tokens that can be generated for a completion
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub max_completion_tokens: Option<u32>,
-
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub presence_penalty: Option<f32>,
-
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub frequency_penalty: Option<f32>,
-
-    /// Modify the likelihood of specified tokens appearing in the completion
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub logit_bias: Option<HashMap<String, f32>>,
-
-    /// A unique identifier representing your end-user
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub user: Option<String>,
-
-    /// If specified, our system will make a best effort to sample deterministically
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub seed: Option<i64>,
-
-    /// Whether to return log probabilities of the output tokens
-    #[serde(default)]
-    pub logprobs: bool,
-
-    /// An integer between 0 and 20 specifying the number of most likely tokens to return
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_logprobs: Option<u32>,
-
-    /// An object specifying the format that the model must output
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub response_format: Option<ResponseFormat>,
-
-    /// A list of tools the model may call
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tools: Option<Vec<Tool>>,
+    pub temperature: Option<f32>,

    /// Controls which (if any) tool is called by the model
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tool_choice: Option<ToolChoice>,

-    /// Whether to enable parallel function calling during tool use
+    /// A list of tools the model may call
    #[serde(skip_serializing_if = "Option::is_none")]
-    pub parallel_tool_calls: Option<bool>,
+    pub tools: Option<Vec<Tool>>,

-    /// Deprecated: use tools instead
+    /// An integer between 0 and 20 specifying the number of most likely tokens to return
    #[serde(skip_serializing_if = "Option::is_none")]
-    pub functions: Option<Vec<Function>>,
+    pub top_logprobs: Option<u32>,

-    /// Deprecated: use tool_choice instead
+    /// An alternative to sampling with temperature
    #[serde(skip_serializing_if = "Option::is_none")]
-    pub function_call: Option<FunctionCall>,
+    pub top_p: Option<f32>,
+
+    /// Verbosity level for debugging
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub verbosity: Option<i32>,

    // ============= SGLang Extensions =============
    /// Top-k sampling parameter (-1 to disable)
@@ -316,7 +344,6 @@ pub struct ChatCompletionRequest {
    #[serde(default = "default_true")]
    pub skip_special_tokens: bool,

-    // ============= SGLang Extensions =============
    /// Path to LoRA adapter(s) for model customization
    #[serde(skip_serializing_if = "Option::is_none")]
    pub lora_path: Option<LoRAPath>,
--- a/sgl-router/src/protocols/validation.rs
+++ b/sgl-router/src/protocols/validation.rs
@@ -563,6 +563,7 @@ impl StopConditionsProvider for ChatCompletionRequest {
 }

 impl TokenLimitsProvider for ChatCompletionRequest {
+    #[allow(deprecated)]
    fn get_max_tokens(&self) -> Option<u32> {
        // Prefer max_completion_tokens over max_tokens if both are set
        self.max_completion_tokens.or(self.max_tokens)
@@ -656,19 +657,13 @@ impl ChatCompletionRequest {

    /// Validate chat API specific logprobs requirements
    pub fn validate_chat_logprobs(&self) -> Result<(), ValidationError> {
-        // In chat API, if logprobs=true, top_logprobs must be specified
-        if self.logprobs && self.top_logprobs.is_none() {
-            return Err(ValidationError::MissingRequired {
-                parameter: "top_logprobs".to_string(),
-            });
-        }
-
-        // If top_logprobs is specified, logprobs should be true
+        // OpenAI rule: If top_logprobs is specified, logprobs must be true
+        // But logprobs=true without top_logprobs is valid (returns basic logprobs)
        if self.top_logprobs.is_some() && !self.logprobs {
            return Err(ValidationError::InvalidValue {
-                parameter: "logprobs".to_string(),
-                value: "false".to_string(),
-                reason: "must be true when top_logprobs is specified".to_string(),
+                parameter: "top_logprobs".to_string(),
+                value: self.top_logprobs.unwrap().to_string(),
+                reason: "top_logprobs is only allowed when logprobs is enabled".to_string(),
            });
        }

@@ -676,6 +671,7 @@ impl ChatCompletionRequest {
    }

    /// Validate cross-parameter relationships specific to chat completions
+    #[allow(deprecated)]
    pub fn validate_chat_cross_parameters(&self) -> Result<(), ValidationError> {
        // Validate that both max_tokens and max_completion_tokens aren't set
        utils::validate_conflicting_parameters(
@@ -871,53 +867,24 @@ mod tests {
    mod chat_tests {
        use super::*;

+        #[allow(deprecated)]
        fn create_valid_chat_request() -> ChatCompletionRequest {
            ChatCompletionRequest {
-                model: "gpt-4".to_string(),
                messages: vec![ChatMessage::User {
                    role: "user".to_string(),
                    content: UserMessageContent::Text("Hello".to_string()),
                    name: None,
                }],
+                model: "gpt-4".to_string(),
+                // Set specific fields we want to test
                temperature: Some(1.0),
                top_p: Some(0.9),
                n: Some(1),
-                stream: false,
-                stream_options: None,
-                stop: None,
                max_tokens: Some(100),
-                max_completion_tokens: None,
-                presence_penalty: Some(0.0),
                frequency_penalty: Some(0.0),
-                logit_bias: None,
-                user: None,
-                seed: None,
-                logprobs: false,
-                top_logprobs: None,
-                response_format: None,
-                tools: None,
-                tool_choice: None,
-                parallel_tool_calls: None,
-                functions: None,
-                function_call: None,
-                // SGLang extensions
-                top_k: None,
-                min_p: None,
-                min_tokens: None,
-                repetition_penalty: None,
-                regex: None,
-                ebnf: None,
-                stop_token_ids: None,
-                no_stop_trim: false,
-                ignore_eos: false,
-                continue_final_message: false,
-                skip_special_tokens: true,
-                lora_path: None,
-                session_params: None,
-                separate_reasoning: true,
-                stream_reasoning: true,
-                chat_template_kwargs: None,
-                return_hidden_states: false,
+                presence_penalty: Some(0.0),
+                // Use default for all other fields
+                ..Default::default()
            }
        }

@@ -938,19 +905,47 @@ mod tests {
        }

        #[test]
-        fn test_chat_conflicts() {
+        #[allow(deprecated)]
+        fn test_chat_cross_parameter_conflicts() {
            let mut request = create_valid_chat_request();

-            // Conflicting max_tokens
+            // Test 1: max_tokens vs max_completion_tokens conflict
            request.max_tokens = Some(100);
            request.max_completion_tokens = Some(200);
-            assert!(request.validate().is_err());
+            assert!(
+                request.validate().is_err(),
+                "Should reject both max_tokens and max_completion_tokens"
+            );

-            // Logprobs without top_logprobs
+            // Reset for next test
            request.max_tokens = None;
+            request.max_completion_tokens = None;
+
+            // Test 2: tools vs functions conflict (deprecated)
+            request.tools = Some(vec![]);
+            request.functions = Some(vec![]);
+            assert!(
+                request.validate().is_err(),
+                "Should reject both tools and functions"
+            );
+
+            // Test 3: logprobs=true without top_logprobs should be valid
+            let mut request = create_valid_chat_request();
            request.logprobs = true;
            request.top_logprobs = None;
-            assert!(request.validate().is_err());
+            assert!(
+                request.validate().is_ok(),
+                "logprobs=true without top_logprobs should be valid"
+            );
+
+            // Test 4: top_logprobs without logprobs=true should fail (OpenAI rule)
+            let mut request = create_valid_chat_request();
+            request.logprobs = false;
+            request.top_logprobs = Some(5);
+            assert!(
+                request.validate().is_err(),
+                "top_logprobs without logprobs=true should fail"
+            );
        }

        #[test]
@@ -1097,14 +1092,17 @@ mod tests {
        fn test_logprobs_validation() {
            let mut request = create_valid_chat_request();

-            // Valid logprobs configuration
+            // Valid logprobs configuration with top_logprobs
            request.logprobs = true;
            request.top_logprobs = Some(10);
            assert!(request.validate().is_ok());

-            // logprobs=true without top_logprobs should fail
+            // logprobs=true without top_logprobs should be valid (OpenAI behavior)
            request.top_logprobs = None;
-            assert!(request.validate().is_err());
+            assert!(
+                request.validate().is_ok(),
+                "logprobs=true without top_logprobs should be valid"
+            );

            // top_logprobs without logprobs=true should fail
            request.logprobs = false;
@@ -1137,6 +1135,7 @@ mod tests {
        }

        #[test]
+        #[allow(deprecated)]
        fn test_min_max_tokens_validation() {
            let mut request = create_valid_chat_request();