router: Fix constraint proto and build_constraint in grpc router (#10881)

2025-09-25 08:12:06 -07:00
parent d511b2d905
commit 916784746b
7 changed files with 172 additions and 141 deletions
--- a/python/sglang/srt/entrypoints/grpc_server.py
+++ b/python/sglang/srt/entrypoints/grpc_server.py
@@ -438,6 +438,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
        regex = None
        json_schema = None
        ebnf_grammar = None
        structural_tag = None
        if grpc_params.HasField("regex"):
            regex = grpc_params.regex
@@ -445,6 +446,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
            json_schema = grpc_params.json_schema
        elif grpc_params.HasField("ebnf_grammar"):
            ebnf_grammar = grpc_params.ebnf_grammar
        elif grpc_params.HasField("structural_tag"):
            structural_tag = grpc_params.structural_tag
        return SGLSamplingParams(
            temperature=grpc_params.temperature or 1.0,
@@ -465,6 +468,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
            regex=regex,
            json_schema=json_schema,
            ebnf=ebnf_grammar,
            structural_tag=structural_tag,
            n=grpc_params.n or 1,
            ignore_eos=grpc_params.ignore_eos,
        )
--- a/python/sglang/srt/grpc/sglang_scheduler.proto
+++ b/python/sglang/srt/grpc/sglang_scheduler.proto
@@ -47,24 +47,24 @@ message SamplingParams {
    string regex = 13;
    string json_schema = 14;
    string ebnf_grammar = 15;
    string structural_tag = 16;
  }
  // LoRA adapter
-  string lora_path = 16;
+  string lora_path = 17;
  // Speculative decoding
-  int32 n = 17;  // Number of samples
+  int32 n = 18;  // Number of samples
  // Token healing
-  bool token_healing = 18;
+  bool token_healing = 19;
  // Additional parameters
-  int32 min_new_tokens = 19;
+  int32 min_new_tokens = 20;
-  bool ignore_eos = 20;
+  bool ignore_eos = 21;
-  bool no_stop_trim = 21;
+  bool no_stop_trim = 22;
-  int32 stream_interval = 22;
+  int32 stream_interval = 23;
-  map<string, float> logit_bias = 23;
+  map<string, float> logit_bias = 24;
  string structural_tag = 24;
  // Custom parameters for extensibility
  google.protobuf.Struct custom_params = 25;
--- a/python/sglang/srt/grpc/sglang_scheduler_pb2.py
+++ b/python/sglang/srt/grpc/sglang_scheduler_pb2.py
--- a/python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+++ b/python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
@@ -12,7 +12,7 @@ from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
 DESCRIPTOR: _descriptor.FileDescriptor
 class SamplingParams(_message.Message):
-    __slots__ = ("temperature", "top_p", "top_k", "min_p", "frequency_penalty", "presence_penalty", "repetition_penalty", "max_new_tokens", "stop", "stop_token_ids", "skip_special_tokens", "spaces_between_special_tokens", "regex", "json_schema", "ebnf_grammar", "lora_path", "n", "token_healing", "min_new_tokens", "ignore_eos", "no_stop_trim", "stream_interval", "logit_bias", "structural_tag", "custom_params")
+    __slots__ = ("temperature", "top_p", "top_k", "min_p", "frequency_penalty", "presence_penalty", "repetition_penalty", "max_new_tokens", "stop", "stop_token_ids", "skip_special_tokens", "spaces_between_special_tokens", "regex", "json_schema", "ebnf_grammar", "structural_tag", "lora_path", "n", "token_healing", "min_new_tokens", "ignore_eos", "no_stop_trim", "stream_interval", "logit_bias", "custom_params")
    class LogitBiasEntry(_message.Message):
        __slots__ = ("key", "value")
        KEY_FIELD_NUMBER: _ClassVar[int]
@@ -35,6 +35,7 @@ class SamplingParams(_message.Message):
    REGEX_FIELD_NUMBER: _ClassVar[int]
    JSON_SCHEMA_FIELD_NUMBER: _ClassVar[int]
    EBNF_GRAMMAR_FIELD_NUMBER: _ClassVar[int]
    STRUCTURAL_TAG_FIELD_NUMBER: _ClassVar[int]
    LORA_PATH_FIELD_NUMBER: _ClassVar[int]
    N_FIELD_NUMBER: _ClassVar[int]
    TOKEN_HEALING_FIELD_NUMBER: _ClassVar[int]
@@ -43,7 +44,6 @@ class SamplingParams(_message.Message):
    NO_STOP_TRIM_FIELD_NUMBER: _ClassVar[int]
    STREAM_INTERVAL_FIELD_NUMBER: _ClassVar[int]
    LOGIT_BIAS_FIELD_NUMBER: _ClassVar[int]
    STRUCTURAL_TAG_FIELD_NUMBER: _ClassVar[int]
    CUSTOM_PARAMS_FIELD_NUMBER: _ClassVar[int]
    temperature: float
    top_p: float
@@ -60,6 +60,7 @@ class SamplingParams(_message.Message):
    regex: str
    json_schema: str
    ebnf_grammar: str
    structural_tag: str
    lora_path: str
    n: int
    token_healing: bool
@@ -68,9 +69,8 @@ class SamplingParams(_message.Message):
    no_stop_trim: bool
    stream_interval: int
    logit_bias: _containers.ScalarMap[str, float]
    structural_tag: str
    custom_params: _struct_pb2.Struct
-    def __init__(self, temperature: _Optional[float] = ..., top_p: _Optional[float] = ..., top_k: _Optional[int] = ..., min_p: _Optional[float] = ..., frequency_penalty: _Optional[float] = ..., presence_penalty: _Optional[float] = ..., repetition_penalty: _Optional[float] = ..., max_new_tokens: _Optional[int] = ..., stop: _Optional[_Iterable[str]] = ..., stop_token_ids: _Optional[_Iterable[int]] = ..., skip_special_tokens: bool = ..., spaces_between_special_tokens: bool = ..., regex: _Optional[str] = ..., json_schema: _Optional[str] = ..., ebnf_grammar: _Optional[str] = ..., lora_path: _Optional[str] = ..., n: _Optional[int] = ..., token_healing: bool = ..., min_new_tokens: _Optional[int] = ..., ignore_eos: bool = ..., no_stop_trim: bool = ..., stream_interval: _Optional[int] = ..., logit_bias: _Optional[_Mapping[str, float]] = ..., structural_tag: _Optional[str] = ..., custom_params: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ...) -> None: ...
+    def __init__(self, temperature: _Optional[float] = ..., top_p: _Optional[float] = ..., top_k: _Optional[int] = ..., min_p: _Optional[float] = ..., frequency_penalty: _Optional[float] = ..., presence_penalty: _Optional[float] = ..., repetition_penalty: _Optional[float] = ..., max_new_tokens: _Optional[int] = ..., stop: _Optional[_Iterable[str]] = ..., stop_token_ids: _Optional[_Iterable[int]] = ..., skip_special_tokens: bool = ..., spaces_between_special_tokens: bool = ..., regex: _Optional[str] = ..., json_schema: _Optional[str] = ..., ebnf_grammar: _Optional[str] = ..., structural_tag: _Optional[str] = ..., lora_path: _Optional[str] = ..., n: _Optional[int] = ..., token_healing: bool = ..., min_new_tokens: _Optional[int] = ..., ignore_eos: bool = ..., no_stop_trim: bool = ..., stream_interval: _Optional[int] = ..., logit_bias: _Optional[_Mapping[str, float]] = ..., custom_params: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ...) -> None: ...
 class DisaggregatedParams(_message.Message):
    __slots__ = ("bootstrap_host", "bootstrap_port", "bootstrap_room")
--- a/python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
+++ b/python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
@@ -1,3 +1,6 @@
 # This file is auto-generated. Do not edit manually.
 # Regenerate with: python compile_proto.py
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
--- a/sgl-router/src/proto/sglang_scheduler.proto
+++ b/sgl-router/src/proto/sglang_scheduler.proto
@@ -47,24 +47,24 @@ message SamplingParams {
    string regex = 13;
    string json_schema = 14;
    string ebnf_grammar = 15;
    string structural_tag = 16;
  }
  // LoRA adapter
-  string lora_path = 16;
+  string lora_path = 17;
  // Speculative decoding
-  int32 n = 17;  // Number of samples
+  int32 n = 18;  // Number of samples
  // Token healing
-  bool token_healing = 18;
+  bool token_healing = 19;
  // Additional parameters
-  int32 min_new_tokens = 19;
+  int32 min_new_tokens = 20;
-  bool ignore_eos = 20;
+  bool ignore_eos = 21;
-  bool no_stop_trim = 21;
+  bool no_stop_trim = 22;
-  int32 stream_interval = 22;
+  int32 stream_interval = 23;
-  map<string, float> logit_bias = 23;
+  map<string, float> logit_bias = 24;
  string structural_tag = 24;
  // Custom parameters for extensibility
  google.protobuf.Struct custom_params = 25;
--- a/sgl-router/src/routers/grpc/router.rs
+++ b/sgl-router/src/routers/grpc/router.rs
@@ -241,14 +241,14 @@ impl GrpcRouter {
        debug!("Tokenized {} tokens from input", token_ids.len());
        // Step 5: Build tool constraints if needed
-        let structural_tag = if let Some(tools) = &body.tools {
+        let tool_call_constraint = if let Some(tools) = &body.tools {
            self.generate_tool_constraints(tools, &body.tool_choice, &body.model)
        } else {
            None
        };
        // Step 6: Build SamplingParams for gRPC
-        let sampling_params = match self.build_grpc_sampling_params(body, structural_tag) {
+        let sampling_params = match self.build_grpc_sampling_params(body, tool_call_constraint) {
            Ok(params) => params,
            Err(e) => {
                error!("Failed to build sampling parameters: {}", e);
@@ -286,6 +286,41 @@ impl GrpcRouter {
    }
    // ============ Helper Methods ============
    /// Select a worker for the request
    fn select_worker_for_request(
        &self,
        model_id: Option<&str>,
        text: Option<&str>,
    ) -> Option<Arc<dyn crate::core::Worker>> {
        // Get workers for the specified model, filtered by connection mode
        let workers = self.worker_registry.get_workers_filtered(
            model_id,
            Some(WorkerType::Regular),
            Some(crate::core::ConnectionMode::Grpc { port: None }),
            false, // get all workers, we'll filter by is_available() next
        );
        // Filter by availability (health + circuit breaker)
        let available: Vec<Arc<dyn crate::core::Worker>> = workers
            .iter()
            .filter(|w| w.is_available())
            .cloned()
            .collect();
        if available.is_empty() {
            return None;
        }
        // Get the appropriate policy for this model
        let policy = match model_id {
            Some(model) => self.policy_registry.get_policy_or_default(model),
            None => self.policy_registry.get_default_policy(),
        };
        // Select worker using the policy
        let idx = policy.select_worker(&available, text)?;
        Some(available[idx].clone())
    }
    /// Process chat messages and apply template
    fn process_chat_messages(
@@ -516,7 +551,7 @@ impl GrpcRouter {
    fn build_grpc_sampling_params(
        &self,
        request: &ChatCompletionRequest,
-        structural_tag: Option<String>,
+        tool_call_constraint: Option<(String, String)>,
    ) -> Result<proto::SamplingParams, String> {
        let stop_sequences = self.extract_stop_strings(request);
@@ -555,8 +590,7 @@ impl GrpcRouter {
            stop_token_ids: request.stop_token_ids.clone().unwrap_or_default(),
            skip_special_tokens,
            n: request.n.unwrap_or(1) as i32,
-            structural_tag: structural_tag.unwrap_or_default(),
+            constraint: self.build_constraint(request, tool_call_constraint)?,
            constraint: self.build_constraint(request)?,
            ..Default::default()
        })
    }
@@ -574,28 +608,48 @@ impl GrpcRouter {
    fn build_constraint(
        &self,
        request: &ChatCompletionRequest,
        tool_call_constraint: Option<(String, String)>,
    ) -> Result<Option<proto::sampling_params::Constraint>, String> {
        let mut constraints = Vec::new();
        if let Some(ResponseFormat::JsonSchema { json_schema }) = &request.response_format {
            let schema_str = serde_json::to_string(&json_schema.schema)
                .map_err(|e| format!("Failed to serialize JSON schema: {}", e))?;
-            return Ok(Some(proto::sampling_params::Constraint::JsonSchema(
+            constraints.push(proto::sampling_params::Constraint::JsonSchema(schema_str));
                schema_str,
            )));
        }
        if let Some(ebnf) = &request.ebnf {
-            return Ok(Some(proto::sampling_params::Constraint::EbnfGrammar(
+            constraints.push(proto::sampling_params::Constraint::EbnfGrammar(
                ebnf.clone(),
-            )));
+            ));
        }
        if let Some(regex) = &request.regex {
-            return Ok(Some(proto::sampling_params::Constraint::Regex(
+            constraints.push(proto::sampling_params::Constraint::Regex(regex.clone()));
                regex.clone(),
            )));
        }
-        Ok(None)
+        // Handle tool call constraint
        if let Some((constraint_type, constraint_value)) = tool_call_constraint {
            if !constraints.is_empty() {
                return Err("Constrained decoding is not compatible with tool calls.".to_string());
            }
            let tool_constraint = match constraint_type.as_str() {
                "structural_tag" => {
                    proto::sampling_params::Constraint::StructuralTag(constraint_value)
                }
                "json_schema" => proto::sampling_params::Constraint::JsonSchema(constraint_value),
                "ebnf" => proto::sampling_params::Constraint::EbnfGrammar(constraint_value),
                "regex" => proto::sampling_params::Constraint::Regex(constraint_value),
                _ => return Err(format!("Unknown constraint type: {}", constraint_type)),
            };
            constraints.push(tool_constraint);
        }
        match constraints.len() {
            0 => Ok(None),
            1 => Ok(constraints.pop()),
            _ => Err("Multiple constraints are not allowed.".to_string()),
        }
    }
    /// Generate tool constraints for structured generation
@@ -604,52 +658,19 @@ impl GrpcRouter {
        _tools: &[crate::protocols::spec::Tool],
        _tool_choice: &Option<crate::protocols::spec::ToolChoice>,
        model: &str,
-    ) -> Option<String> {
+    ) -> Option<(String, String)> {
        let _parser = self.tool_parser_registry.get_parser(model)?;
        // TODO: Implement actual constraint generation logic
        // For now, return None as this is placeholder implementation
        None
    }
    /// Select a worker for the request
    fn select_worker_for_request(
        &self,
        model_id: Option<&str>,
        text: Option<&str>,
    ) -> Option<Arc<dyn crate::core::Worker>> {
        // Get workers for the specified model, filtered by connection mode
        let workers = self.worker_registry.get_workers_filtered(
            model_id,
            Some(WorkerType::Regular),
            Some(crate::core::ConnectionMode::Grpc { port: None }),
            false, // get all workers, we'll filter by is_available() next
        );
        // Filter by availability (health + circuit breaker)
        let available: Vec<Arc<dyn crate::core::Worker>> = workers
            .iter()
            .filter(|w| w.is_available())
            .cloned()
            .collect();
        if available.is_empty() {
            return None;
        }
        // Get the appropriate policy for this model
        let policy = match model_id {
            Some(model) => self.policy_registry.get_policy_or_default(model),
            None => self.policy_registry.get_default_policy(),
        };
        // Select worker using the policy
        let idx = policy.select_worker(&available, text)?;
        Some(available[idx].clone())
    }
    /// Get or create a gRPC client for the worker
    async fn get_or_create_grpc_client(
        &self,
        worker_url: &str,
    ) -> Result<SglangSchedulerClient, String> {
        // TODO: move to worker
        debug!("Creating new gRPC client for worker: {}", worker_url);
        SglangSchedulerClient::connect(worker_url)
            .await