[router] Add spec for sglang scheduler (#9322)

2025-08-18 17:20:20 -07:00
parent 5626e20b2b
commit 439df4548a
1 changed files with 541 additions and 0 deletions
--- a/sgl-router/src/proto/sglang_scheduler.proto
+++ b/sgl-router/src/proto/sglang_scheduler.proto
@@ -0,0 +1,541 @@
+syntax = "proto3";
+
+package sglang.grpc.scheduler;
+
+import "google/protobuf/timestamp.proto";
+import "google/protobuf/struct.proto";
+
+// Service definition for SGLang scheduler communication
+// This protocol bridges the Rust router and Python scheduler
+service SGLangScheduler {
+  // Initialize connection and get model info
+  rpc Initialize(InitializeRequest) returns (InitializeResponse);
+
+  // Submit a generation request (supports streaming)
+  rpc Generate(GenerateRequest) returns (stream GenerateResponse);
+
+  // Submit an embedding request
+  rpc Embed(EmbedRequest) returns (EmbedResponse);
+
+  // Health check and metrics
+  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
+
+  // Abort a running request
+  rpc AbortRequest(AbortRequest) returns (AbortResponse);
+
+  // Flush KV cache
+  rpc FlushCache(FlushCacheRequest) returns (FlushCacheResponse);
+}
+
+// =====================
+// Common Types
+// =====================
+
+// Sampling parameters matching SGLang's SamplingParams
+message SamplingParams {
+  float temperature = 1;
+  float top_p = 2;
+  int32 top_k = 3;
+  float min_p = 4;
+  float frequency_penalty = 5;
+  float presence_penalty = 6;
+  float repetition_penalty = 7;
+
+  int32 max_new_tokens = 8;
+  repeated string stop = 9;
+  repeated int32 stop_token_ids = 10;
+  bool skip_special_tokens = 11;
+  bool spaces_between_special_tokens = 12;
+
+  // Structured generation
+  oneof constraint {
+    string regex = 13;
+    string json_schema = 14;
+    string ebnf_grammar = 15;
+  }
+
+  // LoRA adapter
+  string lora_path = 16;
+
+  // Speculative decoding
+  int32 n = 17;  // Number of samples
+
+  // Token healing
+  bool token_healing = 18;
+
+  // Additional parameters
+  int32 min_new_tokens = 19;
+  bool ignore_eos = 20;
+  bool no_stop_trim = 21;
+  int32 stream_interval = 22;
+  map<string, float> logit_bias = 23;
+  string structural_tag = 24;
+
+  // Custom parameters for extensibility
+  google.protobuf.Struct custom_params = 25;
+}
+
+// Session parameters for continual prompting
+message SessionParams {
+  string session_id = 1;
+  string request_id = 2;
+  int32 offset = 3;
+  bool replace = 4;
+  bool drop_previous_output = 5;
+}
+
+// Disaggregated serving parameters
+message DisaggregatedParams {
+  string bootstrap_host = 1;
+  int32 bootstrap_port = 2;
+  int32 bootstrap_room = 3;
+}
+
+// =====================
+// Initialize
+// =====================
+
+message InitializeRequest {
+  string client_id = 1;
+  string client_version = 2;
+
+  // Operating mode
+  enum Mode {
+    REGULAR = 0;      // Normal mode with local scheduler
+    PREFILL = 1;      // Prefill-only mode for disaggregated serving
+    DECODE = 2;       // Decode-only mode for disaggregated serving
+  }
+  Mode mode = 3;
+}
+
+message InitializeResponse {
+  bool success = 1;
+  string scheduler_version = 2;
+
+  // Model information
+  ModelInfo model_info = 3;
+
+  // Server capabilities
+  ServerCapabilities capabilities = 4;
+
+  // Error message if success is false
+  string error_message = 5;
+}
+
+message ModelInfo {
+  string model_name = 1;
+  int32 max_context_length = 2;
+  int32 vocab_size = 3;
+  bool supports_tool_calling = 4;
+  bool supports_vision = 5;
+  repeated string special_tokens = 6;
+
+  // Additional model metadata
+  string model_type = 7;
+  int32 num_layers = 8;
+  int32 hidden_size = 9;
+  int32 num_attention_heads = 10;
+  int32 num_key_value_heads = 11;
+
+  // Tokenizer info
+  string tokenizer_type = 12;
+  repeated int32 eos_token_ids = 13;
+  int32 pad_token_id = 14;
+  int32 bos_token_id = 15;
+}
+
+message ServerCapabilities {
+  bool continuous_batching = 1;
+  bool disaggregated_serving = 2;
+  bool speculative_decoding = 3;
+  int32 max_batch_size = 4;
+  int32 max_num_batched_tokens = 5;
+  int32 max_prefill_tokens = 6;
+  string attention_backend = 7;  // "flashinfer", "triton", "torch"
+
+  // Additional capabilities
+  bool supports_lora = 8;
+  bool supports_grammar = 9;
+  bool supports_multimodal = 10;
+  repeated string supported_modalities = 11;  // ["image", "video", "audio"]
+  bool supports_custom_logit_processor = 12;
+  bool supports_session = 13;
+
+  // Hardware info
+  int32 num_gpus = 14;
+  string gpu_type = 15;
+  int64 total_gpu_memory = 16;
+
+  // Parallelism info
+  int32 tensor_parallel_size = 17;
+  int32 pipeline_parallel_size = 18;
+  int32 data_parallel_size = 19;
+}
+
+// =====================
+// Generate Request
+// =====================
+
+message GenerateRequest {
+  string request_id = 1;
+
+  // Input can be either text or tokenized
+  oneof input {
+    string text = 2;
+    TokenizedInput tokenized = 3;
+  }
+
+  // Multimodal inputs
+  MultimodalInputs mm_inputs = 4;
+
+  // Generation parameters
+  SamplingParams sampling_params = 5;
+
+  // Return options
+  bool return_logprob = 6;
+  int32 logprob_start_len = 7;
+  int32 top_logprobs_num = 8;
+  repeated int32 token_ids_logprob = 9;
+  bool return_hidden_states = 10;
+
+  // Session management
+  SessionParams session_params = 11;
+
+  // For disaggregated serving
+  DisaggregatedParams disaggregated_params = 12;
+
+  // Custom logit processor (serialized)
+  string custom_logit_processor = 13;
+
+  // Request metadata
+  google.protobuf.Timestamp timestamp = 14;
+  bool log_metrics = 15;
+
+  // Input embeddings (alternative to text/tokens)
+  repeated float input_embeds = 16;
+
+  // LoRA adapter ID (if pre-loaded)
+  string lora_id = 17;
+
+  // Data parallel routing
+  int32 data_parallel_rank = 18;
+
+  // For load balancing
+  int32 dp_balance_id = 19;
+}
+
+message TokenizedInput {
+  string original_text = 1;  // For reference
+  repeated int32 input_ids = 2;
+}
+
+message MultimodalInputs {
+  // Simplified multimodal handling - actual data processed by tokenizer
+  repeated string image_urls = 1;
+  repeated string video_urls = 2;
+  repeated string audio_urls = 3;
+
+  // Pre-processed multimodal features (if available)
+  google.protobuf.Struct processed_features = 4;
+
+  // Raw data for direct processing
+  repeated bytes image_data = 5;
+  repeated bytes video_data = 6;
+  repeated bytes audio_data = 7;
+
+  // Modality metadata
+  repeated string modalities = 8;
+}
+
+// =====================
+// Generate Response
+// =====================
+
+message GenerateResponse {
+  string request_id = 1;
+
+  // Response type
+  oneof response {
+    GenerateStreamChunk chunk = 2;
+    GenerateComplete complete = 3;
+    GenerateError error = 4;
+  }
+}
+
+message GenerateStreamChunk {
+  // Generated token
+  int32 token_id = 1;
+  string text = 2;
+
+  // Cumulative counts
+  int32 prompt_tokens = 3;
+  int32 completion_tokens = 4;
+  int32 cached_tokens = 5;
+
+  // Logprobs (if requested)
+  LogProbs logprobs = 6;
+
+  // Hidden states (if requested)
+  repeated float hidden_states = 7;
+
+  // Metadata
+  float generation_time = 8;  // Time to generate this token
+  int32 queue_time = 9;       // Time spent in queue
+}
+
+message GenerateComplete {
+  // Final output
+  repeated int32 output_ids = 1;
+  string output_text = 2;
+
+  // Finish reason
+  enum FinishReason {
+    // The model generated a stop sequence.
+    STOP = 0;
+    // The model reached the maximum generation length.
+    LENGTH = 1;
+    // The model generated an end-of-sequence (EOS) token.
+    EOS_TOKEN = 2;
+    // The model generated a user-provided stop string.
+    STOP_STR = 3;
+    // The request was aborted by the user or system.
+    ABORT = 4;
+  }
+  FinishReason finish_reason = 3;
+
+  // Final counts
+  int32 prompt_tokens = 4;
+  int32 completion_tokens = 5;
+  int32 cached_tokens = 6;
+
+  // Performance metrics
+  float total_generation_time = 7;
+  float time_to_first_token = 8;
+  float tokens_per_second = 9;
+
+  // Spec decode metrics
+  int32 spec_verify_count = 10;
+
+  // All logprobs if requested
+  repeated LogProbs all_logprobs = 11;
+
+  // All hidden states if requested
+  repeated HiddenStates all_hidden_states = 12;
+}
+
+message GenerateError {
+  string message = 1;
+  string http_status_code = 2;
+  string details = 3;
+}
+
+message LogProbs {
+  repeated float token_logprobs = 1;
+  repeated int32 token_ids = 2;
+
+  // Top logprobs at each position
+  repeated TopLogProbs top_logprobs = 3;
+
+  // Decoded text for tokens
+  repeated string token_texts = 4;
+}
+
+message TopLogProbs {
+  repeated float values = 1;
+  repeated int32 token_ids = 2;
+  repeated string token_texts = 3;
+}
+
+message HiddenStates {
+  repeated float values = 1;
+  int32 layer = 2;
+  int32 position = 3;
+}
+
+// =====================
+// Embedding Request
+// =====================
+
+message EmbedRequest {
+  string request_id = 1;
+
+  oneof input {
+    string text = 2;
+    TokenizedInput tokenized = 3;
+  }
+
+  // Multimodal inputs
+  MultimodalInputs mm_inputs = 4;
+
+  // Dummy sampling params for compatibility
+  // EmbedRequest doesn't use sampling_params
+  SamplingParams sampling_params = 5;
+
+  bool log_metrics = 6;
+
+  // Token type IDs for models that require them
+  repeated int32 token_type_ids = 7;
+
+  // Data parallel routing
+  int32 data_parallel_rank = 8;
+
+  // For cross-encoder requests
+  bool is_cross_encoder = 9;
+  repeated string texts = 10;  // For cross-encoder batch
+}
+
+message EmbedResponse {
+  string request_id = 1;
+
+  oneof response {
+    EmbedComplete complete = 2;
+    EmbedError error = 3;
+  }
+}
+
+message EmbedComplete {
+  repeated float embedding = 1;
+  int32 prompt_tokens = 2;
+  int32 cached_tokens = 3;
+
+  // Additional metadata
+  int32 embedding_dim = 4;
+  float generation_time = 5;
+
+  // For batch embeddings
+  repeated Embedding batch_embeddings = 6;
+}
+
+message Embedding {
+  repeated float values = 1;
+  int32 index = 2;
+}
+
+message EmbedError {
+  string message = 1;
+  string code = 2;
+  string details = 3;
+}
+
+// =====================
+// Management Operations
+// =====================
+
+message HealthCheckRequest {
+  bool include_detailed_metrics = 1;
+}
+
+message HealthCheckResponse {
+  bool healthy = 1;
+
+  // Current load metrics
+  int32 num_requests_running = 2;
+  int32 num_requests_waiting = 3;
+  float gpu_cache_usage = 4;
+  float gpu_memory_usage = 5;
+
+  // KV cache metrics
+  int32 kv_cache_total_blocks = 6;
+  int32 kv_cache_used_blocks = 7;
+  float kv_cache_hit_rate = 8;
+
+  // Additional metrics
+  int32 num_grammar_queue_requests = 9;
+  float generation_throughput = 10;  // tokens/sec
+  float average_queue_time = 11;     // seconds
+  float average_generation_time = 12; // seconds
+
+  // System metrics
+  float cpu_usage = 13;
+  int64 memory_usage = 14;
+
+  // Disaggregation metrics
+  int32 num_prefill_requests = 15;
+  int32 num_decode_requests = 16;
+
+  // Detailed metrics (optional)
+  google.protobuf.Struct detailed_metrics = 17;
+}
+
+message AbortRequest {
+  string request_id = 1;
+  string reason = 2;
+}
+
+message AbortResponse {
+  bool success = 1;
+  string message = 2;
+}
+
+message FlushCacheRequest {
+  bool flush_all = 1;
+  repeated string session_ids = 2;  // Flush specific sessions
+}
+
+message FlushCacheResponse {
+  bool success = 1;
+  int32 num_entries_flushed = 2;
+  int64 memory_freed = 3;  // bytes
+  string message = 4;
+}
+
+// =====================
+// Additional Operations (Future)
+// =====================
+
+// Load LoRA adapter
+message LoadLoRARequest {
+  string adapter_id = 1;
+  string adapter_path = 2;
+  int32 rank = 3;
+}
+
+message LoadLoRAResponse {
+  bool success = 1;
+  string adapter_id = 2;
+  string message = 3;
+}
+
+// Unload LoRA adapter
+message UnloadLoRARequest {
+  string adapter_id = 1;
+}
+
+message UnloadLoRAResponse {
+  bool success = 1;
+  string message = 2;
+}
+
+// Update weights
+message UpdateWeightsRequest {
+  oneof source {
+    string disk_path = 1;
+    bytes tensor_data = 2;
+    string remote_url = 3;
+  }
+  string weight_name = 4;
+}
+
+message UpdateWeightsResponse {
+  bool success = 1;
+  string message = 2;
+}
+
+// Get internal state for debugging
+message GetInternalStateRequest {
+  repeated string state_keys = 1;
+}
+
+message GetInternalStateResponse {
+  google.protobuf.Struct state = 1;
+}
+
+// Set internal state for testing
+message SetInternalStateRequest {
+  google.protobuf.Struct state = 1;
+}
+
+message SetInternalStateResponse {
+  bool success = 1;
+  string message = 2;
+}