syntax = "proto3"; package vllm.grpc.engine; // Service definition for vLLM engine communication // This protocol is designed for efficient binary communication between // the Rust router and vLLM Python engine (AsyncLLM). service VllmEngine { // Submit a generation request (supports streaming) rpc Generate(GenerateRequest) returns (stream GenerateResponse); // Submit an embedding request rpc Embed(EmbedRequest) returns (EmbedResponse); // Health check rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse); // Abort a running request rpc Abort(AbortRequest) returns (AbortResponse); // Get model information rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse); // Get server information rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse); } // ===================== // Common Types // ===================== // Sampling parameters for text generation message SamplingParams { optional float temperature = 1; float top_p = 2; uint32 top_k = 3; float min_p = 4; float frequency_penalty = 5; float presence_penalty = 6; float repetition_penalty = 7; optional uint32 max_tokens = 8; uint32 min_tokens = 9; repeated string stop = 10; repeated uint32 stop_token_ids = 11; bool skip_special_tokens = 12; bool spaces_between_special_tokens = 13; bool ignore_eos = 14; uint32 n = 15; // Number of parallel samples // Logprobs configuration optional int32 logprobs = 22; // Number of log probabilities per output token (-1 for all) optional int32 prompt_logprobs = 23; // Number of log probabilities per prompt token (-1 for all) // Additional vLLM fields optional int32 seed = 24; // Random seed for reproducibility bool include_stop_str_in_output = 25; // Whether to include stop strings in output map logit_bias = 26; // Token ID to bias mapping (-100 to 100) optional int32 truncate_prompt_tokens = 27; // Prompt truncation (-1 for model max) // Structured outputs (one of) - matches vLLM's StructuredOutputsParams oneof constraint { string json_schema = 16; // JSON schema for structured output string regex = 17; // Regex pattern string grammar = 18; // Grammar/EBNF for structured output string structural_tag = 19; // Structural tag (e.g., Harmony models) bool json_object = 20; // Force JSON object output ChoiceConstraint choice = 21; // List of allowed choices } } // Choice constraint for structured outputs message ChoiceConstraint { repeated string choices = 1; } // Pre-tokenized input from Rust router message TokenizedInput { string original_text = 1; // For reference/debugging repeated uint32 input_ids = 2; // Actual token IDs to process } // ===================== // Generate Request // ===================== message GenerateRequest { string request_id = 1; // Prompt input oneof input { TokenizedInput tokenized = 2; string text = 3; } // Generation parameters (includes logprobs config) SamplingParams sampling_params = 4; // Streaming bool stream = 5; } // ===================== // Generate Response // ===================== message GenerateResponse { oneof response { GenerateStreamChunk chunk = 1; // For streaming GenerateComplete complete = 2; // For final/non-streaming } } message GenerateStreamChunk { repeated uint32 token_ids = 1; // Incremental tokens uint32 prompt_tokens = 2; uint32 completion_tokens = 3; uint32 cached_tokens = 4; // Logprobs support (TODO: implement in Phase 4) // OutputLogProbs output_logprobs = 5; // InputLogProbs input_logprobs = 6; // Only in first chunk } message GenerateComplete { repeated uint32 output_ids = 1; // All output tokens string finish_reason = 2; // "stop", "length", "abort" uint32 prompt_tokens = 3; uint32 completion_tokens = 4; uint32 cached_tokens = 5; // Logprobs support (TODO: implement in Phase 4) // OutputLogProbs output_logprobs = 6; // InputLogProbs input_logprobs = 7; } // ===================== // Embedding Request // ===================== message EmbedRequest { string request_id = 1; TokenizedInput tokenized = 2; } message EmbedResponse { repeated float embedding = 1; uint32 prompt_tokens = 2; uint32 embedding_dim = 3; } // ===================== // Management Operations // ===================== message HealthCheckRequest {} message HealthCheckResponse { bool healthy = 1; string message = 2; } message AbortRequest { repeated string request_ids = 1; } message AbortResponse { } // ===================== // Model and Server Info // ===================== message GetModelInfoRequest {} message GetModelInfoResponse { string model_path = 1; bool is_generation = 2; uint32 max_context_length = 3; uint32 vocab_size = 4; bool supports_vision = 5; } message GetServerInfoRequest {} message GetServerInfoResponse { uint32 active_requests = 1; bool is_paused = 2; double last_receive_timestamp = 3; double uptime_seconds = 4; string server_type = 5; // "vllm-grpc" }