196 lines
5.0 KiB
Protocol Buffer
196 lines
5.0 KiB
Protocol Buffer
|
|
syntax = "proto3";
|
||
|
|
|
||
|
|
package vllm.grpc.engine;
|
||
|
|
|
||
|
|
// Service definition for vLLM engine communication
|
||
|
|
// This protocol is designed for efficient binary communication between
|
||
|
|
// the Rust router and vLLM Python engine (AsyncLLM).
|
||
|
|
service VllmEngine {
|
||
|
|
// Submit a generation request (supports streaming)
|
||
|
|
rpc Generate(GenerateRequest) returns (stream GenerateResponse);
|
||
|
|
|
||
|
|
// Submit an embedding request
|
||
|
|
rpc Embed(EmbedRequest) returns (EmbedResponse);
|
||
|
|
|
||
|
|
// Health check
|
||
|
|
rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
|
||
|
|
|
||
|
|
// Abort a running request
|
||
|
|
rpc Abort(AbortRequest) returns (AbortResponse);
|
||
|
|
|
||
|
|
// Get model information
|
||
|
|
rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
|
||
|
|
|
||
|
|
// Get server information
|
||
|
|
rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
|
||
|
|
}
|
||
|
|
|
||
|
|
// =====================
|
||
|
|
// Common Types
|
||
|
|
// =====================
|
||
|
|
|
||
|
|
// Sampling parameters for text generation
|
||
|
|
message SamplingParams {
|
||
|
|
optional float temperature = 1;
|
||
|
|
float top_p = 2;
|
||
|
|
uint32 top_k = 3;
|
||
|
|
float min_p = 4;
|
||
|
|
float frequency_penalty = 5;
|
||
|
|
float presence_penalty = 6;
|
||
|
|
float repetition_penalty = 7;
|
||
|
|
|
||
|
|
optional uint32 max_tokens = 8;
|
||
|
|
uint32 min_tokens = 9;
|
||
|
|
|
||
|
|
repeated string stop = 10;
|
||
|
|
repeated uint32 stop_token_ids = 11;
|
||
|
|
|
||
|
|
bool skip_special_tokens = 12;
|
||
|
|
bool spaces_between_special_tokens = 13;
|
||
|
|
bool ignore_eos = 14;
|
||
|
|
|
||
|
|
uint32 n = 15; // Number of parallel samples
|
||
|
|
|
||
|
|
// Logprobs configuration
|
||
|
|
optional int32 logprobs = 22; // Number of log probabilities per output token (-1 for all)
|
||
|
|
optional int32 prompt_logprobs = 23; // Number of log probabilities per prompt token (-1 for all)
|
||
|
|
|
||
|
|
// Additional vLLM fields
|
||
|
|
optional int32 seed = 24; // Random seed for reproducibility
|
||
|
|
bool include_stop_str_in_output = 25; // Whether to include stop strings in output
|
||
|
|
map<int32, float> logit_bias = 26; // Token ID to bias mapping (-100 to 100)
|
||
|
|
optional int32 truncate_prompt_tokens = 27; // Prompt truncation (-1 for model max)
|
||
|
|
|
||
|
|
// Structured outputs (one of) - matches vLLM's StructuredOutputsParams
|
||
|
|
oneof constraint {
|
||
|
|
string json_schema = 16; // JSON schema for structured output
|
||
|
|
string regex = 17; // Regex pattern
|
||
|
|
string grammar = 18; // Grammar/EBNF for structured output
|
||
|
|
string structural_tag = 19; // Structural tag (e.g., Harmony models)
|
||
|
|
bool json_object = 20; // Force JSON object output
|
||
|
|
ChoiceConstraint choice = 21; // List of allowed choices
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Choice constraint for structured outputs
|
||
|
|
message ChoiceConstraint {
|
||
|
|
repeated string choices = 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Pre-tokenized input from Rust router
|
||
|
|
message TokenizedInput {
|
||
|
|
string original_text = 1; // For reference/debugging
|
||
|
|
repeated uint32 input_ids = 2; // Actual token IDs to process
|
||
|
|
}
|
||
|
|
|
||
|
|
// =====================
|
||
|
|
// Generate Request
|
||
|
|
// =====================
|
||
|
|
|
||
|
|
message GenerateRequest {
|
||
|
|
string request_id = 1;
|
||
|
|
|
||
|
|
// Prompt input
|
||
|
|
oneof input {
|
||
|
|
TokenizedInput tokenized = 2;
|
||
|
|
string text = 3;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Generation parameters (includes logprobs config)
|
||
|
|
SamplingParams sampling_params = 4;
|
||
|
|
|
||
|
|
// Streaming
|
||
|
|
bool stream = 5;
|
||
|
|
}
|
||
|
|
|
||
|
|
// =====================
|
||
|
|
// Generate Response
|
||
|
|
// =====================
|
||
|
|
|
||
|
|
message GenerateResponse {
|
||
|
|
oneof response {
|
||
|
|
GenerateStreamChunk chunk = 1; // For streaming
|
||
|
|
GenerateComplete complete = 2; // For final/non-streaming
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
message GenerateStreamChunk {
|
||
|
|
repeated uint32 token_ids = 1; // Incremental tokens
|
||
|
|
uint32 prompt_tokens = 2;
|
||
|
|
uint32 completion_tokens = 3;
|
||
|
|
uint32 cached_tokens = 4;
|
||
|
|
|
||
|
|
// Logprobs support (TODO: implement in Phase 4)
|
||
|
|
// OutputLogProbs output_logprobs = 5;
|
||
|
|
// InputLogProbs input_logprobs = 6; // Only in first chunk
|
||
|
|
}
|
||
|
|
|
||
|
|
message GenerateComplete {
|
||
|
|
repeated uint32 output_ids = 1; // All output tokens
|
||
|
|
string finish_reason = 2; // "stop", "length", "abort"
|
||
|
|
uint32 prompt_tokens = 3;
|
||
|
|
uint32 completion_tokens = 4;
|
||
|
|
uint32 cached_tokens = 5;
|
||
|
|
|
||
|
|
// Logprobs support (TODO: implement in Phase 4)
|
||
|
|
// OutputLogProbs output_logprobs = 6;
|
||
|
|
// InputLogProbs input_logprobs = 7;
|
||
|
|
}
|
||
|
|
|
||
|
|
// =====================
|
||
|
|
// Embedding Request
|
||
|
|
// =====================
|
||
|
|
|
||
|
|
message EmbedRequest {
|
||
|
|
string request_id = 1;
|
||
|
|
TokenizedInput tokenized = 2;
|
||
|
|
}
|
||
|
|
|
||
|
|
message EmbedResponse {
|
||
|
|
repeated float embedding = 1;
|
||
|
|
uint32 prompt_tokens = 2;
|
||
|
|
uint32 embedding_dim = 3;
|
||
|
|
}
|
||
|
|
|
||
|
|
// =====================
|
||
|
|
// Management Operations
|
||
|
|
// =====================
|
||
|
|
|
||
|
|
message HealthCheckRequest {}
|
||
|
|
|
||
|
|
message HealthCheckResponse {
|
||
|
|
bool healthy = 1;
|
||
|
|
string message = 2;
|
||
|
|
}
|
||
|
|
|
||
|
|
message AbortRequest {
|
||
|
|
repeated string request_ids = 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
message AbortResponse {
|
||
|
|
}
|
||
|
|
|
||
|
|
// =====================
|
||
|
|
// Model and Server Info
|
||
|
|
// =====================
|
||
|
|
|
||
|
|
message GetModelInfoRequest {}
|
||
|
|
|
||
|
|
message GetModelInfoResponse {
|
||
|
|
string model_path = 1;
|
||
|
|
bool is_generation = 2;
|
||
|
|
uint32 max_context_length = 3;
|
||
|
|
uint32 vocab_size = 4;
|
||
|
|
bool supports_vision = 5;
|
||
|
|
}
|
||
|
|
|
||
|
|
message GetServerInfoRequest {}
|
||
|
|
|
||
|
|
message GetServerInfoResponse {
|
||
|
|
uint32 active_requests = 1;
|
||
|
|
bool is_paused = 2;
|
||
|
|
double last_receive_timestamp = 3;
|
||
|
|
double uptime_seconds = 4;
|
||
|
|
string server_type = 5; // "vllm-grpc"
|
||
|
|
}
|