Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
195
vllm/grpc/vllm_engine.proto
Normal file
195
vllm/grpc/vllm_engine.proto
Normal file
@@ -0,0 +1,195 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package vllm.grpc.engine;
|
||||
|
||||
// Service definition for vLLM engine communication
|
||||
// This protocol is designed for efficient binary communication between
|
||||
// the Rust router and vLLM Python engine (AsyncLLM).
|
||||
service VllmEngine {
|
||||
// Submit a generation request (supports streaming)
|
||||
rpc Generate(GenerateRequest) returns (stream GenerateResponse);
|
||||
|
||||
// Submit an embedding request
|
||||
rpc Embed(EmbedRequest) returns (EmbedResponse);
|
||||
|
||||
// Health check
|
||||
rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
|
||||
|
||||
// Abort a running request
|
||||
rpc Abort(AbortRequest) returns (AbortResponse);
|
||||
|
||||
// Get model information
|
||||
rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
|
||||
|
||||
// Get server information
|
||||
rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Common Types
|
||||
// =====================
|
||||
|
||||
// Sampling parameters for text generation
|
||||
message SamplingParams {
|
||||
optional float temperature = 1;
|
||||
float top_p = 2;
|
||||
uint32 top_k = 3;
|
||||
float min_p = 4;
|
||||
float frequency_penalty = 5;
|
||||
float presence_penalty = 6;
|
||||
float repetition_penalty = 7;
|
||||
|
||||
optional uint32 max_tokens = 8;
|
||||
uint32 min_tokens = 9;
|
||||
|
||||
repeated string stop = 10;
|
||||
repeated uint32 stop_token_ids = 11;
|
||||
|
||||
bool skip_special_tokens = 12;
|
||||
bool spaces_between_special_tokens = 13;
|
||||
bool ignore_eos = 14;
|
||||
|
||||
uint32 n = 15; // Number of parallel samples
|
||||
|
||||
// Logprobs configuration
|
||||
optional int32 logprobs = 22; // Number of log probabilities per output token (-1 for all)
|
||||
optional int32 prompt_logprobs = 23; // Number of log probabilities per prompt token (-1 for all)
|
||||
|
||||
// Additional vLLM fields
|
||||
optional int32 seed = 24; // Random seed for reproducibility
|
||||
bool include_stop_str_in_output = 25; // Whether to include stop strings in output
|
||||
map<int32, float> logit_bias = 26; // Token ID to bias mapping (-100 to 100)
|
||||
optional int32 truncate_prompt_tokens = 27; // Prompt truncation (-1 for model max)
|
||||
|
||||
// Structured outputs (one of) - matches vLLM's StructuredOutputsParams
|
||||
oneof constraint {
|
||||
string json_schema = 16; // JSON schema for structured output
|
||||
string regex = 17; // Regex pattern
|
||||
string grammar = 18; // Grammar/EBNF for structured output
|
||||
string structural_tag = 19; // Structural tag (e.g., Harmony models)
|
||||
bool json_object = 20; // Force JSON object output
|
||||
ChoiceConstraint choice = 21; // List of allowed choices
|
||||
}
|
||||
}
|
||||
|
||||
// Choice constraint for structured outputs
|
||||
message ChoiceConstraint {
|
||||
repeated string choices = 1;
|
||||
}
|
||||
|
||||
// Pre-tokenized input from Rust router
|
||||
message TokenizedInput {
|
||||
string original_text = 1; // For reference/debugging
|
||||
repeated uint32 input_ids = 2; // Actual token IDs to process
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Generate Request
|
||||
// =====================
|
||||
|
||||
message GenerateRequest {
|
||||
string request_id = 1;
|
||||
|
||||
// Prompt input
|
||||
oneof input {
|
||||
TokenizedInput tokenized = 2;
|
||||
string text = 3;
|
||||
}
|
||||
|
||||
// Generation parameters (includes logprobs config)
|
||||
SamplingParams sampling_params = 4;
|
||||
|
||||
// Streaming
|
||||
bool stream = 5;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Generate Response
|
||||
// =====================
|
||||
|
||||
message GenerateResponse {
|
||||
oneof response {
|
||||
GenerateStreamChunk chunk = 1; // For streaming
|
||||
GenerateComplete complete = 2; // For final/non-streaming
|
||||
}
|
||||
}
|
||||
|
||||
message GenerateStreamChunk {
|
||||
repeated uint32 token_ids = 1; // Incremental tokens
|
||||
uint32 prompt_tokens = 2;
|
||||
uint32 completion_tokens = 3;
|
||||
uint32 cached_tokens = 4;
|
||||
|
||||
// Logprobs support (TODO: implement in Phase 4)
|
||||
// OutputLogProbs output_logprobs = 5;
|
||||
// InputLogProbs input_logprobs = 6; // Only in first chunk
|
||||
}
|
||||
|
||||
message GenerateComplete {
|
||||
repeated uint32 output_ids = 1; // All output tokens
|
||||
string finish_reason = 2; // "stop", "length", "abort"
|
||||
uint32 prompt_tokens = 3;
|
||||
uint32 completion_tokens = 4;
|
||||
uint32 cached_tokens = 5;
|
||||
|
||||
// Logprobs support (TODO: implement in Phase 4)
|
||||
// OutputLogProbs output_logprobs = 6;
|
||||
// InputLogProbs input_logprobs = 7;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Embedding Request
|
||||
// =====================
|
||||
|
||||
message EmbedRequest {
|
||||
string request_id = 1;
|
||||
TokenizedInput tokenized = 2;
|
||||
}
|
||||
|
||||
message EmbedResponse {
|
||||
repeated float embedding = 1;
|
||||
uint32 prompt_tokens = 2;
|
||||
uint32 embedding_dim = 3;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Management Operations
|
||||
// =====================
|
||||
|
||||
message HealthCheckRequest {}
|
||||
|
||||
message HealthCheckResponse {
|
||||
bool healthy = 1;
|
||||
string message = 2;
|
||||
}
|
||||
|
||||
message AbortRequest {
|
||||
repeated string request_ids = 1;
|
||||
}
|
||||
|
||||
message AbortResponse {
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Model and Server Info
|
||||
// =====================
|
||||
|
||||
message GetModelInfoRequest {}
|
||||
|
||||
message GetModelInfoResponse {
|
||||
string model_path = 1;
|
||||
bool is_generation = 2;
|
||||
uint32 max_context_length = 3;
|
||||
uint32 vocab_size = 4;
|
||||
bool supports_vision = 5;
|
||||
}
|
||||
|
||||
message GetServerInfoRequest {}
|
||||
|
||||
message GetServerInfoResponse {
|
||||
uint32 active_requests = 1;
|
||||
bool is_paused = 2;
|
||||
double last_receive_timestamp = 3;
|
||||
double uptime_seconds = 4;
|
||||
string server_type = 5; // "vllm-grpc"
|
||||
}
|
||||
Reference in New Issue
Block a user