enginex-bi_150-vllm/vllm/grpc/vllm_engine.proto

syntax = "proto3";

package vllm.grpc.engine;

// Service definition for vLLM engine communication
// This protocol is designed for efficient binary communication between
// the Rust router and vLLM Python engine (AsyncLLM).
service VllmEngine {
  // Submit a generation request (supports streaming)
  rpc Generate(GenerateRequest) returns (stream GenerateResponse);

  // Submit an embedding request
  rpc Embed(EmbedRequest) returns (EmbedResponse);

  // Health check
  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);

  // Abort a running request
  rpc Abort(AbortRequest) returns (AbortResponse);

  // Get model information
  rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);

  // Get server information
  rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
}

// =====================
// Common Types
// =====================

// Sampling parameters for text generation
message SamplingParams {
  optional float temperature = 1;
  float top_p = 2;
  uint32 top_k = 3;
  float min_p = 4;
  float frequency_penalty = 5;
  float presence_penalty = 6;
  float repetition_penalty = 7;

  optional uint32 max_tokens = 8;
  uint32 min_tokens = 9;

  repeated string stop = 10;
  repeated uint32 stop_token_ids = 11;

  bool skip_special_tokens = 12;
  bool spaces_between_special_tokens = 13;
  bool ignore_eos = 14;

  uint32 n = 15;  // Number of parallel samples

  // Logprobs configuration
  optional int32 logprobs = 22;  // Number of log probabilities per output token (-1 for all)
  optional int32 prompt_logprobs = 23;  // Number of log probabilities per prompt token (-1 for all)

  // Additional vLLM fields
  optional int32 seed = 24;  // Random seed for reproducibility
  bool include_stop_str_in_output = 25;  // Whether to include stop strings in output
  map<int32, float> logit_bias = 26;  // Token ID to bias mapping (-100 to 100)
  optional int32 truncate_prompt_tokens = 27;  // Prompt truncation (-1 for model max)

  // Structured outputs (one of) - matches vLLM's StructuredOutputsParams
  oneof constraint {
    string json_schema = 16;  // JSON schema for structured output
    string regex = 17;  // Regex pattern
    string grammar = 18;  // Grammar/EBNF for structured output
    string structural_tag = 19;  // Structural tag (e.g., Harmony models)
    bool json_object = 20;  // Force JSON object output
    ChoiceConstraint choice = 21;  // List of allowed choices
  }
}

// Choice constraint for structured outputs
message ChoiceConstraint {
  repeated string choices = 1;
}

// Pre-tokenized input from Rust router
message TokenizedInput {
  string original_text = 1;  // For reference/debugging
  repeated uint32 input_ids = 2;  // Actual token IDs to process
}

// =====================
// Generate Request
// =====================

message GenerateRequest {
  string request_id = 1;

  // Prompt input
  oneof input {
    TokenizedInput tokenized = 2;
    string text = 3;
  }

  // Generation parameters (includes logprobs config)
  SamplingParams sampling_params = 4;

  // Streaming
  bool stream = 5;
}

// =====================
// Generate Response
// =====================

message GenerateResponse {
  oneof response {
    GenerateStreamChunk chunk = 1;     // For streaming
    GenerateComplete complete = 2;     // For final/non-streaming
  }
}

message GenerateStreamChunk {
  repeated uint32 token_ids = 1;       // Incremental tokens
  uint32 prompt_tokens = 2;
  uint32 completion_tokens = 3;
  uint32 cached_tokens = 4;

  // Logprobs support (TODO: implement in Phase 4)
  // OutputLogProbs output_logprobs = 5;
  // InputLogProbs input_logprobs = 6;  // Only in first chunk
}

message GenerateComplete {
  repeated uint32 output_ids = 1;      // All output tokens
  string finish_reason = 2;            // "stop", "length", "abort"
  uint32 prompt_tokens = 3;
  uint32 completion_tokens = 4;
  uint32 cached_tokens = 5;

  // Logprobs support (TODO: implement in Phase 4)
  // OutputLogProbs output_logprobs = 6;
  // InputLogProbs input_logprobs = 7;
}

// =====================
// Embedding Request
// =====================

message EmbedRequest {
  string request_id = 1;
  TokenizedInput tokenized = 2;
}

message EmbedResponse {
  repeated float embedding = 1;
  uint32 prompt_tokens = 2;
  uint32 embedding_dim = 3;
}

// =====================
// Management Operations
// =====================

message HealthCheckRequest {}

message HealthCheckResponse {
  bool healthy = 1;
  string message = 2;
}

message AbortRequest {
  repeated string request_ids = 1;
}

message AbortResponse {
}

// =====================
// Model and Server Info
// =====================

message GetModelInfoRequest {}

message GetModelInfoResponse {
  string model_path = 1;
  bool is_generation = 2;
  uint32 max_context_length = 3;
  uint32 vocab_size = 4;
  bool supports_vision = 5;
}

message GetServerInfoRequest {}

message GetServerInfoResponse {
  uint32 active_requests = 1;
  bool is_paused = 2;
  double last_receive_timestamp = 3;
  double uptime_seconds = 4;
  string server_type = 5;  // "vllm-grpc"
}
update 2026-04-09 11:23:47 +08:00			`syntax = "proto3";`

			`package vllm.grpc.engine;`

			`// Service definition for vLLM engine communication`
			`// This protocol is designed for efficient binary communication between`
			`// the Rust router and vLLM Python engine (AsyncLLM).`
			`service VllmEngine {`
			`// Submit a generation request (supports streaming)`
			`rpc Generate(GenerateRequest) returns (stream GenerateResponse);`

			`// Submit an embedding request`
			`rpc Embed(EmbedRequest) returns (EmbedResponse);`

			`// Health check`
			`rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);`

			`// Abort a running request`
			`rpc Abort(AbortRequest) returns (AbortResponse);`

			`// Get model information`
			`rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);`

			`// Get server information`
			`rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);`
			`}`

			`// =====================`
			`// Common Types`
			`// =====================`

			`// Sampling parameters for text generation`
			`message SamplingParams {`
			`optional float temperature = 1;`
			`float top_p = 2;`
			`uint32 top_k = 3;`
			`float min_p = 4;`
			`float frequency_penalty = 5;`
			`float presence_penalty = 6;`
			`float repetition_penalty = 7;`

			`optional uint32 max_tokens = 8;`
			`uint32 min_tokens = 9;`

			`repeated string stop = 10;`
			`repeated uint32 stop_token_ids = 11;`

			`bool skip_special_tokens = 12;`
			`bool spaces_between_special_tokens = 13;`
			`bool ignore_eos = 14;`

			`uint32 n = 15; // Number of parallel samples`

			`// Logprobs configuration`
			`optional int32 logprobs = 22; // Number of log probabilities per output token (-1 for all)`
			`optional int32 prompt_logprobs = 23; // Number of log probabilities per prompt token (-1 for all)`

			`// Additional vLLM fields`
			`optional int32 seed = 24; // Random seed for reproducibility`
			`bool include_stop_str_in_output = 25; // Whether to include stop strings in output`
			`map<int32, float> logit_bias = 26; // Token ID to bias mapping (-100 to 100)`
			`optional int32 truncate_prompt_tokens = 27; // Prompt truncation (-1 for model max)`

			`// Structured outputs (one of) - matches vLLM's StructuredOutputsParams`
			`oneof constraint {`
			`string json_schema = 16; // JSON schema for structured output`
			`string regex = 17; // Regex pattern`
			`string grammar = 18; // Grammar/EBNF for structured output`
			`string structural_tag = 19; // Structural tag (e.g., Harmony models)`
			`bool json_object = 20; // Force JSON object output`
			`ChoiceConstraint choice = 21; // List of allowed choices`
			`}`
			`}`

			`// Choice constraint for structured outputs`
			`message ChoiceConstraint {`
			`repeated string choices = 1;`
			`}`

			`// Pre-tokenized input from Rust router`
			`message TokenizedInput {`
			`string original_text = 1; // For reference/debugging`
			`repeated uint32 input_ids = 2; // Actual token IDs to process`
			`}`

			`// =====================`
			`// Generate Request`
			`// =====================`

			`message GenerateRequest {`
			`string request_id = 1;`

			`// Prompt input`
			`oneof input {`
			`TokenizedInput tokenized = 2;`
			`string text = 3;`
			`}`

			`// Generation parameters (includes logprobs config)`
			`SamplingParams sampling_params = 4;`

			`// Streaming`
			`bool stream = 5;`
			`}`

			`// =====================`
			`// Generate Response`
			`// =====================`

			`message GenerateResponse {`
			`oneof response {`
			`GenerateStreamChunk chunk = 1; // For streaming`
			`GenerateComplete complete = 2; // For final/non-streaming`
			`}`
			`}`

			`message GenerateStreamChunk {`
			`repeated uint32 token_ids = 1; // Incremental tokens`
			`uint32 prompt_tokens = 2;`
			`uint32 completion_tokens = 3;`
			`uint32 cached_tokens = 4;`

			`// Logprobs support (TODO: implement in Phase 4)`
			`// OutputLogProbs output_logprobs = 5;`
			`// InputLogProbs input_logprobs = 6; // Only in first chunk`
			`}`

			`message GenerateComplete {`
			`repeated uint32 output_ids = 1; // All output tokens`
			`string finish_reason = 2; // "stop", "length", "abort"`
			`uint32 prompt_tokens = 3;`
			`uint32 completion_tokens = 4;`
			`uint32 cached_tokens = 5;`

			`// Logprobs support (TODO: implement in Phase 4)`
			`// OutputLogProbs output_logprobs = 6;`
			`// InputLogProbs input_logprobs = 7;`
			`}`

			`// =====================`
			`// Embedding Request`
			`// =====================`

			`message EmbedRequest {`
			`string request_id = 1;`
			`TokenizedInput tokenized = 2;`
			`}`

			`message EmbedResponse {`
			`repeated float embedding = 1;`
			`uint32 prompt_tokens = 2;`
			`uint32 embedding_dim = 3;`
			`}`

			`// =====================`
			`// Management Operations`
			`// =====================`

			`message HealthCheckRequest {}`

			`message HealthCheckResponse {`
			`bool healthy = 1;`
			`string message = 2;`
			`}`

			`message AbortRequest {`
			`repeated string request_ids = 1;`
			`}`

			`message AbortResponse {`
			`}`

			`// =====================`
			`// Model and Server Info`
			`// =====================`

			`message GetModelInfoRequest {}`

			`message GetModelInfoResponse {`
			`string model_path = 1;`
			`bool is_generation = 2;`
			`uint32 max_context_length = 3;`
			`uint32 vocab_size = 4;`
			`bool supports_vision = 5;`
			`}`

			`message GetServerInfoRequest {}`

			`message GetServerInfoResponse {`
			`uint32 active_requests = 1;`
			`bool is_paused = 2;`
			`double last_receive_timestamp = 3;`
			`double uptime_seconds = 4;`
			`string server_type = 5; // "vllm-grpc"`
			`}`