Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
17
vllm/grpc/__init__.py
Normal file
17
vllm/grpc/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
vLLM gRPC protocol definitions.
|
||||
|
||||
This module contains the protocol buffer definitions for vLLM's gRPC API.
|
||||
The protobuf files are compiled into Python code using grpcio-tools.
|
||||
"""
|
||||
|
||||
# These imports will be available after protobuf compilation
|
||||
# from vllm.grpc import vllm_engine_pb2
|
||||
# from vllm.grpc import vllm_engine_pb2_grpc
|
||||
|
||||
__all__ = [
|
||||
"vllm_engine_pb2",
|
||||
"vllm_engine_pb2_grpc",
|
||||
]
|
||||
94
vllm/grpc/compile_protos.py
Normal file
94
vllm/grpc/compile_protos.py
Normal file
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Compile vLLM protobuf definitions into Python code.
|
||||
|
||||
This script uses grpcio-tools to generate *_pb2.py, *_pb2_grpc.py, and
|
||||
*_pb2.pyi (type stubs) files from the vllm_engine.proto definition.
|
||||
|
||||
NOTE: Proto compilation happens automatically during package build (via setup.py).
|
||||
This script is provided for developers who want to regenerate protos manually,
|
||||
e.g., after modifying vllm_engine.proto.
|
||||
|
||||
Usage:
|
||||
python vllm/grpc/compile_protos.py
|
||||
|
||||
Requirements:
|
||||
pip install grpcio-tools
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def compile_protos():
|
||||
"""Compile protobuf definitions."""
|
||||
# Get the vllm package root directory
|
||||
script_dir = Path(__file__).parent
|
||||
vllm_package_root = script_dir.parent.parent # vllm/vllm/grpc -> vllm/
|
||||
|
||||
proto_file = script_dir / "vllm_engine.proto"
|
||||
|
||||
if not proto_file.exists():
|
||||
print(f"Error: Proto file not found at {proto_file}")
|
||||
return 1
|
||||
|
||||
print(f"Compiling protobuf: {proto_file}")
|
||||
print(f"Output directory: {script_dir}")
|
||||
|
||||
# Compile the proto file
|
||||
# We use vllm/vllm as the proto_path so that the package is vllm.grpc.engine
|
||||
try:
|
||||
from grpc_tools import protoc
|
||||
|
||||
result = protoc.main(
|
||||
[
|
||||
"grpc_tools.protoc",
|
||||
f"--proto_path={vllm_package_root}",
|
||||
f"--python_out={vllm_package_root}",
|
||||
f"--grpc_python_out={vllm_package_root}",
|
||||
f"--pyi_out={vllm_package_root}", # Generate type stubs
|
||||
str(script_dir / "vllm_engine.proto"),
|
||||
]
|
||||
)
|
||||
|
||||
if result == 0:
|
||||
# Add SPDX headers to generated files
|
||||
spdx_header = (
|
||||
"# SPDX-License-Identifier: Apache-2.0\n"
|
||||
"# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n"
|
||||
)
|
||||
|
||||
for generated_file in [
|
||||
script_dir / "vllm_engine_pb2.py",
|
||||
script_dir / "vllm_engine_pb2_grpc.py",
|
||||
script_dir / "vllm_engine_pb2.pyi",
|
||||
]:
|
||||
if generated_file.exists():
|
||||
content = generated_file.read_text()
|
||||
if not content.startswith("# SPDX-License-Identifier"):
|
||||
# Add mypy ignore-errors comment for all generated files
|
||||
header = spdx_header + "# mypy: ignore-errors\n"
|
||||
generated_file.write_text(header + content)
|
||||
|
||||
print("✓ Protobuf compilation successful!")
|
||||
print(f" Generated: {script_dir / 'vllm_engine_pb2.py'}")
|
||||
print(f" Generated: {script_dir / 'vllm_engine_pb2_grpc.py'}")
|
||||
print(f" Generated: {script_dir / 'vllm_engine_pb2.pyi'} (type stubs)")
|
||||
return 0
|
||||
else:
|
||||
print(f"Error: protoc returned {result}")
|
||||
return result
|
||||
|
||||
except ImportError:
|
||||
print("Error: grpcio-tools not installed")
|
||||
print("Install with: pip install grpcio-tools")
|
||||
return 1
|
||||
except Exception as e:
|
||||
print(f"Error during compilation: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(compile_protos())
|
||||
195
vllm/grpc/vllm_engine.proto
Normal file
195
vllm/grpc/vllm_engine.proto
Normal file
@@ -0,0 +1,195 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package vllm.grpc.engine;
|
||||
|
||||
// Service definition for vLLM engine communication
|
||||
// This protocol is designed for efficient binary communication between
|
||||
// the Rust router and vLLM Python engine (AsyncLLM).
|
||||
service VllmEngine {
|
||||
// Submit a generation request (supports streaming)
|
||||
rpc Generate(GenerateRequest) returns (stream GenerateResponse);
|
||||
|
||||
// Submit an embedding request
|
||||
rpc Embed(EmbedRequest) returns (EmbedResponse);
|
||||
|
||||
// Health check
|
||||
rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
|
||||
|
||||
// Abort a running request
|
||||
rpc Abort(AbortRequest) returns (AbortResponse);
|
||||
|
||||
// Get model information
|
||||
rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
|
||||
|
||||
// Get server information
|
||||
rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Common Types
|
||||
// =====================
|
||||
|
||||
// Sampling parameters for text generation
|
||||
message SamplingParams {
|
||||
optional float temperature = 1;
|
||||
float top_p = 2;
|
||||
uint32 top_k = 3;
|
||||
float min_p = 4;
|
||||
float frequency_penalty = 5;
|
||||
float presence_penalty = 6;
|
||||
float repetition_penalty = 7;
|
||||
|
||||
optional uint32 max_tokens = 8;
|
||||
uint32 min_tokens = 9;
|
||||
|
||||
repeated string stop = 10;
|
||||
repeated uint32 stop_token_ids = 11;
|
||||
|
||||
bool skip_special_tokens = 12;
|
||||
bool spaces_between_special_tokens = 13;
|
||||
bool ignore_eos = 14;
|
||||
|
||||
uint32 n = 15; // Number of parallel samples
|
||||
|
||||
// Logprobs configuration
|
||||
optional int32 logprobs = 22; // Number of log probabilities per output token (-1 for all)
|
||||
optional int32 prompt_logprobs = 23; // Number of log probabilities per prompt token (-1 for all)
|
||||
|
||||
// Additional vLLM fields
|
||||
optional int32 seed = 24; // Random seed for reproducibility
|
||||
bool include_stop_str_in_output = 25; // Whether to include stop strings in output
|
||||
map<int32, float> logit_bias = 26; // Token ID to bias mapping (-100 to 100)
|
||||
optional int32 truncate_prompt_tokens = 27; // Prompt truncation (-1 for model max)
|
||||
|
||||
// Structured outputs (one of) - matches vLLM's StructuredOutputsParams
|
||||
oneof constraint {
|
||||
string json_schema = 16; // JSON schema for structured output
|
||||
string regex = 17; // Regex pattern
|
||||
string grammar = 18; // Grammar/EBNF for structured output
|
||||
string structural_tag = 19; // Structural tag (e.g., Harmony models)
|
||||
bool json_object = 20; // Force JSON object output
|
||||
ChoiceConstraint choice = 21; // List of allowed choices
|
||||
}
|
||||
}
|
||||
|
||||
// Choice constraint for structured outputs
|
||||
message ChoiceConstraint {
|
||||
repeated string choices = 1;
|
||||
}
|
||||
|
||||
// Pre-tokenized input from Rust router
|
||||
message TokenizedInput {
|
||||
string original_text = 1; // For reference/debugging
|
||||
repeated uint32 input_ids = 2; // Actual token IDs to process
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Generate Request
|
||||
// =====================
|
||||
|
||||
message GenerateRequest {
|
||||
string request_id = 1;
|
||||
|
||||
// Prompt input
|
||||
oneof input {
|
||||
TokenizedInput tokenized = 2;
|
||||
string text = 3;
|
||||
}
|
||||
|
||||
// Generation parameters (includes logprobs config)
|
||||
SamplingParams sampling_params = 4;
|
||||
|
||||
// Streaming
|
||||
bool stream = 5;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Generate Response
|
||||
// =====================
|
||||
|
||||
message GenerateResponse {
|
||||
oneof response {
|
||||
GenerateStreamChunk chunk = 1; // For streaming
|
||||
GenerateComplete complete = 2; // For final/non-streaming
|
||||
}
|
||||
}
|
||||
|
||||
message GenerateStreamChunk {
|
||||
repeated uint32 token_ids = 1; // Incremental tokens
|
||||
uint32 prompt_tokens = 2;
|
||||
uint32 completion_tokens = 3;
|
||||
uint32 cached_tokens = 4;
|
||||
|
||||
// Logprobs support (TODO: implement in Phase 4)
|
||||
// OutputLogProbs output_logprobs = 5;
|
||||
// InputLogProbs input_logprobs = 6; // Only in first chunk
|
||||
}
|
||||
|
||||
message GenerateComplete {
|
||||
repeated uint32 output_ids = 1; // All output tokens
|
||||
string finish_reason = 2; // "stop", "length", "abort"
|
||||
uint32 prompt_tokens = 3;
|
||||
uint32 completion_tokens = 4;
|
||||
uint32 cached_tokens = 5;
|
||||
|
||||
// Logprobs support (TODO: implement in Phase 4)
|
||||
// OutputLogProbs output_logprobs = 6;
|
||||
// InputLogProbs input_logprobs = 7;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Embedding Request
|
||||
// =====================
|
||||
|
||||
message EmbedRequest {
|
||||
string request_id = 1;
|
||||
TokenizedInput tokenized = 2;
|
||||
}
|
||||
|
||||
message EmbedResponse {
|
||||
repeated float embedding = 1;
|
||||
uint32 prompt_tokens = 2;
|
||||
uint32 embedding_dim = 3;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Management Operations
|
||||
// =====================
|
||||
|
||||
message HealthCheckRequest {}
|
||||
|
||||
message HealthCheckResponse {
|
||||
bool healthy = 1;
|
||||
string message = 2;
|
||||
}
|
||||
|
||||
message AbortRequest {
|
||||
repeated string request_ids = 1;
|
||||
}
|
||||
|
||||
message AbortResponse {
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Model and Server Info
|
||||
// =====================
|
||||
|
||||
message GetModelInfoRequest {}
|
||||
|
||||
message GetModelInfoResponse {
|
||||
string model_path = 1;
|
||||
bool is_generation = 2;
|
||||
uint32 max_context_length = 3;
|
||||
uint32 vocab_size = 4;
|
||||
bool supports_vision = 5;
|
||||
}
|
||||
|
||||
message GetServerInfoRequest {}
|
||||
|
||||
message GetServerInfoResponse {
|
||||
uint32 active_requests = 1;
|
||||
bool is_paused = 2;
|
||||
double last_receive_timestamp = 3;
|
||||
double uptime_seconds = 4;
|
||||
string server_type = 5; // "vllm-grpc"
|
||||
}
|
||||
Reference in New Issue
Block a user