Add minimal vLLM 0.16.1 build repo for BI-V150

This commit is contained in:
2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions

17
vllm/grpc/__init__.py Normal file
View File

@@ -0,0 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
vLLM gRPC protocol definitions.
This module contains the protocol buffer definitions for vLLM's gRPC API.
The protobuf files are compiled into Python code using grpcio-tools.
"""
# These imports will be available after protobuf compilation
# from vllm.grpc import vllm_engine_pb2
# from vllm.grpc import vllm_engine_pb2_grpc
__all__ = [
"vllm_engine_pb2",
"vllm_engine_pb2_grpc",
]

View File

@@ -0,0 +1,94 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Compile vLLM protobuf definitions into Python code.
This script uses grpcio-tools to generate *_pb2.py, *_pb2_grpc.py, and
*_pb2.pyi (type stubs) files from the vllm_engine.proto definition.
NOTE: Proto compilation happens automatically during package build (via setup.py).
This script is provided for developers who want to regenerate protos manually,
e.g., after modifying vllm_engine.proto.
Usage:
python vllm/grpc/compile_protos.py
Requirements:
pip install grpcio-tools
"""
import sys
from pathlib import Path
def compile_protos():
"""Compile protobuf definitions."""
# Get the vllm package root directory
script_dir = Path(__file__).parent
vllm_package_root = script_dir.parent.parent # vllm/vllm/grpc -> vllm/
proto_file = script_dir / "vllm_engine.proto"
if not proto_file.exists():
print(f"Error: Proto file not found at {proto_file}")
return 1
print(f"Compiling protobuf: {proto_file}")
print(f"Output directory: {script_dir}")
# Compile the proto file
# We use vllm/vllm as the proto_path so that the package is vllm.grpc.engine
try:
from grpc_tools import protoc
result = protoc.main(
[
"grpc_tools.protoc",
f"--proto_path={vllm_package_root}",
f"--python_out={vllm_package_root}",
f"--grpc_python_out={vllm_package_root}",
f"--pyi_out={vllm_package_root}", # Generate type stubs
str(script_dir / "vllm_engine.proto"),
]
)
if result == 0:
# Add SPDX headers to generated files
spdx_header = (
"# SPDX-License-Identifier: Apache-2.0\n"
"# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n"
)
for generated_file in [
script_dir / "vllm_engine_pb2.py",
script_dir / "vllm_engine_pb2_grpc.py",
script_dir / "vllm_engine_pb2.pyi",
]:
if generated_file.exists():
content = generated_file.read_text()
if not content.startswith("# SPDX-License-Identifier"):
# Add mypy ignore-errors comment for all generated files
header = spdx_header + "# mypy: ignore-errors\n"
generated_file.write_text(header + content)
print("✓ Protobuf compilation successful!")
print(f" Generated: {script_dir / 'vllm_engine_pb2.py'}")
print(f" Generated: {script_dir / 'vllm_engine_pb2_grpc.py'}")
print(f" Generated: {script_dir / 'vllm_engine_pb2.pyi'} (type stubs)")
return 0
else:
print(f"Error: protoc returned {result}")
return result
except ImportError:
print("Error: grpcio-tools not installed")
print("Install with: pip install grpcio-tools")
return 1
except Exception as e:
print(f"Error during compilation: {e}")
return 1
if __name__ == "__main__":
sys.exit(compile_protos())

195
vllm/grpc/vllm_engine.proto Normal file
View File

@@ -0,0 +1,195 @@
syntax = "proto3";
package vllm.grpc.engine;
// Service definition for vLLM engine communication
// This protocol is designed for efficient binary communication between
// the Rust router and vLLM Python engine (AsyncLLM).
service VllmEngine {
// Submit a generation request (supports streaming)
rpc Generate(GenerateRequest) returns (stream GenerateResponse);
// Submit an embedding request
rpc Embed(EmbedRequest) returns (EmbedResponse);
// Health check
rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
// Abort a running request
rpc Abort(AbortRequest) returns (AbortResponse);
// Get model information
rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
// Get server information
rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
}
// =====================
// Common Types
// =====================
// Sampling parameters for text generation
message SamplingParams {
optional float temperature = 1;
float top_p = 2;
uint32 top_k = 3;
float min_p = 4;
float frequency_penalty = 5;
float presence_penalty = 6;
float repetition_penalty = 7;
optional uint32 max_tokens = 8;
uint32 min_tokens = 9;
repeated string stop = 10;
repeated uint32 stop_token_ids = 11;
bool skip_special_tokens = 12;
bool spaces_between_special_tokens = 13;
bool ignore_eos = 14;
uint32 n = 15; // Number of parallel samples
// Logprobs configuration
optional int32 logprobs = 22; // Number of log probabilities per output token (-1 for all)
optional int32 prompt_logprobs = 23; // Number of log probabilities per prompt token (-1 for all)
// Additional vLLM fields
optional int32 seed = 24; // Random seed for reproducibility
bool include_stop_str_in_output = 25; // Whether to include stop strings in output
map<int32, float> logit_bias = 26; // Token ID to bias mapping (-100 to 100)
optional int32 truncate_prompt_tokens = 27; // Prompt truncation (-1 for model max)
// Structured outputs (one of) - matches vLLM's StructuredOutputsParams
oneof constraint {
string json_schema = 16; // JSON schema for structured output
string regex = 17; // Regex pattern
string grammar = 18; // Grammar/EBNF for structured output
string structural_tag = 19; // Structural tag (e.g., Harmony models)
bool json_object = 20; // Force JSON object output
ChoiceConstraint choice = 21; // List of allowed choices
}
}
// Choice constraint for structured outputs
message ChoiceConstraint {
repeated string choices = 1;
}
// Pre-tokenized input from Rust router
message TokenizedInput {
string original_text = 1; // For reference/debugging
repeated uint32 input_ids = 2; // Actual token IDs to process
}
// =====================
// Generate Request
// =====================
message GenerateRequest {
string request_id = 1;
// Prompt input
oneof input {
TokenizedInput tokenized = 2;
string text = 3;
}
// Generation parameters (includes logprobs config)
SamplingParams sampling_params = 4;
// Streaming
bool stream = 5;
}
// =====================
// Generate Response
// =====================
message GenerateResponse {
oneof response {
GenerateStreamChunk chunk = 1; // For streaming
GenerateComplete complete = 2; // For final/non-streaming
}
}
message GenerateStreamChunk {
repeated uint32 token_ids = 1; // Incremental tokens
uint32 prompt_tokens = 2;
uint32 completion_tokens = 3;
uint32 cached_tokens = 4;
// Logprobs support (TODO: implement in Phase 4)
// OutputLogProbs output_logprobs = 5;
// InputLogProbs input_logprobs = 6; // Only in first chunk
}
message GenerateComplete {
repeated uint32 output_ids = 1; // All output tokens
string finish_reason = 2; // "stop", "length", "abort"
uint32 prompt_tokens = 3;
uint32 completion_tokens = 4;
uint32 cached_tokens = 5;
// Logprobs support (TODO: implement in Phase 4)
// OutputLogProbs output_logprobs = 6;
// InputLogProbs input_logprobs = 7;
}
// =====================
// Embedding Request
// =====================
message EmbedRequest {
string request_id = 1;
TokenizedInput tokenized = 2;
}
message EmbedResponse {
repeated float embedding = 1;
uint32 prompt_tokens = 2;
uint32 embedding_dim = 3;
}
// =====================
// Management Operations
// =====================
message HealthCheckRequest {}
message HealthCheckResponse {
bool healthy = 1;
string message = 2;
}
message AbortRequest {
repeated string request_ids = 1;
}
message AbortResponse {
}
// =====================
// Model and Server Info
// =====================
message GetModelInfoRequest {}
message GetModelInfoResponse {
string model_path = 1;
bool is_generation = 2;
uint32 max_context_length = 3;
uint32 vocab_size = 4;
bool supports_vision = 5;
}
message GetServerInfoRequest {}
message GetServerInfoResponse {
uint32 active_requests = 1;
bool is_paused = 2;
double last_receive_timestamp = 3;
double uptime_seconds = 4;
string server_type = 5; // "vllm-grpc"
}