Add minimal vLLM 0.16.1 build repo for BI-V150

2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions
--- a/vllm/grpc/init.py
+++ b/vllm/grpc/init.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+vLLM gRPC protocol definitions.
+
+This module contains the protocol buffer definitions for vLLM's gRPC API.
+The protobuf files are compiled into Python code using grpcio-tools.
+"""
+
+# These imports will be available after protobuf compilation
+# from vllm.grpc import vllm_engine_pb2
+# from vllm.grpc import vllm_engine_pb2_grpc
+
+__all__ = [
+    "vllm_engine_pb2",
+    "vllm_engine_pb2_grpc",
+]
--- a/vllm/grpc/compile_protos.py
+++ b/vllm/grpc/compile_protos.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Compile vLLM protobuf definitions into Python code.
+
+This script uses grpcio-tools to generate *_pb2.py, *_pb2_grpc.py, and
+*_pb2.pyi (type stubs) files from the vllm_engine.proto definition.
+
+NOTE: Proto compilation happens automatically during package build (via setup.py).
+This script is provided for developers who want to regenerate protos manually,
+e.g., after modifying vllm_engine.proto.
+
+Usage:
+    python vllm/grpc/compile_protos.py
+
+Requirements:
+    pip install grpcio-tools
+"""
+
+import sys
+from pathlib import Path
+
+
+def compile_protos():
+    """Compile protobuf definitions."""
+    # Get the vllm package root directory
+    script_dir = Path(__file__).parent
+    vllm_package_root = script_dir.parent.parent  # vllm/vllm/grpc -> vllm/
+
+    proto_file = script_dir / "vllm_engine.proto"
+
+    if not proto_file.exists():
+        print(f"Error: Proto file not found at {proto_file}")
+        return 1
+
+    print(f"Compiling protobuf: {proto_file}")
+    print(f"Output directory: {script_dir}")
+
+    # Compile the proto file
+    # We use vllm/vllm as the proto_path so that the package is vllm.grpc.engine
+    try:
+        from grpc_tools import protoc
+
+        result = protoc.main(
+            [
+                "grpc_tools.protoc",
+                f"--proto_path={vllm_package_root}",
+                f"--python_out={vllm_package_root}",
+                f"--grpc_python_out={vllm_package_root}",
+                f"--pyi_out={vllm_package_root}",  # Generate type stubs
+                str(script_dir / "vllm_engine.proto"),
+            ]
+        )
+
+        if result == 0:
+            # Add SPDX headers to generated files
+            spdx_header = (
+                "# SPDX-License-Identifier: Apache-2.0\n"
+                "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n"
+            )
+
+            for generated_file in [
+                script_dir / "vllm_engine_pb2.py",
+                script_dir / "vllm_engine_pb2_grpc.py",
+                script_dir / "vllm_engine_pb2.pyi",
+            ]:
+                if generated_file.exists():
+                    content = generated_file.read_text()
+                    if not content.startswith("# SPDX-License-Identifier"):
+                        # Add mypy ignore-errors comment for all generated files
+                        header = spdx_header + "# mypy: ignore-errors\n"
+                        generated_file.write_text(header + content)
+
+            print("✓ Protobuf compilation successful!")
+            print(f"  Generated: {script_dir / 'vllm_engine_pb2.py'}")
+            print(f"  Generated: {script_dir / 'vllm_engine_pb2_grpc.py'}")
+            print(f"  Generated: {script_dir / 'vllm_engine_pb2.pyi'} (type stubs)")
+            return 0
+        else:
+            print(f"Error: protoc returned {result}")
+            return result
+
+    except ImportError:
+        print("Error: grpcio-tools not installed")
+        print("Install with: pip install grpcio-tools")
+        return 1
+    except Exception as e:
+        print(f"Error during compilation: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(compile_protos())
--- a/vllm/grpc/vllm_engine.proto
+++ b/vllm/grpc/vllm_engine.proto
@@ -0,0 +1,195 @@
+syntax = "proto3";
+
+package vllm.grpc.engine;
+
+// Service definition for vLLM engine communication
+// This protocol is designed for efficient binary communication between
+// the Rust router and vLLM Python engine (AsyncLLM).
+service VllmEngine {
+  // Submit a generation request (supports streaming)
+  rpc Generate(GenerateRequest) returns (stream GenerateResponse);
+
+  // Submit an embedding request
+  rpc Embed(EmbedRequest) returns (EmbedResponse);
+
+  // Health check
+  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
+
+  // Abort a running request
+  rpc Abort(AbortRequest) returns (AbortResponse);
+
+  // Get model information
+  rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
+
+  // Get server information
+  rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
+}
+
+// =====================
+// Common Types
+// =====================
+
+// Sampling parameters for text generation
+message SamplingParams {
+  optional float temperature = 1;
+  float top_p = 2;
+  uint32 top_k = 3;
+  float min_p = 4;
+  float frequency_penalty = 5;
+  float presence_penalty = 6;
+  float repetition_penalty = 7;
+
+  optional uint32 max_tokens = 8;
+  uint32 min_tokens = 9;
+
+  repeated string stop = 10;
+  repeated uint32 stop_token_ids = 11;
+
+  bool skip_special_tokens = 12;
+  bool spaces_between_special_tokens = 13;
+  bool ignore_eos = 14;
+
+  uint32 n = 15;  // Number of parallel samples
+
+  // Logprobs configuration
+  optional int32 logprobs = 22;  // Number of log probabilities per output token (-1 for all)
+  optional int32 prompt_logprobs = 23;  // Number of log probabilities per prompt token (-1 for all)
+
+  // Additional vLLM fields
+  optional int32 seed = 24;  // Random seed for reproducibility
+  bool include_stop_str_in_output = 25;  // Whether to include stop strings in output
+  map<int32, float> logit_bias = 26;  // Token ID to bias mapping (-100 to 100)
+  optional int32 truncate_prompt_tokens = 27;  // Prompt truncation (-1 for model max)
+
+  // Structured outputs (one of) - matches vLLM's StructuredOutputsParams
+  oneof constraint {
+    string json_schema = 16;  // JSON schema for structured output
+    string regex = 17;  // Regex pattern
+    string grammar = 18;  // Grammar/EBNF for structured output
+    string structural_tag = 19;  // Structural tag (e.g., Harmony models)
+    bool json_object = 20;  // Force JSON object output
+    ChoiceConstraint choice = 21;  // List of allowed choices
+  }
+}
+
+// Choice constraint for structured outputs
+message ChoiceConstraint {
+  repeated string choices = 1;
+}
+
+// Pre-tokenized input from Rust router
+message TokenizedInput {
+  string original_text = 1;  // For reference/debugging
+  repeated uint32 input_ids = 2;  // Actual token IDs to process
+}
+
+// =====================
+// Generate Request
+// =====================
+
+message GenerateRequest {
+  string request_id = 1;
+
+  // Prompt input
+  oneof input {
+    TokenizedInput tokenized = 2;
+    string text = 3;
+  }
+
+  // Generation parameters (includes logprobs config)
+  SamplingParams sampling_params = 4;
+
+  // Streaming
+  bool stream = 5;
+}
+
+// =====================
+// Generate Response
+// =====================
+
+message GenerateResponse {
+  oneof response {
+    GenerateStreamChunk chunk = 1;     // For streaming
+    GenerateComplete complete = 2;     // For final/non-streaming
+  }
+}
+
+message GenerateStreamChunk {
+  repeated uint32 token_ids = 1;       // Incremental tokens
+  uint32 prompt_tokens = 2;
+  uint32 completion_tokens = 3;
+  uint32 cached_tokens = 4;
+
+  // Logprobs support (TODO: implement in Phase 4)
+  // OutputLogProbs output_logprobs = 5;
+  // InputLogProbs input_logprobs = 6;  // Only in first chunk
+}
+
+message GenerateComplete {
+  repeated uint32 output_ids = 1;      // All output tokens
+  string finish_reason = 2;            // "stop", "length", "abort"
+  uint32 prompt_tokens = 3;
+  uint32 completion_tokens = 4;
+  uint32 cached_tokens = 5;
+
+  // Logprobs support (TODO: implement in Phase 4)
+  // OutputLogProbs output_logprobs = 6;
+  // InputLogProbs input_logprobs = 7;
+}
+
+// =====================
+// Embedding Request
+// =====================
+
+message EmbedRequest {
+  string request_id = 1;
+  TokenizedInput tokenized = 2;
+}
+
+message EmbedResponse {
+  repeated float embedding = 1;
+  uint32 prompt_tokens = 2;
+  uint32 embedding_dim = 3;
+}
+
+// =====================
+// Management Operations
+// =====================
+
+message HealthCheckRequest {}
+
+message HealthCheckResponse {
+  bool healthy = 1;
+  string message = 2;
+}
+
+message AbortRequest {
+  repeated string request_ids = 1;
+}
+
+message AbortResponse {
+}
+
+// =====================
+// Model and Server Info
+// =====================
+
+message GetModelInfoRequest {}
+
+message GetModelInfoResponse {
+  string model_path = 1;
+  bool is_generation = 2;
+  uint32 max_context_length = 3;
+  uint32 vocab_size = 4;
+  bool supports_vision = 5;
+}
+
+message GetServerInfoRequest {}
+
+message GetServerInfoResponse {
+  uint32 active_requests = 1;
+  bool is_paused = 2;
+  double last_receive_timestamp = 3;
+  double uptime_seconds = 4;
+  string server_type = 5;  // "vllm-grpc"
+}