Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@@ -0,0 +1,58 @@
+# Structured Outputs
+
+This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
+It can run individual constraint type or all of them.
+It supports both streaming responses and concurrent non-streaming requests.
+
+To use this example, you must start an vLLM server with any model of your choice.
+
+```bash
+vllm serve Qwen/Qwen2.5-3B-Instruct
+```
+
+To serve a reasoning model, you can use the following command:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
+    --reasoning-parser deepseek_r1
+```
+
+If you want to run this script standalone with `uv`, you can use the following:
+
+```bash
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
+    structured-outputs
+```
+
+See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
+
+!!! tip
+    If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
+
+## Usage
+
+Run all constraints, non-streaming:
+
+```bash
+uv run structured_outputs.py
+```
+
+Run all constraints, streaming:
+
+```bash
+uv run structured_outputs.py --stream
+```
+
+Run certain constraints, for example `structural_tag` and `regex`, streaming:
+
+```bash
+uv run structured_outputs.py \
+    --constraint structural_tag regex \
+    --stream
+```
+
+Run all constraints, with reasoning models and streaming:
+
+```bash
+uv run structured_outputs.py --reasoning --stream
+```
--- a/examples/online_serving/structured_outputs/pyproject.toml
+++ b/examples/online_serving/structured_outputs/pyproject.toml
@@ -0,0 +1,8 @@
+[project]
+name = "examples-online-structured-outputs"
+requires-python = ">=3.10, <3.14"
+dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
+version = "0.0.0"
+
+[project.scripts]
+structured-outputs = "structured_outputs:main"
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -0,0 +1,268 @@
+# ruff: noqa: E501
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import asyncio
+import enum
+import os
+from typing import Any, Literal
+
+import openai
+import pydantic
+from openai.types.chat import ChatCompletionChunk
+
+ConstraintsFormat = Literal[
+    "choice",
+    "regex",
+    "json",
+    "grammar",
+    "structural_tag",
+]
+
+
+async def print_stream_response(
+    stream_response: openai.AsyncStream[ChatCompletionChunk],
+    title: str,
+    args: argparse.Namespace,
+):
+    print(f"\n\n{title} (Streaming):")
+
+    local_reasoning_header_printed = False
+    local_content_header_printed = False
+
+    async for chunk in stream_response:
+        delta = chunk.choices[0].delta
+
+        reasoning_chunk_text: str | None = getattr(delta, "reasoning", None)
+        content_chunk_text = delta.content
+
+        if args.reasoning:
+            if reasoning_chunk_text:
+                if not local_reasoning_header_printed:
+                    print("  Reasoning: ", end="")
+                    local_reasoning_header_printed = True
+                print(reasoning_chunk_text, end="", flush=True)
+
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    if local_reasoning_header_printed:
+                        print()
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+        else:
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+    print()
+
+
+class CarType(str, enum.Enum):
+    SEDAN = "SEDAN"
+    SUV = "SUV"
+    TRUCK = "TRUCK"
+    COUPE = "COUPE"
+
+
+class CarDescription(pydantic.BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
+    "choice": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Classify this sentiment: vLLM is wonderful!",
+            }
+        ],
+        "extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
+    },
+    "regex": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
+            }
+        ],
+        "extra_body": {
+            "structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
+        },
+    },
+    "json": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "car-description",
+                "schema": CarDescription.model_json_schema(),
+            },
+        },
+    },
+    "grammar": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+            }
+        ],
+        "extra_body": {
+            "structured_outputs": {
+                "grammar": """
+root ::= select_statement
+
+select_statement ::= "SELECT " column " from " table " where " condition
+
+column ::= "col_1 " | "col_2 "
+
+table ::= "table_1 " | "table_2 "
+
+condition ::= column "= " number
+
+number ::= "1 " | "2 "
+""",
+            }
+        },
+    },
+    "structural_tag": {
+        "messages": [
+            {
+                "role": "user",
+                "content": """
+You have access to the following function to retrieve the weather in a city:
+
+{
+    "name": "get_weather",
+    "parameters": {
+        "city": {
+            "param_type": "string",
+            "description": "The city to get the weather for",
+            "required": True
+        }
+    }
+}
+
+If a you choose to call a function ONLY reply in the following format:
+<{start_tag}={function_name}>{parameters}{end_tag}
+where
+
+start_tag => `<function`
+parameters => a JSON dict with the function argument name as key and function
+              argument value as value.
+end_tag => `</function>`
+
+Here is an example,
+<function=example_function_name>{"example_name": "example_value"}</function>
+
+Reminder:
+- Function calls MUST follow the specified format
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line
+- Always add your sources when using search results to answer the user query
+
+You are a helpful assistant.
+
+Given the previous instructions, what is the weather in New York City, Boston,
+and San Francisco?""",
+            },
+        ],
+        "response_format": {
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_weather>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {"city": {"type": "string"}},
+                        "required": ["city"],
+                    },
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
+        },
+    },
+}
+
+
+async def cli():
+    parser = argparse.ArgumentParser(
+        description="Run OpenAI Chat Completion with various structured outputs capabilities",
+    )
+    _ = parser.add_argument(
+        "--constraint",
+        type=str,
+        nargs="+",
+        choices=[*list(PARAMS), "*"],
+        default=["*"],
+        help="Specify which constraint(s) to run.",
+    )
+    _ = parser.add_argument(
+        "--stream",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable streaming output",
+    )
+    _ = parser.add_argument(
+        "--reasoning",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable printing of reasoning traces if available.",
+    )
+    args = parser.parse_args()
+
+    base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
+    client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
+    constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
+    model = (await client.models.list()).data[0].id
+
+    if args.stream:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=True,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, stream in zip(constraints, results):
+            await print_stream_response(stream, constraint, args)
+    else:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=False,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, response in zip(constraints, results):
+            print(f"\n\n{constraint}:")
+            message = response.choices[0].message
+            if args.reasoning and hasattr(message, "reasoning"):
+                print(f"  Reasoning: {message.reasoning or ''}")
+            print(f"  Content: {message.content!r}")
+
+
+def main():
+    asyncio.run(cli())
+
+
+if __name__ == "__main__":
+    main()