Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server with the reasoning
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --reasoning-parser deepseek_r1
+```
+
+Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
+streaming chat completions feature.
+
+The streaming chat completions feature allows you to receive chat completions
+in real-time as they are generated by the model. This is useful for scenarios
+where you want to display chat completions to the user as they are generated
+by the model.
+
+Remember to check content and reasoning exist in `ChatCompletionChunk`,
+content may not exist leading to errors if you try to access it.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+
+
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # ruff: noqa: E501
+    # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    stream = client.chat.completions.create(model=model, messages=messages, stream=True)
+
+    print("client: Start streaming chat completions...")
+    printed_reasoning = False
+    printed_content = False
+
+    for chunk in stream:
+        # Safely extract reasoning and content from delta,
+        # defaulting to None if attributes don't exist or are empty strings
+        reasoning = getattr(chunk.choices[0].delta, "reasoning", None) or None
+        content = getattr(chunk.choices[0].delta, "content", None) or None
+
+        if reasoning is not None:
+            if not printed_reasoning:
+                printed_reasoning = True
+                print("reasoning:", end="", flush=True)
+            print(reasoning, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+
+
+if __name__ == "__main__":
+    main()