Sync from v0.13
This commit is contained in:
@@ -0,0 +1,73 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
An example shows how to generate chat completions from reasoning models
|
||||
like DeepSeekR1.
|
||||
|
||||
To run this example, you need to start the vLLM server with the reasoning
|
||||
parser:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
|
||||
streaming chat completions feature.
|
||||
|
||||
The streaming chat completions feature allows you to receive chat completions
|
||||
in real-time as they are generated by the model. This is useful for scenarios
|
||||
where you want to display chat completions to the user as they are generated
|
||||
by the model.
|
||||
|
||||
Remember to check content and reasoning exist in `ChatCompletionChunk`,
|
||||
content may not exist leading to errors if you try to access it.
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
# ruff: noqa: E501
|
||||
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
stream = client.chat.completions.create(model=model, messages=messages, stream=True)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning = False
|
||||
printed_content = False
|
||||
|
||||
for chunk in stream:
|
||||
# Safely extract reasoning and content from delta,
|
||||
# defaulting to None if attributes don't exist or are empty strings
|
||||
reasoning = getattr(chunk.choices[0].delta, "reasoning", None) or None
|
||||
content = getattr(chunk.choices[0].delta, "content", None) or None
|
||||
|
||||
if reasoning is not None:
|
||||
if not printed_reasoning:
|
||||
printed_reasoning = True
|
||||
print("reasoning:", end="", flush=True)
|
||||
print(reasoning, end="", flush=True)
|
||||
elif content is not None:
|
||||
if not printed_content:
|
||||
printed_content = True
|
||||
print("\ncontent:", end="", flush=True)
|
||||
# Extract and print the content
|
||||
print(content, end="", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user