adapt to sglang v0.5.2rc1 on dcu

2025-09-04 15:56:33 +08:00
commit 909abb58f5
2320 changed files with 489411 additions and 0 deletions
--- a/examples/runtime/openai_chat_with_response_prefill.py
+++ b/examples/runtime/openai_chat_with_response_prefill.py
@@ -0,0 +1,53 @@
+"""
+Usage:
+1) Launch the server in one terminal:
+   python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 30000
+
+2) Run this script in another terminal:
+   python openai_chat_with_response_prefill.py
+
+This example demonstrates two chat completion calls:
+- One with continue_final_message enabled (the final assistant message is used as a prefill).
+- One without continue_final_message (the final assistant message remains, starting a new turn).
+"""
+
+import openai
+
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+messages = [
+    {"role": "system", "content": "You are a helpful AI assistant."},
+    {
+        "role": "user",
+        "content": """
+Extract the name, size, price, and color from this product description as a JSON object:
+
+<description>
+The SmartHome Mini is a compact smart home assistant available in black or white for only $49.99.
+At just 5 inches wide, it lets you control lights, thermostats, and other connected devices via voice or app—
+no matter where you place it in your home.
+This affordable little hub brings convenient hands-free control to your smart devices.
+</description>
+""",
+    },
+    {"role": "assistant", "content": "{\n"},
+]
+
+# Calling the API with continue_final_message enabled.
+print("=== Prefill with continue_final_messagem ===")
+response_with = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    messages=messages,
+    temperature=0,
+    extra_body={"continue_final_message": True},
+)
+print(response_with.choices[0].message.content)
+
+# Calling the API without continue_final_message (using default behavior).
+print("\n=== Prefill without continue_final_message ===")
+response_without = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    messages=messages,
+    temperature=0,
+)
+print(response_without.choices[0].message.content)