From a68ed76682ae78e589327f3e9ce7d6a7e9504493 Mon Sep 17 00:00:00 2001
From: mlmz <54172054+minleminzui@users.noreply.github.com>
Date: Tue, 6 May 2025 02:43:34 +0800
Subject: [PATCH] feat: append more comprehensive fields in messages instead of
 merely role and content (#5996)

---
 docs/backend/function_calling.ipynb      | 53 +++++++++++-------------
 python/sglang/srt/openai_api/adapter.py  | 36 +++++++++-------
 python/sglang/srt/openai_api/protocol.py | 36 +++++++++-------
 3 files changed, 66 insertions(+), 59 deletions(-)

diff --git a/docs/backend/function_calling.ipynb b/docs/backend/function_calling.ipynb
index 2fece950f..26a0024fa 100644
--- a/docs/backend/function_calling.ipynb
+++ b/docs/backend/function_calling.ipynb
@@ -38,7 +38,9 @@
     "    from patch import launch_server_cmd\n",
     "else:\n",
     "    from sglang.utils import launch_server_cmd\n",
+    "    import nest_asyncio\n",
     "\n",
+    "    nest_asyncio.apply()\n",
     "\n",
     "server_process, port = launch_server_cmd(\n",
     "    \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\"  # qwen25\n",
@@ -164,7 +166,7 @@
     "response_non_stream = client.chat.completions.create(\n",
     "    model=model_name,\n",
     "    messages=messages,\n",
-    "    temperature=0.1,\n",
+    "    temperature=0,\n",
     "    top_p=0.95,\n",
     "    max_tokens=1024,\n",
     "    stream=False,  # Non-streaming\n",
@@ -219,7 +221,7 @@
     "response_stream = client.chat.completions.create(\n",
     "    model=model_name,\n",
     "    messages=messages,\n",
-    "    temperature=0.1,\n",
+    "    temperature=0,\n",
     "    top_p=0.95,\n",
     "    max_tokens=1024,\n",
     "    stream=True,  # Enable streaming\n",
@@ -309,22 +311,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "call_data = json.loads(full_arguments)\n",
-    "\n",
-    "messages.append(\n",
-    "    {\n",
-    "        \"role\": \"user\",\n",
-    "        \"content\": \"\",\n",
-    "        \"tool_calls\": {\"name\": \"get_current_weather\", \"arguments\": full_arguments},\n",
-    "    }\n",
-    ")\n",
+    "messages.append(response_non_stream.choices[0].message)\n",
     "\n",
     "# Call the corresponding tool function\n",
-    "tool_name = messages[-1][\"tool_calls\"][\"name\"]\n",
+    "tool_call = messages[-1].tool_calls[0]\n",
+    "tool_name = tool_call.function.name\n",
     "tool_to_call = available_tools[tool_name]\n",
-    "result = tool_to_call(**call_data)\n",
+    "result = tool_to_call(**(json.loads(tool_call.function.arguments)))\n",
     "print_highlight(f\"Function call result: {result}\")\n",
-    "messages.append({\"role\": \"tool\", \"content\": result, \"name\": tool_name})\n",
+    "# messages.append({\"role\": \"tool\", \"content\": result, \"name\": tool_name})\n",
+    "messages.append(\n",
+    "    {\n",
+    "        \"role\": \"tool\",\n",
+    "        \"tool_call_id\": tool_call.id,\n",
+    "        \"content\": str(result),\n",
+    "        \"name\": tool_name,\n",
+    "    }\n",
+    ")\n",
     "\n",
     "print_highlight(f\"Updated message history: {messages}\")"
    ]
@@ -345,7 +348,7 @@
     "final_response = client.chat.completions.create(\n",
     "    model=model_name,\n",
     "    messages=messages,\n",
-    "    temperature=0.1,\n",
+    "    temperature=0,\n",
     "    top_p=0.95,\n",
     "    stream=False,\n",
     "    tools=tools,\n",
@@ -391,7 +394,7 @@
     "    \"sampling_params\": {\n",
     "        \"skip_special_tokens\": False,\n",
     "        \"max_new_tokens\": 1024,\n",
-    "        \"temperature\": 0.1,\n",
+    "        \"temperature\": 0,\n",
     "        \"top_p\": 0.95,\n",
     "    },\n",
     "}\n",
@@ -452,7 +455,7 @@
     "\n",
     "sampling_params = {\n",
     "    \"max_new_tokens\": 1024,\n",
-    "    \"temperature\": 0.1,\n",
+    "    \"temperature\": 0,\n",
     "    \"top_p\": 0.95,\n",
     "    \"skip_special_tokens\": False,\n",
     "}\n",
@@ -540,14 +543,6 @@
    "outputs": [],
    "source": [
     "import openai\n",
-    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
-    "from sglang.test.test_utils import is_in_ci\n",
-    "\n",
-    "\n",
-    "if is_in_ci():\n",
-    "    from patch import launch_server_cmd\n",
-    "else:\n",
-    "    from sglang.utils import launch_server_cmd\n",
     "\n",
     "server_process, port = launch_server_cmd(\n",
     "    \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1\"  # llama-3.2-1b-instruct\n",
@@ -624,8 +619,8 @@
     "response_non_stream = client.chat.completions.create(\n",
     "    model=model_name,\n",
     "    messages=messages,\n",
-    "    temperature=0.8,\n",
-    "    top_p=0.8,\n",
+    "    temperature=0,\n",
+    "    top_p=0.9,\n",
     "    stream=False,  # Non-streaming\n",
     "    tools=tools,\n",
     ")\n",
@@ -635,8 +630,8 @@
     "response_stream = client.chat.completions.create(\n",
     "    model=model_name,\n",
     "    messages=messages,\n",
-    "    temperature=0.8,\n",
-    "    top_p=0.8,\n",
+    "    temperature=0,\n",
+    "    top_p=0.9,\n",
     "    stream=True,\n",
     "    tools=tools,\n",
     ")\n",
diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index d4640e23b..ba10f2951 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -14,6 +14,7 @@
 """Conversion between OpenAI APIs and native SRT APIs"""
 
 import asyncio
+import base64
 import json
 import logging
 import os
@@ -970,17 +971,19 @@ def v1_chat_generate_request(
                 for message in request.messages:
                     if message.content is None:
                         message.content = ""
-                    if isinstance(message.content, str):
-                        openai_compatible_messages.append(
-                            {"role": message.role, "content": message.content}
-                        )
+                    msg_dict = message.dict()
+                    if isinstance(msg_dict.get("content"), list):
+                        for chunk in msg_dict["content"]:
+                            if isinstance(chunk, dict) and chunk.get("type") == "text":
+                                new_msg = msg_dict.copy()
+                                new_msg["content"] = chunk["text"]
+                                new_msg = {
+                                    k: v for k, v in new_msg.items() if v is not None
+                                }
+                                openai_compatible_messages.append(new_msg)
                     else:
-                        content_list = message.dict()["content"]
-                        for content in content_list:
-                            if content["type"] == "text":
-                                openai_compatible_messages.append(
-                                    {"role": message.role, "content": content["text"]}
-                                )
+                        msg_dict = {k: v for k, v in msg_dict.items() if v is not None}
+                        openai_compatible_messages.append(msg_dict)
                 if (
                     openai_compatible_messages
                     and openai_compatible_messages[-1]["role"] == "assistant"
@@ -1290,7 +1293,8 @@ def v1_chat_generate_response(
                     text, call_info_list = parser.parse_non_stream(text)
                     tool_calls = [
                         ToolCall(
-                            id=str(call_info.tool_index),
+                            id=f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}",
+                            index=call_info.tool_index,
                             function=FunctionResponse(
                                 name=call_info.name, arguments=call_info.parameters
                             ),
@@ -1406,6 +1410,7 @@ async def v1_chat_completions(
         reasoning_parser_dict = {}
 
         async def generate_stream_resp():
+            tool_call_first = True
             is_firsts = {}
             stream_buffers = {}
             n_prev_tokens = {}
@@ -1572,7 +1577,6 @@ async def v1_chat_completions(
                         # 2) if we found calls, we output them as separate chunk(s)
                         for call_item in calls:
                             # transform call_item -> FunctionResponse + ToolCall
-
                             if finish_reason_type == "stop":
                                 latest_delta_len = 0
                                 if isinstance(call_item.parameters, str):
@@ -1595,15 +1599,19 @@ async def v1_chat_completions(
                                 call_item.parameters = remaining_call
 
                                 finish_reason_type = "tool_calls"
-
                             tool_call = ToolCall(
-                                id=str(call_item.tool_index),
+                                id=(
+                                    f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}"
+                                    if tool_call_first
+                                    else None
+                                ),
                                 index=call_item.tool_index,
                                 function=FunctionResponse(
                                     name=call_item.name,
                                     arguments=call_item.parameters,
                                 ),
                             )
+                            tool_call_first = False
                             choice_data = ChatCompletionResponseStreamChoice(
                                 index=index,
                                 delta=DeltaMessage(tool_calls=[tool_call]),
diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py
index 88d8873d1..c37442248 100644
--- a/python/sglang/srt/openai_api/protocol.py
+++ b/python/sglang/srt/openai_api/protocol.py
@@ -250,9 +250,29 @@ ChatCompletionMessageContentPart = Union[
 ]
 
 
+class FunctionResponse(BaseModel):
+    """Function response."""
+
+    name: Optional[str] = None
+    arguments: Optional[str] = None
+
+
+class ToolCall(BaseModel):
+    """Tool call response."""
+
+    id: Optional[str] = None
+    index: Optional[int] = None
+    type: Literal["function"] = "function"
+    function: FunctionResponse
+
+
 class ChatCompletionMessageGenericParam(BaseModel):
     role: Literal["system", "assistant", "tool"]
     content: Union[str, List[ChatCompletionMessageContentTextPart], None]
+    tool_call_id: Optional[str] = None
+    name: Optional[str] = None
+    reasoning_content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
 
 
 class ChatCompletionMessageUserParam(BaseModel):
@@ -378,22 +398,6 @@ class ChatCompletionRequest(BaseModel):
     bootstrap_room: Optional[int] = None
 
 
-class FunctionResponse(BaseModel):
-    """Function response."""
-
-    name: Optional[str] = None
-    arguments: Optional[str] = None
-
-
-class ToolCall(BaseModel):
-    """Tool call response."""
-
-    id: str
-    index: Optional[int] = None
-    type: Literal["function"] = "function"
-    function: FunctionResponse
-
-
 class ChatMessage(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None