fix serving issues when requesting real data

2026-06-12 17:57:23 +08:00
parent 50e3a05fb0
commit 3b8a567e9e
2 changed files with 36 additions and 8 deletions
--- a/qwen3_6_scripts/chat_utils.py
+++ b/qwen3_6_scripts/chat_utils.py
@@ -113,6 +113,11 @@ class ConversationMessage(TypedDict, total=False):
    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
    """The tool calls generated by the model, such as function calls."""

+    reasoning_content: Optional[str]
+    """Reasoning / thinking content for assistant messages.
+    Passed directly to the chat template (Qwen3 reads message.reasoning_content
+    natively) instead of being manually wrapped in <think>...</think>."""
+

 ModalityStr = Literal["image", "audio", "video"]
 _T = TypeVar("_T")
@@ -480,15 +485,13 @@ def _parse_chat_message_content(
            if "tool_calls" in parsed_msg:
                result_msg["tool_calls"] = list(parsed_msg["tool_calls"])

-            # Prepend reasoning_content as <think>...</think> so the model
-            # sees its own chain-of-thought in multi-turn conversations.
-            reasoning = message.get("reasoning_content")  # type: ignore[arg-type]
+            # Pass reasoning content as a dedicated field so the chat template
+            # can render it natively (Qwen3: message.reasoning_content branch).
+            # Accept both "reasoning" (new vllm) and "reasoning_content" (ours).
+            reasoning = (message.get("reasoning")  # type: ignore[arg-type]
+                         or message.get("reasoning_content"))  # type: ignore[arg-type]
            if reasoning and isinstance(reasoning, str):
-                existing = result_msg.get("content") or ""
-                result_msg["content"] = (
-                    f"<think>{reasoning}</think>\n\n{existing}"
-                    if existing else f"<think>{reasoning}</think>"
-                )
+                result_msg["reasoning_content"] = reasoning

        elif role == "tool":
            parsed_msg = _ToolParser(message)
--- a/qwen3_6_scripts/protocol.py
+++ b/qwen3_6_scripts/protocol.py
@@ -373,6 +373,31 @@ class ChatCompletionRequest(OpenAIBaseModel):

        return None

+    @model_validator(mode="before")
+    @classmethod
+    def normalize_messages(cls, data):
+        """Normalize incoming messages before pydantic union validation.
+
+        Real-world clients (e.g. from other providers) send assistant tool_call
+        messages with content=null, which fails the strict Union type check.
+        Replace null content with "" so validation passes.
+        reasoning_content is intentionally kept — chat_utils.py wraps it as
+        <think>...</think> for multi-turn reasoning history.
+        """
+        messages = data.get("messages")
+        if not isinstance(messages, list):
+            return data
+        normalized = []
+        for msg in messages:
+            if not isinstance(msg, dict):
+                normalized.append(msg)
+                continue
+            if msg.get("content") is None:
+                msg = {**msg, "content": ""}
+            normalized.append(msg)
+        data = {**data, "messages": normalized}
+        return data
+
    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):