Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -11,6 +11,7 @@ from vllm.entrypoints.openai.responses.protocol import (
    ResponsesRequest,
 )
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.tokenizers import TokenizerLike


 class Qwen3ReasoningParser(BaseThinkingReasoningParser):
@@ -33,6 +34,14 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
    it is stripped before extraction (non-streaming) or skipped (streaming).
    """

+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+        # Qwen3 defaults to thinking enabled; only treat output as
+        # pure content when the user explicitly disables it.
+        self.thinking_enabled = chat_kwargs.get("enable_thinking", True)
+
    @property
    def start_token(self) -> str:
        """The token that starts reasoning content."""
@@ -54,8 +63,11 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
        If <think> is present (e.g. from a different template), it is
        stripped before extraction.

-        When thinking is disabled (no </think> in output), returns
-        (None, model_output) to indicate all output is content.
+        When thinking is explicitly disabled and no </think> appears,
+        returns (None, model_output) — all output is content.
+        Otherwise (thinking enabled, default), a missing </think> means
+        the output was truncated and everything is reasoning:
+        returns (model_output, None).

        Returns:
            tuple[Optional[str], Optional[str]]: reasoning content and content
@@ -68,9 +80,12 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
        )

        if self.end_token not in model_output:
-            # No end token means thinking is disabled or the model
-            # did not produce reasoning. Treat everything as content.
-            return None, model_output
+            if not self.thinking_enabled:
+                # Thinking explicitly disabled — treat everything as content.
+                return None, model_output
+            # Thinking enabled but no </think>: output was truncated.
+            # Everything generated so far is reasoning.
+            return model_output, None

        # Extract reasoning content from the model output.
        reasoning, _, content = model_output.partition(self.end_token)