初始化项目，由ModelHub XC社区提供模型

Model: domyn/Domyn-Small-v1.0 Source: Original Platform
2026-06-01 12:09:16 +08:00
commit d1020dd6c0
15 changed files with 9709 additions and 0 deletions
--- a/reasoning_parser_plugin.py
+++ b/reasoning_parser_plugin.py
@@ -0,0 +1,297 @@
+"""Reasoning parser plugin for Domyn-Small ``<think>...</think>`` outputs.
+
+Loaded into vLLM with ``--reasoning-parser-plugin <path>`` and selected via
+``--reasoning-parser think_block``. The parser splits each model output on
+the literal ``</think>`` marker: everything before it is reasoning,
+everything after is final content.
+
+See :class:`ThinkBlockReasoningParser` for the streaming state machine and
+how per-request thinking-on/off is discovered.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable, Sequence
+from typing import TYPE_CHECKING
+
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+    from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+    from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+
+# Literal markers emitted by the Domyn-Small chat template. `<think>` is
+# pre-emitted by the prompt, so model output never starts with it; only `</think>`
+# actually has to be detected at runtime.
+START = "<think>"
+END = "</think>"
+
+
+def _max_suffix_prefix(s: str, marker: str) -> str:
+    """Longest non-empty suffix of ``s`` that is also a prefix of ``marker``.
+
+    Used to decide how many trailing bytes of the streaming buffer must be
+    held back — if those bytes could still grow into ``marker`` on the next
+    delta, releasing them now would fragment the marker across deltas (e.g.
+    emitting ``</thi`` and then ``nk>``).
+    """
+    for i in range(min(len(marker) - 1, len(s)), 0, -1):
+        if s.endswith(marker[:i]):
+            return s[-i:]
+    return ""
+
+
+@ReasoningParserManager.register_module("think_block")
+class ThinkBlockReasoningParser(ReasoningParser):
+    """Splits model output on the literal ``</think>`` marker.
+
+    **Streaming.** Olmo3-style buffered state machine: incoming text is
+    accumulated in :attr:`_buffer` and only released when the marker is
+    either confirmed (split point reached) or ruled out (the buffer tail
+    can no longer be a prefix of ``</think>``). This guarantees the marker
+    is never fragmented across deltas.
+
+    **Per-request lane.** The initial lane (``"reasoning"`` vs
+    ``"content"``) is set from the request itself: ``True`` if
+    ``chat_template_kwargs.enable_thinking`` (or ``.thinking``) is truthy,
+    or if any system message contains the literal ``"thinking on"``
+    directive — mirroring the chat template's own detection.
+
+    **Request discovery.** vLLM instantiates the parser per request from
+    inside ``create_chat_completion(self, request, ...)``, but does not
+    pass the request to the constructor. We recover it by walking the call
+    stack at ``__init__`` time, inspecting only each frame's *function
+    arguments* (so we don't accidentally match request-shaped objects in
+    module globals or unrelated locals). If no request is found we fall
+    back to ``thinking=off``, which keeps tool-call streaming working out
+    of the box.
+    """
+
+    def __init__(self, tokenizer, *args, **kwargs) -> None:
+        # Base ReasoningParser only accepts `tokenizer`; swallow any extras so
+        # the registration signature stays compatible across vLLM versions.
+        super().__init__(tokenizer)
+        self._buffer: str = ""
+        # Current lane for streaming output: "reasoning" while inside
+        # <think>...</think>, "content" otherwise. Locked to "content" once
+        # `</think>` is observed.
+        self._state: str = "content"
+        # Tracks whether we have applied per-request configuration yet —
+        # stack-walking covers the streaming path; `extract_reasoning` also
+        # configures on the first non-streaming call as a safety net.
+        self._configured: bool = False
+
+        request = self._find_request_in_stack()
+        if request is not None:
+            self._configure_for_request(request)
+
+    @staticmethod
+    def _looks_like_request(obj) -> bool:
+        """Duck-typed check for ChatCompletionRequest / ResponsesRequest.
+
+        Avoids importing vLLM's protocol module, which differs across forks
+        and isn't guaranteed to be importable at plugin load time.
+        """
+        return hasattr(obj, "messages") and (
+            hasattr(obj, "chat_template_kwargs") or hasattr(obj, "stream")
+        )
+
+    @classmethod
+    def _find_request_in_stack(cls, max_depth: int = 12):
+        """Locate the in-flight request by scanning caller-frame arguments.
+
+        Walks a bounded number of caller frames via ``sys._getframe`` /
+        ``frame.f_back`` and inspects only each frame's *function
+        arguments* — never its full locals. This matches vLLM's
+        ``create_chat_completion(self, request, ...)`` signature and avoids
+        matching request-shaped objects that happen to live in module
+        globals or unrelated locals (e.g. test fixtures).
+
+        We deliberately avoid :func:`inspect.stack`, which reads source
+        files via ``linecache`` and builds ``FrameInfo`` objects for the
+        whole stack on every call — measurable overhead per request under
+        high concurrency, since parser construction is per-request and
+        runs under the GIL on the serving event loop.
+        """
+        import sys
+        try:
+            frame = sys._getframe(1)
+        except Exception:
+            return None
+        depth = 0
+        while frame is not None and depth < max_depth:
+            code = frame.f_code
+            n_args = code.co_argcount + code.co_kwonlyargcount
+            for name in code.co_varnames[:n_args]:
+                value = frame.f_locals.get(name)
+                if cls._looks_like_request(value):
+                    return value
+            frame = frame.f_back
+            depth += 1
+        return None
+
+    def _configure_for_request(self, request) -> None:
+        """Set initial streaming lane from the request's thinking flag."""
+        self._state = "reasoning" if self._thinking_was_enabled(request) else "content"
+        self._configured = True
+
+    def _decode(self, ids: Sequence[int]) -> str:
+        # `skip_special_tokens=False` is required: `</think>` may be tokenized
+        # as (or contain) special tokens that the default decode would strip,
+        # which would silently break marker detection.
+        try:
+            return self.model_tokenizer.decode(list(ids), skip_special_tokens=False)
+        except Exception:
+            return ""
+
+    @property
+    def reasoning_start_str(self) -> str | None:
+        return START
+
+    @property
+    def reasoning_end_str(self) -> str | None:
+        return END
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        return END in self._decode(input_ids)
+
+    def is_reasoning_end_streaming(
+        self, input_ids: Sequence[int], delta_ids: Iterable[int]
+    ) -> bool:
+        # Decode a 64-token tail window so the marker is detected even when
+        # it straddles the previous-vs-delta token boundary (BPE may split
+        # `</think>` across multiple tokens, especially around punctuation).
+        tail = list(input_ids)[-64:]
+        return END in self._decode(tail)
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        text = self._decode(input_ids)
+        idx = text.rfind(END)
+        if idx < 0:
+            return []
+        try:
+            return self.model_tokenizer.encode(
+                text[idx + len(END):], add_special_tokens=False
+            )
+        except Exception:
+            return []
+
+    def count_reasoning_tokens(self, token_ids: Sequence[int]) -> int:
+        text = self._decode(token_ids)
+        idx = text.find(END)
+        prefix = text if idx < 0 else text[:idx]
+        try:
+            return len(self.model_tokenizer.encode(prefix, add_special_tokens=False))
+        except Exception:
+            return 0
+
+    def extract_reasoning(
+        self,
+        model_output: str,
+        request: "ChatCompletionRequest | ResponsesRequest",
+    ) -> tuple[str | None, str | None]:
+        """Split a full (non-streaming) output into ``(reasoning, content)``.
+
+        Returns ``(None, content)`` when the request has thinking disabled
+        and the output contains no marker — the chat template pre-emits
+        ``<think></think>`` in the prompt in that case, so a marker-less
+        output is purely the answer.
+        """
+        # Configure streaming state as a side effect: a fork's serving layer
+        # may call this before streaming starts, and we don't want the
+        # streaming path to fall back to the `thinking=off` default if the
+        # request actually had thinking enabled.
+        if not self._configured:
+            self._configure_for_request(request)
+
+        s = model_output
+        if s.startswith(START):
+            s = s[len(START):]
+        if END in s:
+            reasoning, _, content = s.partition(END)
+            return (reasoning.strip("\n") or None, content.lstrip("\n") or None)
+        # No `</think>` in output: only treat the text as truncated reasoning
+        # if we have positive evidence that thinking was enabled — otherwise
+        # it is the final answer.
+        if self._thinking_was_enabled(request):
+            return (s.strip("\n") or None, None)
+        return (None, s.lstrip("\n") or None)
+
+    @staticmethod
+    def _thinking_was_enabled(request) -> bool:
+        """Whether ``request`` asked for reasoning to be emitted.
+
+        Mirrors the chat template's own detection so the parser stays in
+        lockstep with prompt construction: enabled iff
+        ``chat_template_kwargs.enable_thinking`` (or ``.thinking``) is
+        truthy, or any system message contains the literal ``"thinking on"``
+        directive (case-insensitive).
+        """
+        kwargs = getattr(request, "chat_template_kwargs", None) or {}
+        if kwargs.get("enable_thinking") or kwargs.get("thinking"):
+            return True
+        messages = getattr(request, "messages", None) or []
+        for m in messages:
+            role = m.get("role") if isinstance(m, dict) else getattr(m, "role", None)
+            if role != "system":
+                continue
+            content = m.get("content") if isinstance(m, dict) else getattr(m, "content", None)
+            if isinstance(content, str) and "thinking on" in content.lower():
+                return True
+        return False
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> "DeltaMessage | None":
+        """Emit one ``DeltaMessage`` per delta, routed to reasoning or content.
+
+        The marker ``</think>`` is never emitted to the client. Trailing
+        bytes of the buffer that *could* still grow into the marker on the
+        next delta are held back, so the marker is never fragmented across
+        deltas (e.g. ``</thi`` ... ``nk>``). When the marker is observed,
+        pre-marker bytes go to the current lane and post-marker bytes go
+        to ``content``; the lane is then locked to ``content``.
+        """
+        from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+
+        self._buffer += delta_text
+
+        # Case 1 — marker fully present in the buffer: split and switch lane.
+        # The pre-marker chunk stays on the *current* lane (reasoning if we
+        # were inside <think>, content otherwise); the post-marker chunk
+        # always goes to content; the lane is locked to content afterwards.
+        idx = self._buffer.find(END)
+        if idx >= 0:
+            pre = self._buffer[:idx]
+            post = self._buffer[idx + len(END):]
+            self._buffer = ""
+            pre_lane = self._state
+            self._state = "content"
+            if not pre and not post:
+                return None
+            fields: dict = {}
+            if pre:
+                fields[pre_lane] = pre
+            if post:
+                # `.get` covers the edge case where pre_lane is already
+                # "content" and both pre and post are non-empty — they get
+                # concatenated into a single content delta.
+                fields["content"] = fields.get("content", "") + post
+            return DeltaMessage(**fields)
+
+        # Case 2 — no marker yet: release everything except a possible
+        # partial-marker tail, which we retain for the next delta.
+        held = _max_suffix_prefix(self._buffer, END)
+        safe_end = len(self._buffer) - len(held)
+        if safe_end == 0:
+            return None
+        chunk = self._buffer[:safe_end]
+        self._buffer = self._buffer[safe_end:]
+        return DeltaMessage(**{self._state: chunk})