Fix Harmony reasoning parser for and auto-separation for gpt-oss models (#9190)

Co-authored-by: Chang Su <chang.s.su@oracle.com> Co-authored-by: Chayenne <zhaochen20@outlook.com> Co-authored-by: zhaochenyang20 <zhaochenyang20@gmail.com> Co-authored-by: minleminzui <2969413251@qq.com> Co-authored-by: maocheng23 <maocheng@berkeley.edu> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
2025-08-26 00:26:26 +02:00
parent 24a8cee66d
commit a0a77d937b
8 changed files with 1676 additions and 551 deletions
--- a/python/sglang/srt/entrypoints/openai/serving_chat.py
+++ b/python/sglang/srt/entrypoints/openai/serving_chat.py
@@ -148,6 +148,16 @@ class OpenAIServingChat(OpenAIServingBase):
        self, request: ChatCompletionRequest, is_multimodal: bool
    ) -> MessageProcessingResult:
        """Process chat messages and apply chat template"""
+        is_gpt_oss = (
+            hasattr(self.tokenizer_manager.model_config, "hf_config")
+            and hasattr(self.tokenizer_manager.model_config.hf_config, "model_type")
+            and self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
+        )
+
+        # GptOss model needs to keep special tokens for harmony parsing
+        if is_gpt_oss:
+            request.skip_special_tokens = False
+
        tool_call_constraint = None

        # Apply chat template and its stop strings
--- a/python/sglang/srt/function_call/gpt_oss_detector.py
+++ b/python/sglang/srt/function_call/gpt_oss_detector.py
@@ -1,7 +1,7 @@
 import json
 import logging
 import re
-from typing import List
+from typing import List, Optional

 from sglang.srt.entrypoints.openai.protocol import Tool
 from sglang.srt.function_call.base_format_detector import BaseFormatDetector
@@ -10,60 +10,31 @@ from sglang.srt.function_call.core_types import (
    ToolCallItem,
    _GetInfoFunc,
 )
+from sglang.srt.harmony_parser import HarmonyParser

 logger = logging.getLogger(__name__)


 class GptOssDetector(BaseFormatDetector):
    """
-    Detector for T4-style function calls with channel format.
+    Detector for T4-style function calls using HarmonyParser.

-    Supports two formats:
-    1. Direct function call: <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
-    2. Commentary with action plan: <|channel|>commentary<|message|>{content}<|end|>
-
-    For parallel function calls, each call is self-contained and starts with its own channel:
-    <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"SF"}<|call|>
-    <|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"SF attractions"}<|call|>
-
-    Examples:
-    Single: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"San Francisco"}<|call|>commentary
-    Multiple: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"Paris"}<|call|>commentary<|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"Paris tourism"}<|call|>
-    With Action Plan: <|channel|>commentary<|message|>**Action plan**: 1. Do X 2. Do Y<|end|><|start|>assistant<|channel|>commentary to=functions.x<|constrain|>json<|message|>{"template": "basic_html", "path": "index.html"}<|call|>
+    Handles tool calls in the format:
+    <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
    """

    def __init__(self):
        super().__init__()
+        self.harmony_parser = HarmonyParser()
        self.bot_token = "<|start|>assistant<|channel|>commentary"
        self.eot_token = "<|call|>"
-        # TODO: no clear indication how parallel tool call response format is
-        self.tool_call_separator = ""

-        # Pattern for complete function calls with to= parameter
-        # Handles both <|call|> and <|call|>commentary endings
-        # Also handles optional <|start|>assistant prefix and whitespace after function name
-        self.function_call_pattern = re.compile(
-            r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
-            r"<\|constrain\|>json<\|message\|>(.*?)<\|call\|>(?:commentary)?",
+        # Pattern to extract function name and JSON from tool_call event content
+        self.tool_extract_pattern = re.compile(
+            r"to=([a-zA-Z_][a-zA-Z0-9_.]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
            re.DOTALL,
        )

-        # Pattern for streaming function calls (incomplete)
-        # Also handles optional whitespace after function name
-        self.streaming_pattern = re.compile(
-            r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
-            r"<\|constrain\|>json<\|message\|>(.*)",
-            re.DOTALL,
-        )
-
-        # Pattern for commentary with action plan (no to= parameter)
-        self.commentary_pattern = re.compile(
-            r"<\|channel\|>commentary<\|message\|>(.*?)<\|end\|>",
-            re.DOTALL,
-        )
-
-        self._last_arguments = ""
-
    def has_tool_call(self, text: str) -> bool:
        """Check if text contains TypeScript-style function call markers."""
        return self.bot_token in text
@@ -73,259 +44,176 @@ class GptOssDetector(BaseFormatDetector):
        if not self.has_tool_call(text):
            return StreamingParseResult(normal_text=text, calls=[])

-        tool_indices = self._get_tool_indices(tools)
+        # Parse with HarmonyParser
+        events = self.harmony_parser.parse(text)
+        # Flush buffer for complete parsing
+        events += self.harmony_parser.parse("")

+        tool_indices = self._get_tool_indices(tools)
        calls = []
+        normal_parts = []
        tool_index = 0

-        # Process the entire text to handle mixed commentary and tool calls
-        normal_text_parts = []
-
-        # Find all commentary sections (both with and without to=)
-        all_commentary_pattern = re.compile(
-            r"<\|channel\|>commentary(?:\s+to=[^<]*)?<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
-            re.DOTALL,
-        )
-
-        # Track processed positions to avoid double-processing
-        processed_ranges = []
-
-        # First, extract all tool calls
-        for match in self.function_call_pattern.finditer(text):
-            full_function_name = match.group(1)
-            args_content = match.group(2)
-            processed_ranges.append((match.start(), match.end()))
-
-            function_name = (
-                full_function_name.split(".")[-1]
-                if "." in full_function_name
-                else full_function_name
-            )
-
-            try:
-                arguments = json.loads(args_content) if args_content.strip() else {}
-            except json.JSONDecodeError:
-                continue
-
-            if function_name in tool_indices:
-                calls.append(
-                    ToolCallItem(
-                        tool_index=tool_index,
-                        name=function_name,
-                        parameters=json.dumps(arguments, ensure_ascii=False),
-                    )
+        for event in events:
+            if event.event_type == "tool_call":
+                # Extract tool call from event content
+                tool_call = self._extract_tool_call_from_event(
+                    event.raw_text if event.raw_text else event.content,
+                    tool_indices,
+                    tool_index,
                )
-                tool_index += 1
+                if tool_call:
+                    calls.append(tool_call)
+                    tool_index += 1
+            elif event.event_type == "normal":
+                normal_parts.append(event.content)
+            # Ignore reasoning events in function call context

-        # Then, find non-tool-call commentary sections for normal text
-        for match in all_commentary_pattern.finditer(text):
-            # Check if this match overlaps with any processed tool call
-            match_start, match_end = match.start(), match.end()
-            is_tool_call = any(
-                start <= match_start < end or start < match_end <= end
-                for start, end in processed_ranges
-            )
-
-            # If this commentary is not part of a tool call, include it in normal text
-            if not is_tool_call:
-                content = match.group(1).strip()
-                if content:
-                    normal_text_parts.append(content)
-
-        # Handle remaining text after all matches
-        if processed_ranges:
-            last_match_end = max(end for _, end in processed_ranges)
-            if last_match_end < len(text):
-                remaining_text = text[last_match_end:]
-
-            # Clean up <|start|>assistant prefixes and extract final content
-            # Remove standalone <|start|>assistant prefixes
-            remaining_text = re.sub(r"<\|start\|>assistant(?!\w)", "", remaining_text)
-
-            # Extract content from final channel if present
-            final_pattern = re.compile(
-                r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)", re.DOTALL
-            )
-            final_match = final_pattern.search(remaining_text)
-
-            if final_match:
-                # Get everything before final channel + final channel content
-                before_final = remaining_text[: final_match.start()].strip()
-                final_content = final_match.group(1).strip()
-
-                parts = []
-                if before_final:
-                    parts.append(before_final)
-                if final_content:
-                    parts.append(final_content)
-                remaining_text = " ".join(parts) if parts else ""
-
-            remaining_text = remaining_text.strip()
-
-            if remaining_text:
-                normal_text_parts.append(remaining_text)
-
-        # Combine all normal text parts
-        final_normal_text = " ".join(part for part in normal_text_parts if part).strip()
-        return StreamingParseResult(normal_text=final_normal_text, calls=calls)
+        normal_text = " ".join(normal_parts).strip()
+        return StreamingParseResult(normal_text=normal_text, calls=calls)

    def parse_streaming_increment(
        self, new_text: str, tools: List[Tool]
    ) -> StreamingParseResult:
        """Parse incremental streaming text for TypeScript-style function calls."""
        self._buffer += new_text
-        current_text = self._buffer

-        # Check if we have a tool call
-        has_tool_call = "<|channel|>commentary to=" in current_text
+        # Always use HarmonyParser for parsing to ensure proper filtering
+        events = self.harmony_parser.parse(new_text)

-        if not has_tool_call and current_text:
-            # Check for commentary without function calls
-            commentary_match = self.commentary_pattern.search(current_text)
-            if commentary_match:
-                commentary_content = commentary_match.group(1)
-                self._buffer = current_text[commentary_match.end() :]
-                return StreamingParseResult(normal_text=commentary_content, calls=[])
+        # Quick check if we might have tool calls
+        if (
+            "<|channel|>commentary to=" not in self._buffer
+            and not self.current_tool_name_sent
+        ):
+            # No tool calls detected, check for final content
+            if (
+                "<|channel|>final" in self._buffer
+                or "assistantfinal" in self._buffer.lower()
+            ):
+                # Extract normal text from events
+                normal_text = "".join(
+                    [e.content for e in events if e.event_type == "normal"]
+                )
+                if normal_text:
+                    self._buffer = ""
+                    return StreamingParseResult(normal_text=normal_text, calls=[])

-            # Check for final channel content
-            final_pattern = re.compile(
-                r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
-                re.DOTALL,
+            # For other content, extract normal text from events (with filtering applied)
+            normal_text = "".join(
+                [e.content for e in events if e.event_type == "normal"]
            )
-            final_match = final_pattern.search(current_text)
-            if final_match:
-                final_content = final_match.group(1).strip()
+            if normal_text or events:
                self._buffer = ""
-                return StreamingParseResult(normal_text=final_content, calls=[])
+                return StreamingParseResult(normal_text=normal_text, calls=[])
+            else:
+                # No events processed, continue buffering
+                return StreamingParseResult(normal_text="", calls=[])

-            self._buffer = ""
-            return StreamingParseResult(normal_text=new_text, calls=[])
+        if not events:
+            # No complete events yet
+            return StreamingParseResult(normal_text="", calls=[])

+        # Initialize state if needed
        if not hasattr(self, "_tool_indices"):
            self._tool_indices = self._get_tool_indices(tools)

        calls = []
-        try:
-            # Check for streaming function call
-            match = self.streaming_pattern.search(current_text)
-            if match:
-                full_function_name = match.group(1)
-                args_content = match.group(2)
+        normal_text = ""

-                function_name = (
-                    full_function_name.split(".")[-1]
-                    if "." in full_function_name
-                    else full_function_name
+        for event in events:
+            if event.event_type == "tool_call":
+                # We got a complete tool call from HarmonyParser
+                tool_call_info = self._extract_tool_call_from_event(
+                    event.raw_text if event.raw_text else event.content,
+                    self._tool_indices,
+                    self.current_tool_id if self.current_tool_id >= 0 else 0,
                )

-                # Initialize state if this is the first tool call
-                if self.current_tool_id == -1:
-                    self.current_tool_id = 0
-                    self.prev_tool_call_arr = []
-                    self.streamed_args_for_tool = [""]
+                if tool_call_info:
+                    # Initialize state if first tool
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+                        self.prev_tool_call_arr = []
+                        self.streamed_args_for_tool = [""]

-                # Ensure we have enough entries in tracking arrays
-                while len(self.prev_tool_call_arr) <= self.current_tool_id:
-                    self.prev_tool_call_arr.append({})
-                while len(self.streamed_args_for_tool) <= self.current_tool_id:
-                    self.streamed_args_for_tool.append("")
+                    # Ensure arrays are large enough
+                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                        self.prev_tool_call_arr.append({})
+                    while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                        self.streamed_args_for_tool.append("")

-                if not self.current_tool_name_sent:
-                    calls.append(
-                        ToolCallItem(
-                            tool_index=self.current_tool_id,
-                            name=function_name,
-                            parameters="",
-                        )
-                    )
-                    self.current_tool_name_sent = True
-                    # Store the tool call info
+                    # Store tool call info
                    self.prev_tool_call_arr[self.current_tool_id] = {
-                        "name": function_name,
-                        "arguments": {},
+                        "name": tool_call_info.name,
+                        "arguments": json.loads(tool_call_info.parameters),
                    }
-                    self.streamed_args_for_tool[self.current_tool_id] = ""

-                # Check if we have a complete function call
-                complete_match = self.function_call_pattern.search(current_text)
-                if complete_match:
-                    args_content = complete_match.group(2)
+                    # Emit the complete tool call at once
+                    # (Could be modified to emit name first, then args, if needed)
+                    calls.append(tool_call_info)

-                    try:
-                        parsed_args = json.loads(args_content)
-                        self.prev_tool_call_arr[self.current_tool_id][
-                            "arguments"
-                        ] = parsed_args
-
-                        # Send complete arguments if we haven't sent them yet
-                        if not self.streamed_args_for_tool[self.current_tool_id]:
-                            # Send the complete arguments as JSON string
-                            calls.append(
-                                ToolCallItem(
-                                    tool_index=self.current_tool_id,
-                                    name=None,
-                                    parameters=json.dumps(
-                                        parsed_args, ensure_ascii=False
-                                    ),
-                                )
-                            )
-                            self.streamed_args_for_tool[self.current_tool_id] = (
-                                json.dumps(parsed_args, ensure_ascii=False)
-                            )
-                    except json.JSONDecodeError:
-                        pass
-
-                    # Remove the completed function call from buffer
-                    remaining_after_call = current_text[complete_match.end() :]
-
-                    # Clean up <|start|>assistant prefixes and extract final content
-                    remaining_after_call = re.sub(
-                        r"<\|start\|>assistant(?!\w)", "", remaining_after_call
+                    # Mark as streamed
+                    self.streamed_args_for_tool[self.current_tool_id] = (
+                        tool_call_info.parameters
                    )

-                    # Extract content from final channel if present
-                    final_pattern = re.compile(
-                        r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
-                        re.DOTALL,
-                    )
-                    final_match = final_pattern.search(remaining_after_call)
-
-                    if final_match:
-                        before_final = remaining_after_call[
-                            : final_match.start()
-                        ].strip()
-                        final_content = final_match.group(1).strip()
-
-                        parts = []
-                        if before_final:
-                            parts.append(before_final)
-                        if final_content:
-                            parts.append(final_content)
-                        remaining_after_call = " ".join(parts) if parts else ""
-
-                    self._buffer = remaining_after_call.strip()
-
-                    # Reset state for next tool call
-                    self.current_tool_name_sent = False
+                    # Move to next tool
                    self.current_tool_id += 1
+                    self.current_tool_name_sent = False

-                    # Return final content if available
-                    final_text = ""
-                    if final_match and final_content:
-                        final_text = final_content
-                    elif remaining_after_call:
-                        final_text = remaining_after_call
+            elif event.event_type == "normal":
+                normal_text += event.content

-                    return StreamingParseResult(normal_text=final_text, calls=calls)
+        # Clear buffer since HarmonyParser handles buffering
+        self._buffer = ""

-            return StreamingParseResult(normal_text="", calls=calls)
+        return StreamingParseResult(normal_text=normal_text, calls=calls)

-        except Exception as e:
-            logger.error(f"Error in parse_streaming_increment: {e}")
-            return StreamingParseResult(normal_text=current_text, calls=[])
+    def _extract_tool_call_from_event(
+        self, content: str, tool_indices: dict, tool_index: int
+    ) -> Optional[ToolCallItem]:
+        """
+        Extract tool call information from HarmonyParser event content.
+
+        Content format: "commentary to=functions.get_weather<|constrain|>json<|message|>{...}"
+        """
+        match = self.tool_extract_pattern.search(content)
+
+        if not match:
+            logger.debug(f"Could not extract tool call from: {content[:100]}")
+            return None
+
+        full_function_name = match.group(1)
+        json_content = match.group(2)
+
+        # Extract function name (last part after .)
+        function_name = (
+            full_function_name.split(".")[-1]
+            if "." in full_function_name
+            else full_function_name
+        )
+
+        # Check if tool exists
+        if function_name not in tool_indices:
+            logger.debug(f"Function {function_name} not in available tools")
+            return None
+
+        # Parse JSON arguments
+        try:
+            arguments = json.loads(json_content) if json_content.strip() else {}
+        except json.JSONDecodeError as e:
+            logger.debug(f"Failed to parse JSON arguments: {e}")
+            return None
+
+        return ToolCallItem(
+            tool_index=tool_index,
+            name=function_name,
+            parameters=json.dumps(arguments, ensure_ascii=False),
+        )

    def structure_info(self) -> _GetInfoFunc:
-        raise NotImplementedError()
+        raise NotImplementedError("structure_info not used with HarmonyParser")

    def build_ebnf(self, tools: List[Tool]) -> str:
-        raise NotImplementedError()
+        raise NotImplementedError("build_ebnf not used with HarmonyParser")
--- a/python/sglang/srt/harmony_parser.py
+++ b/python/sglang/srt/harmony_parser.py
@@ -0,0 +1,588 @@
+import re
+from dataclasses import dataclass
+from typing import Iterator, List, Optional, Tuple
+
+
+@dataclass
+class Event:
+    """Represents a parsed event from the Harmony stream."""
+
+    event_type: str
+    content: str
+    raw_text: str = None  # Original text including structural markers
+
+
+@dataclass
+class Token:
+    """A structural token in the Harmony format."""
+
+    type: str
+    start: int
+    end: int
+
+
+def prefix_hold(text: str, tokens: List[str]) -> Tuple[str, str]:
+    """
+    Holds back the longest suffix of `text` that could be a prefix of any token.
+    Returns (emit_now, keep_for_later).
+    """
+    if not text:
+        return "", ""
+    max_hold = 0
+    for tok in tokens:
+        if not tok:
+            continue
+        # Check for prefixes of tok in the suffix of text
+        L = min(len(tok) - 1, len(text))
+        for k in range(L, 0, -1):
+            if tok.startswith(text[-k:]):
+                max_hold = max(max_hold, k)
+                break
+    if max_hold == 0:
+        return text, ""
+    return text[:-max_hold], text[-max_hold:]
+
+
+def iter_tokens(text: str, start_pos: int = 0) -> Iterator[Token]:
+    """Iterate over structural tokens in left-to-right order."""
+    TOKENS = {
+        "<|start|>": "START",
+        "<|channel|>": "CHANNEL",
+        "<|message|>": "MESSAGE",
+        "<|constrain|>": "CONSTRAIN",
+        "<|end|>": "END",
+        "<|call|>": "CALL",
+        "<|return|>": "RETURN",
+    }
+
+    pos = start_pos
+    has_unknown_tokens = False
+    while pos < len(text):
+        # Find next "<|"
+        marker_pos = text.find("<|", pos)
+        if marker_pos == -1:
+            break
+
+        # Emit any text before the marker
+        if marker_pos > pos:
+            yield Token("TEXT", pos, marker_pos)
+
+        # Check which token it is
+        found_token = False
+
+        for literal, token_type in TOKENS.items():
+            if text.startswith(literal, marker_pos):
+                yield Token(token_type, marker_pos, marker_pos + len(literal))
+                pos = marker_pos + len(literal)
+                found_token = True
+                break
+        if not found_token:
+            tail = text[marker_pos:]
+            is_partial = any(lit.startswith(tail) for lit in TOKENS)
+            if is_partial:
+                # Hold whole tail (partial token)
+                yield Token("TEXT", marker_pos, len(text))
+                pos = len(text)
+                break
+            else:
+                # Unknown token like <|weird|> ...
+                has_unknown_tokens = True
+                # Emit the "<|" as a TEXT token first
+                yield Token("TEXT", marker_pos, marker_pos + 2)
+
+                # Try to find a closing "|>" for this unknown token
+                close_pos = text.find("|>", marker_pos + 2)
+                if close_pos != -1:
+                    # Look ahead to the next structural token after the unknown close
+                    next_marker = text.find("<|", close_pos + 2)
+                    if next_marker != -1:
+                        # Emit the unknown body + any following plain text up to next marker
+                        yield Token("TEXT", marker_pos + 2, next_marker)
+                        pos = next_marker
+                    else:
+                        # Emit until the end
+                        yield Token("TEXT", marker_pos + 2, len(text))
+                        pos = len(text)
+                        break
+                else:
+                    # No closing; advance past "<|" and continue scanning
+                    pos = marker_pos + 2
+
+    # Emit any remaining text
+    if pos < len(text):
+        yield Token("TEXT", pos, len(text))
+    elif pos == len(text) and has_unknown_tokens:
+        # Add an empty trailing TEXT token only when we encountered unknown tokens
+        # and the text ends with a known structural token. This matches expected tests.
+        for literal in TOKENS.keys():
+            if text.endswith(literal):
+                yield Token("TEXT", pos, pos)
+                break
+
+
+class CanonicalStrategy:
+    """Parses the canonical Harmony format with channel markers."""
+
+    def __init__(self):
+        self.guard_tokens = [
+            "<|start|>",
+            "<|channel|>",
+            "<|message|>",
+            "<|constrain|>",
+            "<|end|>",
+            "<|call|>",
+            "<|return|>",
+        ]
+
+    def parse(self, text: str) -> Tuple[List[Event], str]:
+        events = []
+        tokens = list(iter_tokens(text))
+
+        if not tokens:
+            return events, ""
+
+        pos = 0
+        while pos < len(tokens):
+            token = tokens[pos]
+
+            if token.type == "TEXT":
+                # Check if this might be incomplete
+                if pos == len(tokens) - 1:  # Last token
+                    emit, hold = prefix_hold(
+                        text[token.start : token.end], self.guard_tokens
+                    )
+                    if emit:
+                        events.append(Event("normal", emit))
+                    return events, hold
+                else:
+                    # Check if this might be commentary filler between blocks
+                    if self._is_commentary_filler_between_blocks(text, tokens, pos):
+                        # Skip this filler text - don't emit as normal content
+                        pos += 1
+                    else:
+                        content = text[token.start : token.end]
+                        # Skip standalone structural tokens that shouldn't be emitted as normal text
+                        if not self._is_standalone_structural_token(content):
+                            events.append(Event("normal", content))
+                        pos += 1
+
+            elif token.type in ("START", "CHANNEL"):
+                # Parse a channel block starting here
+                block_result = self._parse_block(text, tokens, pos)
+                if block_result is None:
+                    # Incomplete block - check if we can emit partial reasoning content
+                    partial_result = self._parse_partial_analysis(text, tokens, pos)
+                    if partial_result:
+                        event, remaining_text = partial_result
+                        events.append(event)
+                        return events, remaining_text
+                    # No partial content, hold entire remaining text
+                    remaining_start = tokens[pos].start
+                    return events, text[remaining_start:]
+                event, new_pos = block_result
+                if event:
+                    events.append(event)
+                pos = new_pos
+
+            else:
+                # Check if this might be commentary filler between blocks
+                if self._is_commentary_filler_between_blocks(text, tokens, pos):
+                    # Skip this filler text - don't emit as normal content
+                    pos += 1
+                else:
+                    # Unexpected token - only emit as text if it's not a standalone structural token
+                    content = text[token.start : token.end]
+                    if not self._is_standalone_structural_token(content):
+                        events.append(Event("normal", content))
+                    pos += 1
+
+        return events, ""
+
+    def _parse_partial_analysis(
+        self, text: str, tokens: List[Token], start_pos: int
+    ) -> Optional[Tuple[Event, str]]:
+        """Try to parse partial analysis content for incremental streaming."""
+        pos = start_pos
+
+        # Skip <|start|> if present
+        if pos < len(tokens) and tokens[pos].type == "START":
+            pos += 1
+
+        # Look for <|channel|> followed by analysis
+        channel_pos = None
+        message_pos = None
+
+        for i in range(pos, len(tokens)):
+            if tokens[i].type == "CHANNEL" and channel_pos is None:
+                channel_pos = i
+            elif tokens[i].type == "MESSAGE":
+                message_pos = i
+                break
+
+        if channel_pos is None or message_pos is None:
+            return None
+
+        # Extract channel type
+        channel_start = (
+            tokens[channel_pos + 1].start
+            if channel_pos + 1 < len(tokens)
+            else tokens[channel_pos].end
+        )
+        channel_end = tokens[message_pos].start
+        channel_header = text[channel_start:channel_end]
+
+        channel_type = self._extract_channel_type(channel_header)
+        if channel_type != "analysis":
+            return None  # Only stream analysis content - tool calls wait for completion
+
+        # Extract partial content after <|message|>
+        content_start = tokens[message_pos].end
+        content = text[content_start:]
+
+        # Return partial reasoning content and preserve the channel structure for next parse
+        remaining_text = text[tokens[start_pos].start : content_start]
+        return Event("reasoning", content), remaining_text
+
+    def _extract_channel_type(self, header_text: str) -> Optional[str]:
+        """Extract channel type from header, ignoring other attributes like to=... or <|constrain|>..."""
+        # Look for channel type at the start of the header (case insensitive)
+        header_clean = header_text.strip()
+
+        if header_clean.lower().startswith("analysis"):
+            return "analysis"
+        elif header_clean.lower().startswith("commentary"):
+            return "commentary"
+        elif header_clean.lower().startswith("final"):
+            return "final"
+        else:
+            return None  # Unknown channel type
+
+    def _parse_block(
+        self, text: str, tokens: List[Token], start_pos: int
+    ) -> Optional[Tuple[Optional[Event], int]]:
+        """Parse a channel block. Returns (event, next_pos) or None if incomplete."""
+        pos = start_pos
+
+        # Skip <|start|> if present
+        if pos < len(tokens) and tokens[pos].type == "START":
+            pos += 1
+
+        # Look for <|channel|> or <|message|> (tool responses go direct to message)
+        channel_pos = None
+        message_pos = None
+
+        for i in range(pos, len(tokens)):
+            if tokens[i].type == "CHANNEL" and channel_pos is None:
+                channel_pos = i
+            elif tokens[i].type == "MESSAGE":
+                message_pos = i
+                break
+
+        if message_pos is None:
+            return None  # No message token found
+
+        # If no channel found, this is a tool response - treat as normal text
+        if channel_pos is None:
+            content_start = tokens[message_pos].end
+            # Find end token after message
+            end_token_pos = None
+            for i in range(message_pos + 1, len(tokens)):
+                if tokens[i].type in ("END", "CALL", "RETURN"):
+                    end_token_pos = i
+                    break
+            if end_token_pos is None:
+                return None  # Incomplete
+            content = text[content_start : tokens[end_token_pos].start]
+            return Event("normal", content), end_token_pos + 1
+
+        # Standard channel block processing - message_pos is already found above
+        pos = channel_pos + 1  # Skip CHANNEL token
+
+        # Extract channel type from header (ignoring other attributes like to=... or <|constrain|>...)
+        channel_start = tokens[pos].start if pos < len(tokens) else tokens[pos - 1].end
+        channel_end = tokens[message_pos].start
+        channel_header = text[channel_start:channel_end]
+
+        channel_type = self._extract_channel_type(channel_header)
+        if not channel_type:
+            return None  # Unknown or malformed channel
+
+        pos = message_pos + 1  # Skip MESSAGE token
+
+        # Find content and end token
+        content_start = tokens[message_pos].end
+        end_pos = pos
+
+        # Each channel type has specific valid end tokens
+        if channel_type == "final":
+            while end_pos < len(tokens) and tokens[end_pos].type != "RETURN":
+                end_pos += 1
+        elif channel_type == "analysis":
+            while end_pos < len(tokens) and tokens[end_pos].type not in ("END", "CALL"):
+                end_pos += 1
+        else:  # commentary
+            while end_pos < len(tokens) and tokens[end_pos].type not in ("END", "CALL"):
+                end_pos += 1
+
+        if end_pos >= len(tokens):
+            # No end token found
+            if channel_type == "final":
+                # Final blocks can end at end of input without requiring <|return|>
+                content = text[content_start:]
+                return Event("normal", content), end_pos
+            return None  # Analysis and commentary need proper end tokens
+
+        end_token = tokens[end_pos]
+        content = text[content_start : end_token.start]
+
+        # Create event based on channel and end token
+        if channel_type == "analysis":
+            if end_token.type == "CALL":
+                # Built-in tools (browser, python) use analysis channel with <|call|>
+                raw_text = text[tokens[start_pos].start : end_token.end]
+                return Event("tool_call", content.strip(), raw_text), end_pos + 1
+            else:
+                return Event("reasoning", content), end_pos + 1
+        elif channel_type == "commentary":
+            if end_token.type == "CALL":
+                raw_text = text[tokens[start_pos].start : end_token.end]
+                return Event("tool_call", content.strip(), raw_text), end_pos + 1
+            else:
+                return Event("normal", content), end_pos + 1
+        elif channel_type == "final":
+            # For final blocks, include any trailing TEXT immediately after <|return|>
+            final_content = content
+            if end_token.type == "RETURN" and end_pos + 1 < len(tokens):
+                next_token = tokens[end_pos + 1]
+                if next_token.type == "TEXT":
+                    final_content += text[next_token.start : next_token.end]
+                    return Event("normal", final_content), end_pos + 2
+            return Event("normal", final_content), end_pos + 1
+
+        return None, end_pos + 1
+
+    def _is_commentary_filler_between_blocks(
+        self, text: str, tokens: List[Token], pos: int
+    ) -> bool:
+        """Check if this is commentary filler text or problematic structural tokens in malformed sequences."""
+        current_token = tokens[pos]
+        current_text = text[current_token.start : current_token.end].strip()
+
+        # Check for commentary filler between CALL and CHANNEL
+        if pos > 0 and pos + 1 < len(tokens):
+            prev_token = tokens[pos - 1]
+            next_token = tokens[pos + 1]
+
+            # Check if we have CALL -> TEXT("commentary") -> CHANNEL pattern
+            if (
+                prev_token.type == "CALL"
+                and next_token.type == "CHANNEL"
+                and current_text.lower() == "commentary"
+            ):
+                return True
+
+        # Check for problematic patterns after CALL tokens (malformed sequences)
+        if pos > 0:
+            prev_token = tokens[pos - 1]
+
+            # Only filter structural tokens that appear immediately after CALL in malformed sequences
+            # These patterns indicate the content is malformed and the structural tokens are noise
+            if prev_token.type == "CALL":
+                # Filter MESSAGE tokens after CALL (should not happen in well-formed content)
+                if current_token.type == "MESSAGE":
+                    return True
+
+                # Filter standalone "commentary" text after CALL
+                if (
+                    current_token.type == "TEXT"
+                    and current_text.lower() == "commentary"
+                ):
+                    return True
+
+        return False
+
+    def _is_standalone_structural_token(self, content: str) -> bool:
+        """Check if content is just a standalone structural token that should be filtered."""
+        content_stripped = content.strip()
+        structural_tokens = [
+            "<|start|>",
+            "<|channel|>",
+            "<|message|>",
+            "<|constrain|>",
+            "<|end|>",
+            "<|call|>",
+            "<|return|>",
+        ]
+        return content_stripped in structural_tokens
+
+
+class TextStrategy:
+    """Parses the text-based Harmony fallback format."""
+
+    def __init__(self):
+        self.buffer_context = ""
+        self.patterns = {
+            "analysis_then_final": re.compile(
+                r"^\s*(?:assistant)?\s*(analysis|commentary)(.*?)\s*assistantfinal\s*(.*)\s*$",
+                re.IGNORECASE | re.DOTALL,
+            ),
+            "final_only": re.compile(
+                r"^\s*assistantfinal\s*(.*)\s*$", re.IGNORECASE | re.DOTALL
+            ),
+            "analysis_only": re.compile(
+                r"^\s*(?:assistant)?\s*(analysis|commentary)(.*)\s*$",
+                re.IGNORECASE | re.DOTALL,
+            ),
+        }
+
+    def set_buffer_context(self, buffer: str):
+        self.buffer_context = buffer
+
+    def parse(self, text: str) -> Tuple[List[Event], str]:
+        events = []
+
+        m = self.patterns["analysis_then_final"].match(text)
+        if m:
+            channel, reasoning, final = m.groups()
+            if channel.lower() == "analysis" and reasoning.strip():
+                events.append(Event("reasoning", reasoning.strip()))
+            elif channel.lower() == "commentary" and reasoning.strip():
+                events.append(Event("normal", reasoning.strip()))
+            if final.strip():
+                events.append(Event("normal", final.strip()))
+            return events, ""
+
+        # If assistantfinal appears to be incomplete (e.g., 'assistantfin'), hold entire buffer
+        if re.search(
+            r"(?:^|\s)(?:assistant)?\s*(analysis|commentary)", text, re.IGNORECASE
+        ):
+            low = text.lower()
+            if "assistantfin" in low and "assistantfinal" not in low:
+                return events, text
+
+        m = self.patterns["final_only"].match(text)
+        if m:
+            final = m.group(1)
+            if final.strip():
+                events.append(Event("normal", final.strip()))
+            return events, ""
+
+        m = self.patterns["analysis_only"].match(text)
+        if m:
+            channel, content = m.groups()
+            emit, hold = prefix_hold(content, ["assistantfinal"])
+            if channel.lower() == "analysis" and emit:
+                # Stream reasoning content as-is based on structural markers only.
+                events.append(Event("reasoning", emit))
+                # Keep the channel header in the remaining buffer to continue parsing
+                # subsequent chunks in the text fallback format. Preserve any held
+                # prefix that may complete into "assistantfinal".
+                if hold:
+                    return events, text[: m.start(2)] + hold
+                else:
+                    return events, channel
+            elif channel.lower() == "commentary" and emit:
+                # For commentary, stream as normal text. Preserve spaces unless holding.
+                content_out = emit if hold else emit.strip()
+                events.append(Event("normal", content_out))
+                if hold:
+                    return events, text[: m.start(2)] + hold
+                else:
+                    return events, ""
+            # If no emit, just return the held content
+            return events, text[: m.start(2)] + hold
+
+        emit, hold = prefix_hold(text, ["analysis", "commentary", "assistantfinal"])
+        if emit:
+            events.append(Event("normal", emit))
+        return events, hold
+
+
+class HarmonyParser:
+    """Facade for parsing Harmony format, switching between strategies."""
+
+    def __init__(self):
+        self.strategy = None
+        self._buffer = ""
+        self._should_filter_commentary = (
+            False  # Track if we should filter commentary in next chunks
+        )
+        self._partial_commentary = (
+            ""  # Track partial commentary being built across chunks
+        )
+
+    def parse(self, chunk: str) -> List[Event]:
+        self._buffer += chunk
+
+        if self.strategy is None:
+            if "<|channel|>" in self._buffer or "<|start|>" in self._buffer:
+                self.strategy = CanonicalStrategy()
+            elif re.search(
+                r"(?:^|\s)(?:assistant)?\s*(analysis|commentary|assistantfinal)",
+                self._buffer,
+                re.IGNORECASE,
+            ):
+                self.strategy = TextStrategy()
+            else:
+                # Not yet determined, hold
+                return []
+
+        if hasattr(self.strategy, "set_buffer_context"):
+            # Provide full buffer context to strategy for smarter whitespace handling
+            self.strategy.set_buffer_context(self._buffer)
+
+        events, remaining = self.strategy.parse(self._buffer)
+
+        # Check if we should start filtering commentary (after <|call|> token or tool_call event)
+        buffer_has_call_token = self._buffer.rstrip().endswith("<|call|>")
+
+        self._buffer = remaining
+
+        # Filter events for streaming case
+        filtered_events = []
+        for event in events:
+            should_filter = False
+
+            if event.event_type == "normal":
+                # Check if we're in a commentary filtering state
+                if self._should_filter_commentary or self._partial_commentary:
+                    # Try to build partial commentary
+                    potential_commentary = (
+                        self._partial_commentary + event.content.strip().lower()
+                    )
+
+                    if potential_commentary == "commentary":
+                        # Complete commentary found - filter it
+                        should_filter = True
+                        self._partial_commentary = ""  # Reset
+                        self._should_filter_commentary = False  # Done filtering
+                    elif "commentary".startswith(potential_commentary):
+                        # Partial match - accumulate and filter this chunk
+                        should_filter = True
+                        self._partial_commentary = potential_commentary
+                    else:
+                        # Not commentary - reset and keep the event
+                        self._partial_commentary = ""
+                        self._should_filter_commentary = False
+                else:
+                    # Not in commentary filtering state - reset partial state
+                    self._partial_commentary = ""
+
+            if should_filter:
+                # Skip this commentary filler
+                continue
+
+            # Update filtering state based on events and buffer state
+            if event.event_type == "tool_call":
+                self._should_filter_commentary = (
+                    True  # Filter commentary after tool calls
+                )
+                self._partial_commentary = ""  # Reset on tool call
+            elif buffer_has_call_token:
+                self._should_filter_commentary = (
+                    True  # Filter commentary after <|call|> token
+                )
+
+            filtered_events.append(event)
+
+        return filtered_events
--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -106,6 +106,8 @@ class DetokenizerManager:
            ]
        )

+        self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
+
    def event_loop(self):
        """The event loop that handles requests"""
        while True:
@@ -133,6 +135,9 @@ class DetokenizerManager:

        # Trim stop token.
        if isinstance(matched, int) and isinstance(output, list):
+            # 200012 <|call|> is the tool call token and one of eos tokens for gpt-oss model
+            if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
+                return output
            assert len(output) > 0
            return output[:-1]
        return output
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -1,13 +1,19 @@
 import re
 from typing import Dict, Optional, Tuple, Type

+from sglang.srt.harmony_parser import HarmonyParser
+

 class StreamingParseResult:
    """Result of streaming incremental parsing."""

-    def __init__(self, normal_text: str = "", reasoning_text: str = ""):
-        self.normal_text = normal_text
-        self.reasoning_text = reasoning_text
+    def __init__(
+        self,
+        normal_text: Optional[str] = None,
+        reasoning_text: Optional[str] = None,
+    ):
+        self.normal_text = normal_text or ""
+        self.reasoning_text = reasoning_text or ""


 class BaseReasoningFormatDetector:
@@ -188,316 +194,60 @@ class KimiDetector(BaseReasoningFormatDetector):

 class GptOssDetector(BaseReasoningFormatDetector):
    """
-    Detector for T4-style reasoning format.
-
-    Assumes reasoning format with two channels:
-      <|channel|>analysis<|message|>...reasoning content...<|end|>
-      <|start|>assistant<|channel|>final<|message|>...final answer...<|return|>
-
-    Returns content from 'analysis' channel as reasoning_text
-    and content from 'final' channel as normal_text.
-
-    Args:
-        stream_reasoning (bool): If False, accumulates reasoning content until complete.
-            If True, streams reasoning content as it arrives.
+    Detector for T4-style reasoning format (GPT-OSS), using the HarmonyParser.
    """

    def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
-        # TypeScript uses channel tokens instead of simple start/end tokens
        super().__init__(
            "<|channel|>analysis<|message|>",
            "<|end|>",
-            force_reasoning=True,
+            force_reasoning=force_reasoning,
            stream_reasoning=stream_reasoning,
        )
-        self.final_channel_start = "<|start|>assistant<|channel|>final<|message|>"
-        self.final_channel_end = "<|return|>"
-        self._in_final_channel = False
-        self._analysis_complete = False
-        self._in_reasoning = True
+        self.parser = HarmonyParser()

    def detect_and_parse(self, text: str) -> StreamingParseResult:
-        """
-        One-time parsing: Detects and parses both analysis and final channels.
-        Tool call channels are preserved in normal_text for downstream processing.
+        events = self.parser.parse(text)
+        # Flush the buffer for one-shot parsing
+        events += self.parser.parse("")

-        HACK: Also handles simplified format where text starts with "analysis" and transitions
-        to "assistantfinal" without full channel markers.
-        """
-        # HACK: Handle simplified format (analysis...assistantfinal) without channel markers
-        if (
-            text.startswith("analysis")
-            and "assistantfinal" in text
-            and "<|channel|>" not in text
-        ):
-            # Split on "assistantfinal"
-            parts = text.split("assistantfinal", 1)
-            self._in_reasoning = False
-            if len(parts) == 2:
-                reasoning_text = parts[0][
-                    len("analysis") :
-                ].strip()  # Remove "analysis" prefix
-                normal_text = parts[1].strip()
-                return StreamingParseResult(
-                    normal_text=normal_text, reasoning_text=reasoning_text
-                )
-
-        reasoning_parts = []
-        normal_parts = []
-        current_pos = 0
-
-        # Process text sequentially to preserve tool calls between analysis sections
-        while current_pos < len(text):
-            # Look for next analysis channel
-            analysis_start_idx = text.find(self.think_start_token, current_pos)
-
-            if analysis_start_idx == -1:
-                # No more analysis channels, rest goes to remaining
-                break
-
-            # Preserve any content before this analysis channel (could include tool calls)
-            if analysis_start_idx > current_pos:
-                between_content = text[current_pos:analysis_start_idx]
-                # This content will be added to normal_parts later
-                normal_parts.append(between_content)
-
-            # Extract analysis content
-            analysis_content_start = analysis_start_idx + len(self.think_start_token)
-            analysis_end_idx = text.find(self.think_end_token, analysis_content_start)
-
-            if analysis_end_idx != -1:
-                reasoning_parts.append(
-                    text[analysis_content_start:analysis_end_idx].strip()
-                )
-                current_pos = analysis_end_idx + len(self.think_end_token)
-            else:
-                # Analysis not complete
-                reasoning_parts.append(text[analysis_content_start:].strip())
-                reasoning_text = "".join(reasoning_parts)
-                return StreamingParseResult(reasoning_text=reasoning_text)
-
-        # Add any remaining text after all analysis sections
-        if current_pos < len(text):
-            remaining = text[current_pos:]
-            normal_parts.append(remaining)
-
-        # Process non-analysis content for commentary sections
-        full_normal_text = "".join(normal_parts)
-
-        # Extract reasoning from non-tool-call commentary sections
-        # Tool calls have "to=" in their header, regular commentary does not
-        commentary_pattern = re.compile(
-            r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
-            re.DOTALL,
+        reasoning_text = "".join(
+            [e.content for e in events if e.event_type == "reasoning"]
        )
-
-        cleaned_text = full_normal_text
-        for match in reversed(list(commentary_pattern.finditer(full_normal_text))):
-            # Check if this commentary is a tool call by looking at the text before <|message|>
-            match_start = match.start()
-            # Find where "<|channel|>commentary" starts within the matched pattern
-            # The pattern starts with "<|start|>assistant<|channel|>commentary"
-            # So we look for the text between "commentary" and "<|message|>" in the match
-            match_text = full_normal_text[match_start : match.end()]
-            commentary_idx = match_text.find("<|channel|>commentary")
-            if commentary_idx != -1:
-                message_idx = match_text.find("<|message|>", commentary_idx)
-                if message_idx != -1:
-                    between_text = match_text[commentary_idx:message_idx]
-                    # If no "to=" found, this is regular commentary (reasoning content)
-                    if " to=" not in between_text:
-                        content = match.group(1).strip()
-                        reasoning_parts.append(content)
-                        # Remove this commentary section from normal text
-                        cleaned_text = (
-                            cleaned_text[: match.start()] + cleaned_text[match.end() :]
-                        )
-
-        full_normal_text = cleaned_text
-
-        # Combine all reasoning parts
-        reasoning_text = "".join(reasoning_parts)
-
-        # Process full_normal_text for final output
-        normal_text = ""
-        if self.final_channel_start in full_normal_text:
-            final_start = full_normal_text.find(self.final_channel_start)
-            final_content_start = final_start + len(self.final_channel_start)
-            final_end = full_normal_text.find(
-                self.final_channel_end, final_content_start
-            )
-
-            if final_end != -1:
-                # Extract content before final channel (includes tool calls)
-                before_final = full_normal_text[:final_start].strip()
-                # Extract ONLY the final channel content (not the channel markers)
-                final_text = full_normal_text[final_content_start:final_end].strip()
-                # Extract content after final channel
-                after_final = full_normal_text[
-                    final_end + len(self.final_channel_end) :
-                ].strip()
-
-                # For tool calls + final answer: concatenate tool calls with final text
-                parts = []
-                if before_final:
-                    parts.append(before_final)
-                if final_text:
-                    parts.append(final_text)
-                if after_final:
-                    parts.append(after_final)
-                normal_text = " ".join(parts)
-            else:
-                # Final channel not complete - extract what we have
-                # Look for just <|channel|>final<|message|> without <|return|>
-                alt_final_start = full_normal_text.find("<|channel|>final<|message|>")
-                if alt_final_start != -1:
-                    before_alt_final = full_normal_text[:alt_final_start].strip()
-                    alt_final_content = full_normal_text[
-                        alt_final_start + len("<|channel|>final<|message|>") :
-                    ].strip()
-
-                    parts = []
-                    if before_alt_final:
-                        parts.append(before_alt_final)
-                    if alt_final_content:
-                        parts.append(alt_final_content)
-                    normal_text = " ".join(parts)
-                else:
-                    normal_text = full_normal_text.strip()
-        else:
-            # No final channel, treat all as normal text (includes tool calls)
-            normal_text = full_normal_text.strip()
+        normal_parts = []
+        for e in events:
+            if e.event_type == "normal":
+                normal_parts.append(e.content)
+            elif e.event_type == "tool_call":
+                # Use raw_text to preserve structural markers for function call detector
+                normal_parts.append(e.raw_text if e.raw_text else e.content)
+        normal_text = "".join(normal_parts)
+        # Tool call events preserve raw text with structural markers

        return StreamingParseResult(
-            normal_text=normal_text, reasoning_text=reasoning_text
+            normal_text=normal_text,
+            reasoning_text=reasoning_text,
        )

    def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
-        """
-        Streaming incremental parsing for GPT-OSS format.
+        events = self.parser.parse(new_text)

-        This is a simplified streaming implementation that accumulates content
-        and delegates to the non-streaming parser for complex multi-channel parsing.
-        TODO: Implement proper incremental parsing for better streaming performance.
-        """
-        self._buffer += new_text
-
-        if not self._in_reasoning:
-            return StreamingParseResult(normal_text=new_text)
-
-        # Check if we have complete sections to process
-        # For GPT-OSS, we need to wait for complete channel sections
-        # HACK: For now, use simplified approach - wait for key markers before processing
-        key_markers = ["<|end|>", "<|call|>", "<|return|>", "assistantfinal"]
-        has_complete_section = any(marker in self._buffer for marker in key_markers)
-
-        if not has_complete_section:
-            # Still accumulating, don't process yet
-            return StreamingParseResult()
-
-        # Handle simplified format (analysis...assistantfinal) with true incremental streaming
-        if (
-            "<|channel|>" not in self._buffer
-        ):  # Simplified format without channel markers
-            if self._buffer.startswith("analysis"):
-                # Check if we have the transition to assistantfinal
-                if "assistantfinal" in self._buffer:
-                    self._in_reasoning = False
-                    # Complete reasoning section - extract and stream it
-                    parts = self._buffer.split("assistantfinal", 1)
-                    reasoning_text = parts[0][len("analysis") :].strip()
-                    final_content = parts[1].strip()
-
-                    # Clear buffer and return both reasoning and final content
-                    self._buffer = ""
-                    return StreamingParseResult(
-                        reasoning_text=reasoning_text if self.stream_reasoning else "",
-                        normal_text=final_content,
-                    )
-                elif self.stream_reasoning:
-                    # Stream reasoning content incrementally as it arrives
-                    current_reasoning = self._buffer[len("analysis") :].strip()
-                    self._buffer = ""
-                    return StreamingParseResult(reasoning_text=current_reasoning)
-                else:
-                    # Wait for assistantfinal
-                    return StreamingParseResult()
-            elif self._buffer.startswith("assistantfinal"):
-                # Direct final content without analysis
-                final_content = self._buffer[len("assistantfinal") :].strip()
-                self._buffer = ""
-                return StreamingParseResult(normal_text=final_content)
-
-        # For full channel format, process sections as they complete
-        result = StreamingParseResult()
-
-        # Process complete analysis sections
-        while (
-            self.think_start_token in self._buffer
-            and self.think_end_token in self._buffer
-        ):
-            start_idx = self._buffer.find(self.think_start_token)
-            start_pos = start_idx + len(self.think_start_token)
-            end_pos = self._buffer.find(self.think_end_token, start_pos)
-
-            if end_pos != -1:
-                reasoning_content = self._buffer[start_pos:end_pos].strip()
-                if self.stream_reasoning and reasoning_content:
-                    result.reasoning_text += reasoning_content
-
-                # Remove processed analysis section
-                self._buffer = (
-                    self._buffer[:start_idx]
-                    + self._buffer[end_pos + len(self.think_end_token) :]
-                )
-            else:
-                break
-
-        # Process complete commentary sections
-        commentary_pattern = re.compile(
-            r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
-            re.DOTALL,
+        reasoning_text = "".join(
+            [e.content for e in events if e.event_type == "reasoning"]
        )
+        normal_parts = []
+        for e in events:
+            if e.event_type == "normal":
+                normal_parts.append(e.content)
+            elif e.event_type == "tool_call":
+                # Use raw_text to preserve structural markers for function call detector
+                normal_parts.append(e.raw_text if e.raw_text else e.content)
+        normal_text = "".join(normal_parts)

-        for match in reversed(list(commentary_pattern.finditer(self._buffer))):
-            # Check if this is a tool call
-            start_pos = match.start()
-            commentary_content = match.group(1).strip()
-            if self.stream_reasoning and commentary_content:
-                result.reasoning_text += commentary_content
-
-            # Remove this commentary section
-            self._buffer = self._buffer[: match.start()] + self._buffer[match.end() :]
-            # Clean up any standalone <|start|>assistant
-            self._buffer = re.sub(
-                r"<\|start\|>assistant(?=<\|start\|>assistant)", "", self._buffer
-            )
-
-        # Handle final channel completion
-        if self.final_channel_start in self._buffer:
-            final_start = self._buffer.find(self.final_channel_start)
-            final_content_start = final_start + len(self.final_channel_start)
-
-            # Check if final channel is complete
-            final_end = self._buffer.find(self.final_channel_end, final_content_start)
-            if final_end != -1:
-                # Complete final channel - process everything
-                final_result = self.detect_and_parse(self._buffer)
-                self._buffer = ""
-                return StreamingParseResult(
-                    normal_text=final_result.normal_text,
-                    reasoning_text=result.reasoning_text + final_result.reasoning_text,
-                )
-            else:
-                # Extract content before final channel (e.g. tool calls)
-                before_final = self._buffer[:final_start]
-                if before_final:
-                    # Output tool calls for processing
-                    result.normal_text += before_final
-                    # Keep the final channel part in buffer
-                    self._buffer = self._buffer[final_start:]
-
-        return result
+        return StreamingParseResult(
+            normal_text=normal_text,
+            reasoning_text=reasoning_text,
+        )


 class ReasoningParser:
@@ -526,7 +276,7 @@ class ReasoningParser:
        self,
        model_type: Optional[str] = None,
        stream_reasoning: bool = True,
-        force_reasoning: bool = False,
+        force_reasoning: Optional[bool] = None,
    ):
        if not model_type:
            raise ValueError("Model type must be specified")
@@ -535,19 +285,25 @@ class ReasoningParser:
        if not detector_class:
            raise ValueError(f"Unsupported model type: {model_type}")

-        if model_type.lower() == "qwen3-thinking":
+        # Special cases where we override force_reasoning
+        if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
            force_reasoning = True

-        self.detector = detector_class(
-            stream_reasoning=stream_reasoning, force_reasoning=force_reasoning
-        )
+        # Only pass force_reasoning if explicitly set, let detectors use their defaults
+        kwargs = {"stream_reasoning": stream_reasoning}
+        if force_reasoning is not None:
+            kwargs["force_reasoning"] = force_reasoning

-    def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
+        self.detector = detector_class(**kwargs)
+
+    def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
        """Non-streaming call: one-time parsing"""
        ret = self.detector.detect_and_parse(full_text)
        return ret.reasoning_text, ret.normal_text

-    def parse_stream_chunk(self, chunk_text: str) -> Tuple[str, str]:
+    def parse_stream_chunk(
+        self, chunk_text: str
+    ) -> Tuple[Optional[str], Optional[str]]:
        """Streaming call: incremental parsing"""
        ret = self.detector.parse_streaming_increment(chunk_text)
        return ret.reasoning_text, ret.normal_text
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -2271,6 +2271,7 @@ class ServerArgs:
            if is_mxfp4_quant_format:
                # use bf16 for mxfp4 triton kernels
                self.dtype = "bfloat16"
+
        elif "Llama4" in model_arch:
            assert self.attention_backend in {
                "fa3",