refactor: centralize CoT parsing in backend for streaming mode (#16394)

* refactor: unify reasoning handling via backend reasoning_content, drop frontend tag parsing

- Updated the chat message component to surface backend-supplied reasoning via message.thinking while showing the raw assistant content without inline tag scrubbing
- Simplified chat streaming to append content chunks directly, stream reasoning into the message model, and persist any partial reasoning when generation stops
- Refactored the chat service SSE handler to rely on server-provided reasoning_content, removing legacy <think> parsing logic
- Refreshed Storybook data and streaming flows to populate the thinking field explicitly for static and streaming assistant messages

* refactor: implement streaming-aware universal reasoning parser

Remove the streaming mode limitation from --reasoning-format by refactoring
try_parse_reasoning() to handle incremental parsing of <think> tags across
all formats.

- Rework try_parse_reasoning() to track whitespace, partial tags, and
  multiple reasoning segments, allowing proper separation of reasoning_content
  and content in streaming mode
- Parse reasoning tags before tool call handling in content-only and Llama 3.x
  formats to ensure inline <think> blocks are captured correctly
- Change default reasoning_format from 'auto' to 'deepseek' for consistent
  behavior
- Add 'deepseek-legacy' option to preserve old inline behavior when needed
- Update CLI help and documentation to reflect streaming support
- Add parser tests for inline <think>...</think> segments

The parser now continues processing content after </think> closes instead of
stopping, enabling proper message.reasoning_content and message.content
separation in both streaming and non-streaming modes.

Fixes the issue where streaming responses would dump everything (including
post-thinking content) into reasoning_content while leaving content empty.

* refactor: address review feedback from allozaur

- Passed the assistant message content directly to ChatMessageAssistant to drop the redundant derived state in the chat message component
- Simplified chat streaming updates by removing unused partial-thinking handling and persisting partial responses straight from currentResponse
- Refreshed the ChatMessage stories to cover standard and reasoning scenarios without the old THINK-tag parsing examples

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>

* refactor: restore forced reasoning prefix to pass test-chat ([chat] All tests passed)

- store the exact sequence seen on input when 'thinking_forced_open' enforces a reasoning block
- inject this prefix before the first accumulated segment in 'reasoning_content', then clear it to avoid duplication
- repeat the capture on every new 'start_think' detection to properly handle partial/streaming flows

* refactor: address review feedback from ngxson

* debug: say goodbye to curl -N, hello one-click raw stream

- adds a new checkbox in the WebUI to display raw LLM output without backend parsing or frontend Markdown rendering

* Update tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>

* webui: add Storybook example for raw LLM output and scope reasoning format toggle per story

- Added a Storybook example that showcases the chat message component in raw LLM output mode with the provided trace sample
- Updated every ChatMessage story to toggle the disableReasoningFormat setting so the raw-output rendering remains scoped to its own example

* npm run format

* chat-parser: address review feedback from ngxson

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

---------

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
This commit is contained in:
Pascal
2025-10-08 22:18:41 +02:00
committed by GitHub
parent 9d0882840e
commit 12bbc3fa50
14 changed files with 276 additions and 431 deletions

View File

@@ -78,6 +78,8 @@ export class ChatService {
timings_per_token
} = options;
const currentConfig = config();
// Cancel any ongoing request and create a new abort controller
this.abort();
this.abortController = new AbortController();
@@ -117,7 +119,7 @@ export class ChatService {
stream
};
requestBody.reasoning_format = 'auto';
requestBody.reasoning_format = currentConfig.disableReasoningFormat ? 'none' : 'auto';
if (temperature !== undefined) requestBody.temperature = temperature;
// Set max_tokens to -1 (infinite) if not provided or empty
@@ -161,7 +163,6 @@ export class ChatService {
}
try {
const currentConfig = config();
const apiKey = currentConfig.apiKey?.toString().trim();
const response = await fetch(`./v1/chat/completions`, {
@@ -256,10 +257,8 @@ export class ChatService {
}
const decoder = new TextDecoder();
let fullResponse = '';
let aggregatedContent = '';
let fullReasoningContent = '';
let regularContent = '';
let insideThinkTag = false;
let hasReceivedData = false;
let lastTimings: ChatMessageTimings | undefined;
@@ -277,7 +276,7 @@ export class ChatService {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') {
if (!hasReceivedData && fullResponse.length === 0) {
if (!hasReceivedData && aggregatedContent.length === 0) {
const contextError = new Error(
'The request exceeds the available context size. Try increasing the context size or enable context shift.'
);
@@ -286,7 +285,7 @@ export class ChatService {
return;
}
onComplete?.(regularContent, fullReasoningContent || undefined, lastTimings);
onComplete?.(aggregatedContent, fullReasoningContent || undefined, lastTimings);
return;
}
@@ -310,27 +309,8 @@ export class ChatService {
if (content) {
hasReceivedData = true;
fullResponse += content;
// Track the regular content before processing this chunk
const regularContentBefore = regularContent;
// Process content character by character to handle think tags
insideThinkTag = this.processContentForThinkTags(
content,
insideThinkTag,
() => {
// Think content is ignored - we don't include it in API requests
},
(regularChunk) => {
regularContent += regularChunk;
}
);
const newRegularContent = regularContent.slice(regularContentBefore.length);
if (newRegularContent) {
onChunk?.(newRegularContent);
}
aggregatedContent += content;
onChunk?.(content);
}
if (reasoningContent) {
@@ -345,7 +325,7 @@ export class ChatService {
}
}
if (!hasReceivedData && fullResponse.length === 0) {
if (!hasReceivedData && aggregatedContent.length === 0) {
const contextError = new Error(
'The request exceeds the available context size. Try increasing the context size or enable context shift.'
);
@@ -552,51 +532,6 @@ export class ChatService {
}
}
/**
* Processes content to separate thinking tags from regular content.
* Parses <think> and </think> tags to route content to appropriate handlers.
*
* @param content - The content string to process
* @param currentInsideThinkTag - Current state of whether we're inside a think tag
* @param addThinkContent - Callback to handle content inside think tags
* @param addRegularContent - Callback to handle regular content outside think tags
* @returns Boolean indicating if we're still inside a think tag after processing
* @private
*/
private processContentForThinkTags(
content: string,
currentInsideThinkTag: boolean,
addThinkContent: (chunk: string) => void,
addRegularContent: (chunk: string) => void
): boolean {
let i = 0;
let insideThinkTag = currentInsideThinkTag;
while (i < content.length) {
if (!insideThinkTag && content.substring(i, i + 7) === '<think>') {
insideThinkTag = true;
i += 7; // Skip the <think> tag
continue;
}
if (insideThinkTag && content.substring(i, i + 8) === '</think>') {
insideThinkTag = false;
i += 8; // Skip the </think> tag
continue;
}
if (insideThinkTag) {
addThinkContent(content[i]);
} else {
addRegularContent(content[i]);
}
i++;
}
return insideThinkTag;
}
/**
* Aborts any ongoing chat completion request.
* Cancels the current request and cleans up the abort controller.