diff --git a/docs/backend/structured_outputs_for_reasoning_models.ipynb b/docs/backend/structured_outputs_for_reasoning_models.ipynb new file mode 100644 index 000000000..447e5a7cc --- /dev/null +++ b/docs/backend/structured_outputs_for_reasoning_models.ipynb @@ -0,0 +1,833 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Structured Outputs For Reasoning Models\n", + "\n", + "When working with reasoning models that use special tokens like `...` to denote reasoning sections, you might want to allow free-form text within these sections while still enforcing grammar constraints on the rest of the output.\n", + "\n", + "SGLang provides a feature to disable grammar restrictions within reasoning sections. This is particularly useful for models that need to perform complex reasoning steps before providing a structured output.\n", + "\n", + "To enable this feature, use the `--reasoning-parser` flag which decide the think_end_token, such as ``, when launching the server. You can also specify the reasoning parser using the `--reasoning-parser` flag.\n", + "\n", + "## Supported Models\n", + "\n", + "Currently, SGLang supports the following reasoning models:\n", + "- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `` and `` tags.\n", + "- [QwQ](https://huggingface.co/Qwen/QwQ-32B): The reasoning content is wrapped with `` and `` tags.\n", + "\n", + "\n", + "## Usage\n", + "\n", + "## OpenAI Compatible API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify the `--grammar-backend`, `--reasoning-parser` option." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "import os\n", + "from sglang.test.test_utils import is_in_ci\n", + "\n", + "if is_in_ci():\n", + " from patch import launch_server_cmd\n", + "else:\n", + " from sglang.utils import launch_server_cmd\n", + "\n", + "from sglang.utils import wait_for_server, print_highlight, terminate_process\n", + "\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", + "\n", + "\n", + "server_process, port = launch_server_cmd(\n", + " \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1\"\n", + ")\n", + "\n", + "wait_for_server(f\"http://localhost:{port}\")\n", + "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### JSON\n", + "\n", + "you can directly define a JSON schema or use [Pydantic](https://docs.pydantic.dev/latest/) to define and validate the response." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Using Pydantic**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic import BaseModel, Field\n", + "\n", + "\n", + "# Define the schema using Pydantic\n", + "class CapitalInfo(BaseModel):\n", + " name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n", + " population: int = Field(..., description=\"Population of the capital city\")\n", + "\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Please generate the information of the capital of France in the JSON format.\",\n", + " },\n", + " ],\n", + " temperature=0,\n", + " max_tokens=2048,\n", + " response_format={\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\n", + " \"name\": \"foo\",\n", + " # convert the pydantic model to json schema\n", + " \"schema\": CapitalInfo.model_json_schema(),\n", + " },\n", + " },\n", + ")\n", + "\n", + "print_highlight(\n", + " f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**JSON Schema Directly**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "json_schema = json.dumps(\n", + " {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n", + " \"population\": {\"type\": \"integer\"},\n", + " },\n", + " \"required\": [\"name\", \"population\"],\n", + " }\n", + ")\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Give me the information of the capital of France in the JSON format.\",\n", + " },\n", + " ],\n", + " temperature=0,\n", + " max_tokens=2048,\n", + " response_format={\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\"name\": \"foo\", \"schema\": json.loads(json_schema)},\n", + " },\n", + ")\n", + "\n", + "print_highlight(\n", + " f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EBNF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ebnf_grammar = \"\"\"\n", + "root ::= city | description\n", + "city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\n", + "description ::= city \" is \" status\n", + "status ::= \"the capital of \" country\n", + "country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"\n", + "\"\"\"\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful geography bot.\"},\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Give me the information of the capital of France.\",\n", + " },\n", + " ],\n", + " temperature=0,\n", + " max_tokens=2048,\n", + " extra_body={\"ebnf\": ebnf_grammar},\n", + ")\n", + "\n", + "print_highlight(\n", + " f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regular expression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = client.chat.completions.create(\n", + " model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n", + " ],\n", + " temperature=0,\n", + " max_tokens=2048,\n", + " extra_body={\"regex\": \"(Paris|London)\"},\n", + ")\n", + "\n", + "print_highlight(\n", + " f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Structural Tag" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tool_get_current_weather = {\n", + " \"type\": \"function\",\n", + " \"function\": {\n", + " \"name\": \"get_current_weather\",\n", + " \"description\": \"Get the current weather in a given location\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"city\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n", + " },\n", + " \"state\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"the two-letter abbreviation for the state that the city is\"\n", + " \" in, e.g. 'CA' which would mean 'California'\",\n", + " },\n", + " \"unit\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The unit to fetch the temperature in\",\n", + " \"enum\": [\"celsius\", \"fahrenheit\"],\n", + " },\n", + " },\n", + " \"required\": [\"city\", \"state\", \"unit\"],\n", + " },\n", + " },\n", + "}\n", + "\n", + "tool_get_current_date = {\n", + " \"type\": \"function\",\n", + " \"function\": {\n", + " \"name\": \"get_current_date\",\n", + " \"description\": \"Get the current date and time for a given timezone\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"timezone\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The timezone to fetch the current date and time for, e.g. 'America/New_York'\",\n", + " }\n", + " },\n", + " \"required\": [\"timezone\"],\n", + " },\n", + " },\n", + "}\n", + "\n", + "schema_get_current_weather = tool_get_current_weather[\"function\"][\"parameters\"]\n", + "schema_get_current_date = tool_get_current_date[\"function\"][\"parameters\"]\n", + "\n", + "\n", + "def get_messages():\n", + " return [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": f\"\"\"\n", + "# Tool Instructions\n", + "- Always execute python code in messages that you share.\n", + "- When looking for real time information use relevant functions if available else fallback to brave_search\n", + "You have access to the following functions:\n", + "Use the function 'get_current_weather' to: Get the current weather in a given location\n", + "{tool_get_current_weather[\"function\"]}\n", + "Use the function 'get_current_date' to: Get the current date and time for a given timezone\n", + "{tool_get_current_date[\"function\"]}\n", + "If a you choose to call a function ONLY reply in the following format:\n", + "<{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}\n", + "where\n", + "start_tag => ` a JSON dict with the function argument name as key and function argument value as value.\n", + "end_tag => ``\n", + "Here is an example,\n", + "{{\"example_name\": \"example_value\"}}\n", + "Reminder:\n", + "- Function calls MUST follow the specified format\n", + "- Required parameters MUST be specified\n", + "- Only call one function at a time\n", + "- Put the entire function call reply on one line\n", + "- Always add your sources when using search results to answer the user query\n", + "You are a helpful assistant.\"\"\",\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"You are in New York. Please get the current date and time, and the weather.\",\n", + " },\n", + " ]\n", + "\n", + "\n", + "messages = get_messages()\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n", + " messages=messages,\n", + " response_format={\n", + " \"type\": \"structural_tag\",\n", + " \"max_new_tokens\": 2048,\n", + " \"structures\": [\n", + " {\n", + " \"begin\": \"\",\n", + " \"schema\": schema_get_current_weather,\n", + " \"end\": \"\",\n", + " },\n", + " {\n", + " \"begin\": \"\",\n", + " \"schema\": schema_get_current_date,\n", + " \"end\": \"\",\n", + " },\n", + " ],\n", + " \"triggers\": [\"\")[0]\n", + "content = json.loads(response.json()[\"text\"].split(\"\")[1])\n", + "print_highlight(f\"reasoing_content: {reasoing_content}\\n\\ncontent: {content}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**JSON Schema Directly**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "json_schema = json.dumps(\n", + " {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n", + " \"population\": {\"type\": \"integer\"},\n", + " },\n", + " \"required\": [\"name\", \"population\"],\n", + " }\n", + ")\n", + "\n", + "# JSON\n", + "text = tokenizer.apply_chat_template(text, tokenize=False, add_generation_prompt=True)\n", + "response = requests.post(\n", + " f\"http://localhost:{port}/generate\",\n", + " json={\n", + " \"text\": text,\n", + " \"sampling_params\": {\n", + " \"temperature\": 0,\n", + " \"max_new_tokens\": 2048,\n", + " \"json_schema\": json_schema,\n", + " },\n", + " },\n", + ")\n", + "\n", + "print_highlight(response.json())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EBNF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.post(\n", + " f\"http://localhost:{port}/generate\",\n", + " json={\n", + " \"text\": \"Give me the information of the capital of France.\",\n", + " \"sampling_params\": {\n", + " \"max_new_tokens\": 2048,\n", + " \"temperature\": 0,\n", + " \"n\": 3,\n", + " \"ebnf\": (\n", + " \"root ::= city | description\\n\"\n", + " 'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n", + " 'description ::= city \" is \" status\\n'\n", + " 'status ::= \"the capital of \" country\\n'\n", + " 'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n", + " ),\n", + " },\n", + " \"stream\": False,\n", + " \"return_logprob\": False,\n", + " },\n", + ")\n", + "\n", + "print(response.json())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regular expression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.post(\n", + " f\"http://localhost:{port}/generate\",\n", + " json={\n", + " \"text\": \"Paris is the capital of\",\n", + " \"sampling_params\": {\n", + " \"temperature\": 0,\n", + " \"max_new_tokens\": 2048,\n", + " \"regex\": \"(France|England)\",\n", + " },\n", + " },\n", + ")\n", + "print(response.json())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Structural Tag" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = tokenizer.apply_chat_template(\n", + " messages, tokenize=False, add_generation_prompt=True\n", + ")\n", + "payload = {\n", + " \"text\": text,\n", + " \"sampling_params\": {\n", + " \"max_new_tokens\": 2048,\n", + " \"structural_tag\": json.dumps(\n", + " {\n", + " \"type\": \"structural_tag\",\n", + " \"structures\": [\n", + " {\n", + " \"begin\": \"\",\n", + " \"schema\": schema_get_current_weather,\n", + " \"end\": \"\",\n", + " },\n", + " {\n", + " \"begin\": \"\",\n", + " \"schema\": schema_get_current_date,\n", + " \"end\": \"\",\n", + " },\n", + " ],\n", + " \"triggers\": [\"\",\n", + " \"schema\": schema_get_current_weather,\n", + " \"end\": \"\",\n", + " },\n", + " {\n", + " \"begin\": \"\",\n", + " \"schema\": schema_get_current_date,\n", + " \"end\": \"\",\n", + " },\n", + " ],\n", + " \"triggers\": [\" Optional[Tuple[List[int], str]]: """ @@ -59,6 +71,13 @@ class BaseGrammarObject(ABC): """ raise NotImplementedError + @abstractmethod + def accept_token(self, token: int) -> None: + """ + Accept a token in the grammar. + """ + raise NotImplementedError + @abstractmethod def allocate_vocab_mask( self, vocab_size: int, batch_size: int, device @@ -90,7 +109,7 @@ class CacheEntry: event: Event -class BaseGrammarBackend(ABC): +class BaseGrammarBackend: def __init__(self): self.executor = ThreadPoolExecutor() self.cache: Dict[Tuple[str, str], CacheEntry] = {} @@ -107,19 +126,15 @@ class BaseGrammarBackend(ABC): """ raise ValueError(f"Invalid key_type: {key_type}={key_string}") - @abstractmethod def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]: return self._not_supported("json", key_string) - @abstractmethod def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]: return self._not_supported("regex", key_string) - @abstractmethod def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]: return self._not_supported("ebnf", key_string) - @abstractmethod def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]: return self._not_supported("structural_tag", key_string) @@ -195,4 +210,10 @@ def create_grammar_backend( else: raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}") + if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"): + from .reasoner_grammar_backend import ReasonerGrammarBackend + + grammar_backend = ReasonerGrammarBackend( + grammar_backend, tokenizer.think_end_id + ) return grammar_backend diff --git a/python/sglang/srt/constrained/llguidance_backend.py b/python/sglang/srt/constrained/llguidance_backend.py index 24893a49d..6926e1c30 100644 --- a/python/sglang/srt/constrained/llguidance_backend.py +++ b/python/sglang/srt/constrained/llguidance_backend.py @@ -33,6 +33,7 @@ class GuidanceGrammar(BaseGrammarObject): def __init__( self, llguidance_tokenizer: llguidance.LLTokenizer, serialized_grammar: str ): + super().__init__() self.llguidance_tokenizer = llguidance_tokenizer self.serialized_grammar = serialized_grammar diff --git a/python/sglang/srt/constrained/outlines_backend.py b/python/sglang/srt/constrained/outlines_backend.py index 7a05bb2c5..41128108a 100644 --- a/python/sglang/srt/constrained/outlines_backend.py +++ b/python/sglang/srt/constrained/outlines_backend.py @@ -44,6 +44,7 @@ class OutlinesGrammar(BaseGrammarObject): guide: RegexGuide, jump_forward_map: Union[OutlinesJumpForwardMap, None], ) -> None: + super().__init__() self.guide = guide self.jump_forward_map = jump_forward_map self.state = 0 diff --git a/python/sglang/srt/constrained/reasoner_grammar_backend.py b/python/sglang/srt/constrained/reasoner_grammar_backend.py new file mode 100644 index 000000000..3f6f59e5b --- /dev/null +++ b/python/sglang/srt/constrained/reasoner_grammar_backend.py @@ -0,0 +1,101 @@ +# Copyright 2023-2024 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""The baseclass of a backend for reasoner grammar-guided constrained decoding.""" + +from concurrent.futures import Future +from typing import List, Optional, Tuple + +import torch + +from .base_grammar_backend import BaseGrammarBackend, BaseGrammarObject + + +class ReasonerGrammarObject(BaseGrammarObject): + def __init__(self, grammar: BaseGrammarObject, think_end_id): + super().__init__() + self.grammar = grammar + self.think_end_id = think_end_id + self.is_in_reasoning = True + + @property + def finished(self): + return self.grammar.finished + + @finished.setter + def finished(self, finished): + self.grammar.finished = finished + + def allocate_vocab_mask( + self, vocab_size: int, batch_size: int, device + ) -> torch.Tensor: + return self.grammar.allocate_vocab_mask(vocab_size, batch_size, device) + + def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None: + if not self.is_in_reasoning: + self.grammar.fill_vocab_mask(vocab_mask, idx) + + def move_vocab_mask(self, vocab_mask: torch.Tensor, device) -> torch.Tensor: + return self.grammar.move_vocab_mask(vocab_mask, device) + + @property + def apply_vocab_mask(self): + return self.grammar.apply_vocab_mask + + def accept_token(self, token: int): + if token == self.think_end_id: + self.is_in_reasoning = False + + if not self.is_in_reasoning and token != self.think_end_id: + self.grammar.accept_token(token) + + def try_jump_forward(self, tokenizer): + return self.grammar.try_jump_forward(tokenizer) + + def jump_forward_str_state(self, helper): + return self.grammar.jump_forward_str_state(helper) + + def jump_and_retokenize( + self, old_output_ids: List[int], new_output_ids: List[int], next_state: int + ): + return self.grammar.jump_and_retokenize( + old_output_ids, new_output_ids, next_state + ) + + def copy(self) -> BaseGrammarObject: + return ReasonerGrammarObject(self.grammar.copy(), self.think_end_id) + + +class ReasonerGrammarBackend(BaseGrammarBackend): + def __init__(self, grammar_backend: BaseGrammarBackend, think_end_id): + self.grammar_backend = grammar_backend + self.think_end_id = think_end_id + + def get_cached_value(self, key: Tuple[str, str]) -> Optional[ReasonerGrammarObject]: + grammar = self.grammar_backend.get_cached_value(key) + return ReasonerGrammarObject(grammar, self.think_end_id) if grammar else None + + def get_future_value(self, key: Tuple[str, str]) -> Future: + grammar = Future() + + def callback(f: Future): + if result := f.result(): + grammar.set_result(ReasonerGrammarObject(result, self.think_end_id)) + else: + grammar.set_result(None) + + self.grammar_backend.get_future_value(key).add_done_callback(callback) + return grammar + + def reset(self): + self.grammar_backend.reset() diff --git a/python/sglang/srt/constrained/xgrammar_backend.py b/python/sglang/srt/constrained/xgrammar_backend.py index 4df3ae286..5aef05f9b 100644 --- a/python/sglang/srt/constrained/xgrammar_backend.py +++ b/python/sglang/srt/constrained/xgrammar_backend.py @@ -48,6 +48,7 @@ class XGrammarGrammar(BaseGrammarObject): ctx: CompiledGrammar, override_stop_tokens: Optional[Union[List[int], int]], ) -> None: + super().__init__() self.matcher = matcher self.vocab_size = vocab_size self.ctx = ctx diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index e917f42b1..533db7d87 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -113,6 +113,7 @@ from sglang.srt.mem_cache.hiradix_cache import HiRadixCache from sglang.srt.mem_cache.radix_cache import RadixCache from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats from sglang.srt.model_executor.forward_batch_info import ForwardMode +from sglang.srt.reasoning_parser import ReasoningParser from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.speculative.spec_info import SpeculativeAlgorithm from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter @@ -232,6 +233,15 @@ class Scheduler( # Init tokenizer self.init_tokenizer() + # Set reasoning_parser and think_end_id if --reasoning_parser is enabled + if self.server_args.reasoning_parser and self.tokenizer: + reasoning_parser = ReasoningParser( + model_type=self.server_args.reasoning_parser, stream_reasoning=False + ) + self.tokenizer.think_end_id = self.tokenizer.encode( + reasoning_parser.detector.think_end_token, add_special_tokens=False + )[0] + # Check whether overlap can be enabled if not self.is_generation: self.enable_overlap = False