Support Thinking Budget (via custom_logit_processor for OpenAI API) [Fix #6572] (#11416)

Signed-off-by: ybyang <ybyang7@iflytek.com> Co-authored-by: YorkSu <york_su@qq.com>
2025-10-21 16:27:56 +08:00
parent c1e1600373
commit dbb16bedd5
7 changed files with 239 additions and 1 deletions
--- a/docs/basic_usage/deepseek.md
+++ b/docs/basic_usage/deepseek.md
@@ -235,6 +235,44 @@ Important Notes:
 2. To receive more consistent tool call results, it is recommended to use `--chat-template examples/chat_template/tool_chat_template_deepseekv3.jinja`. It provides an improved unified prompt.


+### Thinking Budget for DeepSeek R1
+
+In SGLang, we can implement thinking budget with `CustomLogitProcessor`.
+
+Launch a server with `--enable-custom-logit-processor` flag on.
+
+```
+python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-R1 --tp 8 --port 30000 --host 0.0.0.0 --mem-fraction-static 0.9 --disable-cuda-graph --reasoning-parser deepseek-r1 --enable-custom-logit-processor
+```
+
+Sample Request:
+
+```python
+import openai
+from rich.pretty import pprint
+from sglang.srt.sampling.custom_logit_processor import DeepSeekR1ThinkingBudgetLogitProcessor
+
+
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="*")
+response = client.chat.completions.create(
+    model="deepseek-ai/DeepSeek-R1",
+    messages=[
+        {
+            "role": "user",
+            "content": "Question: Is Paris the Capital of France?",
+        }
+    ],
+    max_tokens=1024,
+    extra_body={
+        "custom_logit_processor": DeepSeekR1ThinkingBudgetLogitProcessor().to_str(),
+        "custom_params": {
+            "thinking_budget": 512,
+        },
+    },
+)
+pprint(response)
+```
+
 ## FAQ

 **Q: Model loading is taking too long, and I'm encountering an NCCL timeout. What should I do?**
--- a/docs/basic_usage/sampling_params.md
+++ b/docs/basic_usage/sampling_params.md
@@ -319,3 +319,27 @@ response = requests.post(
 )
 print(response.json())
 ```
+
+Send an OpenAI chat completion request:
+
+```python
+import openai
+from sglang.utils import print_highlight
+
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None")
+
+response = client.chat.completions.create(
+    model="meta-llama/Meta-Llama-3-8B-Instruct",
+    messages=[
+        {"role": "user", "content": "List 3 countries and their capitals."},
+    ],
+    temperature=0.0,
+    max_tokens=32,
+    extra_body={
+        "custom_logit_processor": DeterministicLogitProcessor().to_str(),
+        "custom_params": {"token_id": 5},
+    },
+)
+
+print_highlight(f"Response: {response}")
+```
--- a/python/sglang/srt/entrypoints/openai/protocol.py
+++ b/python/sglang/srt/entrypoints/openai/protocol.py
@@ -243,6 +243,8 @@ class CompletionRequest(BaseModel):
    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
    session_params: Optional[Dict] = None
    response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
+    custom_params: Optional[Dict] = None
+    custom_logit_processor: Optional[str] = None

    # For PD disaggregation
    bootstrap_host: Optional[Union[List[str], str]] = None
@@ -504,6 +506,10 @@ class ChatCompletionRequest(BaseModel):
    stream_reasoning: bool = True
    chat_template_kwargs: Optional[Dict] = None

+    # Custom logit processor for advanced sampling control
+    custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
+    custom_params: Optional[Dict] = None
+
    # For request id
    rid: Optional[Union[List[str], str]] = None
    # Extra key for classifying the request (e.g. cache_salt)
@@ -636,6 +642,7 @@ class ChatCompletionRequest(BaseModel):
            "ignore_eos": self.ignore_eos,
            "skip_special_tokens": self.skip_special_tokens,
            "logit_bias": self.logit_bias,
+            "custom_params": self.custom_params,
        }

        if self.response_format and self.response_format.type == "json_schema":
--- a/python/sglang/srt/entrypoints/openai/serving_chat.py
+++ b/python/sglang/srt/entrypoints/openai/serving_chat.py
@@ -196,6 +196,7 @@ class OpenAIServingChat(OpenAIServingBase):
            extra_key=self._compute_extra_key(request),
            priority=request.priority,
            custom_labels=custom_labels,
+            custom_logit_processor=request.custom_logit_processor,
        )

        return adapted_request, request
--- a/python/sglang/srt/entrypoints/openai/serving_completions.py
+++ b/python/sglang/srt/entrypoints/openai/serving_completions.py
@@ -121,6 +121,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
            extra_key=self._compute_extra_key(request),
            priority=request.priority,
            custom_labels=custom_labels,
+            custom_logit_processor=request.custom_logit_processor,
        )

        return adapted_request, request
@@ -149,6 +150,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
            "ignore_eos": request.ignore_eos,
            "skip_special_tokens": request.skip_special_tokens,
            "logit_bias": request.logit_bias,
+            "custom_params": request.custom_params,
        }

        # Handle response_format constraints
--- a/python/sglang/srt/sampling/custom_logit_processor.py
+++ b/python/sglang/srt/sampling/custom_logit_processor.py
@@ -1,12 +1,15 @@
 import json
 from abc import ABC, abstractmethod
 from functools import lru_cache
-from typing import Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional

 import dill
 import orjson
 import torch

+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+

@lru_cache(maxsize=None)
 def _cache_from_str(json_str: str):
@@ -52,3 +55,74 @@ class DisallowedTokensLogitsProcessor(CustomLogitProcessor):
        ), f"{custom_param_list=}"
        logits[..., disallowed_token_ids] = -float("inf")
        return logits
+
+
+class ThinkingBudgetLogitProcessor(CustomLogitProcessor):
+    """A logit processor that controls the length of thinking."""
+
+    THINKING_START_TOKEN_ID: int
+    THINKING_END_TOKEN_ID: int
+    NEW_LINE_TOKEN_ID: int
+
+    def __call__(self, logits, custom_param_list: list[dict[str, Any]]):
+        if custom_param_list is None or not custom_param_list:
+            return logits
+        for i, param_dict in enumerate(custom_param_list):
+            if param_dict is None:
+                continue
+
+            thinking_budget: int | None = param_dict.get("thinking_budget")
+
+            # Skip if thinking_budget is unset, or not an integer, or negative
+            if (
+                thinking_budget is None
+                or not isinstance(thinking_budget, int)
+                or thinking_budget < 0
+            ):
+                continue
+            req: Req = param_dict.get("__req__")
+            cur_ids: list[int] = [*req.origin_input_ids, *req.output_ids]
+
+            # Check if out of thinking stage
+            if (
+                self.THINKING_START_TOKEN_ID not in cur_ids
+                or self.THINKING_END_TOKEN_ID in cur_ids
+            ):
+                continue
+
+            # Find the index of the thinking start token
+            start_index = cur_ids.index(self.THINKING_START_TOKEN_ID)
+
+            # Count the number of tokens after the thinking start token
+            num_tokens_after_start = len(cur_ids) - start_index - 1
+
+            if num_tokens_after_start < thinking_budget:
+                continue
+
+            # Ensure new line token before thinking end token
+            if not req.output_ids or req.output_ids[-1] != self.NEW_LINE_TOKEN_ID:
+                logits[i, :] = -float("inf")
+                logits[i, self.NEW_LINE_TOKEN_ID] = 0.0
+                continue
+
+            # Assign highest probability to the thinking end token
+            logits[i, :] = -float("inf")
+            logits[i, self.THINKING_END_TOKEN_ID] = 0.0
+
+        return logits
+
+
+class Qwen3ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
+    """A logit processor that controls the length of thinking for Qwen3 models."""
+
+    THINKING_START_TOKEN_ID: int = 151667
+    THINKING_END_TOKEN_ID: int = 151668
+    NEW_LINE_TOKEN_ID: int = 198
+
+
+class DeepSeekR1ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
+    """A logit processor that controls the length of thinking for DeepSeek-R1 models."""
+
+    THINKING_START_TOKEN_ID: int = 128798
+    THINKING_END_TOKEN_ID: int = 128799
+    NEW_LINE_TOKEN_ID: int = 201
--- a/test/srt/openai_server/basic/test_openai_server.py
+++ b/test/srt/openai_server/basic/test_openai_server.py
@@ -6,13 +6,17 @@ python3 -m unittest openai_server.basic.test_openai_server.TestOpenAIServer.test
 """

 import json
+import random
 import re
 import unittest
+from concurrent.futures import ThreadPoolExecutor
+from typing import Optional

 import numpy as np
 import openai
 import requests

+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.runners import TEST_RERANK_QUERY_DOCS
@@ -848,6 +852,94 @@ class TestOpenAIV1Rerank(CustomTestCase):
        self.assertTrue(isinstance(response[1]["index"], int))


+class TestOpenAIServerCustomLogitProcessor(CustomTestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=["--enable-custom-logit-processor"],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        kill_process_tree(cls.process.pid)
+
+    def run_custom_logit_processor(self, target_token_id: Optional[int] = None) -> None:
+        """
+        Test custom logit processor with custom params.
+
+        If target_token_id is None, the custom logit processor won't be passed in.
+        """
+
+        class DeterministicLogitProcessor(CustomLogitProcessor):
+            """A dummy logit processor that changes the logits to always sample the given token id."""
+
+            CUSTOM_PARAM_KEY = "token_id"
+
+            def __call__(self, logits, custom_param_list):
+                assert logits.shape[0] == len(custom_param_list)
+
+                for i, param_dict in enumerate(custom_param_list):
+                    # Mask all other tokens
+                    logits[i, :] = -float("inf")
+                    # Assign highest probability to the specified token
+                    logits[i, param_dict[self.CUSTOM_PARAM_KEY]] = 0.0
+
+                return logits
+
+        extra_body = {}
+
+        if target_token_id is not None:
+            extra_body["custom_logit_processor"] = (
+                DeterministicLogitProcessor().to_str()
+            )
+            extra_body["custom_params"] = {
+                "token_id": target_token_id,
+            }
+
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        max_tokens = 200
+
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": "Question: Is Paris the Capital of France?",
+                },
+            ],
+            temperature=0.0,
+            max_tokens=max_tokens,
+            extra_body=extra_body,
+        )
+
+        if target_token_id is not None:
+            target_text = self.tokenizer.decode([target_token_id] * max_tokens)
+            self.assertTrue(
+                target_text == response.choices[0].message.content,
+                f"{target_token_id=}\n{target_text=}\n{response.model_dump(mode='json')}",
+            )
+
+    def test_custom_logit_processor(self) -> None:
+        """Test custom logit processor with a single request."""
+        self.run_custom_logit_processor(target_token_id=5)
+
+    def test_custom_logit_processor_batch_mixed(self) -> None:
+        """Test a batch of requests mixed of requests with and without custom logit processor."""
+        target_token_ids = list(range(32)) + [None] * 16
+        random.shuffle(target_token_ids)
+        with ThreadPoolExecutor(len(target_token_ids)) as executor:
+            list(executor.map(self.run_custom_logit_processor, target_token_ids))
+
+
 class TestOpenAIV1Score(CustomTestCase):
    @classmethod
    def setUpClass(cls):