diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 58b0d5fd9..911969859 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1099,7 +1099,7 @@ def v1_chat_generate_request( sampling_params = { "temperature": request.temperature, - "max_new_tokens": request.max_tokens, + "max_new_tokens": request.max_tokens or request.max_completion_tokens, "min_new_tokens": request.min_tokens, "stop": stop, "stop_token_ids": request.stop_token_ids, diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 9075a1a69..39e25a57c 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -320,7 +320,16 @@ class ChatCompletionRequest(BaseModel): logit_bias: Optional[Dict[str, float]] = None logprobs: bool = False top_logprobs: Optional[int] = None - max_tokens: Optional[int] = None + max_tokens: Optional[int] = Field( + default=None, + deprecated="max_tokens is deprecated in favor of the max_completion_tokens field", + description="The maximum number of tokens that can be generated in the chat completion. ", + ) + max_completion_tokens: Optional[int] = Field( + default=None, + description="The maximum number of completion tokens for a chat completion request, " + "including visible output tokens and reasoning tokens. Input tokens are not included. ", + ) n: int = 1 presence_penalty: float = 0.0 response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None