feat(oai refactor): Replace openai_api with entrypoints/openai (#7351)

Co-authored-by: Jin Pan <jpan236@wisc.edu>
This commit is contained in:
Chang Su
2025-06-21 13:21:06 -07:00
committed by GitHub
parent 02bf31ef29
commit 72676cd6c0
43 changed files with 674 additions and 4555 deletions

View File

@@ -1,4 +1,3 @@
import base64
import json
import logging
import time
@@ -6,7 +5,7 @@ import uuid
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
from fastapi import Request
from fastapi.responses import StreamingResponse
from fastapi.responses import ORJSONResponse, StreamingResponse
from sglang.srt.conversation import generate_chat_conv
from sglang.srt.entrypoints.openai.protocol import (
@@ -28,13 +27,14 @@ from sglang.srt.entrypoints.openai.protocol import (
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
from sglang.srt.entrypoints.openai.utils import (
detect_template_content_format,
process_content_for_template_format,
process_hidden_states_from_ret,
to_openai_style_logprobs,
)
from sglang.srt.function_call.function_call_parser import FunctionCallParser
from sglang.srt.jinja_template_utils import process_content_for_template_format
from sglang.srt.managers.io_struct import GenerateReqInput
from sglang.srt.managers.template_manager import TemplateManager
from sglang.srt.managers.tokenizer_manager import TokenizerManager
from sglang.srt.reasoning_parser import ReasoningParser
from sglang.utils import convert_json_schema_to_str
@@ -42,13 +42,13 @@ logger = logging.getLogger(__name__)
class OpenAIServingChat(OpenAIServingBase):
"""Handler for chat completion requests"""
"""Handler for /v1/chat/completions requests"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Instance-specific cache for template content format detection
self._cached_chat_template = None
self._cached_template_format = None
def __init__(
self, tokenizer_manager: TokenizerManager, template_manager: TemplateManager
):
super().__init__(tokenizer_manager)
self.template_manager = template_manager
def _request_id_prefix(self) -> str:
return "chatcmpl-"
@@ -142,19 +142,14 @@ class OpenAIServingChat(OpenAIServingBase):
)
# Use chat template
if (
hasattr(self.tokenizer_manager, "chat_template_name")
and self.tokenizer_manager.chat_template_name is None
):
if self.template_manager.chat_template_name is None:
prompt, prompt_ids, image_data, audio_data, modalities, stop = (
self._apply_jinja_template(request, tools, is_multimodal)
)
else:
prompt, image_data, audio_data, modalities, stop = (
self._apply_conversation_template(request)
prompt, prompt_ids, image_data, audio_data, modalities, stop = (
self._apply_conversation_template(request, is_multimodal)
)
if not is_multimodal:
prompt_ids = self.tokenizer_manager.tokenizer.encode(prompt)
else:
# Use raw prompt
prompt_ids = request.messages
@@ -181,23 +176,14 @@ class OpenAIServingChat(OpenAIServingBase):
is_multimodal: bool,
) -> tuple[str, List[int], Optional[Any], Optional[Any], List[str], List[str]]:
"""Apply Jinja chat template"""
prompt = ""
prompt_ids = []
openai_compatible_messages = []
image_data = []
audio_data = []
modalities = []
# Detect template content format
current_template = self.tokenizer_manager.tokenizer.chat_template
if current_template != self._cached_chat_template:
self._cached_chat_template = current_template
self._cached_template_format = detect_template_content_format(
current_template
)
logger.info(
f"Detected chat template content format: {self._cached_template_format}"
)
template_content_format = self._cached_template_format
template_content_format = self.template_manager.jinja_template_content_format
for message in request.messages:
if message.content is None:
@@ -262,14 +248,21 @@ class OpenAIServingChat(OpenAIServingBase):
if is_multimodal:
prompt = self.tokenizer_manager.tokenizer.decode(prompt_ids)
stop = request.stop or []
stop = request.stop
image_data = image_data if image_data else None
audio_data = audio_data if audio_data else None
modalities = modalities if modalities else []
return prompt, prompt_ids, image_data, audio_data, modalities, stop
def _apply_conversation_template(
self, request: ChatCompletionRequest
) -> tuple[str, Optional[Any], Optional[Any], List[str], List[str]]:
self,
request: ChatCompletionRequest,
is_multimodal: bool,
) -> tuple[str, Optional[Any], Optional[Any], List[str], List[str], List[str]]:
"""Apply conversation template"""
conv = generate_chat_conv(request, self.tokenizer_manager.chat_template_name)
prompt = ""
prompt_ids = []
conv = generate_chat_conv(request, self.template_manager.chat_template_name)
# If we should continue the final assistant message, adjust the conversation.
if (
@@ -296,9 +289,9 @@ class OpenAIServingChat(OpenAIServingBase):
else:
prompt = conv.get_prompt()
image_data = conv.image_data
audio_data = conv.audio_data
modalities = conv.modalities
image_data = conv.image_data if conv.image_data else None
audio_data = conv.audio_data if conv.audio_data else None
modalities = conv.modalities if conv.modalities else []
stop = conv.stop_str or [] if not request.ignore_eos else []
if request.stop:
@@ -307,7 +300,10 @@ class OpenAIServingChat(OpenAIServingBase):
else:
stop.extend(request.stop)
return prompt, image_data, audio_data, modalities, stop
if not is_multimodal:
prompt_ids = self.tokenizer_manager.tokenizer.encode(prompt)
return prompt, prompt_ids, image_data, audio_data, modalities, stop
def _build_sampling_params(
self,
@@ -459,13 +455,9 @@ class OpenAIServingChat(OpenAIServingBase):
stream_buffers[index] = stream_buffer + delta
# Handle reasoning content
enable_thinking = getattr(request, "chat_template_kwargs", {}).get(
"enable_thinking", True
)
if (
self.tokenizer_manager.server_args.reasoning_parser
and request.separate_reasoning
and enable_thinking
):
reasoning_text, delta = self._process_reasoning_stream(
index, delta, reasoning_parser_dict, content, request
@@ -591,7 +583,7 @@ class OpenAIServingChat(OpenAIServingBase):
)
yield f"data: {usage_chunk.model_dump_json()}\n\n"
except Exception as e:
except ValueError as e:
error = self.create_streaming_error_response(str(e))
yield f"data: {error}\n\n"
@@ -602,7 +594,7 @@ class OpenAIServingChat(OpenAIServingBase):
adapted_request: GenerateReqInput,
request: ChatCompletionRequest,
raw_request: Request,
) -> Union[ChatCompletionResponse, ErrorResponse]:
) -> Union[ChatCompletionResponse, ErrorResponse, ORJSONResponse]:
"""Handle non-streaming chat completion request"""
try:
ret = await self.tokenizer_manager.generate_request(
@@ -627,7 +619,7 @@ class OpenAIServingChat(OpenAIServingBase):
request: ChatCompletionRequest,
ret: List[Dict[str, Any]],
created: int,
) -> ChatCompletionResponse:
) -> Union[ChatCompletionResponse, ORJSONResponse]:
"""Build chat completion response from generation results"""
choices = []
@@ -645,11 +637,8 @@ class OpenAIServingChat(OpenAIServingBase):
# Handle reasoning content
reasoning_text = None
enable_thinking = getattr(request, "chat_template_kwargs", {}).get(
"enable_thinking", True
)
reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
if reasoning_parser and request.separate_reasoning and enable_thinking:
if reasoning_parser and request.separate_reasoning:
try:
parser = ReasoningParser(
model_type=reasoning_parser, stream_reasoning=False
@@ -691,9 +680,10 @@ class OpenAIServingChat(OpenAIServingBase):
choices.append(choice_data)
# Calculate usage
cache_report = self.tokenizer_manager.server_args.enable_cache_report
usage = UsageProcessor.calculate_response_usage(
ret, n_choices=request.n, enable_cache_report=cache_report
ret,
n_choices=request.n,
enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
)
return ChatCompletionResponse(
@@ -821,6 +811,25 @@ class OpenAIServingChat(OpenAIServingBase):
reasoning_parser = reasoning_parser_dict[index]
return reasoning_parser.parse_stream_chunk(delta)
def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
NOTE: This parameter is only useful for models that support enable_thinking
flag, such as Qwen3.
Args:
request_obj: The request object (or an item from a list of requests).
Returns:
The boolean value of 'enable_thinking' if found and not True, otherwise True.
"""
if (
hasattr(request, "chat_template_kwargs")
and request.chat_template_kwargs
and request.chat_template_kwargs.get("enable_thinking") is not None
):
return request.chat_template_kwargs.get("enable_thinking")
return True
async def _process_tool_call_stream(
self,
index: int,