feat(oai refactor): Replace openai_api with entrypoints/openai (#7351)
Co-authored-by: Jin Pan <jpan236@wisc.edu>
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
@@ -6,7 +5,7 @@ import uuid
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
||||
|
||||
from fastapi import Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
from fastapi.responses import ORJSONResponse, StreamingResponse
|
||||
|
||||
from sglang.srt.conversation import generate_chat_conv
|
||||
from sglang.srt.entrypoints.openai.protocol import (
|
||||
@@ -28,13 +27,14 @@ from sglang.srt.entrypoints.openai.protocol import (
|
||||
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
||||
from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
|
||||
from sglang.srt.entrypoints.openai.utils import (
|
||||
detect_template_content_format,
|
||||
process_content_for_template_format,
|
||||
process_hidden_states_from_ret,
|
||||
to_openai_style_logprobs,
|
||||
)
|
||||
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
||||
from sglang.srt.jinja_template_utils import process_content_for_template_format
|
||||
from sglang.srt.managers.io_struct import GenerateReqInput
|
||||
from sglang.srt.managers.template_manager import TemplateManager
|
||||
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
||||
from sglang.srt.reasoning_parser import ReasoningParser
|
||||
from sglang.utils import convert_json_schema_to_str
|
||||
|
||||
@@ -42,13 +42,13 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OpenAIServingChat(OpenAIServingBase):
|
||||
"""Handler for chat completion requests"""
|
||||
"""Handler for /v1/chat/completions requests"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# Instance-specific cache for template content format detection
|
||||
self._cached_chat_template = None
|
||||
self._cached_template_format = None
|
||||
def __init__(
|
||||
self, tokenizer_manager: TokenizerManager, template_manager: TemplateManager
|
||||
):
|
||||
super().__init__(tokenizer_manager)
|
||||
self.template_manager = template_manager
|
||||
|
||||
def _request_id_prefix(self) -> str:
|
||||
return "chatcmpl-"
|
||||
@@ -142,19 +142,14 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
)
|
||||
|
||||
# Use chat template
|
||||
if (
|
||||
hasattr(self.tokenizer_manager, "chat_template_name")
|
||||
and self.tokenizer_manager.chat_template_name is None
|
||||
):
|
||||
if self.template_manager.chat_template_name is None:
|
||||
prompt, prompt_ids, image_data, audio_data, modalities, stop = (
|
||||
self._apply_jinja_template(request, tools, is_multimodal)
|
||||
)
|
||||
else:
|
||||
prompt, image_data, audio_data, modalities, stop = (
|
||||
self._apply_conversation_template(request)
|
||||
prompt, prompt_ids, image_data, audio_data, modalities, stop = (
|
||||
self._apply_conversation_template(request, is_multimodal)
|
||||
)
|
||||
if not is_multimodal:
|
||||
prompt_ids = self.tokenizer_manager.tokenizer.encode(prompt)
|
||||
else:
|
||||
# Use raw prompt
|
||||
prompt_ids = request.messages
|
||||
@@ -181,23 +176,14 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
is_multimodal: bool,
|
||||
) -> tuple[str, List[int], Optional[Any], Optional[Any], List[str], List[str]]:
|
||||
"""Apply Jinja chat template"""
|
||||
prompt = ""
|
||||
prompt_ids = []
|
||||
openai_compatible_messages = []
|
||||
image_data = []
|
||||
audio_data = []
|
||||
modalities = []
|
||||
|
||||
# Detect template content format
|
||||
current_template = self.tokenizer_manager.tokenizer.chat_template
|
||||
if current_template != self._cached_chat_template:
|
||||
self._cached_chat_template = current_template
|
||||
self._cached_template_format = detect_template_content_format(
|
||||
current_template
|
||||
)
|
||||
logger.info(
|
||||
f"Detected chat template content format: {self._cached_template_format}"
|
||||
)
|
||||
|
||||
template_content_format = self._cached_template_format
|
||||
template_content_format = self.template_manager.jinja_template_content_format
|
||||
|
||||
for message in request.messages:
|
||||
if message.content is None:
|
||||
@@ -262,14 +248,21 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
if is_multimodal:
|
||||
prompt = self.tokenizer_manager.tokenizer.decode(prompt_ids)
|
||||
|
||||
stop = request.stop or []
|
||||
stop = request.stop
|
||||
image_data = image_data if image_data else None
|
||||
audio_data = audio_data if audio_data else None
|
||||
modalities = modalities if modalities else []
|
||||
return prompt, prompt_ids, image_data, audio_data, modalities, stop
|
||||
|
||||
def _apply_conversation_template(
|
||||
self, request: ChatCompletionRequest
|
||||
) -> tuple[str, Optional[Any], Optional[Any], List[str], List[str]]:
|
||||
self,
|
||||
request: ChatCompletionRequest,
|
||||
is_multimodal: bool,
|
||||
) -> tuple[str, Optional[Any], Optional[Any], List[str], List[str], List[str]]:
|
||||
"""Apply conversation template"""
|
||||
conv = generate_chat_conv(request, self.tokenizer_manager.chat_template_name)
|
||||
prompt = ""
|
||||
prompt_ids = []
|
||||
conv = generate_chat_conv(request, self.template_manager.chat_template_name)
|
||||
|
||||
# If we should continue the final assistant message, adjust the conversation.
|
||||
if (
|
||||
@@ -296,9 +289,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
else:
|
||||
prompt = conv.get_prompt()
|
||||
|
||||
image_data = conv.image_data
|
||||
audio_data = conv.audio_data
|
||||
modalities = conv.modalities
|
||||
image_data = conv.image_data if conv.image_data else None
|
||||
audio_data = conv.audio_data if conv.audio_data else None
|
||||
modalities = conv.modalities if conv.modalities else []
|
||||
stop = conv.stop_str or [] if not request.ignore_eos else []
|
||||
|
||||
if request.stop:
|
||||
@@ -307,7 +300,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
else:
|
||||
stop.extend(request.stop)
|
||||
|
||||
return prompt, image_data, audio_data, modalities, stop
|
||||
if not is_multimodal:
|
||||
prompt_ids = self.tokenizer_manager.tokenizer.encode(prompt)
|
||||
|
||||
return prompt, prompt_ids, image_data, audio_data, modalities, stop
|
||||
|
||||
def _build_sampling_params(
|
||||
self,
|
||||
@@ -459,13 +455,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
stream_buffers[index] = stream_buffer + delta
|
||||
|
||||
# Handle reasoning content
|
||||
enable_thinking = getattr(request, "chat_template_kwargs", {}).get(
|
||||
"enable_thinking", True
|
||||
)
|
||||
if (
|
||||
self.tokenizer_manager.server_args.reasoning_parser
|
||||
and request.separate_reasoning
|
||||
and enable_thinking
|
||||
):
|
||||
reasoning_text, delta = self._process_reasoning_stream(
|
||||
index, delta, reasoning_parser_dict, content, request
|
||||
@@ -591,7 +583,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
)
|
||||
yield f"data: {usage_chunk.model_dump_json()}\n\n"
|
||||
|
||||
except Exception as e:
|
||||
except ValueError as e:
|
||||
error = self.create_streaming_error_response(str(e))
|
||||
yield f"data: {error}\n\n"
|
||||
|
||||
@@ -602,7 +594,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
adapted_request: GenerateReqInput,
|
||||
request: ChatCompletionRequest,
|
||||
raw_request: Request,
|
||||
) -> Union[ChatCompletionResponse, ErrorResponse]:
|
||||
) -> Union[ChatCompletionResponse, ErrorResponse, ORJSONResponse]:
|
||||
"""Handle non-streaming chat completion request"""
|
||||
try:
|
||||
ret = await self.tokenizer_manager.generate_request(
|
||||
@@ -627,7 +619,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
request: ChatCompletionRequest,
|
||||
ret: List[Dict[str, Any]],
|
||||
created: int,
|
||||
) -> ChatCompletionResponse:
|
||||
) -> Union[ChatCompletionResponse, ORJSONResponse]:
|
||||
"""Build chat completion response from generation results"""
|
||||
choices = []
|
||||
|
||||
@@ -645,11 +637,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
|
||||
# Handle reasoning content
|
||||
reasoning_text = None
|
||||
enable_thinking = getattr(request, "chat_template_kwargs", {}).get(
|
||||
"enable_thinking", True
|
||||
)
|
||||
reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
|
||||
if reasoning_parser and request.separate_reasoning and enable_thinking:
|
||||
if reasoning_parser and request.separate_reasoning:
|
||||
try:
|
||||
parser = ReasoningParser(
|
||||
model_type=reasoning_parser, stream_reasoning=False
|
||||
@@ -691,9 +680,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
choices.append(choice_data)
|
||||
|
||||
# Calculate usage
|
||||
cache_report = self.tokenizer_manager.server_args.enable_cache_report
|
||||
usage = UsageProcessor.calculate_response_usage(
|
||||
ret, n_choices=request.n, enable_cache_report=cache_report
|
||||
ret,
|
||||
n_choices=request.n,
|
||||
enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
|
||||
)
|
||||
|
||||
return ChatCompletionResponse(
|
||||
@@ -821,6 +811,25 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
reasoning_parser = reasoning_parser_dict[index]
|
||||
return reasoning_parser.parse_stream_chunk(delta)
|
||||
|
||||
def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
|
||||
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
|
||||
|
||||
NOTE: This parameter is only useful for models that support enable_thinking
|
||||
flag, such as Qwen3.
|
||||
|
||||
Args:
|
||||
request_obj: The request object (or an item from a list of requests).
|
||||
Returns:
|
||||
The boolean value of 'enable_thinking' if found and not True, otherwise True.
|
||||
"""
|
||||
if (
|
||||
hasattr(request, "chat_template_kwargs")
|
||||
and request.chat_template_kwargs
|
||||
and request.chat_template_kwargs.get("enable_thinking") is not None
|
||||
):
|
||||
return request.chat_template_kwargs.get("enable_thinking")
|
||||
return True
|
||||
|
||||
async def _process_tool_call_stream(
|
||||
self,
|
||||
index: int,
|
||||
|
||||
Reference in New Issue
Block a user