Sync from v0.13
This commit is contained in:
0
vllm/entrypoints/anthropic/__init__.py
Normal file
0
vllm/entrypoints/anthropic/__init__.py
Normal file
162
vllm/entrypoints/anthropic/protocol.py
Normal file
162
vllm/entrypoints/anthropic/protocol.py
Normal file
@@ -0,0 +1,162 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pydantic models for Anthropic API protocol"""
|
||||
|
||||
import time
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
from pydantic import BaseModel, field_validator
|
||||
|
||||
|
||||
class AnthropicError(BaseModel):
|
||||
"""Error structure for Anthropic API"""
|
||||
|
||||
type: str
|
||||
message: str
|
||||
|
||||
|
||||
class AnthropicErrorResponse(BaseModel):
|
||||
"""Error response structure for Anthropic API"""
|
||||
|
||||
type: Literal["error"] = "error"
|
||||
error: AnthropicError
|
||||
|
||||
|
||||
class AnthropicUsage(BaseModel):
|
||||
"""Token usage information"""
|
||||
|
||||
input_tokens: int
|
||||
output_tokens: int
|
||||
cache_creation_input_tokens: int | None = None
|
||||
cache_read_input_tokens: int | None = None
|
||||
|
||||
|
||||
class AnthropicContentBlock(BaseModel):
|
||||
"""Content block in message"""
|
||||
|
||||
type: Literal["text", "image", "tool_use", "tool_result"]
|
||||
text: str | None = None
|
||||
# For image content
|
||||
source: dict[str, Any] | None = None
|
||||
# For tool use/result
|
||||
id: str | None = None
|
||||
name: str | None = None
|
||||
input: dict[str, Any] | None = None
|
||||
content: str | list[dict[str, Any]] | None = None
|
||||
is_error: bool | None = None
|
||||
|
||||
|
||||
class AnthropicMessage(BaseModel):
|
||||
"""Message structure"""
|
||||
|
||||
role: Literal["user", "assistant"]
|
||||
content: str | list[AnthropicContentBlock]
|
||||
|
||||
|
||||
class AnthropicTool(BaseModel):
|
||||
"""Tool definition"""
|
||||
|
||||
name: str
|
||||
description: str | None = None
|
||||
input_schema: dict[str, Any]
|
||||
|
||||
@field_validator("input_schema")
|
||||
@classmethod
|
||||
def validate_input_schema(cls, v):
|
||||
if not isinstance(v, dict):
|
||||
raise ValueError("input_schema must be a dictionary")
|
||||
if "type" not in v:
|
||||
v["type"] = "object" # Default to object type
|
||||
return v
|
||||
|
||||
|
||||
class AnthropicToolChoice(BaseModel):
|
||||
"""Tool Choice definition"""
|
||||
|
||||
type: Literal["auto", "any", "tool"]
|
||||
name: str | None = None
|
||||
|
||||
|
||||
class AnthropicMessagesRequest(BaseModel):
|
||||
"""Anthropic Messages API request"""
|
||||
|
||||
model: str
|
||||
messages: list[AnthropicMessage]
|
||||
max_tokens: int
|
||||
metadata: dict[str, Any] | None = None
|
||||
stop_sequences: list[str] | None = None
|
||||
stream: bool | None = False
|
||||
system: str | list[AnthropicContentBlock] | None = None
|
||||
temperature: float | None = None
|
||||
tool_choice: AnthropicToolChoice | None = None
|
||||
tools: list[AnthropicTool] | None = None
|
||||
top_k: int | None = None
|
||||
top_p: float | None = None
|
||||
|
||||
@field_validator("model")
|
||||
@classmethod
|
||||
def validate_model(cls, v):
|
||||
if not v:
|
||||
raise ValueError("Model is required")
|
||||
return v
|
||||
|
||||
@field_validator("max_tokens")
|
||||
@classmethod
|
||||
def validate_max_tokens(cls, v):
|
||||
if v <= 0:
|
||||
raise ValueError("max_tokens must be positive")
|
||||
return v
|
||||
|
||||
|
||||
class AnthropicDelta(BaseModel):
|
||||
"""Delta for streaming responses"""
|
||||
|
||||
type: Literal["text_delta", "input_json_delta"] | None = None
|
||||
text: str | None = None
|
||||
partial_json: str | None = None
|
||||
|
||||
# Message delta
|
||||
stop_reason: (
|
||||
Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"] | None
|
||||
) = None
|
||||
stop_sequence: str | None = None
|
||||
|
||||
|
||||
class AnthropicStreamEvent(BaseModel):
|
||||
"""Streaming event"""
|
||||
|
||||
type: Literal[
|
||||
"message_start",
|
||||
"message_delta",
|
||||
"message_stop",
|
||||
"content_block_start",
|
||||
"content_block_delta",
|
||||
"content_block_stop",
|
||||
"ping",
|
||||
"error",
|
||||
]
|
||||
message: Optional["AnthropicMessagesResponse"] = None
|
||||
delta: AnthropicDelta | None = None
|
||||
content_block: AnthropicContentBlock | None = None
|
||||
index: int | None = None
|
||||
error: AnthropicError | None = None
|
||||
usage: AnthropicUsage | None = None
|
||||
|
||||
|
||||
class AnthropicMessagesResponse(BaseModel):
|
||||
"""Anthropic Messages API response"""
|
||||
|
||||
id: str
|
||||
type: Literal["message"] = "message"
|
||||
role: Literal["assistant"] = "assistant"
|
||||
content: list[AnthropicContentBlock]
|
||||
model: str
|
||||
stop_reason: (
|
||||
Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"] | None
|
||||
) = None
|
||||
stop_sequence: str | None = None
|
||||
usage: AnthropicUsage | None = None
|
||||
|
||||
def model_post_init(self, __context):
|
||||
if not self.id:
|
||||
self.id = f"msg_{int(time.time() * 1000)}"
|
||||
468
vllm/entrypoints/anthropic/serving_messages.py
Normal file
468
vllm/entrypoints/anthropic/serving_messages.py
Normal file
@@ -0,0 +1,468 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from
|
||||
# https://github.com/vllm/vllm/entrypoints/openai/serving_chat.py
|
||||
|
||||
"""Anthropic Messages API serving handler"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Any
|
||||
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.anthropic.protocol import (
|
||||
AnthropicContentBlock,
|
||||
AnthropicDelta,
|
||||
AnthropicError,
|
||||
AnthropicMessagesRequest,
|
||||
AnthropicMessagesResponse,
|
||||
AnthropicStreamEvent,
|
||||
AnthropicUsage,
|
||||
)
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionNamedToolChoiceParam,
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
ChatCompletionStreamResponse,
|
||||
ChatCompletionToolsParam,
|
||||
ErrorResponse,
|
||||
StreamOptions,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def wrap_data_with_event(data: str, event: str):
|
||||
return f"event: {event}\ndata: {data}\n\n"
|
||||
|
||||
|
||||
class AnthropicServingMessages(OpenAIServingChat):
|
||||
"""Handler for Anthropic Messages API requests"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
response_role: str,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
chat_template: str | None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
reasoning_parser: str = "",
|
||||
enable_auto_tools: bool = False,
|
||||
tool_parser: str | None = None,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
response_role=response_role,
|
||||
request_logger=request_logger,
|
||||
chat_template=chat_template,
|
||||
chat_template_content_format=chat_template_content_format,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
reasoning_parser=reasoning_parser,
|
||||
enable_auto_tools=enable_auto_tools,
|
||||
tool_parser=tool_parser,
|
||||
enable_prompt_tokens_details=enable_prompt_tokens_details,
|
||||
enable_force_include_usage=enable_force_include_usage,
|
||||
)
|
||||
self.stop_reason_map = {
|
||||
"stop": "end_turn",
|
||||
"length": "max_tokens",
|
||||
"tool_calls": "tool_use",
|
||||
}
|
||||
|
||||
def _convert_anthropic_to_openai_request(
|
||||
self, anthropic_request: AnthropicMessagesRequest
|
||||
) -> ChatCompletionRequest:
|
||||
"""Convert Anthropic message format to OpenAI format"""
|
||||
openai_messages = []
|
||||
|
||||
# Add system message if provided
|
||||
if anthropic_request.system:
|
||||
if isinstance(anthropic_request.system, str):
|
||||
openai_messages.append(
|
||||
{"role": "system", "content": anthropic_request.system}
|
||||
)
|
||||
else:
|
||||
system_prompt = ""
|
||||
for block in anthropic_request.system:
|
||||
if block.type == "text" and block.text:
|
||||
system_prompt += block.text
|
||||
openai_messages.append({"role": "system", "content": system_prompt})
|
||||
|
||||
for msg in anthropic_request.messages:
|
||||
openai_msg: dict[str, Any] = {"role": msg.role} # type: ignore
|
||||
if isinstance(msg.content, str):
|
||||
openai_msg["content"] = msg.content
|
||||
else:
|
||||
# Handle complex content blocks
|
||||
content_parts: list[dict[str, Any]] = []
|
||||
tool_calls: list[dict[str, Any]] = []
|
||||
|
||||
for block in msg.content:
|
||||
if block.type == "text" and block.text:
|
||||
content_parts.append({"type": "text", "text": block.text})
|
||||
elif block.type == "image" and block.source:
|
||||
content_parts.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": block.source.get("data", "")},
|
||||
}
|
||||
)
|
||||
elif block.type == "tool_use":
|
||||
# Convert tool use to function call format
|
||||
tool_call = {
|
||||
"id": block.id or f"call_{int(time.time())}",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": block.name or "",
|
||||
"arguments": json.dumps(block.input or {}),
|
||||
},
|
||||
}
|
||||
tool_calls.append(tool_call)
|
||||
elif block.type == "tool_result":
|
||||
if msg.role == "user":
|
||||
openai_messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": block.id or "",
|
||||
"content": str(block.content)
|
||||
if block.content
|
||||
else "",
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Assistant tool result becomes regular text
|
||||
tool_result_text = (
|
||||
str(block.content) if block.content else ""
|
||||
)
|
||||
content_parts.append(
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Tool result: {tool_result_text}",
|
||||
}
|
||||
)
|
||||
|
||||
# Add tool calls to the message if any
|
||||
if tool_calls:
|
||||
openai_msg["tool_calls"] = tool_calls # type: ignore
|
||||
|
||||
# Add content parts if any
|
||||
if content_parts:
|
||||
if len(content_parts) == 1 and content_parts[0]["type"] == "text":
|
||||
openai_msg["content"] = content_parts[0]["text"]
|
||||
else:
|
||||
openai_msg["content"] = content_parts # type: ignore
|
||||
elif not tool_calls:
|
||||
continue
|
||||
|
||||
openai_messages.append(openai_msg)
|
||||
|
||||
req = ChatCompletionRequest(
|
||||
model=anthropic_request.model,
|
||||
messages=openai_messages,
|
||||
max_tokens=anthropic_request.max_tokens,
|
||||
max_completion_tokens=anthropic_request.max_tokens,
|
||||
stop=anthropic_request.stop_sequences,
|
||||
temperature=anthropic_request.temperature,
|
||||
top_p=anthropic_request.top_p,
|
||||
top_k=anthropic_request.top_k,
|
||||
)
|
||||
|
||||
if anthropic_request.stream:
|
||||
req.stream = anthropic_request.stream
|
||||
req.stream_options = StreamOptions.validate(
|
||||
{"include_usage": True, "continuous_usage_stats": True}
|
||||
)
|
||||
|
||||
if anthropic_request.tool_choice is None:
|
||||
req.tool_choice = None
|
||||
elif anthropic_request.tool_choice.type == "auto":
|
||||
req.tool_choice = "auto"
|
||||
elif anthropic_request.tool_choice.type == "any":
|
||||
req.tool_choice = "required"
|
||||
elif anthropic_request.tool_choice.type == "tool":
|
||||
req.tool_choice = ChatCompletionNamedToolChoiceParam.model_validate(
|
||||
{
|
||||
"type": "function",
|
||||
"function": {"name": anthropic_request.tool_choice.name},
|
||||
}
|
||||
)
|
||||
|
||||
tools = []
|
||||
if anthropic_request.tools is None:
|
||||
return req
|
||||
for tool in anthropic_request.tools:
|
||||
tools.append(
|
||||
ChatCompletionToolsParam.model_validate(
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool.name,
|
||||
"description": tool.description,
|
||||
"parameters": tool.input_schema,
|
||||
},
|
||||
}
|
||||
)
|
||||
)
|
||||
if req.tool_choice is None:
|
||||
req.tool_choice = "auto"
|
||||
req.tools = tools
|
||||
return req
|
||||
|
||||
async def create_messages(
|
||||
self,
|
||||
request: AnthropicMessagesRequest,
|
||||
raw_request: Request | None = None,
|
||||
) -> AsyncGenerator[str, None] | AnthropicMessagesResponse | ErrorResponse:
|
||||
"""
|
||||
Messages API similar to Anthropic's API.
|
||||
|
||||
See https://docs.anthropic.com/en/api/messages
|
||||
for the API specification. This API mimics the Anthropic messages API.
|
||||
"""
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug("Received messages request %s", request.model_dump_json())
|
||||
chat_req = self._convert_anthropic_to_openai_request(request)
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug("Convert to OpenAI request %s", chat_req.model_dump_json())
|
||||
generator = await self.create_chat_completion(chat_req, raw_request)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return generator
|
||||
|
||||
elif isinstance(generator, ChatCompletionResponse):
|
||||
return self.messages_full_converter(generator)
|
||||
|
||||
return self.message_stream_converter(generator)
|
||||
|
||||
def messages_full_converter(
|
||||
self,
|
||||
generator: ChatCompletionResponse,
|
||||
) -> AnthropicMessagesResponse:
|
||||
result = AnthropicMessagesResponse(
|
||||
id=generator.id,
|
||||
content=[],
|
||||
model=generator.model,
|
||||
usage=AnthropicUsage(
|
||||
input_tokens=generator.usage.prompt_tokens,
|
||||
output_tokens=generator.usage.completion_tokens,
|
||||
),
|
||||
)
|
||||
if generator.choices[0].finish_reason == "stop":
|
||||
result.stop_reason = "end_turn"
|
||||
elif generator.choices[0].finish_reason == "length":
|
||||
result.stop_reason = "max_tokens"
|
||||
elif generator.choices[0].finish_reason == "tool_calls":
|
||||
result.stop_reason = "tool_use"
|
||||
|
||||
content: list[AnthropicContentBlock] = [
|
||||
AnthropicContentBlock(
|
||||
type="text",
|
||||
text=generator.choices[0].message.content
|
||||
if generator.choices[0].message.content
|
||||
else "",
|
||||
)
|
||||
]
|
||||
|
||||
for tool_call in generator.choices[0].message.tool_calls:
|
||||
anthropic_tool_call = AnthropicContentBlock(
|
||||
type="tool_use",
|
||||
id=tool_call.id,
|
||||
name=tool_call.function.name,
|
||||
input=json.loads(tool_call.function.arguments),
|
||||
)
|
||||
content += [anthropic_tool_call]
|
||||
|
||||
result.content = content
|
||||
|
||||
return result
|
||||
|
||||
async def message_stream_converter(
|
||||
self,
|
||||
generator: AsyncGenerator[str, None],
|
||||
) -> AsyncGenerator[str, None]:
|
||||
try:
|
||||
first_item = True
|
||||
finish_reason = None
|
||||
content_block_index = 0
|
||||
content_block_started = False
|
||||
|
||||
async for item in generator:
|
||||
if item.startswith("data:"):
|
||||
data_str = item[5:].strip().rstrip("\n")
|
||||
if data_str == "[DONE]":
|
||||
stop_message = AnthropicStreamEvent(
|
||||
type="message_stop",
|
||||
)
|
||||
data = stop_message.model_dump_json(
|
||||
exclude_unset=True, exclude_none=True
|
||||
)
|
||||
yield wrap_data_with_event(data, "message_stop")
|
||||
yield "data: [DONE]\n\n"
|
||||
else:
|
||||
origin_chunk = ChatCompletionStreamResponse.model_validate_json(
|
||||
data_str
|
||||
)
|
||||
|
||||
if first_item:
|
||||
chunk = AnthropicStreamEvent(
|
||||
type="message_start",
|
||||
message=AnthropicMessagesResponse(
|
||||
id=origin_chunk.id,
|
||||
content=[],
|
||||
model=origin_chunk.model,
|
||||
usage=AnthropicUsage(
|
||||
input_tokens=origin_chunk.usage.prompt_tokens
|
||||
if origin_chunk.usage
|
||||
else 0,
|
||||
output_tokens=0,
|
||||
),
|
||||
),
|
||||
)
|
||||
first_item = False
|
||||
data = chunk.model_dump_json(exclude_unset=True)
|
||||
yield wrap_data_with_event(data, "message_start")
|
||||
continue
|
||||
|
||||
# last chunk including usage info
|
||||
if len(origin_chunk.choices) == 0:
|
||||
if content_block_started:
|
||||
stop_chunk = AnthropicStreamEvent(
|
||||
index=content_block_index,
|
||||
type="content_block_stop",
|
||||
)
|
||||
data = stop_chunk.model_dump_json(exclude_unset=True)
|
||||
yield wrap_data_with_event(data, "content_block_stop")
|
||||
stop_reason = self.stop_reason_map.get(
|
||||
finish_reason or "stop"
|
||||
)
|
||||
chunk = AnthropicStreamEvent(
|
||||
type="message_delta",
|
||||
delta=AnthropicDelta(stop_reason=stop_reason),
|
||||
usage=AnthropicUsage(
|
||||
input_tokens=origin_chunk.usage.prompt_tokens
|
||||
if origin_chunk.usage
|
||||
else 0,
|
||||
output_tokens=origin_chunk.usage.completion_tokens
|
||||
if origin_chunk.usage
|
||||
else 0,
|
||||
),
|
||||
)
|
||||
data = chunk.model_dump_json(exclude_unset=True)
|
||||
yield wrap_data_with_event(data, "message_delta")
|
||||
continue
|
||||
|
||||
if origin_chunk.choices[0].finish_reason is not None:
|
||||
finish_reason = origin_chunk.choices[0].finish_reason
|
||||
continue
|
||||
|
||||
# content
|
||||
if origin_chunk.choices[0].delta.content is not None:
|
||||
if not content_block_started:
|
||||
chunk = AnthropicStreamEvent(
|
||||
index=content_block_index,
|
||||
type="content_block_start",
|
||||
content_block=AnthropicContentBlock(
|
||||
type="text", text=""
|
||||
),
|
||||
)
|
||||
data = chunk.model_dump_json(exclude_unset=True)
|
||||
yield wrap_data_with_event(data, "content_block_start")
|
||||
content_block_started = True
|
||||
|
||||
if origin_chunk.choices[0].delta.content == "":
|
||||
continue
|
||||
chunk = AnthropicStreamEvent(
|
||||
index=content_block_index,
|
||||
type="content_block_delta",
|
||||
delta=AnthropicDelta(
|
||||
type="text_delta",
|
||||
text=origin_chunk.choices[0].delta.content,
|
||||
),
|
||||
)
|
||||
data = chunk.model_dump_json(exclude_unset=True)
|
||||
yield wrap_data_with_event(data, "content_block_delta")
|
||||
continue
|
||||
|
||||
# tool calls
|
||||
elif len(origin_chunk.choices[0].delta.tool_calls) > 0:
|
||||
tool_call = origin_chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.id is not None:
|
||||
if content_block_started:
|
||||
stop_chunk = AnthropicStreamEvent(
|
||||
index=content_block_index,
|
||||
type="content_block_stop",
|
||||
)
|
||||
data = stop_chunk.model_dump_json(
|
||||
exclude_unset=True
|
||||
)
|
||||
yield wrap_data_with_event(
|
||||
data, "content_block_stop"
|
||||
)
|
||||
content_block_started = False
|
||||
content_block_index += 1
|
||||
|
||||
chunk = AnthropicStreamEvent(
|
||||
index=content_block_index,
|
||||
type="content_block_start",
|
||||
content_block=AnthropicContentBlock(
|
||||
type="tool_use",
|
||||
id=tool_call.id,
|
||||
name=tool_call.function.name
|
||||
if tool_call.function
|
||||
else None,
|
||||
input={},
|
||||
),
|
||||
)
|
||||
data = chunk.model_dump_json(exclude_unset=True)
|
||||
yield wrap_data_with_event(data, "content_block_start")
|
||||
content_block_started = True
|
||||
|
||||
else:
|
||||
chunk = AnthropicStreamEvent(
|
||||
index=content_block_index,
|
||||
type="content_block_delta",
|
||||
delta=AnthropicDelta(
|
||||
type="input_json_delta",
|
||||
partial_json=tool_call.function.arguments
|
||||
if tool_call.function
|
||||
else None,
|
||||
),
|
||||
)
|
||||
data = chunk.model_dump_json(exclude_unset=True)
|
||||
yield wrap_data_with_event(data, "content_block_delta")
|
||||
continue
|
||||
else:
|
||||
error_response = AnthropicStreamEvent(
|
||||
type="error",
|
||||
error=AnthropicError(
|
||||
type="internal_error",
|
||||
message="Invalid data format received",
|
||||
),
|
||||
)
|
||||
data = error_response.model_dump_json(exclude_unset=True)
|
||||
yield wrap_data_with_event(data, "error")
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Error in message stream converter.")
|
||||
error_response = AnthropicStreamEvent(
|
||||
type="error",
|
||||
error=AnthropicError(type="internal_error", message=str(e)),
|
||||
)
|
||||
data = error_response.model_dump_json(exclude_unset=True)
|
||||
yield wrap_data_with_event(data, "error")
|
||||
yield "data: [DONE]\n\n"
|
||||
@@ -1,3 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
NOTE: This API server is used only for demonstrating usage of AsyncEngine
|
||||
and simple performance benchmarks. It is not intended for production use.
|
||||
@@ -6,22 +8,31 @@ We are also not going to accept PRs modifying this file, please
|
||||
change `vllm/entrypoints/openai/api_server.py` instead.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import ssl
|
||||
from typing import AsyncGenerator
|
||||
from argparse import Namespace
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Any
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.launcher import serve_http
|
||||
from vllm.entrypoints.utils import with_cancellation
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.system_utils import set_ulimit
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
logger = init_logger("vllm.entrypoints.api_server")
|
||||
|
||||
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
||||
app = FastAPI()
|
||||
engine = None
|
||||
|
||||
@@ -42,6 +53,11 @@ async def generate(request: Request) -> Response:
|
||||
- other fields: the sampling parameters (See `SamplingParams` for details).
|
||||
"""
|
||||
request_dict = await request.json()
|
||||
return await _generate(request_dict, raw_request=request)
|
||||
|
||||
|
||||
@with_cancellation
|
||||
async def _generate(request_dict: dict, raw_request: Request) -> Response:
|
||||
prompt = request_dict.pop("prompt")
|
||||
stream = request_dict.pop("stream", False)
|
||||
sampling_params = SamplingParams(**request_dict)
|
||||
@@ -54,66 +70,116 @@ async def generate(request: Request) -> Response:
|
||||
async def stream_results() -> AsyncGenerator[bytes, None]:
|
||||
async for request_output in results_generator:
|
||||
prompt = request_output.prompt
|
||||
text_outputs = [
|
||||
prompt + output.text for output in request_output.outputs
|
||||
]
|
||||
assert prompt is not None
|
||||
text_outputs = [prompt + output.text for output in request_output.outputs]
|
||||
ret = {"text": text_outputs}
|
||||
yield (json.dumps(ret) + "\0").encode("utf-8")
|
||||
yield (json.dumps(ret) + "\n").encode("utf-8")
|
||||
|
||||
if stream:
|
||||
return StreamingResponse(stream_results())
|
||||
|
||||
# Non-streaming case
|
||||
final_output = None
|
||||
async for request_output in results_generator:
|
||||
if await request.is_disconnected():
|
||||
# Abort the request if the client disconnects.
|
||||
await engine.abort(request_id)
|
||||
return Response(status_code=499)
|
||||
final_output = request_output
|
||||
try:
|
||||
async for request_output in results_generator:
|
||||
final_output = request_output
|
||||
except asyncio.CancelledError:
|
||||
return Response(status_code=499)
|
||||
|
||||
assert final_output is not None
|
||||
prompt = final_output.prompt
|
||||
assert prompt is not None
|
||||
text_outputs = [prompt + output.text for output in final_output.outputs]
|
||||
ret = {"text": text_outputs}
|
||||
return JSONResponse(ret)
|
||||
|
||||
|
||||
def build_app(args: Namespace) -> FastAPI:
|
||||
global app
|
||||
|
||||
app.root_path = args.root_path
|
||||
return app
|
||||
|
||||
|
||||
async def init_app(
|
||||
args: Namespace,
|
||||
llm_engine: AsyncLLMEngine | None = None,
|
||||
) -> FastAPI:
|
||||
app = build_app(args)
|
||||
|
||||
global engine
|
||||
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
engine = (
|
||||
llm_engine
|
||||
if llm_engine is not None
|
||||
else AsyncLLMEngine.from_engine_args(
|
||||
engine_args, usage_context=UsageContext.API_SERVER
|
||||
)
|
||||
)
|
||||
app.state.engine_client = engine
|
||||
app.state.args = args
|
||||
return app
|
||||
|
||||
|
||||
async def run_server(
|
||||
args: Namespace, llm_engine: AsyncLLMEngine | None = None, **uvicorn_kwargs: Any
|
||||
) -> None:
|
||||
logger.info("vLLM API server version %s", VLLM_VERSION)
|
||||
logger.info("args: %s", args)
|
||||
|
||||
set_ulimit()
|
||||
|
||||
app = await init_app(args, llm_engine)
|
||||
assert engine is not None
|
||||
|
||||
shutdown_task = await serve_http(
|
||||
app,
|
||||
sock=None,
|
||||
enable_ssl_refresh=args.enable_ssl_refresh,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level=args.log_level,
|
||||
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
|
||||
ssl_keyfile=args.ssl_keyfile,
|
||||
ssl_certfile=args.ssl_certfile,
|
||||
ssl_ca_certs=args.ssl_ca_certs,
|
||||
ssl_cert_reqs=args.ssl_cert_reqs,
|
||||
**uvicorn_kwargs,
|
||||
)
|
||||
|
||||
await shutdown_task
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--host", type=str, default=None)
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--port", type=parser.check_port, default=8000)
|
||||
parser.add_argument("--ssl-keyfile", type=str, default=None)
|
||||
parser.add_argument("--ssl-certfile", type=str, default=None)
|
||||
parser.add_argument("--ssl-ca-certs",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The CA certificates file")
|
||||
parser.add_argument(
|
||||
"--ssl-ca-certs", type=str, default=None, help="The CA certificates file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-ssl-refresh",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Refresh SSL Context when SSL certificate files change",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ssl-cert-reqs",
|
||||
type=int,
|
||||
default=int(ssl.CERT_NONE),
|
||||
help="Whether client certificate is required (see stdlib ssl module's)"
|
||||
help="Whether client certificate is required (see stdlib ssl module's)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--root-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="FastAPI root_path when app is behind a path based routing proxy")
|
||||
help="FastAPI root_path when app is behind a path based routing proxy",
|
||||
)
|
||||
parser.add_argument("--log-level", type=str, default="debug")
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
engine = AsyncLLMEngine.from_engine_args(
|
||||
engine_args, usage_context=UsageContext.API_SERVER)
|
||||
|
||||
app.root_path = args.root_path
|
||||
uvicorn.run(app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level=args.log_level,
|
||||
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
|
||||
ssl_keyfile=args.ssl_keyfile,
|
||||
ssl_certfile=args.ssl_certfile,
|
||||
ssl_ca_certs=args.ssl_ca_certs,
|
||||
ssl_cert_reqs=args.ssl_cert_reqs)
|
||||
asyncio.run(run_server(args))
|
||||
|
||||
1903
vllm/entrypoints/chat_utils.py
Normal file
1903
vllm/entrypoints/chat_utils.py
Normal file
File diff suppressed because it is too large
Load Diff
15
vllm/entrypoints/cli/__init__.py
Normal file
15
vllm/entrypoints/cli/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
|
||||
from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
|
||||
from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
|
||||
from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
|
||||
from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcommand
|
||||
|
||||
__all__: list[str] = [
|
||||
"BenchmarkLatencySubcommand",
|
||||
"BenchmarkServingSubcommand",
|
||||
"BenchmarkStartupSubcommand",
|
||||
"BenchmarkSweepSubcommand",
|
||||
"BenchmarkThroughputSubcommand",
|
||||
]
|
||||
0
vllm/entrypoints/cli/benchmark/__init__.py
Normal file
0
vllm/entrypoints/cli/benchmark/__init__.py
Normal file
25
vllm/entrypoints/cli/benchmark/base.py
Normal file
25
vllm/entrypoints/cli/benchmark/base.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
|
||||
|
||||
class BenchmarkSubcommandBase(CLISubcommand):
|
||||
"""The base class of subcommands for `vllm bench`."""
|
||||
|
||||
help: str
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
"""Add the CLI arguments to the parser."""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
"""Run the benchmark.
|
||||
|
||||
Args:
|
||||
args: The arguments to the command.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
21
vllm/entrypoints/cli/benchmark/latency.py
Normal file
21
vllm/entrypoints/cli/benchmark/latency.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.latency import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
|
||||
|
||||
class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
|
||||
"""The `latency` subcommand for `vllm bench`."""
|
||||
|
||||
name = "latency"
|
||||
help = "Benchmark the latency of a single batch of requests."
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
56
vllm/entrypoints/cli/benchmark/main.py
Normal file
56
vllm/entrypoints/cli/benchmark/main.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import typing
|
||||
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
|
||||
class BenchmarkSubcommand(CLISubcommand):
|
||||
"""The `bench` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "bench"
|
||||
help = "vLLM bench subcommand."
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
args.dispatch_function(args)
|
||||
|
||||
def validate(self, args: argparse.Namespace) -> None:
|
||||
pass
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
bench_parser = subparsers.add_parser(
|
||||
self.name,
|
||||
description=self.help,
|
||||
usage=f"vllm {self.name} <bench_type> [options]",
|
||||
)
|
||||
bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type")
|
||||
|
||||
for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
|
||||
cmd_subparser = bench_subparsers.add_parser(
|
||||
cmd_cls.name,
|
||||
help=cmd_cls.help,
|
||||
description=cmd_cls.help,
|
||||
usage=f"vllm {self.name} {cmd_cls.name} [options]",
|
||||
)
|
||||
cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
|
||||
cmd_cls.add_cli_args(cmd_subparser)
|
||||
cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
|
||||
subcmd=f"{self.name} {cmd_cls.name}"
|
||||
)
|
||||
return bench_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [BenchmarkSubcommand()]
|
||||
21
vllm/entrypoints/cli/benchmark/serve.py
Normal file
21
vllm/entrypoints/cli/benchmark/serve.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.serve import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
|
||||
|
||||
class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
|
||||
"""The `serve` subcommand for `vllm bench`."""
|
||||
|
||||
name = "serve"
|
||||
help = "Benchmark the online serving throughput."
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
21
vllm/entrypoints/cli/benchmark/startup.py
Normal file
21
vllm/entrypoints/cli/benchmark/startup.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.startup import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
|
||||
|
||||
class BenchmarkStartupSubcommand(BenchmarkSubcommandBase):
|
||||
"""The `startup` subcommand for `vllm bench`."""
|
||||
|
||||
name = "startup"
|
||||
help = "Benchmark the startup time of vLLM models."
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
21
vllm/entrypoints/cli/benchmark/sweep.py
Normal file
21
vllm/entrypoints/cli/benchmark/sweep.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.sweep.cli import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
|
||||
|
||||
class BenchmarkSweepSubcommand(BenchmarkSubcommandBase):
|
||||
"""The `sweep` subcommand for `vllm bench`."""
|
||||
|
||||
name = "sweep"
|
||||
help = "Benchmark for a parameter sweep."
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
21
vllm/entrypoints/cli/benchmark/throughput.py
Normal file
21
vllm/entrypoints/cli/benchmark/throughput.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.throughput import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
|
||||
|
||||
class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
|
||||
"""The `throughput` subcommand for `vllm bench`."""
|
||||
|
||||
name = "throughput"
|
||||
help = "Benchmark offline inference throughput."
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
38
vllm/entrypoints/cli/collect_env.py
Normal file
38
vllm/entrypoints/cli/collect_env.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import typing
|
||||
|
||||
from vllm.collect_env import main as collect_env_main
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
|
||||
class CollectEnvSubcommand(CLISubcommand):
|
||||
"""The `collect-env` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "collect-env"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
"""Collect information about the environment."""
|
||||
collect_env_main()
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
return subparsers.add_parser(
|
||||
"collect-env",
|
||||
help="Start collecting environment information.",
|
||||
description="Start collecting environment information.",
|
||||
usage="vllm collect-env",
|
||||
)
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [CollectEnvSubcommand()]
|
||||
79
vllm/entrypoints/cli/main.py
Normal file
79
vllm/entrypoints/cli/main.py
Normal file
@@ -0,0 +1,79 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""The CLI entrypoints of vLLM
|
||||
|
||||
Note that all future modules must be lazily loaded within main
|
||||
to avoid certain eager import breakage."""
|
||||
|
||||
import importlib.metadata
|
||||
import sys
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
import vllm.entrypoints.cli.benchmark.main
|
||||
import vllm.entrypoints.cli.collect_env
|
||||
import vllm.entrypoints.cli.openai
|
||||
import vllm.entrypoints.cli.run_batch
|
||||
import vllm.entrypoints.cli.serve
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
CMD_MODULES = [
|
||||
vllm.entrypoints.cli.openai,
|
||||
vllm.entrypoints.cli.serve,
|
||||
vllm.entrypoints.cli.benchmark.main,
|
||||
vllm.entrypoints.cli.collect_env,
|
||||
vllm.entrypoints.cli.run_batch,
|
||||
]
|
||||
|
||||
cli_env_setup()
|
||||
|
||||
# For 'vllm bench *': use CPU instead of UnspecifiedPlatform by default
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "bench":
|
||||
logger.debug(
|
||||
"Bench command detected, must ensure current platform is not "
|
||||
"UnspecifiedPlatform to avoid device type inference error"
|
||||
)
|
||||
from vllm import platforms
|
||||
|
||||
if platforms.current_platform.is_unspecified():
|
||||
from vllm.platforms.cpu import CpuPlatform
|
||||
|
||||
platforms.current_platform = CpuPlatform()
|
||||
logger.info(
|
||||
"Unspecified platform detected, switching to CPU Platform instead."
|
||||
)
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="vLLM CLI",
|
||||
epilog=VLLM_SUBCMD_PARSER_EPILOG.format(subcmd="[subcommand]"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--version",
|
||||
action="version",
|
||||
version=importlib.metadata.version("vllm"),
|
||||
)
|
||||
subparsers = parser.add_subparsers(required=False, dest="subparser")
|
||||
cmds = {}
|
||||
for cmd_module in CMD_MODULES:
|
||||
new_cmds = cmd_module.cmd_init()
|
||||
for cmd in new_cmds:
|
||||
cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)
|
||||
cmds[cmd.name] = cmd
|
||||
args = parser.parse_args()
|
||||
if args.subparser in cmds:
|
||||
cmds[args.subparser].validate(args)
|
||||
|
||||
if hasattr(args, "dispatch_function"):
|
||||
args.dispatch_function(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
260
vllm/entrypoints/cli/openai.py
Normal file
260
vllm/entrypoints/cli/openai.py
Normal file
@@ -0,0 +1,260 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from openai import OpenAI
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
|
||||
def _register_signal_handlers():
|
||||
def signal_handler(sig, frame):
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTSTP, signal_handler)
|
||||
|
||||
|
||||
def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]:
|
||||
_register_signal_handlers()
|
||||
|
||||
base_url = args.url
|
||||
api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
|
||||
openai_client = OpenAI(api_key=api_key, base_url=base_url)
|
||||
|
||||
if args.model_name:
|
||||
model_name = args.model_name
|
||||
else:
|
||||
available_models = openai_client.models.list()
|
||||
model_name = available_models.data[0].id
|
||||
|
||||
print(f"Using model: {model_name}")
|
||||
|
||||
return model_name, openai_client
|
||||
|
||||
|
||||
def _print_chat_stream(stream) -> str:
|
||||
output = ""
|
||||
for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.content:
|
||||
output += delta.content
|
||||
print(delta.content, end="", flush=True)
|
||||
print()
|
||||
return output
|
||||
|
||||
|
||||
def _print_completion_stream(stream) -> str:
|
||||
output = ""
|
||||
for chunk in stream:
|
||||
text = chunk.choices[0].text
|
||||
if text is not None:
|
||||
output += text
|
||||
print(text, end="", flush=True)
|
||||
print()
|
||||
return output
|
||||
|
||||
|
||||
def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None:
|
||||
conversation: list[ChatCompletionMessageParam] = []
|
||||
if system_prompt is not None:
|
||||
conversation.append({"role": "system", "content": system_prompt})
|
||||
|
||||
print("Please enter a message for the chat model:")
|
||||
while True:
|
||||
try:
|
||||
input_message = input("> ")
|
||||
except EOFError:
|
||||
break
|
||||
conversation.append({"role": "user", "content": input_message})
|
||||
|
||||
stream = client.chat.completions.create(
|
||||
model=model_name, messages=conversation, stream=True
|
||||
)
|
||||
output = _print_chat_stream(stream)
|
||||
conversation.append({"role": "assistant", "content": output})
|
||||
|
||||
|
||||
def _add_query_options(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
type=str,
|
||||
default="http://localhost:8000/v1",
|
||||
help="url of the running OpenAI-Compatible RESTful API server",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The model name used in prompt completion, default to "
|
||||
"the first model in list models API call."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"API key for OpenAI services. If provided, this api key "
|
||||
"will overwrite the api key obtained through environment variables."
|
||||
" It is important to note that this option only applies to the "
|
||||
"OpenAI-compatible API endpoints and NOT other endpoints that may "
|
||||
"be present in the server. See the security guide in the vLLM docs "
|
||||
"for more details."
|
||||
),
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
class ChatCommand(CLISubcommand):
|
||||
"""The `chat` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "chat"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
model_name, client = _interactive_cli(args)
|
||||
system_prompt = args.system_prompt
|
||||
conversation: list[ChatCompletionMessageParam] = []
|
||||
|
||||
if system_prompt is not None:
|
||||
conversation.append({"role": "system", "content": system_prompt})
|
||||
|
||||
if args.quick:
|
||||
conversation.append({"role": "user", "content": args.quick})
|
||||
|
||||
stream = client.chat.completions.create(
|
||||
model=model_name, messages=conversation, stream=True
|
||||
)
|
||||
output = _print_chat_stream(stream)
|
||||
conversation.append({"role": "assistant", "content": output})
|
||||
return
|
||||
|
||||
print("Please enter a message for the chat model:")
|
||||
while True:
|
||||
try:
|
||||
input_message = input("> ")
|
||||
except EOFError:
|
||||
break
|
||||
conversation.append({"role": "user", "content": input_message})
|
||||
|
||||
stream = client.chat.completions.create(
|
||||
model=model_name, messages=conversation, stream=True
|
||||
)
|
||||
output = _print_chat_stream(stream)
|
||||
conversation.append({"role": "assistant", "content": output})
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
"""Add CLI arguments for the chat command."""
|
||||
_add_query_options(parser)
|
||||
parser.add_argument(
|
||||
"--system-prompt",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The system prompt to be added to the chat template, "
|
||||
"used for models that support system prompts."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"-q",
|
||||
"--quick",
|
||||
type=str,
|
||||
metavar="MESSAGE",
|
||||
help=("Send a single prompt as MESSAGE and print the response, then exit."),
|
||||
)
|
||||
return parser
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
parser = subparsers.add_parser(
|
||||
"chat",
|
||||
help="Generate chat completions via the running API server.",
|
||||
description="Generate chat completions via the running API server.",
|
||||
usage="vllm chat [options]",
|
||||
)
|
||||
return ChatCommand.add_cli_args(parser)
|
||||
|
||||
|
||||
class CompleteCommand(CLISubcommand):
|
||||
"""The `complete` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "complete"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
model_name, client = _interactive_cli(args)
|
||||
|
||||
kwargs = {
|
||||
"model": model_name,
|
||||
"stream": True,
|
||||
}
|
||||
if args.max_tokens:
|
||||
kwargs["max_tokens"] = args.max_tokens
|
||||
|
||||
if args.quick:
|
||||
stream = client.completions.create(prompt=args.quick, **kwargs)
|
||||
_print_completion_stream(stream)
|
||||
return
|
||||
|
||||
print("Please enter prompt to complete:")
|
||||
while True:
|
||||
try:
|
||||
input_prompt = input("> ")
|
||||
except EOFError:
|
||||
break
|
||||
stream = client.completions.create(prompt=input_prompt, **kwargs)
|
||||
_print_completion_stream(stream)
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
"""Add CLI arguments for the complete command."""
|
||||
_add_query_options(parser)
|
||||
parser.add_argument(
|
||||
"--max-tokens",
|
||||
type=int,
|
||||
help="Maximum number of tokens to generate per output sequence.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-q",
|
||||
"--quick",
|
||||
type=str,
|
||||
metavar="PROMPT",
|
||||
help="Send a single prompt and print the completion output, then exit.",
|
||||
)
|
||||
return parser
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
parser = subparsers.add_parser(
|
||||
"complete",
|
||||
help=(
|
||||
"Generate text completions based on the given prompt "
|
||||
"via the running API server."
|
||||
),
|
||||
description=(
|
||||
"Generate text completions based on the given prompt "
|
||||
"via the running API server."
|
||||
),
|
||||
usage="vllm complete [options]",
|
||||
)
|
||||
return CompleteCommand.add_cli_args(parser)
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [ChatCommand(), CompleteCommand()]
|
||||
68
vllm/entrypoints/cli/run_batch.py
Normal file
68
vllm/entrypoints/cli/run_batch.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import importlib.metadata
|
||||
import typing
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.logger import init_logger
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RunBatchSubcommand(CLISubcommand):
|
||||
"""The `run-batch` subcommand for vLLM CLI."""
|
||||
|
||||
name = "run-batch"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
from vllm.entrypoints.openai.run_batch import main as run_batch_main
|
||||
|
||||
logger.info(
|
||||
"vLLM batch processing API version %s", importlib.metadata.version("vllm")
|
||||
)
|
||||
logger.info("args: %s", args)
|
||||
|
||||
# Start the Prometheus metrics server.
|
||||
# LLMEngine uses the Prometheus client
|
||||
# to publish metrics at the /metrics endpoint.
|
||||
if args.enable_metrics:
|
||||
from prometheus_client import start_http_server
|
||||
|
||||
logger.info("Prometheus metrics enabled")
|
||||
start_http_server(port=args.port, addr=args.url)
|
||||
else:
|
||||
logger.info("Prometheus metrics disabled")
|
||||
|
||||
asyncio.run(run_batch_main(args))
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
from vllm.entrypoints.openai.run_batch import make_arg_parser
|
||||
|
||||
run_batch_parser = subparsers.add_parser(
|
||||
self.name,
|
||||
help="Run batch prompts and write results to file.",
|
||||
description=(
|
||||
"Run batch prompts using vLLM's OpenAI-compatible API.\n"
|
||||
"Supports local or HTTP input/output files."
|
||||
),
|
||||
usage="vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
|
||||
)
|
||||
run_batch_parser = make_arg_parser(run_batch_parser)
|
||||
run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=self.name)
|
||||
return run_batch_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [RunBatchSubcommand()]
|
||||
249
vllm/entrypoints/cli/serve.py
Normal file
249
vllm/entrypoints/cli/serve.py
Normal file
@@ -0,0 +1,249 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import signal
|
||||
|
||||
import uvloop
|
||||
|
||||
import vllm
|
||||
import vllm.envs as envs
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
run_server,
|
||||
run_server_worker,
|
||||
setup_server,
|
||||
)
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.logger import init_logger
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.network_utils import get_tcp_uri
|
||||
from vllm.utils.system_utils import decorate_logs, set_process_title
|
||||
from vllm.v1.engine.core import EngineCoreProc
|
||||
from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
|
||||
from vllm.v1.executor import Executor
|
||||
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
|
||||
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
|
||||
from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
DESCRIPTION = """Launch a local OpenAI-compatible API server to serve LLM
|
||||
completions via HTTP. Defaults to Qwen/Qwen3-0.6B if no model is specified.
|
||||
|
||||
Search by using: `--help=<ConfigGroup>` to explore options by section (e.g.,
|
||||
--help=ModelConfig, --help=Frontend)
|
||||
Use `--help=all` to show all available flags at once.
|
||||
"""
|
||||
|
||||
|
||||
class ServeSubcommand(CLISubcommand):
|
||||
"""The `serve` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "serve"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
# If model is specified in CLI (as positional arg), it takes precedence
|
||||
if hasattr(args, "model_tag") and args.model_tag is not None:
|
||||
args.model = args.model_tag
|
||||
|
||||
if args.headless or args.api_server_count < 1:
|
||||
run_headless(args)
|
||||
else:
|
||||
if args.api_server_count > 1:
|
||||
run_multi_api_server(args)
|
||||
else:
|
||||
# Single API server (this process).
|
||||
uvloop.run(run_server(args))
|
||||
|
||||
def validate(self, args: argparse.Namespace) -> None:
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
serve_parser = subparsers.add_parser(
|
||||
self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]"
|
||||
)
|
||||
|
||||
serve_parser = make_arg_parser(serve_parser)
|
||||
serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=self.name)
|
||||
return serve_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [ServeSubcommand()]
|
||||
|
||||
|
||||
def run_headless(args: argparse.Namespace):
|
||||
if args.api_server_count > 1:
|
||||
raise ValueError("api_server_count can't be set in headless mode")
|
||||
|
||||
# Create the EngineConfig.
|
||||
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
|
||||
usage_context = UsageContext.OPENAI_API_SERVER
|
||||
vllm_config = engine_args.create_engine_config(
|
||||
usage_context=usage_context, headless=True
|
||||
)
|
||||
|
||||
if engine_args.data_parallel_hybrid_lb:
|
||||
raise ValueError("data_parallel_hybrid_lb is not applicable in headless mode")
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
local_engine_count = parallel_config.data_parallel_size_local
|
||||
|
||||
if local_engine_count <= 0:
|
||||
raise ValueError("data_parallel_size_local must be > 0 in headless mode")
|
||||
|
||||
shutdown_requested = False
|
||||
|
||||
# Catch SIGTERM and SIGINT to allow graceful shutdown.
|
||||
def signal_handler(signum, frame):
|
||||
nonlocal shutdown_requested
|
||||
logger.debug("Received %d signal.", signum)
|
||||
if not shutdown_requested:
|
||||
shutdown_requested = True
|
||||
raise SystemExit
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
if parallel_config.node_rank_within_dp > 0:
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
# Run headless workers (for multi-node PP/TP).
|
||||
host = parallel_config.master_addr
|
||||
head_node_address = f"{host}:{parallel_config.master_port}"
|
||||
logger.info(
|
||||
"Launching vLLM (v%s) headless multiproc executor, "
|
||||
"with head node address %s for torch.distributed process group.",
|
||||
VLLM_VERSION,
|
||||
head_node_address,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config, monitor_workers=False)
|
||||
executor.start_worker_monitor(inline=True)
|
||||
return
|
||||
|
||||
host = parallel_config.data_parallel_master_ip
|
||||
port = parallel_config.data_parallel_rpc_port
|
||||
handshake_address = get_tcp_uri(host, port)
|
||||
|
||||
logger.info(
|
||||
"Launching %d data parallel engine(s) in headless mode, "
|
||||
"with head node address %s.",
|
||||
local_engine_count,
|
||||
handshake_address,
|
||||
)
|
||||
|
||||
# Create the engines.
|
||||
engine_manager = CoreEngineProcManager(
|
||||
target_fn=EngineCoreProc.run_engine_core,
|
||||
local_engine_count=local_engine_count,
|
||||
start_index=vllm_config.parallel_config.data_parallel_rank,
|
||||
local_start_index=0,
|
||||
vllm_config=vllm_config,
|
||||
local_client=False,
|
||||
handshake_address=handshake_address,
|
||||
executor_class=Executor.get_class(vllm_config),
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
)
|
||||
|
||||
try:
|
||||
engine_manager.join_first()
|
||||
finally:
|
||||
logger.info("Shutting down.")
|
||||
engine_manager.close()
|
||||
|
||||
|
||||
def run_multi_api_server(args: argparse.Namespace):
|
||||
assert not args.headless
|
||||
num_api_servers: int = args.api_server_count
|
||||
assert num_api_servers > 0
|
||||
|
||||
if num_api_servers > 1:
|
||||
setup_multiprocess_prometheus()
|
||||
|
||||
listen_address, sock = setup_server(args)
|
||||
|
||||
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
|
||||
engine_args._api_process_count = num_api_servers
|
||||
engine_args._api_process_rank = -1
|
||||
|
||||
usage_context = UsageContext.OPENAI_API_SERVER
|
||||
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
|
||||
|
||||
if num_api_servers > 1 and envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
|
||||
raise ValueError(
|
||||
"VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used with api_server_count > 1"
|
||||
)
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
log_stats = not engine_args.disable_log_stats
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
dp_rank = parallel_config.data_parallel_rank
|
||||
external_dp_lb = parallel_config.data_parallel_external_lb
|
||||
hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb
|
||||
assert external_dp_lb or hybrid_dp_lb or dp_rank == 0
|
||||
|
||||
api_server_manager: APIServerProcessManager | None = None
|
||||
|
||||
with launch_core_engines(
|
||||
vllm_config, executor_class, log_stats, num_api_servers
|
||||
) as (local_engine_manager, coordinator, addresses):
|
||||
# Construct common args for the APIServerProcessManager up-front.
|
||||
api_server_manager_kwargs = dict(
|
||||
target_server_fn=run_api_server_worker_proc,
|
||||
listen_address=listen_address,
|
||||
sock=sock,
|
||||
args=args,
|
||||
num_servers=num_api_servers,
|
||||
input_addresses=addresses.inputs,
|
||||
output_addresses=addresses.outputs,
|
||||
stats_update_address=coordinator.get_stats_publish_address()
|
||||
if coordinator
|
||||
else None,
|
||||
)
|
||||
|
||||
# For dp ranks > 0 in external/hybrid DP LB modes, we must delay the
|
||||
# start of the API servers until the local engine is started
|
||||
# (after the launcher context manager exits),
|
||||
# since we get the front-end stats update address from the coordinator
|
||||
# via the handshake with the local engine.
|
||||
if dp_rank == 0 or not (external_dp_lb or hybrid_dp_lb):
|
||||
# Start API servers using the manager.
|
||||
api_server_manager = APIServerProcessManager(**api_server_manager_kwargs)
|
||||
|
||||
# Start API servers now if they weren't already started.
|
||||
if api_server_manager is None:
|
||||
api_server_manager_kwargs["stats_update_address"] = (
|
||||
addresses.frontend_stats_publish_address
|
||||
)
|
||||
api_server_manager = APIServerProcessManager(**api_server_manager_kwargs)
|
||||
|
||||
# Wait for API servers
|
||||
wait_for_completion_or_failure(
|
||||
api_server_manager=api_server_manager,
|
||||
engine_manager=local_engine_manager,
|
||||
coordinator=coordinator,
|
||||
)
|
||||
|
||||
|
||||
def run_api_server_worker_proc(
|
||||
listen_address, sock, args, client_config=None, **uvicorn_kwargs
|
||||
) -> None:
|
||||
"""Entrypoint for individual API server worker processes."""
|
||||
client_config = client_config or {}
|
||||
server_index = client_config.get("client_index", 0)
|
||||
|
||||
# Set process title and add process-specific prefix to stdout and stderr.
|
||||
set_process_title("APIServer", str(server_index))
|
||||
decorate_logs()
|
||||
|
||||
uvloop.run(
|
||||
run_server_worker(listen_address, sock, args, client_config, **uvicorn_kwargs)
|
||||
)
|
||||
29
vllm/entrypoints/cli/types.py
Normal file
29
vllm/entrypoints/cli/types.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import typing
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
|
||||
class CLISubcommand:
|
||||
"""Base class for CLI argument handlers."""
|
||||
|
||||
name: str
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
raise NotImplementedError("Subclasses should implement this method")
|
||||
|
||||
def validate(self, args: argparse.Namespace) -> None:
|
||||
# No validation by default
|
||||
pass
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
raise NotImplementedError("Subclasses should implement this method")
|
||||
12
vllm/entrypoints/constants.py
Normal file
12
vllm/entrypoints/constants.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Shared constants for vLLM entrypoints.
|
||||
"""
|
||||
|
||||
# HTTP header limits for h11 parser
|
||||
# These constants help mitigate header abuse attacks
|
||||
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB
|
||||
H11_MAX_HEADER_COUNT_DEFAULT = 256
|
||||
|
||||
MCP_PREFIX = "mcp_"
|
||||
835
vllm/entrypoints/context.py
Normal file
835
vllm/entrypoints/context.py
Normal file
@@ -0,0 +1,835 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import contextlib
|
||||
import json
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Callable
|
||||
from contextlib import AsyncExitStack
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
from openai.types.responses.tool import Mcp
|
||||
from openai_harmony import Author, Message, Role, StreamState, TextContent
|
||||
|
||||
from vllm import envs
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatTemplateContentFormatOption,
|
||||
)
|
||||
from vllm.entrypoints.constants import MCP_PREFIX
|
||||
from vllm.entrypoints.openai.parser.harmony_utils import (
|
||||
get_encoding,
|
||||
get_streamable_parser_for_assistant,
|
||||
render_for_completion,
|
||||
)
|
||||
from vllm.entrypoints.openai.parser.responses_parser import (
|
||||
get_responses_parser_for_simple_context,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
FunctionCall,
|
||||
ResponseInputOutputItem,
|
||||
ResponseRawMessageAndToken,
|
||||
ResponsesRequest,
|
||||
)
|
||||
from vllm.entrypoints.responses_utils import construct_tool_dicts
|
||||
from vllm.entrypoints.tool import Tool
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
|
||||
from vllm.tokenizers.protocol import TokenizerLike
|
||||
from vllm.tool_parsers.abstract_tool_parser import ToolParser
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from mcp.client import ClientSession
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# This is currently needed as the tool type doesn't 1:1 match the
|
||||
# tool namespace, which is what is used to look up the
|
||||
# connection to the tool server
|
||||
_TOOL_NAME_TO_TYPE_MAP = {
|
||||
"browser": "web_search_preview",
|
||||
"python": "code_interpreter",
|
||||
"container": "container",
|
||||
}
|
||||
|
||||
|
||||
def _map_tool_name_to_tool_type(tool_name: str) -> str:
|
||||
if tool_name not in _TOOL_NAME_TO_TYPE_MAP:
|
||||
available_tools = ", ".join(_TOOL_NAME_TO_TYPE_MAP.keys())
|
||||
raise ValueError(
|
||||
f"Built-in tool name '{tool_name}' not defined in mapping. "
|
||||
f"Available tools: {available_tools}"
|
||||
)
|
||||
return _TOOL_NAME_TO_TYPE_MAP[tool_name]
|
||||
|
||||
|
||||
class TurnMetrics:
|
||||
"""Tracks token and toolcall details for a single conversation turn."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_tokens: int = 0,
|
||||
output_tokens: int = 0,
|
||||
cached_input_tokens: int = 0,
|
||||
tool_output_tokens: int = 0,
|
||||
) -> None:
|
||||
self.input_tokens = input_tokens
|
||||
self.output_tokens = output_tokens
|
||||
self.cached_input_tokens = cached_input_tokens
|
||||
self.tool_output_tokens = tool_output_tokens
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset counters for a new turn."""
|
||||
self.input_tokens = 0
|
||||
self.output_tokens = 0
|
||||
self.cached_input_tokens = 0
|
||||
self.tool_output_tokens = 0
|
||||
|
||||
def copy(self) -> "TurnMetrics":
|
||||
"""Create a copy of this turn's token counts."""
|
||||
return TurnMetrics(
|
||||
self.input_tokens,
|
||||
self.output_tokens,
|
||||
self.cached_input_tokens,
|
||||
self.tool_output_tokens,
|
||||
)
|
||||
|
||||
|
||||
class ConversationContext(ABC):
|
||||
@abstractmethod
|
||||
def append_output(self, output: RequestOutput) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def append_tool_output(self, output) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def call_tool(self) -> list[Message]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def need_builtin_tool_call(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def render_for_completion(self) -> list[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def init_tool_sessions(
|
||||
self,
|
||||
tool_server: ToolServer | None,
|
||||
exit_stack: AsyncExitStack,
|
||||
request_id: str,
|
||||
mcp_tools: dict[str, Mcp],
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def cleanup_session(self) -> None:
|
||||
raise NotImplementedError("Should not be called.")
|
||||
|
||||
|
||||
def _create_json_parse_error_messages(
|
||||
last_msg: Message, e: json.JSONDecodeError
|
||||
) -> list[Message]:
|
||||
"""
|
||||
Creates an error message when json parse failed.
|
||||
"""
|
||||
error_msg = (
|
||||
f"Error parsing tool arguments as JSON: {str(e)}. "
|
||||
"Please ensure the tool call arguments are valid JSON and try again."
|
||||
)
|
||||
content = TextContent(text=error_msg)
|
||||
author = Author(role=Role.TOOL, name=last_msg.recipient)
|
||||
return [
|
||||
Message(
|
||||
author=author,
|
||||
content=[content],
|
||||
recipient=Role.ASSISTANT,
|
||||
channel=last_msg.channel,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class SimpleContext(ConversationContext):
|
||||
"""This is a context that cannot handle MCP tool calls"""
|
||||
|
||||
def __init__(self):
|
||||
self.last_output = None
|
||||
self.num_prompt_tokens = 0
|
||||
self.num_output_tokens = 0
|
||||
self.num_cached_tokens = 0
|
||||
# todo num_reasoning_tokens is not implemented yet.
|
||||
self.num_reasoning_tokens = 0
|
||||
# not implemented yet for SimpleContext
|
||||
self.all_turn_metrics = []
|
||||
|
||||
self.input_messages: list[ResponseRawMessageAndToken] = []
|
||||
self.output_messages: list[ResponseRawMessageAndToken] = []
|
||||
|
||||
def append_output(self, output) -> None:
|
||||
self.last_output = output
|
||||
if not isinstance(output, RequestOutput):
|
||||
raise ValueError("SimpleContext only supports RequestOutput.")
|
||||
self.num_prompt_tokens = len(output.prompt_token_ids or [])
|
||||
self.num_cached_tokens = output.num_cached_tokens or 0
|
||||
self.num_output_tokens += len(output.outputs[0].token_ids or [])
|
||||
|
||||
if len(self.input_messages) == 0:
|
||||
output_prompt = output.prompt or ""
|
||||
output_prompt_token_ids = output.prompt_token_ids or []
|
||||
self.input_messages.append(
|
||||
ResponseRawMessageAndToken(
|
||||
message=output_prompt,
|
||||
tokens=output_prompt_token_ids,
|
||||
)
|
||||
)
|
||||
self.output_messages.append(
|
||||
ResponseRawMessageAndToken(
|
||||
message=output.outputs[0].text,
|
||||
tokens=output.outputs[0].token_ids,
|
||||
)
|
||||
)
|
||||
|
||||
def append_tool_output(self, output) -> None:
|
||||
raise NotImplementedError("Should not be called.")
|
||||
|
||||
def need_builtin_tool_call(self) -> bool:
|
||||
return False
|
||||
|
||||
async def call_tool(self) -> list[Message]:
|
||||
raise NotImplementedError("Should not be called.")
|
||||
|
||||
def render_for_completion(self) -> list[int]:
|
||||
raise NotImplementedError("Should not be called.")
|
||||
|
||||
async def init_tool_sessions(
|
||||
self,
|
||||
tool_server: ToolServer | None,
|
||||
exit_stack: AsyncExitStack,
|
||||
request_id: str,
|
||||
mcp_tools: dict[str, Mcp],
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
async def cleanup_session(self) -> None:
|
||||
raise NotImplementedError("Should not be called.")
|
||||
|
||||
|
||||
class ParsableContext(ConversationContext):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
response_messages: list[ResponseInputOutputItem],
|
||||
tokenizer: AnyTokenizer,
|
||||
reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser] | None,
|
||||
request: ResponsesRequest,
|
||||
available_tools: list[str] | None,
|
||||
tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
|
||||
chat_template: str | None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
):
|
||||
self.num_prompt_tokens = 0
|
||||
self.num_output_tokens = 0
|
||||
self.num_cached_tokens = 0
|
||||
# TODO: num_reasoning_tokens is not implemented yet.
|
||||
self.num_reasoning_tokens = 0
|
||||
# not implemented yet for ParsableContext
|
||||
self.all_turn_metrics: list[TurnMetrics] = []
|
||||
|
||||
if reasoning_parser_cls is None:
|
||||
raise ValueError("reasoning_parser_cls must be provided.")
|
||||
|
||||
self.parser = get_responses_parser_for_simple_context(
|
||||
tokenizer=tokenizer,
|
||||
reasoning_parser_cls=reasoning_parser_cls,
|
||||
response_messages=response_messages,
|
||||
request=request,
|
||||
tool_parser_cls=tool_parser_cls,
|
||||
)
|
||||
self.tool_parser_cls = tool_parser_cls
|
||||
self.request = request
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
self.available_tools = available_tools or []
|
||||
self._tool_sessions: dict[str, ClientSession | Tool] = {}
|
||||
self.called_tools: set[str] = set()
|
||||
|
||||
self.tool_dicts = construct_tool_dicts(request.tools, request.tool_choice)
|
||||
self.chat_template = chat_template
|
||||
self.chat_template_content_format = chat_template_content_format
|
||||
|
||||
def append_output(self, output: RequestOutput) -> None:
|
||||
self.num_prompt_tokens = len(output.prompt_token_ids or [])
|
||||
self.num_cached_tokens = output.num_cached_tokens or 0
|
||||
self.num_output_tokens += len(output.outputs[0].token_ids or [])
|
||||
self.parser.process(output.outputs[0])
|
||||
|
||||
def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
|
||||
self.parser.response_messages.extend(output)
|
||||
|
||||
def need_builtin_tool_call(self) -> bool:
|
||||
"""Return true if the last message is a MCP tool call"""
|
||||
last_message = self.parser.response_messages[-1]
|
||||
# TODO(qandrew): figure out which tools are MCP tools
|
||||
if last_message.type == "function_call": # noqa: SIM102
|
||||
if last_message.name in (
|
||||
"code_interpreter",
|
||||
"python",
|
||||
"web_search_preview",
|
||||
) or last_message.name.startswith("container"):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def call_python_tool(
|
||||
self, tool_session: Union["ClientSession", Tool], last_msg: FunctionCall
|
||||
) -> list[ResponseInputOutputItem]:
|
||||
self.called_tools.add("python")
|
||||
if isinstance(tool_session, Tool):
|
||||
return await tool_session.get_result_parsable_context(self)
|
||||
args = json.loads(last_msg.arguments)
|
||||
param = {
|
||||
"code": args["code"],
|
||||
}
|
||||
result = await tool_session.call_tool("python", param)
|
||||
result_str = result.content[0].text
|
||||
|
||||
message = ResponseFunctionToolCallOutputItem(
|
||||
id=f"mcpo_{random_uuid()}",
|
||||
type="function_call_output",
|
||||
call_id=f"call_{random_uuid()}",
|
||||
output=result_str,
|
||||
status="completed",
|
||||
)
|
||||
|
||||
return [message]
|
||||
|
||||
async def call_search_tool(
|
||||
self, tool_session: Union["ClientSession", Tool], last_msg: FunctionCall
|
||||
) -> list[ResponseInputOutputItem]:
|
||||
self.called_tools.add("browser")
|
||||
if isinstance(tool_session, Tool):
|
||||
return await tool_session.get_result_parsable_context(self)
|
||||
if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
|
||||
try:
|
||||
args = json.loads(last_msg.arguments)
|
||||
except json.JSONDecodeError as e:
|
||||
return _create_json_parse_error_messages(last_msg, e)
|
||||
else:
|
||||
args = json.loads(last_msg.arguments)
|
||||
result = await tool_session.call_tool("search", args)
|
||||
result_str = result.content[0].text
|
||||
|
||||
message = ResponseFunctionToolCallOutputItem(
|
||||
id=f"fco_{random_uuid()}",
|
||||
type="function_call_output",
|
||||
call_id=f"call_{random_uuid()}",
|
||||
output=result_str,
|
||||
status="completed",
|
||||
)
|
||||
|
||||
return [message]
|
||||
|
||||
async def call_container_tool(
|
||||
self, tool_session: Union["ClientSession", Tool], last_msg: Message
|
||||
) -> list[Message]:
|
||||
"""
|
||||
Call container tool. Expect this to be run in a stateful docker
|
||||
with command line terminal.
|
||||
The official container tool would at least
|
||||
expect the following format:
|
||||
- for tool name: exec
|
||||
- args:
|
||||
{
|
||||
"cmd":List[str] "command to execute",
|
||||
"workdir":optional[str] "current working directory",
|
||||
"env":optional[object/dict] "environment variables",
|
||||
"session_name":optional[str] "session name",
|
||||
"timeout":optional[int] "timeout in seconds",
|
||||
"user":optional[str] "user name",
|
||||
}
|
||||
"""
|
||||
self.called_tools.add("container")
|
||||
if isinstance(tool_session, Tool):
|
||||
return await tool_session.get_result_parsable_context(self)
|
||||
# tool_name = last_msg.recipient.split(".")[1].split(" ")[0]
|
||||
if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
|
||||
try:
|
||||
args = json.loads(last_msg.arguments)
|
||||
except json.JSONDecodeError as e:
|
||||
return _create_json_parse_error_messages(last_msg, e)
|
||||
else:
|
||||
args = json.loads(last_msg.arguments)
|
||||
result = await tool_session.call_tool("exec", args)
|
||||
result_str = result.content[0].text
|
||||
|
||||
message = ResponseFunctionToolCallOutputItem(
|
||||
id=f"fco_{random_uuid()}",
|
||||
type="function_call_output",
|
||||
call_id=f"call_{random_uuid()}",
|
||||
output=result_str,
|
||||
status="completed",
|
||||
)
|
||||
|
||||
return [message]
|
||||
|
||||
async def call_tool(self) -> list[ResponseInputOutputItem]:
|
||||
if not self.parser.response_messages:
|
||||
return []
|
||||
last_msg = self.parser.response_messages[-1]
|
||||
# change this to a mcp_ function call
|
||||
last_msg.id = f"{MCP_PREFIX}{random_uuid()}"
|
||||
self.parser.response_messages[-1] = last_msg
|
||||
if last_msg.name == "code_interpreter":
|
||||
return await self.call_python_tool(self._tool_sessions["python"], last_msg)
|
||||
elif last_msg.name == "web_search_preview":
|
||||
return await self.call_search_tool(self._tool_sessions["browser"], last_msg)
|
||||
elif last_msg.name.startswith("container"):
|
||||
return await self.call_container_tool(
|
||||
self._tool_sessions["container"], last_msg
|
||||
)
|
||||
return []
|
||||
|
||||
def render_for_completion(self):
|
||||
raise NotImplementedError("Should not be called.")
|
||||
|
||||
async def init_tool_sessions(
|
||||
self,
|
||||
tool_server: ToolServer | None,
|
||||
exit_stack: AsyncExitStack,
|
||||
request_id: str,
|
||||
mcp_tools: dict[str, Mcp],
|
||||
):
|
||||
if tool_server:
|
||||
for tool_name in self.available_tools:
|
||||
if tool_name in self._tool_sessions:
|
||||
continue
|
||||
|
||||
tool_type = _map_tool_name_to_tool_type(tool_name)
|
||||
headers = (
|
||||
mcp_tools[tool_type].headers if tool_type in mcp_tools else None
|
||||
)
|
||||
tool_session = await exit_stack.enter_async_context(
|
||||
tool_server.new_session(tool_name, request_id, headers)
|
||||
)
|
||||
self._tool_sessions[tool_name] = tool_session
|
||||
exit_stack.push_async_exit(self.cleanup_session)
|
||||
|
||||
async def cleanup_session(self, *args, **kwargs) -> None:
|
||||
"""Can be used as coro to used in __aexit__"""
|
||||
|
||||
async def cleanup_tool_session(tool_session):
|
||||
if not isinstance(tool_session, Tool):
|
||||
logger.info(
|
||||
"Cleaning up tool session for %s", tool_session._client_info
|
||||
)
|
||||
with contextlib.suppress(Exception):
|
||||
await tool_session.call_tool("cleanup_session", {})
|
||||
|
||||
await asyncio.gather(
|
||||
*(
|
||||
cleanup_tool_session(self._tool_sessions[tool])
|
||||
for tool in self.called_tools
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class HarmonyContext(ConversationContext):
|
||||
def __init__(
|
||||
self,
|
||||
messages: list,
|
||||
available_tools: list[str],
|
||||
):
|
||||
self._messages = messages
|
||||
self.finish_reason: str | None = None
|
||||
self.available_tools = available_tools
|
||||
self._tool_sessions: dict[str, ClientSession | Tool] = {}
|
||||
self.called_tools: set[str] = set()
|
||||
|
||||
self.parser = get_streamable_parser_for_assistant()
|
||||
self.num_init_messages = len(messages)
|
||||
self.num_prompt_tokens = 0
|
||||
self.num_output_tokens = 0
|
||||
self.num_cached_tokens = 0
|
||||
self.num_reasoning_tokens = 0
|
||||
self.num_tool_output_tokens = 0
|
||||
|
||||
# Turn tracking - replaces multiple individual tracking variables
|
||||
self.current_turn_metrics = TurnMetrics()
|
||||
# Track metrics for all turns
|
||||
self.all_turn_metrics: list[TurnMetrics] = []
|
||||
self.is_first_turn = True
|
||||
self.first_tok_of_message = True # For streaming support
|
||||
|
||||
def _update_num_reasoning_tokens(self):
|
||||
# Count all analysis and commentary channels as reasoning tokens
|
||||
if self.parser.current_channel in {"analysis", "commentary"}:
|
||||
self.num_reasoning_tokens += 1
|
||||
|
||||
def append_output(self, output: RequestOutput) -> None:
|
||||
output_token_ids = output.outputs[0].token_ids
|
||||
self.parser = get_streamable_parser_for_assistant()
|
||||
for token_id in output_token_ids:
|
||||
self.parser.process(token_id)
|
||||
# Check if the current token is part of reasoning content
|
||||
self._update_num_reasoning_tokens()
|
||||
self._update_prefill_token_usage(output)
|
||||
self._update_decode_token_usage(output)
|
||||
# Append current turn to all turn list for next turn's calculations
|
||||
self.all_turn_metrics.append(self.current_turn_metrics.copy())
|
||||
self.current_turn_metrics.reset()
|
||||
# append_output is called only once before tool calling
|
||||
# in non-streaming case
|
||||
# so we can append all the parser messages to _messages
|
||||
output_msgs = self.parser.messages
|
||||
# The responses finish reason is set in the last message
|
||||
self.finish_reason = output.outputs[0].finish_reason
|
||||
self._messages.extend(output_msgs)
|
||||
|
||||
def append_tool_output(self, output: list[Message]) -> None:
|
||||
output_msgs = output
|
||||
self._messages.extend(output_msgs)
|
||||
|
||||
def _update_prefill_token_usage(self, output: RequestOutput) -> None:
|
||||
"""Update token usage statistics for the prefill phase of generation.
|
||||
|
||||
The prefill phase processes the input prompt tokens. This method:
|
||||
1. Counts the prompt tokens for this turn
|
||||
2. Calculates tool output tokens for multi-turn conversations
|
||||
3. Updates cached token counts
|
||||
4. Tracks state for next turn calculations
|
||||
|
||||
Tool output tokens are calculated as:
|
||||
current_prompt_tokens - last_turn_prompt_tokens -
|
||||
last_turn_output_tokens
|
||||
This represents tokens added between turns (typically tool responses).
|
||||
|
||||
Args:
|
||||
output: The RequestOutput containing prompt token information
|
||||
"""
|
||||
if output.prompt_token_ids is not None:
|
||||
this_turn_input_tokens = len(output.prompt_token_ids)
|
||||
else:
|
||||
this_turn_input_tokens = 0
|
||||
logger.error("RequestOutput appended contains no prompt_token_ids.")
|
||||
|
||||
# Update current turn input tokens
|
||||
self.current_turn_metrics.input_tokens = this_turn_input_tokens
|
||||
self.num_prompt_tokens += this_turn_input_tokens
|
||||
|
||||
# Calculate tool tokens (except on first turn)
|
||||
if self.is_first_turn:
|
||||
self.is_first_turn = False
|
||||
else:
|
||||
previous_turn = self.all_turn_metrics[-1]
|
||||
# start counting tool after first turn
|
||||
# tool tokens = this turn prefill - last turn prefill -
|
||||
# last turn decode
|
||||
this_turn_tool_tokens = (
|
||||
self.current_turn_metrics.input_tokens
|
||||
- previous_turn.input_tokens
|
||||
- previous_turn.output_tokens
|
||||
)
|
||||
|
||||
# Handle negative tool token counts (shouldn't happen in normal
|
||||
# cases)
|
||||
if this_turn_tool_tokens < 0:
|
||||
logger.error(
|
||||
"Negative tool output tokens calculated: %d "
|
||||
"(current_input=%d, previous_input=%d, "
|
||||
"previous_output=%d). Setting to 0.",
|
||||
this_turn_tool_tokens,
|
||||
self.current_turn_metrics.input_tokens,
|
||||
previous_turn.input_tokens,
|
||||
previous_turn.output_tokens,
|
||||
)
|
||||
this_turn_tool_tokens = 0
|
||||
|
||||
self.num_tool_output_tokens += this_turn_tool_tokens
|
||||
self.current_turn_metrics.tool_output_tokens = this_turn_tool_tokens
|
||||
|
||||
# Update cached tokens
|
||||
num_cached_token = output.num_cached_tokens
|
||||
if num_cached_token is not None:
|
||||
self.num_cached_tokens += num_cached_token
|
||||
self.current_turn_metrics.cached_input_tokens = num_cached_token
|
||||
|
||||
def _update_decode_token_usage(self, output: RequestOutput) -> int:
|
||||
"""Update token usage statistics for the decode phase of generation.
|
||||
|
||||
The decode phase processes the generated output tokens. This method:
|
||||
1. Counts output tokens from all completion outputs
|
||||
2. Updates the total output token count
|
||||
3. Tracks tokens generated in the current turn
|
||||
|
||||
In streaming mode, this is called for each token generated.
|
||||
In non-streaming mode, this is called once with all output tokens.
|
||||
|
||||
Args:
|
||||
output: The RequestOutput containing generated token information
|
||||
|
||||
Returns:
|
||||
int: Number of output tokens processed in this call
|
||||
"""
|
||||
updated_output_token_count = 0
|
||||
if output.outputs:
|
||||
for completion_output in output.outputs:
|
||||
# only keep last round
|
||||
updated_output_token_count += len(completion_output.token_ids)
|
||||
self.num_output_tokens += updated_output_token_count
|
||||
self.current_turn_metrics.output_tokens += updated_output_token_count
|
||||
return updated_output_token_count
|
||||
|
||||
@property
|
||||
def messages(self) -> list:
|
||||
return self._messages
|
||||
|
||||
def need_builtin_tool_call(self) -> bool:
|
||||
last_msg = self.messages[-1]
|
||||
recipient = last_msg.recipient
|
||||
return recipient is not None and (
|
||||
recipient.startswith("browser.")
|
||||
or recipient.startswith("python")
|
||||
or recipient.startswith("container.")
|
||||
)
|
||||
|
||||
async def call_tool(self) -> list[Message]:
|
||||
if not self.messages:
|
||||
return []
|
||||
last_msg = self.messages[-1]
|
||||
recipient = last_msg.recipient
|
||||
if recipient is not None:
|
||||
if recipient.startswith("browser."):
|
||||
return await self.call_search_tool(
|
||||
self._tool_sessions["browser"], last_msg
|
||||
)
|
||||
elif recipient.startswith("python"):
|
||||
return await self.call_python_tool(
|
||||
self._tool_sessions["python"], last_msg
|
||||
)
|
||||
elif recipient.startswith("container."):
|
||||
return await self.call_container_tool(
|
||||
self._tool_sessions["container"], last_msg
|
||||
)
|
||||
raise ValueError("No tool call found")
|
||||
|
||||
def render_for_completion(self) -> list[int]:
|
||||
return render_for_completion(self.messages)
|
||||
|
||||
async def call_search_tool(
|
||||
self, tool_session: Union["ClientSession", Tool], last_msg: Message
|
||||
) -> list[Message]:
|
||||
self.called_tools.add("browser")
|
||||
if isinstance(tool_session, Tool):
|
||||
return await tool_session.get_result(self)
|
||||
tool_name = last_msg.recipient.split(".")[1]
|
||||
if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
|
||||
try:
|
||||
args = json.loads(last_msg.content[0].text)
|
||||
except json.JSONDecodeError as e:
|
||||
return _create_json_parse_error_messages(last_msg, e)
|
||||
else:
|
||||
args = json.loads(last_msg.content[0].text)
|
||||
result = await tool_session.call_tool(tool_name, args)
|
||||
result_str = result.content[0].text
|
||||
content = TextContent(text=result_str)
|
||||
author = Author(role=Role.TOOL, name=last_msg.recipient)
|
||||
return [
|
||||
Message(
|
||||
author=author,
|
||||
content=[content],
|
||||
recipient=Role.ASSISTANT,
|
||||
channel=last_msg.channel,
|
||||
)
|
||||
]
|
||||
|
||||
async def call_python_tool(
|
||||
self, tool_session: Union["ClientSession", Tool], last_msg: Message
|
||||
) -> list[Message]:
|
||||
self.called_tools.add("python")
|
||||
if isinstance(tool_session, Tool):
|
||||
return await tool_session.get_result(self)
|
||||
param = {
|
||||
"code": last_msg.content[0].text,
|
||||
}
|
||||
result = await tool_session.call_tool("python", param)
|
||||
result_str = result.content[0].text
|
||||
|
||||
content = TextContent(text=result_str)
|
||||
author = Author(role=Role.TOOL, name="python")
|
||||
|
||||
return [
|
||||
Message(
|
||||
author=author,
|
||||
content=[content],
|
||||
channel=last_msg.channel,
|
||||
recipient=Role.ASSISTANT,
|
||||
)
|
||||
]
|
||||
|
||||
async def init_tool_sessions(
|
||||
self,
|
||||
tool_server: ToolServer | None,
|
||||
exit_stack: AsyncExitStack,
|
||||
request_id: str,
|
||||
mcp_tools: dict[str, Mcp],
|
||||
):
|
||||
if tool_server:
|
||||
for tool_name in self.available_tools:
|
||||
if tool_name not in self._tool_sessions:
|
||||
tool_type = _map_tool_name_to_tool_type(tool_name)
|
||||
headers = (
|
||||
mcp_tools[tool_type].headers if tool_type in mcp_tools else None
|
||||
)
|
||||
tool_session = await exit_stack.enter_async_context(
|
||||
tool_server.new_session(tool_name, request_id, headers)
|
||||
)
|
||||
self._tool_sessions[tool_name] = tool_session
|
||||
exit_stack.push_async_exit(self.cleanup_session)
|
||||
|
||||
async def call_container_tool(
|
||||
self, tool_session: Union["ClientSession", Tool], last_msg: Message
|
||||
) -> list[Message]:
|
||||
"""
|
||||
Call container tool. Expect this to be run in a stateful docker
|
||||
with command line terminal.
|
||||
The official container tool would at least
|
||||
expect the following format:
|
||||
- for tool name: exec
|
||||
- args:
|
||||
{
|
||||
"cmd":List[str] "command to execute",
|
||||
"workdir":optional[str] "current working directory",
|
||||
"env":optional[object/dict] "environment variables",
|
||||
"session_name":optional[str] "session name",
|
||||
"timeout":optional[int] "timeout in seconds",
|
||||
"user":optional[str] "user name",
|
||||
}
|
||||
"""
|
||||
self.called_tools.add("container")
|
||||
if isinstance(tool_session, Tool):
|
||||
return await tool_session.get_result(self)
|
||||
tool_name = last_msg.recipient.split(".")[1].split(" ")[0]
|
||||
if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
|
||||
try:
|
||||
args = json.loads(last_msg.content[0].text)
|
||||
except json.JSONDecodeError as e:
|
||||
return _create_json_parse_error_messages(last_msg, e)
|
||||
else:
|
||||
args = json.loads(last_msg.content[0].text)
|
||||
result = await tool_session.call_tool(tool_name, args)
|
||||
result_str = result.content[0].text
|
||||
content = TextContent(text=result_str)
|
||||
author = Author(role=Role.TOOL, name=last_msg.recipient)
|
||||
return [
|
||||
Message(
|
||||
author=author,
|
||||
content=[content],
|
||||
recipient=Role.ASSISTANT,
|
||||
channel=last_msg.channel,
|
||||
)
|
||||
]
|
||||
|
||||
async def cleanup_session(self, *args, **kwargs) -> None:
|
||||
"""Can be used as coro to used in __aexit__"""
|
||||
|
||||
async def cleanup_tool_session(tool_session):
|
||||
if not isinstance(tool_session, Tool):
|
||||
logger.info(
|
||||
"Cleaning up tool session for %s", tool_session._client_info
|
||||
)
|
||||
with contextlib.suppress(Exception):
|
||||
await tool_session.call_tool("cleanup_session", {})
|
||||
|
||||
await asyncio.gather(
|
||||
*(
|
||||
cleanup_tool_session(self._tool_sessions[tool])
|
||||
for tool in self.called_tools
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class StreamingHarmonyContext(HarmonyContext):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.last_output = None
|
||||
|
||||
self.parser = get_streamable_parser_for_assistant()
|
||||
self.encoding = get_encoding()
|
||||
self.last_tok = None
|
||||
self.first_tok_of_message = True
|
||||
|
||||
@property
|
||||
def messages(self) -> list:
|
||||
return self._messages
|
||||
|
||||
def append_output(self, output: RequestOutput) -> None:
|
||||
# append_output is called for each output token in streaming case,
|
||||
# so we only want to add the prompt tokens once for each message.
|
||||
if self.first_tok_of_message:
|
||||
self._update_prefill_token_usage(output)
|
||||
# Reset self.first_tok_of_message if needed:
|
||||
# if the current token is the last one of the current message
|
||||
# (finished=True), then the next token processed will mark the
|
||||
# beginning of a new message
|
||||
self.first_tok_of_message = output.finished
|
||||
for tok in output.outputs[0].token_ids:
|
||||
self.parser.process(tok)
|
||||
self._update_decode_token_usage(output)
|
||||
|
||||
# For streaming, update previous turn when message is complete
|
||||
if output.finished:
|
||||
self.all_turn_metrics.append(self.current_turn_metrics.copy())
|
||||
self.current_turn_metrics.reset()
|
||||
# Check if the current token is part of reasoning content
|
||||
self._update_num_reasoning_tokens()
|
||||
self.last_tok = tok
|
||||
if len(self._messages) - self.num_init_messages < len(self.parser.messages):
|
||||
self._messages.extend(
|
||||
self.parser.messages[len(self._messages) - self.num_init_messages :]
|
||||
)
|
||||
|
||||
def append_tool_output(self, output: list[Message]) -> None:
|
||||
# Handle the case of tool output in direct message format
|
||||
assert len(output) == 1, "Tool output should be a single message"
|
||||
msg = output[0]
|
||||
# Sometimes the recipient is not set for tool messages,
|
||||
# so we set it to "assistant"
|
||||
if msg.author.role == Role.TOOL and msg.recipient is None:
|
||||
msg.recipient = "assistant"
|
||||
toks = self.encoding.render(msg)
|
||||
for tok in toks:
|
||||
self.parser.process(tok)
|
||||
self.last_tok = toks[-1]
|
||||
# TODO: add tool_output messages to self._messages
|
||||
|
||||
def is_expecting_start(self) -> bool:
|
||||
return self.parser.state == StreamState.EXPECT_START
|
||||
|
||||
def is_assistant_action_turn(self) -> bool:
|
||||
return self.last_tok in self.encoding.stop_tokens_for_assistant_actions()
|
||||
|
||||
def render_for_completion(self) -> list[int]:
|
||||
# now this list of tokens as next turn's starting tokens
|
||||
# `<|start|>assistant`,
|
||||
# we need to process them in parser.
|
||||
rendered_tokens = super().render_for_completion()
|
||||
|
||||
last_n = -1
|
||||
to_process = []
|
||||
while rendered_tokens[last_n] != self.last_tok:
|
||||
to_process.append(rendered_tokens[last_n])
|
||||
last_n -= 1
|
||||
for tok in reversed(to_process):
|
||||
self.parser.process(tok)
|
||||
|
||||
return rendered_tokens
|
||||
175
vllm/entrypoints/launcher.py
Normal file
175
vllm/entrypoints/launcher.py
Normal file
@@ -0,0 +1,175 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import signal
|
||||
import socket
|
||||
from http import HTTPStatus
|
||||
from typing import Any
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, Request, Response
|
||||
|
||||
from vllm import envs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.constants import (
|
||||
H11_MAX_HEADER_COUNT_DEFAULT,
|
||||
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
|
||||
)
|
||||
from vllm.entrypoints.ssl import SSLCertRefresher
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.network_utils import find_process_using_port
|
||||
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
async def serve_http(
|
||||
app: FastAPI,
|
||||
sock: socket.socket | None,
|
||||
enable_ssl_refresh: bool = False,
|
||||
**uvicorn_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
|
||||
options. Supports http header limits via h11_max_incomplete_event_size and
|
||||
h11_max_header_count.
|
||||
"""
|
||||
logger.info("Available routes are:")
|
||||
for route in app.routes:
|
||||
methods = getattr(route, "methods", None)
|
||||
path = getattr(route, "path", None)
|
||||
|
||||
if methods is None or path is None:
|
||||
continue
|
||||
|
||||
logger.info("Route: %s, Methods: %s", path, ", ".join(methods))
|
||||
|
||||
# Extract header limit options if present
|
||||
h11_max_incomplete_event_size = uvicorn_kwargs.pop(
|
||||
"h11_max_incomplete_event_size", None
|
||||
)
|
||||
h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None)
|
||||
|
||||
# Set safe defaults if not provided
|
||||
if h11_max_incomplete_event_size is None:
|
||||
h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
|
||||
if h11_max_header_count is None:
|
||||
h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT
|
||||
|
||||
config = uvicorn.Config(app, **uvicorn_kwargs)
|
||||
# Set header limits
|
||||
config.h11_max_incomplete_event_size = h11_max_incomplete_event_size
|
||||
config.h11_max_header_count = h11_max_header_count
|
||||
config.load()
|
||||
server = uvicorn.Server(config)
|
||||
_add_shutdown_handlers(app, server)
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
watchdog_task = loop.create_task(watchdog_loop(server, app.state.engine_client))
|
||||
server_task = loop.create_task(server.serve(sockets=[sock] if sock else None))
|
||||
|
||||
ssl_cert_refresher = (
|
||||
None
|
||||
if not enable_ssl_refresh
|
||||
else SSLCertRefresher(
|
||||
ssl_context=config.ssl,
|
||||
key_path=config.ssl_keyfile,
|
||||
cert_path=config.ssl_certfile,
|
||||
ca_path=config.ssl_ca_certs,
|
||||
)
|
||||
)
|
||||
|
||||
def signal_handler() -> None:
|
||||
# prevents the uvicorn signal handler to exit early
|
||||
server_task.cancel()
|
||||
watchdog_task.cancel()
|
||||
if ssl_cert_refresher:
|
||||
ssl_cert_refresher.stop()
|
||||
|
||||
async def dummy_shutdown() -> None:
|
||||
pass
|
||||
|
||||
loop.add_signal_handler(signal.SIGINT, signal_handler)
|
||||
loop.add_signal_handler(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
await server_task
|
||||
return dummy_shutdown()
|
||||
except asyncio.CancelledError:
|
||||
port = uvicorn_kwargs["port"]
|
||||
process = find_process_using_port(port)
|
||||
if process is not None:
|
||||
logger.warning(
|
||||
"port %s is used by process %s launched with command:\n%s",
|
||||
port,
|
||||
process,
|
||||
" ".join(process.cmdline()),
|
||||
)
|
||||
logger.info("Shutting down FastAPI HTTP server.")
|
||||
return server.shutdown()
|
||||
finally:
|
||||
watchdog_task.cancel()
|
||||
|
||||
|
||||
async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
|
||||
"""
|
||||
# Watchdog task that runs in the background, checking
|
||||
# for error state in the engine. Needed to trigger shutdown
|
||||
# if an exception arises is StreamingResponse() generator.
|
||||
"""
|
||||
VLLM_WATCHDOG_TIME_S = 5.0
|
||||
while True:
|
||||
await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
|
||||
terminate_if_errored(server, engine)
|
||||
|
||||
|
||||
def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
|
||||
"""
|
||||
See discussions here on shutting down a uvicorn server
|
||||
https://github.com/encode/uvicorn/discussions/1103
|
||||
In this case we cannot await the server shutdown here
|
||||
because handler must first return to close the connection
|
||||
for this request.
|
||||
"""
|
||||
engine_errored = engine.errored and not engine.is_running
|
||||
if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
|
||||
server.should_exit = True
|
||||
|
||||
|
||||
def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
|
||||
"""
|
||||
VLLM V1 AsyncLLM catches exceptions and returns
|
||||
only two types: EngineGenerateError and EngineDeadError.
|
||||
|
||||
EngineGenerateError is raised by the per request generate()
|
||||
method. This error could be request specific (and therefore
|
||||
recoverable - e.g. if there is an error in input processing).
|
||||
|
||||
EngineDeadError is raised by the background output_handler
|
||||
method. This error is global and therefore not recoverable.
|
||||
|
||||
We register these @app.exception_handlers to return nice
|
||||
responses to the end user if they occur and shut down if needed.
|
||||
See https://fastapi.tiangolo.com/tutorial/handling-errors/
|
||||
for more details on how exception handlers work.
|
||||
|
||||
If an exception is encountered in a StreamingResponse
|
||||
generator, the exception is not raised, since we already sent
|
||||
a 200 status. Rather, we send an error message as the next chunk.
|
||||
Since the exception is not raised, this means that the server
|
||||
will not automatically shut down. Instead, we use the watchdog
|
||||
background task for check for errored state.
|
||||
"""
|
||||
|
||||
@app.exception_handler(RuntimeError)
|
||||
@app.exception_handler(EngineDeadError)
|
||||
@app.exception_handler(EngineGenerateError)
|
||||
async def runtime_exception_handler(request: Request, __):
|
||||
terminate_if_errored(
|
||||
server=server,
|
||||
engine=request.app.state.engine_client,
|
||||
)
|
||||
|
||||
return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
||||
File diff suppressed because it is too large
Load Diff
84
vllm/entrypoints/logger.py
Normal file
84
vllm/entrypoints/logger.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RequestLogger:
|
||||
def __init__(self, *, max_log_len: int | None) -> None:
|
||||
self.max_log_len = max_log_len
|
||||
|
||||
def log_inputs(
|
||||
self,
|
||||
request_id: str,
|
||||
prompt: str | None,
|
||||
prompt_token_ids: list[int] | None,
|
||||
prompt_embeds: torch.Tensor | None,
|
||||
params: SamplingParams | PoolingParams | BeamSearchParams | None,
|
||||
lora_request: LoRARequest | None,
|
||||
) -> None:
|
||||
max_log_len = self.max_log_len
|
||||
if max_log_len is not None:
|
||||
if prompt is not None:
|
||||
prompt = prompt[:max_log_len]
|
||||
|
||||
if prompt_token_ids is not None:
|
||||
prompt_token_ids = prompt_token_ids[:max_log_len]
|
||||
|
||||
logger.debug(
|
||||
"Request %s details: prompt: %r, "
|
||||
"prompt_token_ids: %s, "
|
||||
"prompt_embeds shape: %s.",
|
||||
request_id,
|
||||
prompt,
|
||||
prompt_token_ids,
|
||||
prompt_embeds.shape if prompt_embeds is not None else None,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Received request %s: params: %s, lora_request: %s.",
|
||||
request_id,
|
||||
params,
|
||||
lora_request,
|
||||
)
|
||||
|
||||
def log_outputs(
|
||||
self,
|
||||
request_id: str,
|
||||
outputs: str,
|
||||
output_token_ids: Sequence[int] | None,
|
||||
finish_reason: str | None = None,
|
||||
is_streaming: bool = False,
|
||||
delta: bool = False,
|
||||
) -> None:
|
||||
max_log_len = self.max_log_len
|
||||
if max_log_len is not None:
|
||||
if outputs is not None:
|
||||
outputs = outputs[:max_log_len]
|
||||
|
||||
if output_token_ids is not None:
|
||||
# Convert to list and apply truncation
|
||||
output_token_ids = list(output_token_ids)[:max_log_len]
|
||||
|
||||
stream_info = ""
|
||||
if is_streaming:
|
||||
stream_info = " (streaming delta)" if delta else " (streaming complete)"
|
||||
|
||||
logger.info(
|
||||
"Generated response %s%s: output: %r, "
|
||||
"output_token_ids: %s, finish_reason: %s",
|
||||
request_id,
|
||||
stream_info,
|
||||
outputs,
|
||||
output_token_ids,
|
||||
finish_reason,
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,3 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file contains the command line arguments for the vLLM's
|
||||
OpenAI-compatible server. It is kept in a separate file for documentation
|
||||
@@ -7,109 +9,294 @@ purposes.
|
||||
import argparse
|
||||
import json
|
||||
import ssl
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import field
|
||||
from typing import Literal
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
|
||||
from vllm.entrypoints.openai.serving_engine import LoRAModulePath
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import config
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatTemplateContentFormatOption,
|
||||
validate_chat_template,
|
||||
)
|
||||
from vllm.entrypoints.constants import (
|
||||
H11_MAX_HEADER_COUNT_DEFAULT,
|
||||
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tool_parsers import ToolParserManager
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class LoRAParserAction(argparse.Action):
|
||||
def __call__(
|
||||
self,
|
||||
parser: argparse.ArgumentParser,
|
||||
namespace: argparse.Namespace,
|
||||
values: str | Sequence[str] | None,
|
||||
option_string: str | None = None,
|
||||
):
|
||||
if values is None:
|
||||
values = []
|
||||
if isinstance(values, str):
|
||||
raise TypeError("Expected values to be a list")
|
||||
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
lora_list = []
|
||||
lora_list: list[LoRAModulePath] = []
|
||||
for item in values:
|
||||
name, path = item.split('=')
|
||||
lora_list.append(LoRAModulePath(name, path))
|
||||
if item in [None, ""]: # Skip if item is None or empty string
|
||||
continue
|
||||
if "=" in item and "," not in item: # Old format: name=path
|
||||
name, path = item.split("=")
|
||||
lora_list.append(LoRAModulePath(name, path))
|
||||
else: # Assume JSON format
|
||||
try:
|
||||
lora_dict = json.loads(item)
|
||||
lora = LoRAModulePath(**lora_dict)
|
||||
lora_list.append(lora)
|
||||
except json.JSONDecodeError:
|
||||
parser.error(f"Invalid JSON format for --lora-modules: {item}")
|
||||
except TypeError as e:
|
||||
parser.error(
|
||||
f"Invalid fields for --lora-modules: {item} - {str(e)}"
|
||||
)
|
||||
setattr(namespace, self.dest, lora_list)
|
||||
|
||||
|
||||
def make_arg_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="vLLM OpenAI-Compatible RESTful API server.")
|
||||
parser.add_argument("--host",
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help="host name")
|
||||
parser.add_argument("--port", type=int, default=8000, help="port number")
|
||||
@config
|
||||
@dataclass
|
||||
class FrontendArgs:
|
||||
"""Arguments for the OpenAI-compatible frontend server."""
|
||||
|
||||
host: str | None = None
|
||||
"""Host name."""
|
||||
port: int = 8000
|
||||
"""Port number."""
|
||||
uds: str | None = None
|
||||
"""Unix domain socket path. If set, host and port arguments are ignored."""
|
||||
uvicorn_log_level: Literal[
|
||||
"debug", "info", "warning", "error", "critical", "trace"
|
||||
] = "info"
|
||||
"""Log level for uvicorn."""
|
||||
disable_uvicorn_access_log: bool = False
|
||||
"""Disable uvicorn access log."""
|
||||
allow_credentials: bool = False
|
||||
"""Allow credentials."""
|
||||
allowed_origins: list[str] = field(default_factory=lambda: ["*"])
|
||||
"""Allowed origins."""
|
||||
allowed_methods: list[str] = field(default_factory=lambda: ["*"])
|
||||
"""Allowed methods."""
|
||||
allowed_headers: list[str] = field(default_factory=lambda: ["*"])
|
||||
"""Allowed headers."""
|
||||
api_key: list[str] | None = None
|
||||
"""If provided, the server will require one of these keys to be presented in
|
||||
the header."""
|
||||
lora_modules: list[LoRAModulePath] | None = None
|
||||
"""LoRA modules configurations in either 'name=path' format or JSON format
|
||||
or JSON list format. Example (old format): `'name=path'` Example (new
|
||||
format): `{\"name\": \"name\", \"path\": \"lora_path\",
|
||||
\"base_model_name\": \"id\"}`"""
|
||||
chat_template: str | None = None
|
||||
"""The file path to the chat template, or the template in single-line form
|
||||
for the specified model."""
|
||||
chat_template_content_format: ChatTemplateContentFormatOption = "auto"
|
||||
"""The format to render message content within a chat template.
|
||||
|
||||
* "string" will render the content as a string. Example: `"Hello World"`
|
||||
* "openai" will render the content as a list of dictionaries, similar to
|
||||
OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
||||
trust_request_chat_template: bool = False
|
||||
"""Whether to trust the chat template provided in the request. If False,
|
||||
the server will always use the chat template specified by `--chat-template`
|
||||
or the ones from tokenizer."""
|
||||
response_role: str = "assistant"
|
||||
"""The role name to return if `request.add_generation_prompt=true`."""
|
||||
ssl_keyfile: str | None = None
|
||||
"""The file path to the SSL key file."""
|
||||
ssl_certfile: str | None = None
|
||||
"""The file path to the SSL cert file."""
|
||||
ssl_ca_certs: str | None = None
|
||||
"""The CA certificates file."""
|
||||
enable_ssl_refresh: bool = False
|
||||
"""Refresh SSL Context when SSL certificate files change"""
|
||||
ssl_cert_reqs: int = int(ssl.CERT_NONE)
|
||||
"""Whether client certificate is required (see stdlib ssl module's)."""
|
||||
root_path: str | None = None
|
||||
"""FastAPI root_path when app is behind a path based routing proxy."""
|
||||
middleware: list[str] = field(default_factory=lambda: [])
|
||||
"""Additional ASGI middleware to apply to the app. We accept multiple
|
||||
--middleware arguments. The value should be an import path. If a function
|
||||
is provided, vLLM will add it to the server using
|
||||
`@app.middleware('http')`. If a class is provided, vLLM will
|
||||
add it to the server using `app.add_middleware()`."""
|
||||
return_tokens_as_token_ids: bool = False
|
||||
"""When `--max-logprobs` is specified, represents single tokens as
|
||||
strings of the form 'token_id:{token_id}' so that tokens that are not
|
||||
JSON-encodable can be identified."""
|
||||
disable_frontend_multiprocessing: bool = False
|
||||
"""If specified, will run the OpenAI frontend server in the same process as
|
||||
the model serving engine."""
|
||||
enable_request_id_headers: bool = False
|
||||
"""If specified, API server will add X-Request-Id header to responses."""
|
||||
enable_auto_tool_choice: bool = False
|
||||
"""Enable auto tool choice for supported models. Use `--tool-call-parser`
|
||||
to specify which parser to use."""
|
||||
exclude_tools_when_tool_choice_none: bool = False
|
||||
"""If specified, exclude tool definitions in prompts when
|
||||
tool_choice='none'."""
|
||||
tool_call_parser: str | None = None
|
||||
"""Select the tool call parser depending on the model that you're using.
|
||||
This is used to parse the model-generated tool call into OpenAI API format.
|
||||
Required for `--enable-auto-tool-choice`. You can choose any option from
|
||||
the built-in parsers or register a plugin via `--tool-parser-plugin`."""
|
||||
tool_parser_plugin: str = ""
|
||||
"""Special the tool parser plugin write to parse the model-generated tool
|
||||
into OpenAI API format, the name register in this plugin can be used in
|
||||
`--tool-call-parser`."""
|
||||
tool_server: str | None = None
|
||||
"""Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
|
||||
Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
|
||||
purpose."""
|
||||
log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
|
||||
"""Path to logging config JSON file for both vllm and uvicorn"""
|
||||
max_log_len: int | None = None
|
||||
"""Max number of prompt characters or prompt ID numbers being printed in
|
||||
log. The default of None means unlimited."""
|
||||
disable_fastapi_docs: bool = False
|
||||
"""Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""
|
||||
enable_prompt_tokens_details: bool = False
|
||||
"""If set to True, enable prompt_tokens_details in usage."""
|
||||
enable_server_load_tracking: bool = False
|
||||
"""If set to True, enable tracking server_load_metrics in the app state."""
|
||||
enable_force_include_usage: bool = False
|
||||
"""If set to True, including usage on every request."""
|
||||
enable_tokenizer_info_endpoint: bool = False
|
||||
"""Enable the `/tokenizer_info` endpoint. May expose chat
|
||||
templates and other tokenizer configuration."""
|
||||
enable_log_outputs: bool = False
|
||||
"""If True, log model outputs (generations).
|
||||
Requires --enable-log-requests."""
|
||||
h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
|
||||
"""Maximum size (bytes) of an incomplete HTTP event (header or body) for
|
||||
h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
|
||||
h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
|
||||
"""Maximum number of HTTP headers allowed in a request for h11 parser.
|
||||
Helps mitigate header abuse. Default: 256."""
|
||||
log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
|
||||
"""If set to True, log the stack trace of error responses"""
|
||||
tokens_only: bool = False
|
||||
"""
|
||||
If set to True, only enable the Tokens In<>Out endpoint.
|
||||
This is intended for use in a Disaggregated Everything setup.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
from vllm.engine.arg_utils import get_kwargs
|
||||
|
||||
frontend_kwargs = get_kwargs(FrontendArgs)
|
||||
|
||||
# Special case: allowed_origins, allowed_methods, allowed_headers all
|
||||
# need json.loads type
|
||||
# Should also remove nargs
|
||||
frontend_kwargs["allowed_origins"]["type"] = json.loads
|
||||
frontend_kwargs["allowed_methods"]["type"] = json.loads
|
||||
frontend_kwargs["allowed_headers"]["type"] = json.loads
|
||||
del frontend_kwargs["allowed_origins"]["nargs"]
|
||||
del frontend_kwargs["allowed_methods"]["nargs"]
|
||||
del frontend_kwargs["allowed_headers"]["nargs"]
|
||||
|
||||
# Special case: LoRA modules need custom parser action and
|
||||
# optional_type(str)
|
||||
frontend_kwargs["lora_modules"]["type"] = optional_type(str)
|
||||
frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
|
||||
|
||||
# Special case: Middleware needs to append action
|
||||
frontend_kwargs["middleware"]["action"] = "append"
|
||||
frontend_kwargs["middleware"]["type"] = str
|
||||
if "nargs" in frontend_kwargs["middleware"]:
|
||||
del frontend_kwargs["middleware"]["nargs"]
|
||||
frontend_kwargs["middleware"]["default"] = []
|
||||
|
||||
# Special case: Tool call parser shows built-in options.
|
||||
valid_tool_parsers = list(ToolParserManager.list_registered())
|
||||
parsers_str = ",".join(valid_tool_parsers)
|
||||
frontend_kwargs["tool_call_parser"]["metavar"] = (
|
||||
f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
|
||||
)
|
||||
|
||||
frontend_group = parser.add_argument_group(
|
||||
title="Frontend",
|
||||
description=FrontendArgs.__doc__,
|
||||
)
|
||||
|
||||
for key, value in frontend_kwargs.items():
|
||||
frontend_group.add_argument(f"--{key.replace('_', '-')}", **value)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
"""Create the CLI argument parser used by the OpenAI API server.
|
||||
|
||||
We rely on the helper methods of `FrontendArgs` and `AsyncEngineArgs` to
|
||||
register all arguments instead of manually enumerating them here. This
|
||||
avoids code duplication and keeps the argument definitions in one place.
|
||||
"""
|
||||
parser.add_argument(
|
||||
"--uvicorn-log-level",
|
||||
"model_tag",
|
||||
type=str,
|
||||
default="info",
|
||||
choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
|
||||
help="log level for uvicorn")
|
||||
parser.add_argument("--allow-credentials",
|
||||
action="store_true",
|
||||
help="allow credentials")
|
||||
parser.add_argument("--allowed-origins",
|
||||
type=json.loads,
|
||||
default=["*"],
|
||||
help="allowed origins")
|
||||
parser.add_argument("--allowed-methods",
|
||||
type=json.loads,
|
||||
default=["*"],
|
||||
help="allowed methods")
|
||||
parser.add_argument("--allowed-headers",
|
||||
type=json.loads,
|
||||
default=["*"],
|
||||
help="allowed headers")
|
||||
parser.add_argument("--api-key",
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help="If provided, the server will require this key "
|
||||
"to be presented in the header.")
|
||||
parser.add_argument(
|
||||
"--lora-modules",
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
nargs='+',
|
||||
action=LoRAParserAction,
|
||||
help="LoRA module configurations in the format name=path. "
|
||||
"Multiple modules can be specified.")
|
||||
parser.add_argument("--chat-template",
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help="The file path to the chat template, "
|
||||
"or the template in single-line form "
|
||||
"for the specified model")
|
||||
parser.add_argument("--response-role",
|
||||
type=nullable_str,
|
||||
default="assistant",
|
||||
help="The role name to return if "
|
||||
"`request.add_generation_prompt=true`.")
|
||||
parser.add_argument("--ssl-keyfile",
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help="The file path to the SSL key file")
|
||||
parser.add_argument("--ssl-certfile",
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help="The file path to the SSL cert file")
|
||||
parser.add_argument("--ssl-ca-certs",
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help="The CA certificates file")
|
||||
parser.add_argument(
|
||||
"--ssl-cert-reqs",
|
||||
type=int,
|
||||
default=int(ssl.CERT_NONE),
|
||||
help="Whether client certificate is required (see stdlib ssl module's)"
|
||||
nargs="?",
|
||||
help="The model tag to serve (optional if specified in config)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--root-path",
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help="FastAPI root_path when app is behind a path based routing proxy")
|
||||
"--headless",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Run in headless mode. See multi-node data parallel "
|
||||
"documentation for more details.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--middleware",
|
||||
type=nullable_str,
|
||||
action="append",
|
||||
default=[],
|
||||
help="Additional ASGI middleware to apply to the app. "
|
||||
"We accept multiple --middleware arguments. "
|
||||
"The value should be an import path. "
|
||||
"If a function is provided, vLLM will add it to the server "
|
||||
"using @app.middleware('http'). "
|
||||
"If a class is provided, vLLM will add it to the server "
|
||||
"using app.add_middleware(). ")
|
||||
|
||||
"--api-server-count",
|
||||
"-asc",
|
||||
type=int,
|
||||
default=1,
|
||||
help="How many API server processes to run.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
help="Read CLI options from a config file. "
|
||||
"Must be a YAML with the following options: "
|
||||
"https://docs.vllm.ai/en/latest/configuration/serve_args.html",
|
||||
)
|
||||
parser = FrontendArgs.add_cli_args(parser)
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def validate_parsed_serve_args(args: argparse.Namespace):
|
||||
"""Quick checks for model serve args that raise prior to loading."""
|
||||
if hasattr(args, "subparser") and args.subparser != "serve":
|
||||
return
|
||||
|
||||
# Ensure that the chat template is valid; raises if it likely isn't
|
||||
validate_chat_template(args.chat_template)
|
||||
|
||||
# Enable auto tool needs a tool call parser to be valid
|
||||
if args.enable_auto_tool_choice and not args.tool_call_parser:
|
||||
raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser")
|
||||
if args.enable_log_outputs and not args.enable_log_requests:
|
||||
raise TypeError("Error: --enable-log-outputs requires --enable-log-requests")
|
||||
|
||||
|
||||
def create_parser_for_docs() -> FlexibleArgumentParser:
|
||||
parser_for_docs = FlexibleArgumentParser(
|
||||
prog="-m vllm.entrypoints.openai.api_server"
|
||||
)
|
||||
return make_arg_parser(parser_for_docs)
|
||||
|
||||
120
vllm/entrypoints/openai/orca_metrics.py
Normal file
120
vllm/entrypoints/openai/orca_metrics.py
Normal file
@@ -0,0 +1,120 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Utility functions that create ORCA endpoint load report response headers.
|
||||
"""
|
||||
|
||||
import json
|
||||
from collections.abc import Mapping
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.metrics.reader import Gauge, get_metrics_snapshot
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def create_orca_header(
|
||||
metrics_format: str, named_metrics: list[tuple[str, float]]
|
||||
) -> Mapping[str, str] | None:
|
||||
"""
|
||||
Creates ORCA headers named 'endpoint-load-metrics' in the specified format
|
||||
and adds custom metrics to named_metrics.
|
||||
ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0
|
||||
ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto
|
||||
|
||||
Parameters:
|
||||
- metrics_format (str): The format of the header ('TEXT', 'JSON').
|
||||
- named_metrics (List[Tuple[str, float]]): List of tuples with metric names
|
||||
and their corresponding double values.
|
||||
|
||||
Returns:
|
||||
- Optional[Mapping[str,str]]: A dictionary with header key as
|
||||
'endpoint-load-metrics' and values as the ORCA header strings with
|
||||
format prefix and data in with named_metrics in.
|
||||
"""
|
||||
|
||||
if metrics_format.lower() not in ["text", "json"]:
|
||||
logger.warning(
|
||||
"Warning: `%s` format is not supported in the ORCA response header",
|
||||
format,
|
||||
)
|
||||
return None
|
||||
|
||||
header = {}
|
||||
orca_report = {
|
||||
"named_metrics": {
|
||||
metric_name: value
|
||||
for metric_name, value in named_metrics
|
||||
if isinstance(metric_name, str) and isinstance(value, float)
|
||||
}
|
||||
}
|
||||
# output example:
|
||||
# endpoint-load-metrics: TEXT named_metrics.kv_cache_utilization=0.4
|
||||
if metrics_format.lower() == "text":
|
||||
native_http_header = ", ".join(
|
||||
[
|
||||
f"named_metrics.{metric_name}={value}"
|
||||
for metric_name, value in named_metrics
|
||||
if isinstance(metric_name, str) and isinstance(value, float)
|
||||
]
|
||||
)
|
||||
header["endpoint-load-metrics"] = f"TEXT {native_http_header}"
|
||||
|
||||
# output example:
|
||||
# endpoint-load-metrics: JSON “named_metrics”: {“custom-metric-util”: 0.4}
|
||||
elif metrics_format.lower() == "json":
|
||||
header["endpoint-load-metrics"] = f"JSON {json.dumps(orca_report)}"
|
||||
|
||||
logger.info("Created ORCA header %s", header)
|
||||
|
||||
return header
|
||||
|
||||
|
||||
def get_named_metrics_from_prometheus() -> list[tuple[str, float]]:
|
||||
"""
|
||||
Collects current metrics from Prometheus and returns some of them
|
||||
in the form of the `named_metrics` list for `create_orca_header()`.
|
||||
|
||||
Parameters:
|
||||
- None
|
||||
|
||||
Returns:
|
||||
- list[tuple[str, float]]: List of tuples of metric names and their values.
|
||||
"""
|
||||
named_metrics: list[tuple[str, float]] = []
|
||||
# Map from prometheus metric names to ORCA named metrics.
|
||||
prometheus_to_orca_metrics = {
|
||||
"vllm:kv_cache_usage_perc": "kv_cache_usage_perc",
|
||||
"vllm:num_requests_waiting": "num_requests_waiting",
|
||||
}
|
||||
metrics = get_metrics_snapshot()
|
||||
for metric in metrics:
|
||||
orca_name = prometheus_to_orca_metrics.get(metric.name)
|
||||
# If this metric is mapped into ORCA, then add it to the report.
|
||||
# Note: Only Gauge metrics are currently supported.
|
||||
if orca_name is not None and isinstance(metric, Gauge):
|
||||
named_metrics.append((str(orca_name), float(metric.value)))
|
||||
return named_metrics
|
||||
|
||||
|
||||
def metrics_header(metrics_format: str) -> Mapping[str, str] | None:
|
||||
"""
|
||||
Creates ORCA headers named 'endpoint-load-metrics' in the specified format.
|
||||
Metrics are collected from Prometheus using `get_named_metrics_from_prometheus()`.
|
||||
|
||||
ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0
|
||||
ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto
|
||||
|
||||
Parameters:
|
||||
- metrics_format (str): The format of the header ('TEXT', 'JSON').
|
||||
|
||||
Returns:
|
||||
- Optional[Mapping[str,str]]: A dictionary with header key as
|
||||
'endpoint-load-metrics' and values as the ORCA header strings with
|
||||
format prefix and data in with named_metrics in.
|
||||
"""
|
||||
if not metrics_format:
|
||||
return None
|
||||
# Get named metrics from prometheus.
|
||||
named_metrics = get_named_metrics_from_prometheus()
|
||||
return create_orca_header(metrics_format, named_metrics)
|
||||
0
vllm/entrypoints/openai/parser/__init__.py
Normal file
0
vllm/entrypoints/openai/parser/__init__.py
Normal file
825
vllm/entrypoints/openai/parser/harmony_utils.py
Normal file
825
vllm/entrypoints/openai/parser/harmony_utils.py
Normal file
@@ -0,0 +1,825 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
import json
|
||||
from collections.abc import Iterable, Sequence
|
||||
from typing import Literal
|
||||
|
||||
from openai.types.responses import (
|
||||
ResponseFunctionToolCall,
|
||||
ResponseOutputItem,
|
||||
ResponseOutputMessage,
|
||||
ResponseOutputText,
|
||||
ResponseReasoningItem,
|
||||
)
|
||||
from openai.types.responses.response_function_web_search import (
|
||||
ActionFind,
|
||||
ActionOpenPage,
|
||||
ActionSearch,
|
||||
ResponseFunctionWebSearch,
|
||||
)
|
||||
from openai.types.responses.response_output_item import McpCall
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
Content as ResponseReasoningTextContent,
|
||||
)
|
||||
from openai.types.responses.tool import Tool
|
||||
from openai_harmony import (
|
||||
Author,
|
||||
ChannelConfig,
|
||||
Conversation,
|
||||
DeveloperContent,
|
||||
HarmonyEncodingName,
|
||||
Message,
|
||||
ReasoningEffort,
|
||||
Role,
|
||||
StreamableParser,
|
||||
SystemContent,
|
||||
TextContent,
|
||||
ToolDescription,
|
||||
load_harmony_encoding,
|
||||
)
|
||||
from openai_harmony import Message as OpenAIHarmonyMessage
|
||||
from openai_harmony import Role as OpenAIHarmonyRole
|
||||
|
||||
from vllm import envs
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionToolsParam,
|
||||
ResponseInputOutputItem,
|
||||
ResponsesRequest,
|
||||
)
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
REASONING_EFFORT = {
|
||||
"high": ReasoningEffort.HIGH,
|
||||
"medium": ReasoningEffort.MEDIUM,
|
||||
"low": ReasoningEffort.LOW,
|
||||
}
|
||||
|
||||
_harmony_encoding = None
|
||||
|
||||
# Builtin tools that should be included in the system message when
|
||||
# they are available and requested by the user.
|
||||
# Tool args are provided by MCP tool descriptions. Output
|
||||
# of the tools are stringified.
|
||||
MCP_BUILTIN_TOOLS: set[str] = {
|
||||
"web_search_preview",
|
||||
"code_interpreter",
|
||||
"container",
|
||||
}
|
||||
|
||||
|
||||
def has_custom_tools(tool_types: set[str]) -> bool:
|
||||
"""
|
||||
Checks if the given tool types are custom tools
|
||||
(i.e. any tool other than MCP buildin tools)
|
||||
"""
|
||||
return not tool_types.issubset(MCP_BUILTIN_TOOLS)
|
||||
|
||||
|
||||
def get_encoding():
|
||||
global _harmony_encoding
|
||||
if _harmony_encoding is None:
|
||||
_harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
||||
return _harmony_encoding
|
||||
|
||||
|
||||
def get_system_message(
|
||||
model_identity: str | None = None,
|
||||
reasoning_effort: Literal["high", "medium", "low"] | None = None,
|
||||
start_date: str | None = None,
|
||||
browser_description: str | None = None,
|
||||
python_description: str | None = None,
|
||||
container_description: str | None = None,
|
||||
instructions: str | None = None,
|
||||
with_custom_tools: bool = False,
|
||||
) -> Message:
|
||||
sys_msg_content = SystemContent.new()
|
||||
if model_identity is not None:
|
||||
sys_msg_content = sys_msg_content.with_model_identity(model_identity)
|
||||
if instructions is not None and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
|
||||
current_identity = sys_msg_content.model_identity
|
||||
new_identity = (
|
||||
f"{current_identity}\n{instructions}" if current_identity else instructions
|
||||
)
|
||||
sys_msg_content = sys_msg_content.with_model_identity(new_identity)
|
||||
if reasoning_effort is not None:
|
||||
sys_msg_content = sys_msg_content.with_reasoning_effort(
|
||||
REASONING_EFFORT[reasoning_effort]
|
||||
)
|
||||
if start_date is None:
|
||||
# NOTE(woosuk): This brings non-determinism in vLLM. Be careful.
|
||||
start_date = datetime.datetime.now().strftime("%Y-%m-%d")
|
||||
sys_msg_content = sys_msg_content.with_conversation_start_date(start_date)
|
||||
if browser_description is not None:
|
||||
sys_msg_content = sys_msg_content.with_tools(browser_description)
|
||||
if python_description is not None:
|
||||
sys_msg_content = sys_msg_content.with_tools(python_description)
|
||||
if container_description is not None:
|
||||
sys_msg_content = sys_msg_content.with_tools(container_description)
|
||||
if not with_custom_tools:
|
||||
channel_config = sys_msg_content.channel_config
|
||||
invalid_channel = "commentary"
|
||||
new_config = ChannelConfig.require_channels(
|
||||
[c for c in channel_config.valid_channels if c != invalid_channel]
|
||||
)
|
||||
sys_msg_content = sys_msg_content.with_channel_config(new_config)
|
||||
sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
|
||||
return sys_msg
|
||||
|
||||
|
||||
def create_tool_definition(tool: ChatCompletionToolsParam | Tool):
|
||||
if isinstance(tool, ChatCompletionToolsParam):
|
||||
return ToolDescription.new(
|
||||
name=tool.function.name,
|
||||
description=tool.function.description,
|
||||
parameters=tool.function.parameters,
|
||||
)
|
||||
return ToolDescription.new(
|
||||
name=tool.name,
|
||||
description=tool.description,
|
||||
parameters=tool.parameters,
|
||||
)
|
||||
|
||||
|
||||
def get_developer_message(
|
||||
instructions: str | None = None,
|
||||
tools: list[Tool | ChatCompletionToolsParam] | None = None,
|
||||
) -> Message:
|
||||
dev_msg_content = DeveloperContent.new()
|
||||
if instructions is not None and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
|
||||
dev_msg_content = dev_msg_content.with_instructions(instructions)
|
||||
if tools is not None:
|
||||
function_tools: list[Tool | ChatCompletionToolsParam] = []
|
||||
for tool in tools:
|
||||
if tool.type in (
|
||||
"web_search_preview",
|
||||
"code_interpreter",
|
||||
"container",
|
||||
):
|
||||
pass
|
||||
|
||||
elif tool.type == "function":
|
||||
function_tools.append(tool)
|
||||
else:
|
||||
raise ValueError(f"tool type {tool.type} not supported")
|
||||
if function_tools:
|
||||
function_tool_descriptions = [
|
||||
create_tool_definition(tool) for tool in function_tools
|
||||
]
|
||||
dev_msg_content = dev_msg_content.with_function_tools(
|
||||
function_tool_descriptions
|
||||
)
|
||||
dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
|
||||
return dev_msg
|
||||
|
||||
|
||||
def get_user_message(content: str) -> Message:
|
||||
return Message.from_role_and_content(Role.USER, content)
|
||||
|
||||
|
||||
def parse_response_input(
|
||||
response_msg: ResponseInputOutputItem,
|
||||
prev_responses: list[ResponseOutputItem | ResponseReasoningItem],
|
||||
) -> Message:
|
||||
if not isinstance(response_msg, dict):
|
||||
response_msg = response_msg.model_dump()
|
||||
if "type" not in response_msg or response_msg["type"] == "message":
|
||||
role = response_msg["role"]
|
||||
content = response_msg["content"]
|
||||
if role == "system":
|
||||
# User is trying to set a system message. Change it to:
|
||||
# <|start|>developer<|message|># Instructions
|
||||
# {instructions}<|end|>
|
||||
role = "developer"
|
||||
text_prefix = "Instructions:\n"
|
||||
else:
|
||||
text_prefix = ""
|
||||
if isinstance(content, str):
|
||||
msg = Message.from_role_and_content(role, text_prefix + content)
|
||||
else:
|
||||
contents = [TextContent(text=text_prefix + c["text"]) for c in content]
|
||||
msg = Message.from_role_and_contents(role, contents)
|
||||
if role == "assistant":
|
||||
msg = msg.with_channel("final")
|
||||
elif response_msg["type"] == "function_call_output":
|
||||
call_id = response_msg["call_id"]
|
||||
call_response: ResponseFunctionToolCall | None = None
|
||||
for prev_response in reversed(prev_responses):
|
||||
if (
|
||||
isinstance(prev_response, ResponseFunctionToolCall)
|
||||
and prev_response.call_id == call_id
|
||||
):
|
||||
call_response = prev_response
|
||||
break
|
||||
if call_response is None:
|
||||
raise ValueError(f"No call message found for {call_id}")
|
||||
msg = Message.from_author_and_content(
|
||||
Author.new(Role.TOOL, f"functions.{call_response.name}"),
|
||||
response_msg["output"],
|
||||
)
|
||||
elif response_msg["type"] == "reasoning":
|
||||
content = response_msg["content"]
|
||||
assert len(content) == 1
|
||||
msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
|
||||
elif response_msg["type"] == "function_call":
|
||||
msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
|
||||
msg = msg.with_channel("commentary")
|
||||
msg = msg.with_recipient(f"functions.{response_msg['name']}")
|
||||
msg = msg.with_content_type("json")
|
||||
else:
|
||||
raise ValueError(f"Unknown input type: {response_msg['type']}")
|
||||
return msg
|
||||
|
||||
|
||||
def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]:
|
||||
"""
|
||||
Parse a list of messages from request.messages in the Chat Completion API to
|
||||
Harmony messages.
|
||||
"""
|
||||
msgs: list[Message] = []
|
||||
tool_id_names: dict[str, str] = {}
|
||||
|
||||
# Collect tool id to name mappings for tool response recipient values
|
||||
for chat_msg in chat_msgs:
|
||||
for tool_call in chat_msg.get("tool_calls", []):
|
||||
tool_id_names[tool_call.get("id")] = tool_call.get("function", {}).get(
|
||||
"name"
|
||||
)
|
||||
|
||||
for chat_msg in chat_msgs:
|
||||
msgs.extend(parse_chat_input_to_harmony_message(chat_msg, tool_id_names))
|
||||
|
||||
msgs = auto_drop_analysis_messages(msgs)
|
||||
return msgs
|
||||
|
||||
|
||||
def auto_drop_analysis_messages(msgs: list[Message]) -> list[Message]:
|
||||
"""
|
||||
Harmony models expect the analysis messages (representing raw chain of thought) to
|
||||
be dropped after an assistant message to the final channel is produced from the
|
||||
reasoning of those messages.
|
||||
|
||||
The openai-harmony library does this if the very last assistant message is to the
|
||||
final channel, but it does not handle the case where we're in longer multi-turn
|
||||
conversations and the client gave us reasoning content from previous turns of
|
||||
the conversation with multiple assistant messages to the final channel in the
|
||||
conversation.
|
||||
|
||||
So, we find the index of the last assistant message to the final channel and drop
|
||||
all analysis messages that precede it, leaving only the analysis messages that
|
||||
are relevant to the current part of the conversation.
|
||||
"""
|
||||
last_assistant_final_index = -1
|
||||
for i in range(len(msgs) - 1, -1, -1):
|
||||
msg = msgs[i]
|
||||
if msg.author.role == "assistant" and msg.channel == "final":
|
||||
last_assistant_final_index = i
|
||||
break
|
||||
|
||||
cleaned_msgs: list[Message] = []
|
||||
for i, msg in enumerate(msgs):
|
||||
if i < last_assistant_final_index and msg.channel == "analysis":
|
||||
continue
|
||||
cleaned_msgs.append(msg)
|
||||
|
||||
return cleaned_msgs
|
||||
|
||||
|
||||
def flatten_chat_text_content(content: str | list | None) -> str | None:
|
||||
"""
|
||||
Extract the text parts from a chat message content field and flatten them
|
||||
into a single string.
|
||||
"""
|
||||
if isinstance(content, list):
|
||||
return "".join(
|
||||
item.get("text", "")
|
||||
for item in content
|
||||
if isinstance(item, dict) and item.get("type") == "text"
|
||||
)
|
||||
return content
|
||||
|
||||
|
||||
def parse_chat_input_to_harmony_message(
|
||||
chat_msg, tool_id_names: dict[str, str] | None = None
|
||||
) -> list[Message]:
|
||||
"""
|
||||
Parse a message from request.messages in the Chat Completion API to
|
||||
Harmony messages.
|
||||
"""
|
||||
tool_id_names = tool_id_names or {}
|
||||
|
||||
if not isinstance(chat_msg, dict):
|
||||
# Handle Pydantic models
|
||||
chat_msg = chat_msg.model_dump(exclude_none=True)
|
||||
|
||||
role = chat_msg.get("role")
|
||||
msgs: list[Message] = []
|
||||
|
||||
# Assistant message with tool calls
|
||||
tool_calls = chat_msg.get("tool_calls", [])
|
||||
|
||||
if role == "assistant" and tool_calls:
|
||||
content = flatten_chat_text_content(chat_msg.get("content"))
|
||||
if content:
|
||||
commentary_msg = Message.from_role_and_content(Role.ASSISTANT, content)
|
||||
commentary_msg = commentary_msg.with_channel("commentary")
|
||||
msgs.append(commentary_msg)
|
||||
|
||||
reasoning_content = chat_msg.get("reasoning") or chat_msg.get(
|
||||
"reasoning_content"
|
||||
)
|
||||
if reasoning_content:
|
||||
analysis_msg = Message.from_role_and_content(
|
||||
Role.ASSISTANT, reasoning_content
|
||||
)
|
||||
analysis_msg = analysis_msg.with_channel("analysis")
|
||||
msgs.append(analysis_msg)
|
||||
|
||||
for call in tool_calls:
|
||||
func = call.get("function", {})
|
||||
name = func.get("name", "")
|
||||
arguments = func.get("arguments", "") or ""
|
||||
msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
|
||||
msg = msg.with_channel("commentary")
|
||||
msg = msg.with_recipient(f"functions.{name}")
|
||||
# Officially, this should be `<|constrain|>json` but there is not clear
|
||||
# evidence that improves accuracy over `json` and some anecdotes to the
|
||||
# contrary. Further testing of the different content_types is needed.
|
||||
msg = msg.with_content_type("json")
|
||||
msgs.append(msg)
|
||||
return msgs
|
||||
|
||||
# Tool role message (tool output)
|
||||
if role == "tool":
|
||||
tool_call_id = chat_msg.get("tool_call_id", "")
|
||||
name = tool_id_names.get(tool_call_id, "")
|
||||
content = chat_msg.get("content", "") or ""
|
||||
content = flatten_chat_text_content(content)
|
||||
|
||||
msg = (
|
||||
Message.from_author_and_content(
|
||||
Author.new(Role.TOOL, f"functions.{name}"), content
|
||||
)
|
||||
.with_channel("commentary")
|
||||
.with_recipient("assistant")
|
||||
)
|
||||
return [msg]
|
||||
|
||||
# Non-tool reasoning content
|
||||
reasoning_content = chat_msg.get("reasoning") or chat_msg.get("reasoning_content")
|
||||
if role == "assistant" and reasoning_content:
|
||||
analysis_msg = Message.from_role_and_content(Role.ASSISTANT, reasoning_content)
|
||||
analysis_msg = analysis_msg.with_channel("analysis")
|
||||
msgs.append(analysis_msg)
|
||||
|
||||
# Default: user/assistant/system messages with content
|
||||
content = chat_msg.get("content") or ""
|
||||
if content is None:
|
||||
content = ""
|
||||
if isinstance(content, str):
|
||||
contents = [TextContent(text=content)]
|
||||
else:
|
||||
# TODO: Support refusal.
|
||||
contents = [TextContent(text=c.get("text", "")) for c in content]
|
||||
|
||||
# Only add assistant messages if they have content, as reasoning or tool calling
|
||||
# assistant messages were already added above.
|
||||
if role == "assistant" and contents and contents[0].text:
|
||||
msg = Message.from_role_and_contents(role, contents)
|
||||
# Send non-tool assistant messages to the final channel
|
||||
msg = msg.with_channel("final")
|
||||
msgs.append(msg)
|
||||
# For user/system/developer messages, add them directly even if no content.
|
||||
elif role != "assistant":
|
||||
msg = Message.from_role_and_contents(role, contents)
|
||||
msgs.append(msg)
|
||||
|
||||
return msgs
|
||||
|
||||
|
||||
def parse_input_to_harmony_message(chat_msg) -> list[Message]:
|
||||
"""
|
||||
Parse a message from request.previous_input_messages in the Responsees API to
|
||||
Harmony messages.
|
||||
"""
|
||||
if not isinstance(chat_msg, dict):
|
||||
# Handle Pydantic models
|
||||
chat_msg = chat_msg.model_dump(exclude_none=True)
|
||||
|
||||
role = chat_msg.get("role")
|
||||
|
||||
# Assistant message with tool calls
|
||||
tool_calls = chat_msg.get("tool_calls")
|
||||
if role == "assistant" and tool_calls:
|
||||
msgs: list[Message] = []
|
||||
for call in tool_calls:
|
||||
func = call.get("function", {})
|
||||
name = func.get("name", "")
|
||||
arguments = func.get("arguments", "") or ""
|
||||
msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
|
||||
msg = msg.with_channel("commentary")
|
||||
msg = msg.with_recipient(f"functions.{name}")
|
||||
msg = msg.with_content_type("json")
|
||||
msgs.append(msg)
|
||||
return msgs
|
||||
|
||||
# Tool role message (tool output)
|
||||
if role == "tool":
|
||||
name = chat_msg.get("name", "")
|
||||
content = chat_msg.get("content", "") or ""
|
||||
content = flatten_chat_text_content(content)
|
||||
|
||||
msg = Message.from_author_and_content(
|
||||
Author.new(Role.TOOL, f"functions.{name}"), content
|
||||
).with_channel("commentary")
|
||||
return [msg]
|
||||
|
||||
# Default: user/assistant/system messages with content
|
||||
content = chat_msg.get("content", "")
|
||||
if isinstance(content, str):
|
||||
contents = [TextContent(text=content)]
|
||||
else:
|
||||
# TODO: Support refusal.
|
||||
contents = [TextContent(text=c.get("text", "")) for c in content]
|
||||
msg = Message.from_role_and_contents(role, contents)
|
||||
return [msg]
|
||||
|
||||
|
||||
def construct_harmony_previous_input_messages(
|
||||
request: ResponsesRequest,
|
||||
) -> list[OpenAIHarmonyMessage]:
|
||||
messages: list[OpenAIHarmonyMessage] = []
|
||||
if request.previous_input_messages:
|
||||
for message in request.previous_input_messages:
|
||||
# Handle both OpenAIHarmonyMessage objects and dictionary inputs
|
||||
if isinstance(message, OpenAIHarmonyMessage):
|
||||
message_role = message.author.role
|
||||
# To match OpenAI, instructions, reasoning and tools are
|
||||
# always taken from the most recent Responses API request
|
||||
# not carried over from previous requests
|
||||
if (
|
||||
message_role == OpenAIHarmonyRole.SYSTEM
|
||||
or message_role == OpenAIHarmonyRole.DEVELOPER
|
||||
):
|
||||
continue
|
||||
messages.append(message)
|
||||
else:
|
||||
harmony_messages = parse_input_to_harmony_message(message)
|
||||
for harmony_msg in harmony_messages:
|
||||
message_role = harmony_msg.author.role
|
||||
# To match OpenAI, instructions, reasoning and tools are
|
||||
# always taken from the most recent Responses API request
|
||||
# not carried over from previous requests
|
||||
if (
|
||||
message_role == OpenAIHarmonyRole.SYSTEM
|
||||
or message_role == OpenAIHarmonyRole.DEVELOPER
|
||||
):
|
||||
continue
|
||||
messages.append(harmony_msg)
|
||||
return messages
|
||||
|
||||
|
||||
def render_for_completion(messages: list[Message]) -> list[int]:
|
||||
conversation = Conversation.from_messages(messages)
|
||||
token_ids = get_encoding().render_conversation_for_completion(
|
||||
conversation, Role.ASSISTANT
|
||||
)
|
||||
return token_ids
|
||||
|
||||
|
||||
def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem:
|
||||
"""Parse browser tool calls (search, open, find) into web search items."""
|
||||
if len(message.content) != 1:
|
||||
raise ValueError("Invalid number of contents in browser message")
|
||||
content = message.content[0]
|
||||
|
||||
# Parse JSON args (with retry detection)
|
||||
try:
|
||||
browser_call = json.loads(content.text)
|
||||
except json.JSONDecodeError:
|
||||
json_retry_output_message = (
|
||||
f"Invalid JSON args, caught and retried: {content.text}"
|
||||
)
|
||||
browser_call = {
|
||||
"query": json_retry_output_message,
|
||||
"url": json_retry_output_message,
|
||||
"pattern": json_retry_output_message,
|
||||
}
|
||||
|
||||
# Create appropriate action based on recipient
|
||||
if recipient == "browser.search":
|
||||
action = ActionSearch(
|
||||
query=f"cursor:{browser_call.get('query', '')}", type="search"
|
||||
)
|
||||
elif recipient == "browser.open":
|
||||
action = ActionOpenPage(
|
||||
url=f"cursor:{browser_call.get('url', '')}", type="open_page"
|
||||
)
|
||||
elif recipient == "browser.find":
|
||||
action = ActionFind(
|
||||
pattern=browser_call.get("pattern", ""),
|
||||
url=f"cursor:{browser_call.get('url', '')}",
|
||||
type="find",
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown browser action: {recipient}")
|
||||
|
||||
return ResponseFunctionWebSearch(
|
||||
id=f"ws_{random_uuid()}",
|
||||
action=action,
|
||||
status="completed",
|
||||
type="web_search_call",
|
||||
)
|
||||
|
||||
|
||||
def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
|
||||
"""Parse function calls into function tool call items."""
|
||||
function_name = recipient.split(".")[-1]
|
||||
output_items = []
|
||||
for content in message.content:
|
||||
random_id = random_uuid()
|
||||
response_item = ResponseFunctionToolCall(
|
||||
arguments=content.text,
|
||||
call_id=f"call_{random_id}",
|
||||
type="function_call",
|
||||
name=function_name,
|
||||
id=f"fc_{random_id}",
|
||||
)
|
||||
output_items.append(response_item)
|
||||
return output_items
|
||||
|
||||
|
||||
def _parse_reasoning_content(message: Message) -> list[ResponseOutputItem]:
|
||||
"""Parse reasoning/analysis content into reasoning items."""
|
||||
output_items = []
|
||||
for content in message.content:
|
||||
reasoning_item = ResponseReasoningItem(
|
||||
id=f"rs_{random_uuid()}",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
ResponseReasoningTextContent(text=content.text, type="reasoning_text")
|
||||
],
|
||||
status=None,
|
||||
)
|
||||
output_items.append(reasoning_item)
|
||||
return output_items
|
||||
|
||||
|
||||
def _parse_final_message(message: Message) -> ResponseOutputItem:
|
||||
"""Parse final channel messages into output message items."""
|
||||
contents = []
|
||||
for content in message.content:
|
||||
output_text = ResponseOutputText(
|
||||
text=content.text,
|
||||
annotations=[], # TODO
|
||||
type="output_text",
|
||||
logprobs=None, # TODO
|
||||
)
|
||||
contents.append(output_text)
|
||||
return ResponseOutputMessage(
|
||||
id=f"msg_{random_uuid()}",
|
||||
content=contents,
|
||||
role=message.author.role,
|
||||
status="completed",
|
||||
type="message",
|
||||
)
|
||||
|
||||
|
||||
def _parse_mcp_recipient(recipient: str) -> tuple[str, str]:
|
||||
"""
|
||||
Parse MCP recipient into (server_label, tool_name).
|
||||
|
||||
For dotted recipients like "repo_browser.list":
|
||||
- server_label: "repo_browser" (namespace/server)
|
||||
- tool_name: "list" (specific tool)
|
||||
|
||||
For simple recipients like "filesystem":
|
||||
- server_label: "filesystem"
|
||||
- tool_name: "filesystem"
|
||||
"""
|
||||
if "." in recipient:
|
||||
server_label = recipient.split(".")[0]
|
||||
tool_name = recipient.split(".")[-1]
|
||||
else:
|
||||
server_label = recipient
|
||||
tool_name = recipient
|
||||
return server_label, tool_name
|
||||
|
||||
|
||||
def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
|
||||
"""Parse MCP calls into MCP call items."""
|
||||
server_label, tool_name = _parse_mcp_recipient(recipient)
|
||||
output_items = []
|
||||
for content in message.content:
|
||||
response_item = McpCall(
|
||||
arguments=content.text,
|
||||
type="mcp_call",
|
||||
name=tool_name,
|
||||
server_label=server_label,
|
||||
id=f"mcp_{random_uuid()}",
|
||||
status="completed",
|
||||
)
|
||||
output_items.append(response_item)
|
||||
return output_items
|
||||
|
||||
|
||||
def parse_output_message(message: Message) -> list[ResponseOutputItem]:
|
||||
"""
|
||||
Parse a Harmony message into a list of output response items.
|
||||
"""
|
||||
if message.author.role != "assistant":
|
||||
# This is a message from a tool to the assistant (e.g., search result).
|
||||
# Don't include it in the final output for now. This aligns with
|
||||
# OpenAI's behavior on models like o4-mini.
|
||||
return []
|
||||
|
||||
output_items: list[ResponseOutputItem] = []
|
||||
recipient = message.recipient
|
||||
|
||||
if recipient is not None:
|
||||
# Browser tool calls
|
||||
if recipient.startswith("browser."):
|
||||
output_items.append(_parse_browser_tool_call(message, recipient))
|
||||
|
||||
# Function calls (should only happen on commentary channel)
|
||||
elif message.channel == "commentary" and recipient.startswith("functions."):
|
||||
output_items.extend(_parse_function_call(message, recipient))
|
||||
|
||||
# Built-in tools are treated as reasoning
|
||||
elif recipient.startswith(("python", "browser", "container")):
|
||||
# Built-in tool recipients (python/browser/container)
|
||||
# generate reasoning output
|
||||
output_items.extend(_parse_reasoning_content(message))
|
||||
|
||||
# All other recipients are MCP calls
|
||||
else:
|
||||
output_items.extend(_parse_mcp_call(message, recipient))
|
||||
|
||||
# No recipient - handle based on channel for non-tool messages
|
||||
elif message.channel == "analysis":
|
||||
output_items.extend(_parse_reasoning_content(message))
|
||||
|
||||
elif message.channel == "commentary":
|
||||
# Per Harmony format, commentary channel can contain preambles to calling
|
||||
# multiple functions - explanatory text with no recipient
|
||||
output_items.extend(_parse_reasoning_content(message))
|
||||
|
||||
elif message.channel == "final":
|
||||
output_items.append(_parse_final_message(message))
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown channel: {message.channel}")
|
||||
|
||||
return output_items
|
||||
|
||||
|
||||
def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]:
|
||||
if not parser.current_content:
|
||||
return []
|
||||
if parser.current_role != Role.ASSISTANT:
|
||||
return []
|
||||
current_recipient = parser.current_recipient
|
||||
if current_recipient is not None and current_recipient.startswith("browser."):
|
||||
return []
|
||||
|
||||
if current_recipient and parser.current_channel in ("commentary", "analysis"):
|
||||
if current_recipient.startswith("functions."):
|
||||
rid = random_uuid()
|
||||
return [
|
||||
ResponseFunctionToolCall(
|
||||
arguments=parser.current_content,
|
||||
call_id=f"call_{rid}",
|
||||
type="function_call",
|
||||
name=current_recipient.split(".")[-1],
|
||||
id=f"fc_{rid}",
|
||||
status="in_progress",
|
||||
)
|
||||
]
|
||||
# Built-in tools (python, browser, container) should be treated as reasoning
|
||||
elif not (
|
||||
current_recipient.startswith("python")
|
||||
or current_recipient.startswith("browser")
|
||||
or current_recipient.startswith("container")
|
||||
):
|
||||
# All other recipients are MCP calls
|
||||
rid = random_uuid()
|
||||
server_label, tool_name = _parse_mcp_recipient(current_recipient)
|
||||
return [
|
||||
McpCall(
|
||||
arguments=parser.current_content,
|
||||
type="mcp_call",
|
||||
name=tool_name,
|
||||
server_label=server_label,
|
||||
id=f"mcp_{rid}",
|
||||
status="in_progress",
|
||||
)
|
||||
]
|
||||
|
||||
if parser.current_channel == "commentary":
|
||||
return [
|
||||
ResponseReasoningItem(
|
||||
id=f"rs_{random_uuid()}",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
ResponseReasoningTextContent(
|
||||
text=parser.current_content, type="reasoning_text"
|
||||
)
|
||||
],
|
||||
status=None,
|
||||
)
|
||||
]
|
||||
|
||||
if parser.current_channel == "analysis":
|
||||
return [
|
||||
ResponseReasoningItem(
|
||||
id=f"rs_{random_uuid()}",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
ResponseReasoningTextContent(
|
||||
text=parser.current_content, type="reasoning_text"
|
||||
)
|
||||
],
|
||||
status=None,
|
||||
)
|
||||
]
|
||||
|
||||
if parser.current_channel == "final":
|
||||
output_text = ResponseOutputText(
|
||||
text=parser.current_content,
|
||||
annotations=[], # TODO
|
||||
type="output_text",
|
||||
logprobs=None, # TODO
|
||||
)
|
||||
text_item = ResponseOutputMessage(
|
||||
id=f"msg_{random_uuid()}",
|
||||
content=[output_text],
|
||||
role="assistant",
|
||||
# if the parser still has messages (ie if the generator got cut
|
||||
# abruptly), this should be incomplete
|
||||
status="incomplete",
|
||||
type="message",
|
||||
)
|
||||
return [text_item]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def get_stop_tokens_for_assistant_actions() -> list[int]:
|
||||
return get_encoding().stop_tokens_for_assistant_actions()
|
||||
|
||||
|
||||
def get_streamable_parser_for_assistant() -> StreamableParser:
|
||||
return StreamableParser(get_encoding(), role=Role.ASSISTANT)
|
||||
|
||||
|
||||
def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser:
|
||||
parser = get_streamable_parser_for_assistant()
|
||||
for token_id in token_ids:
|
||||
parser.process(token_id)
|
||||
return parser
|
||||
|
||||
|
||||
def parse_chat_output(
|
||||
token_ids: Sequence[int],
|
||||
) -> tuple[str | None, str | None, bool]:
|
||||
"""
|
||||
Parse the output of a Harmony chat completion into reasoning and final content.
|
||||
Note that when the `openai` tool parser is used, serving_chat only uses this
|
||||
for the reasoning content and gets the final content from the tool call parser.
|
||||
|
||||
When the `openai` tool parser is not enabled, or when `GptOssReasoningParser` is
|
||||
in use,this needs to return the final content without any tool calls parsed.
|
||||
|
||||
Empty reasoning or final content is returned as None instead of an empty string.
|
||||
"""
|
||||
parser = parse_output_into_messages(token_ids)
|
||||
output_msgs = parser.messages
|
||||
is_tool_call = False # TODO: update this when tool call is supported
|
||||
|
||||
# Get completed messages from the parser
|
||||
reasoning_texts = [
|
||||
msg.content[0].text for msg in output_msgs if msg.channel == "analysis"
|
||||
]
|
||||
final_texts = [
|
||||
msg.content[0].text for msg in output_msgs if msg.channel != "analysis"
|
||||
]
|
||||
|
||||
# Extract partial messages from the parser
|
||||
if parser.current_channel == "analysis" and parser.current_content:
|
||||
reasoning_texts.append(parser.current_content)
|
||||
elif parser.current_channel != "analysis" and parser.current_content:
|
||||
final_texts.append(parser.current_content)
|
||||
|
||||
# Flatten multiple messages into a single string
|
||||
reasoning: str | None = "\n".join(reasoning_texts)
|
||||
final_content: str | None = "\n".join(final_texts)
|
||||
|
||||
# Return None instead of empty string since existing callers check for None
|
||||
reasoning = reasoning or None
|
||||
final_content = final_content or None
|
||||
|
||||
return reasoning, final_content, is_tool_call
|
||||
135
vllm/entrypoints/openai/parser/responses_parser.py
Normal file
135
vllm/entrypoints/openai/parser/responses_parser.py
Normal file
@@ -0,0 +1,135 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
|
||||
from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
|
||||
from openai.types.responses.response_output_message import ResponseOutputMessage
|
||||
from openai.types.responses.response_output_text import ResponseOutputText
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
Content,
|
||||
ResponseReasoningItem,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
|
||||
from vllm.outputs import CompletionOutput
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
|
||||
from vllm.tokenizers.protocol import TokenizerLike
|
||||
from vllm.tool_parsers.abstract_tool_parser import ToolParser
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ResponsesParser:
|
||||
"""Incremental parser over completion tokens with reasoning support."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
tokenizer: AnyTokenizer,
|
||||
reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
|
||||
response_messages: list[ResponseInputOutputItem],
|
||||
request: ResponsesRequest,
|
||||
tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
|
||||
):
|
||||
self.response_messages: list[ResponseInputOutputItem] = (
|
||||
# TODO: initial messages may not be properly typed
|
||||
response_messages
|
||||
)
|
||||
self.num_init_messages = len(response_messages)
|
||||
self.tokenizer = tokenizer
|
||||
self.request = request
|
||||
|
||||
self.reasoning_parser_instance = reasoning_parser_cls(tokenizer)
|
||||
self.tool_parser_instance = None
|
||||
if tool_parser_cls is not None:
|
||||
self.tool_parser_instance = tool_parser_cls(tokenizer)
|
||||
|
||||
def process(self, output: CompletionOutput) -> "ResponsesParser":
|
||||
reasoning_content, content = self.reasoning_parser_instance.extract_reasoning(
|
||||
output.text, request=self.request
|
||||
)
|
||||
if reasoning_content:
|
||||
self.response_messages.append(
|
||||
ResponseReasoningItem(
|
||||
type="reasoning",
|
||||
id=f"rs_{random_uuid()}",
|
||||
summary=[],
|
||||
content=[
|
||||
Content(
|
||||
type="reasoning_text",
|
||||
text=reasoning_content,
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
function_calls: list[ResponseFunctionToolCall] = []
|
||||
if self.tool_parser_instance is not None:
|
||||
tool_call_info = self.tool_parser_instance.extract_tool_calls(
|
||||
content if content is not None else "",
|
||||
request=self.request, # type: ignore
|
||||
)
|
||||
if tool_call_info is not None and tool_call_info.tools_called:
|
||||
# extract_tool_calls() returns a list of tool calls.
|
||||
function_calls.extend(
|
||||
ResponseFunctionToolCall(
|
||||
id=f"fc_{random_uuid()}",
|
||||
call_id=f"call_{random_uuid()}",
|
||||
type="function_call",
|
||||
status="completed",
|
||||
name=tool_call.function.name,
|
||||
arguments=tool_call.function.arguments,
|
||||
)
|
||||
for tool_call in tool_call_info.tool_calls
|
||||
)
|
||||
content = tool_call_info.content
|
||||
if content and content.strip() == "":
|
||||
content = None
|
||||
|
||||
if content:
|
||||
self.response_messages.append(
|
||||
ResponseOutputMessage(
|
||||
type="message",
|
||||
id=f"msg_{random_uuid()}",
|
||||
status="completed",
|
||||
role="assistant",
|
||||
content=[
|
||||
ResponseOutputText(
|
||||
annotations=[], # TODO
|
||||
type="output_text",
|
||||
text=content,
|
||||
logprobs=None, # TODO
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
if len(function_calls) > 0:
|
||||
self.response_messages.extend(function_calls)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def get_responses_parser_for_simple_context(
|
||||
*,
|
||||
tokenizer: AnyTokenizer,
|
||||
reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
|
||||
response_messages: list[ResponseInputOutputItem],
|
||||
request: ResponsesRequest,
|
||||
tool_parser_cls,
|
||||
) -> ResponsesParser:
|
||||
"""Factory function to create a ResponsesParser with
|
||||
optional reasoning parser.
|
||||
|
||||
Returns:
|
||||
ResponsesParser instance configured with the provided parser
|
||||
"""
|
||||
return ResponsesParser(
|
||||
tokenizer=tokenizer,
|
||||
reasoning_parser_cls=reasoning_parser_cls,
|
||||
response_messages=response_messages,
|
||||
request=request,
|
||||
tool_parser_cls=tool_parser_cls,
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
631
vllm/entrypoints/openai/run_batch.py
Normal file
631
vllm/entrypoints/openai/run_batch.py
Normal file
@@ -0,0 +1,631 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import tempfile
|
||||
from argparse import Namespace
|
||||
from collections.abc import Awaitable, Callable
|
||||
from http import HTTPStatus
|
||||
from io import StringIO
|
||||
from typing import Any, TypeAlias
|
||||
|
||||
import aiohttp
|
||||
import torch
|
||||
from prometheus_client import start_http_server
|
||||
from pydantic import TypeAdapter, field_validator
|
||||
from pydantic_core.core_schema import ValidationInfo
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
ErrorResponse,
|
||||
OpenAIBaseModel,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
|
||||
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
|
||||
from vllm.entrypoints.pooling.score.protocol import (
|
||||
RerankRequest,
|
||||
RerankResponse,
|
||||
ScoreRequest,
|
||||
ScoreResponse,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.serving import ServingScores
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
BatchRequestInputBody: TypeAlias = (
|
||||
ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
|
||||
)
|
||||
|
||||
|
||||
class BatchRequestInput(OpenAIBaseModel):
|
||||
"""
|
||||
The per-line object of the batch input file.
|
||||
|
||||
NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
|
||||
"""
|
||||
|
||||
# A developer-provided per-request id that will be used to match outputs to
|
||||
# inputs. Must be unique for each request in a batch.
|
||||
custom_id: str
|
||||
|
||||
# The HTTP method to be used for the request. Currently only POST is
|
||||
# supported.
|
||||
method: str
|
||||
|
||||
# The OpenAI API relative URL to be used for the request. Currently
|
||||
# /v1/chat/completions is supported.
|
||||
url: str
|
||||
|
||||
# The parameters of the request.
|
||||
body: BatchRequestInputBody
|
||||
|
||||
@field_validator("body", mode="plain")
|
||||
@classmethod
|
||||
def check_type_for_url(cls, value: Any, info: ValidationInfo):
|
||||
# Use url to disambiguate models
|
||||
url: str = info.data["url"]
|
||||
if url == "/v1/chat/completions":
|
||||
return ChatCompletionRequest.model_validate(value)
|
||||
if url == "/v1/embeddings":
|
||||
return TypeAdapter(EmbeddingRequest).validate_python(value)
|
||||
if url.endswith("/score"):
|
||||
return ScoreRequest.model_validate(value)
|
||||
if url.endswith("/rerank"):
|
||||
return RerankRequest.model_validate(value)
|
||||
return TypeAdapter(BatchRequestInputBody).validate_python(value)
|
||||
|
||||
|
||||
class BatchResponseData(OpenAIBaseModel):
|
||||
# HTTP status code of the response.
|
||||
status_code: int = 200
|
||||
|
||||
# An unique identifier for the API request.
|
||||
request_id: str
|
||||
|
||||
# The body of the response.
|
||||
body: (
|
||||
ChatCompletionResponse
|
||||
| EmbeddingResponse
|
||||
| ScoreResponse
|
||||
| RerankResponse
|
||||
| None
|
||||
) = None
|
||||
|
||||
|
||||
class BatchRequestOutput(OpenAIBaseModel):
|
||||
"""
|
||||
The per-line object of the batch output and error files
|
||||
"""
|
||||
|
||||
id: str
|
||||
|
||||
# A developer-provided per-request id that will be used to match outputs to
|
||||
# inputs.
|
||||
custom_id: str
|
||||
|
||||
response: BatchResponseData | None
|
||||
|
||||
# For requests that failed with a non-HTTP error, this will contain more
|
||||
# information on the cause of the failure.
|
||||
error: Any | None
|
||||
|
||||
|
||||
def make_arg_parser(parser: FlexibleArgumentParser):
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--input-file",
|
||||
required=True,
|
||||
type=str,
|
||||
help="The path or url to a single input file. Currently supports local file "
|
||||
"paths, or the http protocol (http or https). If a URL is specified, "
|
||||
"the file should be available via HTTP GET.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output-file",
|
||||
required=True,
|
||||
type=str,
|
||||
help="The path or url to a single output file. Currently supports "
|
||||
"local file paths, or web (http or https) urls. If a URL is specified,"
|
||||
" the file should be available via HTTP PUT.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-tmp-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The directory to store the output file before uploading it "
|
||||
"to the output URL.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--response-role",
|
||||
type=optional_type(str),
|
||||
default="assistant",
|
||||
help="The role name to return if `request.add_generation_prompt=True`.",
|
||||
)
|
||||
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
"--max-log-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Max number of prompt characters or prompt "
|
||||
"ID numbers being printed in log."
|
||||
"\n\nDefault: Unlimited",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--enable-metrics", action="store_true", help="Enable Prometheus metrics"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
type=str,
|
||||
default="0.0.0.0",
|
||||
help="URL to the Prometheus metrics server "
|
||||
"(only needed if enable-metrics is set).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=8000,
|
||||
help="Port number for the Prometheus metrics server "
|
||||
"(only needed if enable-metrics is set).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-prompt-tokens-details",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="If set to True, enable prompt_tokens_details in usage.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-force-include-usage",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="If set to True, include usage on every request "
|
||||
"(even when stream_options is not specified)",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(description="vLLM OpenAI-Compatible batch runner.")
|
||||
return make_arg_parser(parser).parse_args()
|
||||
|
||||
|
||||
# explicitly use pure text format, with a newline at the end
|
||||
# this makes it impossible to see the animation in the progress bar
|
||||
# but will avoid messing up with ray or multiprocessing, which wraps
|
||||
# each line of output with some prefix.
|
||||
_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n" # noqa: E501
|
||||
|
||||
|
||||
class BatchProgressTracker:
|
||||
def __init__(self):
|
||||
self._total = 0
|
||||
self._pbar: tqdm | None = None
|
||||
|
||||
def submitted(self):
|
||||
self._total += 1
|
||||
|
||||
def completed(self):
|
||||
if self._pbar:
|
||||
self._pbar.update()
|
||||
|
||||
def pbar(self) -> tqdm:
|
||||
enable_tqdm = (
|
||||
not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
|
||||
)
|
||||
self._pbar = tqdm(
|
||||
total=self._total,
|
||||
unit="req",
|
||||
desc="Running batch",
|
||||
mininterval=5,
|
||||
disable=not enable_tqdm,
|
||||
bar_format=_BAR_FORMAT,
|
||||
)
|
||||
return self._pbar
|
||||
|
||||
|
||||
async def read_file(path_or_url: str) -> str:
|
||||
if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
|
||||
async with aiohttp.ClientSession() as session, session.get(path_or_url) as resp:
|
||||
return await resp.text()
|
||||
else:
|
||||
with open(path_or_url, encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
async def write_local_file(
|
||||
output_path: str, batch_outputs: list[BatchRequestOutput]
|
||||
) -> None:
|
||||
"""
|
||||
Write the responses to a local file.
|
||||
output_path: The path to write the responses to.
|
||||
batch_outputs: The list of batch outputs to write.
|
||||
"""
|
||||
# We should make this async, but as long as run_batch runs as a
|
||||
# standalone program, blocking the event loop won't affect performance.
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
for o in batch_outputs:
|
||||
print(o.model_dump_json(), file=f)
|
||||
|
||||
|
||||
async def upload_data(output_url: str, data_or_file: str, from_file: bool) -> None:
|
||||
"""
|
||||
Upload a local file to a URL.
|
||||
output_url: The URL to upload the file to.
|
||||
data_or_file: Either the data to upload or the path to the file to upload.
|
||||
from_file: If True, data_or_file is the path to the file to upload.
|
||||
"""
|
||||
# Timeout is a common issue when uploading large files.
|
||||
# We retry max_retries times before giving up.
|
||||
max_retries = 5
|
||||
# Number of seconds to wait before retrying.
|
||||
delay = 5
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
# We increase the timeout to 1000 seconds to allow
|
||||
# for large files (default is 300).
|
||||
async with aiohttp.ClientSession(
|
||||
timeout=aiohttp.ClientTimeout(total=1000)
|
||||
) as session:
|
||||
if from_file:
|
||||
with open(data_or_file, "rb") as file:
|
||||
async with session.put(output_url, data=file) as response:
|
||||
if response.status != 200:
|
||||
raise Exception(
|
||||
f"Failed to upload file.\n"
|
||||
f"Status: {response.status}\n"
|
||||
f"Response: {response.text()}"
|
||||
)
|
||||
else:
|
||||
async with session.put(output_url, data=data_or_file) as response:
|
||||
if response.status != 200:
|
||||
raise Exception(
|
||||
f"Failed to upload data.\n"
|
||||
f"Status: {response.status}\n"
|
||||
f"Response: {response.text()}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
if attempt < max_retries:
|
||||
logger.error(
|
||||
"Failed to upload data (attempt %d). Error message: %s.\nRetrying in %d seconds...", # noqa: E501
|
||||
attempt,
|
||||
e,
|
||||
delay,
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
raise Exception(
|
||||
f"Failed to upload data (attempt {attempt}). Error message: {str(e)}." # noqa: E501
|
||||
) from e
|
||||
|
||||
|
||||
async def write_file(
|
||||
path_or_url: str, batch_outputs: list[BatchRequestOutput], output_tmp_dir: str
|
||||
) -> None:
|
||||
"""
|
||||
Write batch_outputs to a file or upload to a URL.
|
||||
path_or_url: The path or URL to write batch_outputs to.
|
||||
batch_outputs: The list of batch outputs to write.
|
||||
output_tmp_dir: The directory to store the output file before uploading it
|
||||
to the output URL.
|
||||
"""
|
||||
if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
|
||||
if output_tmp_dir is None:
|
||||
logger.info("Writing outputs to memory buffer")
|
||||
output_buffer = StringIO()
|
||||
for o in batch_outputs:
|
||||
print(o.model_dump_json(), file=output_buffer)
|
||||
output_buffer.seek(0)
|
||||
logger.info("Uploading outputs to %s", path_or_url)
|
||||
await upload_data(
|
||||
path_or_url,
|
||||
output_buffer.read().strip().encode("utf-8"),
|
||||
from_file=False,
|
||||
)
|
||||
else:
|
||||
# Write responses to a temporary file and then upload it to the URL.
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w",
|
||||
encoding="utf-8",
|
||||
dir=output_tmp_dir,
|
||||
prefix="tmp_batch_output_",
|
||||
suffix=".jsonl",
|
||||
) as f:
|
||||
logger.info("Writing outputs to temporary local file %s", f.name)
|
||||
await write_local_file(f.name, batch_outputs)
|
||||
logger.info("Uploading outputs to %s", path_or_url)
|
||||
await upload_data(path_or_url, f.name, from_file=True)
|
||||
else:
|
||||
logger.info("Writing outputs to local file %s", path_or_url)
|
||||
await write_local_file(path_or_url, batch_outputs)
|
||||
|
||||
|
||||
def make_error_request_output(
|
||||
request: BatchRequestInput, error_msg: str
|
||||
) -> BatchRequestOutput:
|
||||
batch_output = BatchRequestOutput(
|
||||
id=f"vllm-{random_uuid()}",
|
||||
custom_id=request.custom_id,
|
||||
response=BatchResponseData(
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
request_id=f"vllm-batch-{random_uuid()}",
|
||||
),
|
||||
error=error_msg,
|
||||
)
|
||||
return batch_output
|
||||
|
||||
|
||||
async def make_async_error_request_output(
|
||||
request: BatchRequestInput, error_msg: str
|
||||
) -> BatchRequestOutput:
|
||||
return make_error_request_output(request, error_msg)
|
||||
|
||||
|
||||
async def run_request(
|
||||
serving_engine_func: Callable,
|
||||
request: BatchRequestInput,
|
||||
tracker: BatchProgressTracker,
|
||||
) -> BatchRequestOutput:
|
||||
response = await serving_engine_func(request.body)
|
||||
|
||||
if isinstance(
|
||||
response,
|
||||
(ChatCompletionResponse, EmbeddingResponse, ScoreResponse, RerankResponse),
|
||||
):
|
||||
batch_output = BatchRequestOutput(
|
||||
id=f"vllm-{random_uuid()}",
|
||||
custom_id=request.custom_id,
|
||||
response=BatchResponseData(
|
||||
body=response, request_id=f"vllm-batch-{random_uuid()}"
|
||||
),
|
||||
error=None,
|
||||
)
|
||||
elif isinstance(response, ErrorResponse):
|
||||
batch_output = BatchRequestOutput(
|
||||
id=f"vllm-{random_uuid()}",
|
||||
custom_id=request.custom_id,
|
||||
response=BatchResponseData(
|
||||
status_code=response.error.code,
|
||||
request_id=f"vllm-batch-{random_uuid()}",
|
||||
),
|
||||
error=response,
|
||||
)
|
||||
else:
|
||||
batch_output = make_error_request_output(
|
||||
request, error_msg="Request must not be sent in stream mode"
|
||||
)
|
||||
|
||||
tracker.completed()
|
||||
return batch_output
|
||||
|
||||
|
||||
def validate_run_batch_args(args):
|
||||
valid_reasoning_parsers = ReasoningParserManager.list_registered()
|
||||
if (
|
||||
reasoning_parser := args.structured_outputs_config.reasoning_parser
|
||||
) and reasoning_parser not in valid_reasoning_parsers:
|
||||
raise KeyError(
|
||||
f"invalid reasoning parser: {reasoning_parser} "
|
||||
f"(chose from {{ {','.join(valid_reasoning_parsers)} }})"
|
||||
)
|
||||
|
||||
|
||||
async def run_batch(
|
||||
engine_client: EngineClient,
|
||||
args: Namespace,
|
||||
) -> None:
|
||||
if args.served_model_name is not None:
|
||||
served_model_names = args.served_model_name
|
||||
else:
|
||||
served_model_names = [args.model]
|
||||
|
||||
if args.enable_log_requests:
|
||||
request_logger = RequestLogger(max_log_len=args.max_log_len)
|
||||
else:
|
||||
request_logger = None
|
||||
|
||||
base_model_paths = [
|
||||
BaseModelPath(name=name, model_path=args.model) for name in served_model_names
|
||||
]
|
||||
|
||||
model_config = engine_client.model_config
|
||||
supported_tasks = await engine_client.get_supported_tasks()
|
||||
logger.info("Supported tasks: %s", supported_tasks)
|
||||
|
||||
# Create the openai serving objects.
|
||||
openai_serving_models = OpenAIServingModels(
|
||||
engine_client=engine_client,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=None,
|
||||
)
|
||||
|
||||
openai_serving_chat = (
|
||||
OpenAIServingChat(
|
||||
engine_client,
|
||||
openai_serving_models,
|
||||
args.response_role,
|
||||
request_logger=request_logger,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
reasoning_parser=args.structured_outputs_config.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
)
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
|
||||
openai_serving_embedding = (
|
||||
OpenAIServingEmbedding(
|
||||
engine_client,
|
||||
openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
)
|
||||
if "embed" in supported_tasks
|
||||
else None
|
||||
)
|
||||
|
||||
enable_serving_reranking = (
|
||||
"classify" in supported_tasks
|
||||
and getattr(model_config.hf_config, "num_labels", 0) == 1
|
||||
)
|
||||
|
||||
openai_serving_scores = (
|
||||
ServingScores(
|
||||
engine_client,
|
||||
openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
)
|
||||
if ("embed" in supported_tasks or enable_serving_reranking)
|
||||
else None
|
||||
)
|
||||
|
||||
tracker = BatchProgressTracker()
|
||||
logger.info("Reading batch from %s...", args.input_file)
|
||||
|
||||
# Submit all requests in the file to the engine "concurrently".
|
||||
response_futures: list[Awaitable[BatchRequestOutput]] = []
|
||||
for request_json in (await read_file(args.input_file)).strip().split("\n"):
|
||||
# Skip empty lines.
|
||||
request_json = request_json.strip()
|
||||
if not request_json:
|
||||
continue
|
||||
|
||||
request = BatchRequestInput.model_validate_json(request_json)
|
||||
|
||||
# Determine the type of request and run it.
|
||||
if request.url == "/v1/chat/completions":
|
||||
chat_handler_fn = (
|
||||
openai_serving_chat.create_chat_completion
|
||||
if openai_serving_chat is not None
|
||||
else None
|
||||
)
|
||||
if chat_handler_fn is None:
|
||||
response_futures.append(
|
||||
make_async_error_request_output(
|
||||
request,
|
||||
error_msg="The model does not support Chat Completions API",
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
response_futures.append(run_request(chat_handler_fn, request, tracker))
|
||||
tracker.submitted()
|
||||
elif request.url == "/v1/embeddings":
|
||||
embed_handler_fn = (
|
||||
openai_serving_embedding.create_embedding
|
||||
if openai_serving_embedding is not None
|
||||
else None
|
||||
)
|
||||
if embed_handler_fn is None:
|
||||
response_futures.append(
|
||||
make_async_error_request_output(
|
||||
request,
|
||||
error_msg="The model does not support Embeddings API",
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
response_futures.append(run_request(embed_handler_fn, request, tracker))
|
||||
tracker.submitted()
|
||||
elif request.url.endswith("/score"):
|
||||
score_handler_fn = (
|
||||
openai_serving_scores.create_score
|
||||
if openai_serving_scores is not None
|
||||
else None
|
||||
)
|
||||
if score_handler_fn is None:
|
||||
response_futures.append(
|
||||
make_async_error_request_output(
|
||||
request,
|
||||
error_msg="The model does not support Scores API",
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
response_futures.append(run_request(score_handler_fn, request, tracker))
|
||||
tracker.submitted()
|
||||
elif request.url.endswith("/rerank"):
|
||||
rerank_handler_fn = (
|
||||
openai_serving_scores.do_rerank
|
||||
if openai_serving_scores is not None
|
||||
else None
|
||||
)
|
||||
if rerank_handler_fn is None:
|
||||
response_futures.append(
|
||||
make_async_error_request_output(
|
||||
request,
|
||||
error_msg="The model does not support Rerank API",
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
response_futures.append(run_request(rerank_handler_fn, request, tracker))
|
||||
tracker.submitted()
|
||||
else:
|
||||
response_futures.append(
|
||||
make_async_error_request_output(
|
||||
request,
|
||||
error_msg=f"URL {request.url} was used. "
|
||||
"Supported endpoints: /v1/chat/completions, /v1/embeddings,"
|
||||
" /score, /rerank ."
|
||||
"See vllm/entrypoints/openai/api_server.py for supported "
|
||||
"score/rerank versions.",
|
||||
)
|
||||
)
|
||||
|
||||
with tracker.pbar():
|
||||
responses = await asyncio.gather(*response_futures)
|
||||
|
||||
await write_file(args.output_file, responses, args.output_tmp_dir)
|
||||
|
||||
|
||||
async def main(args: Namespace):
|
||||
from vllm.entrypoints.openai.api_server import build_async_engine_client
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
|
||||
validate_run_batch_args(args)
|
||||
|
||||
async with build_async_engine_client(
|
||||
args,
|
||||
usage_context=UsageContext.OPENAI_BATCH_RUNNER,
|
||||
disable_frontend_multiprocessing=False,
|
||||
) as engine_client:
|
||||
await run_batch(engine_client, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
logger.info("vLLM batch processing API version %s", VLLM_VERSION)
|
||||
logger.info("args: %s", args)
|
||||
|
||||
# Start the Prometheus metrics server. LLMEngine uses the Prometheus client
|
||||
# to publish metrics at the /metrics endpoint.
|
||||
if args.enable_metrics:
|
||||
logger.info("Prometheus metrics enabled")
|
||||
start_http_server(port=args.port, addr=args.url)
|
||||
else:
|
||||
logger.info("Prometheus metrics disabled")
|
||||
|
||||
asyncio.run(main(args))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,67 +1,90 @@
|
||||
import time
|
||||
from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
|
||||
Optional, Tuple)
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, AsyncIterator
|
||||
from collections.abc import Sequence as GenericSequence
|
||||
from typing import cast
|
||||
|
||||
import jinja2
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
||||
CompletionResponse,
|
||||
CompletionResponseChoice,
|
||||
CompletionResponseStreamChoice,
|
||||
CompletionStreamResponse,
|
||||
LogProbs, UsageInfo)
|
||||
from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
|
||||
OpenAIServing)
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
CompletionLogProbs,
|
||||
CompletionRequest,
|
||||
CompletionResponse,
|
||||
CompletionResponseChoice,
|
||||
CompletionResponseStreamChoice,
|
||||
CompletionStreamResponse,
|
||||
ErrorResponse,
|
||||
PromptTokenUsageInfo,
|
||||
RequestResponseMetadata,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import (
|
||||
GenerationError,
|
||||
OpenAIServing,
|
||||
clamp_prompt_logprobs,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
|
||||
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.guided_decoding import (
|
||||
get_guided_decoding_logits_processor)
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.utils import merge_async_iterators, random_uuid
|
||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
from vllm.utils.collection_utils import as_list
|
||||
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
TypeTokenIDs = List[int]
|
||||
TypeTopLogProbs = List[Optional[Dict[int, float]]]
|
||||
TypeCreateLogProbsFn = Callable[
|
||||
[TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs]
|
||||
|
||||
|
||||
def parse_prompt_format(prompt) -> Tuple[bool, list]:
|
||||
# get the prompt, openai supports the following
|
||||
# "a string, array of strings, array of tokens, or array of token arrays."
|
||||
prompt_is_tokens = False
|
||||
prompts = [prompt] # case 1: a string
|
||||
if isinstance(prompt, list):
|
||||
if len(prompt) == 0:
|
||||
raise ValueError("please provide at least one prompt")
|
||||
elif isinstance(prompt[0], str):
|
||||
prompt_is_tokens = False
|
||||
prompts = prompt # case 2: array of strings
|
||||
elif isinstance(prompt[0], int):
|
||||
prompt_is_tokens = True
|
||||
prompts = [prompt] # case 3: array of tokens
|
||||
elif isinstance(prompt[0], list) and isinstance(prompt[0][0], int):
|
||||
prompt_is_tokens = True
|
||||
prompts = prompt # case 4: array of token arrays
|
||||
else:
|
||||
raise ValueError("prompt must be a string, array of strings, "
|
||||
"array of tokens, or array of token arrays")
|
||||
return prompt_is_tokens, prompts
|
||||
|
||||
|
||||
class OpenAIServingCompletion(OpenAIServing):
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
log_error_stack: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
|
||||
def __init__(self,
|
||||
engine: AsyncLLMEngine,
|
||||
served_model_names: List[str],
|
||||
lora_modules: Optional[List[LoRAModulePath]] = None):
|
||||
super().__init__(engine=engine,
|
||||
served_model_names=served_model_names,
|
||||
lora_modules=lora_modules)
|
||||
# set up logits processors
|
||||
self.logits_processors = self.model_config.logits_processors
|
||||
|
||||
async def create_completion(self, request: CompletionRequest,
|
||||
raw_request: Request):
|
||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||
self.default_sampling_params = self.model_config.get_diff_sampling_param()
|
||||
self.enable_force_include_usage = enable_force_include_usage
|
||||
if self.default_sampling_params:
|
||||
source = self.model_config.generation_config
|
||||
source = "model" if source == "auto" else source
|
||||
logger.info(
|
||||
"Using default completion sampling params from %s: %s",
|
||||
source,
|
||||
self.default_sampling_params,
|
||||
)
|
||||
|
||||
async def create_completion(
|
||||
self,
|
||||
request: CompletionRequest,
|
||||
raw_request: Request | None = None,
|
||||
) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
|
||||
"""Completion API similar to OpenAI's API.
|
||||
|
||||
See https://platform.openai.com/docs/api-reference/completions/create
|
||||
@@ -75,90 +98,214 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
# If the engine is dead, raise the engine's DEAD_ERROR.
|
||||
# This is required for the streaming case, where we return a
|
||||
# success status before we actually start generating text :).
|
||||
if self.engine_client.errored:
|
||||
raise self.engine_client.dead_error
|
||||
|
||||
# Return error for unsupported features.
|
||||
if request.suffix is not None:
|
||||
return self.create_error_response(
|
||||
"suffix is not currently supported")
|
||||
return self.create_error_response("suffix is not currently supported")
|
||||
|
||||
model_name = self.served_model_names[0]
|
||||
request_id = f"cmpl-{random_uuid()}"
|
||||
if request.echo and request.prompt_embeds is not None:
|
||||
return self.create_error_response("Echo is unsupported with prompt embeds.")
|
||||
|
||||
if request.prompt_logprobs is not None and request.prompt_embeds is not None:
|
||||
return self.create_error_response(
|
||||
"prompt_logprobs is not compatible with prompt embeds."
|
||||
)
|
||||
|
||||
request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
|
||||
created_time = int(time.time())
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: List[AsyncIterator[RequestOutput]] = []
|
||||
request_metadata = RequestResponseMetadata(request_id=request_id)
|
||||
if raw_request:
|
||||
raw_request.state.request_metadata = request_metadata
|
||||
|
||||
try:
|
||||
sampling_params = request.to_sampling_params()
|
||||
lora_request = self._maybe_get_lora(request)
|
||||
decoding_config = await self.engine.get_decoding_config()
|
||||
guided_decoding_backend = request.guided_decoding_backend \
|
||||
or decoding_config.guided_decoding_backend
|
||||
guided_decode_logit_processor = (
|
||||
await get_guided_decoding_logits_processor(
|
||||
guided_decoding_backend, request, await
|
||||
self.engine.get_tokenizer()))
|
||||
if guided_decode_logit_processor is not None:
|
||||
if sampling_params.logits_processors is None:
|
||||
sampling_params.logits_processors = []
|
||||
sampling_params.logits_processors.append(
|
||||
guided_decode_logit_processor)
|
||||
prompt_is_tokens, prompts = parse_prompt_format(request.prompt)
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
if prompt_is_tokens:
|
||||
prompt_formats = self._validate_prompt_and_tokenize(
|
||||
request,
|
||||
prompt_ids=prompt,
|
||||
truncate_prompt_tokens=sampling_params.
|
||||
truncate_prompt_tokens)
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
renderer = self._get_renderer(tokenizer)
|
||||
|
||||
engine_prompts = await renderer.render_prompt_and_embeds(
|
||||
prompt_or_prompts=request.prompt,
|
||||
prompt_embeds=request.prompt_embeds,
|
||||
config=self._build_render_config(request),
|
||||
)
|
||||
except ValueError as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(str(e))
|
||||
except TypeError as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(str(e))
|
||||
except RuntimeError as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(str(e))
|
||||
except jinja2.TemplateError as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
# Extract data_parallel_rank from header (router can inject it)
|
||||
data_parallel_rank = self._get_data_parallel_rank(raw_request)
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[RequestOutput, None]] = []
|
||||
try:
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
prompt_text, prompt_token_ids, prompt_embeds = (
|
||||
self._get_prompt_components(engine_prompt)
|
||||
)
|
||||
|
||||
input_length = None
|
||||
if prompt_token_ids is not None:
|
||||
input_length = len(prompt_token_ids)
|
||||
elif prompt_embeds is not None:
|
||||
input_length = len(prompt_embeds)
|
||||
else:
|
||||
prompt_formats = self._validate_prompt_and_tokenize(
|
||||
request,
|
||||
prompt=prompt,
|
||||
truncate_prompt_tokens=sampling_params.
|
||||
truncate_prompt_tokens)
|
||||
prompt_ids, prompt_text = prompt_formats
|
||||
raise NotImplementedError
|
||||
|
||||
generators.append(
|
||||
self.engine.generate(prompt_text,
|
||||
sampling_params,
|
||||
f"{request_id}-{i}",
|
||||
prompt_token_ids=prompt_ids,
|
||||
lora_request=lora_request))
|
||||
if self.default_sampling_params is None:
|
||||
self.default_sampling_params = {}
|
||||
|
||||
max_tokens = get_max_tokens(
|
||||
max_model_len=self.max_model_len,
|
||||
request=request,
|
||||
input_length=input_length,
|
||||
default_sampling_params=self.default_sampling_params,
|
||||
)
|
||||
|
||||
sampling_params: SamplingParams | BeamSearchParams
|
||||
if request.use_beam_search:
|
||||
sampling_params = request.to_beam_search_params(
|
||||
max_tokens, self.default_sampling_params
|
||||
)
|
||||
else:
|
||||
sampling_params = request.to_sampling_params(
|
||||
max_tokens,
|
||||
self.model_config.logits_processor_pattern,
|
||||
self.default_sampling_params,
|
||||
)
|
||||
validate_logits_processors_parameters(
|
||||
self.logits_processors,
|
||||
sampling_params,
|
||||
)
|
||||
|
||||
request_id_item = f"{request_id}-{i}"
|
||||
|
||||
self._log_inputs(
|
||||
request_id_item,
|
||||
engine_prompt,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
trace_headers = (
|
||||
None
|
||||
if raw_request is None
|
||||
else await self._get_trace_headers(raw_request.headers)
|
||||
)
|
||||
|
||||
# Mypy inconsistently requires this second cast in different
|
||||
# environments. It shouldn't be necessary (redundant from above)
|
||||
# but pre-commit in CI fails without it.
|
||||
engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
|
||||
if isinstance(sampling_params, BeamSearchParams):
|
||||
generator = self.beam_search(
|
||||
prompt=engine_prompt,
|
||||
request_id=request_id,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
else:
|
||||
engine_request, tokenization_kwargs = await self._process_inputs(
|
||||
request_id_item,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
)
|
||||
|
||||
generator = self.engine_client.generate(
|
||||
engine_request,
|
||||
sampling_params,
|
||||
request_id_item,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
prompt_text=prompt_text,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
data_parallel_rank=data_parallel_rank,
|
||||
)
|
||||
|
||||
generators.append(generator)
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
result_generator: AsyncIterator[Tuple[
|
||||
int, RequestOutput]] = merge_async_iterators(*generators)
|
||||
result_generator = merge_async_iterators(*generators)
|
||||
|
||||
# Similar to the OpenAI API, when n != best_of, we do not stream the
|
||||
# results. In addition, we do not stream the results when use
|
||||
# beam search.
|
||||
stream = (request.stream
|
||||
and (request.best_of is None or request.n == request.best_of)
|
||||
and not request.use_beam_search)
|
||||
model_name = self.models.model_name(lora_request)
|
||||
num_prompts = len(engine_prompts)
|
||||
|
||||
# We do not stream the results when using beam search.
|
||||
stream = request.stream and not request.use_beam_search
|
||||
|
||||
# Streaming response
|
||||
if stream:
|
||||
return self.completion_stream_generator(request,
|
||||
raw_request,
|
||||
result_generator,
|
||||
request_id,
|
||||
created_time,
|
||||
model_name,
|
||||
num_prompts=len(prompts))
|
||||
return self.completion_stream_generator(
|
||||
request,
|
||||
engine_prompts,
|
||||
result_generator,
|
||||
request_id,
|
||||
created_time,
|
||||
model_name,
|
||||
num_prompts=num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
request_metadata=request_metadata,
|
||||
)
|
||||
|
||||
# Non-streaming response
|
||||
final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
|
||||
final_res_batch: list[RequestOutput | None] = [None] * num_prompts
|
||||
try:
|
||||
async for i, res in result_generator:
|
||||
if await raw_request.is_disconnected():
|
||||
# Abort the request if the client disconnects.
|
||||
await self.engine.abort(f"{request_id}-{i}")
|
||||
return self.create_error_response("Client disconnected")
|
||||
final_res_batch[i] = res
|
||||
|
||||
for i, final_res in enumerate(final_res_batch):
|
||||
assert final_res is not None
|
||||
|
||||
# The output should contain the input text
|
||||
# We did not pass it into vLLM engine to avoid being redundant
|
||||
# with the inputs token IDs
|
||||
if final_res.prompt is None:
|
||||
engine_prompt = engine_prompts[i]
|
||||
final_res.prompt = (
|
||||
None
|
||||
if is_embeds_prompt(engine_prompt)
|
||||
else engine_prompt.get("prompt")
|
||||
)
|
||||
|
||||
final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
|
||||
|
||||
response = self.request_output_to_completion_response(
|
||||
final_res_batch, request, request_id, created_time, model_name)
|
||||
final_res_batch_checked,
|
||||
request,
|
||||
request_id,
|
||||
created_time,
|
||||
model_name,
|
||||
tokenizer,
|
||||
request_metadata,
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
except GenerationError as e:
|
||||
return self._convert_generation_error_to_response(e)
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
@@ -179,80 +326,126 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
async def completion_stream_generator(
|
||||
self,
|
||||
request: CompletionRequest,
|
||||
raw_request: Request,
|
||||
result_generator: AsyncIterator[Tuple[int, RequestOutput]],
|
||||
engine_prompts: list[TokensPrompt | EmbedsPrompt],
|
||||
result_generator: AsyncIterator[tuple[int, RequestOutput]],
|
||||
request_id: str,
|
||||
created_time: int,
|
||||
model_name: str,
|
||||
num_prompts: int,
|
||||
tokenizer: TokenizerLike | None,
|
||||
request_metadata: RequestResponseMetadata,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
assert request.n is not None
|
||||
previous_texts = [""] * request.n * num_prompts
|
||||
previous_num_tokens = [0] * request.n * num_prompts
|
||||
has_echoed = [False] * request.n * num_prompts
|
||||
num_choices = 1 if request.n is None else request.n
|
||||
previous_text_lens = [0] * num_choices * num_prompts
|
||||
previous_num_tokens = [0] * num_choices * num_prompts
|
||||
has_echoed = [False] * num_choices * num_prompts
|
||||
num_prompt_tokens = [0] * num_prompts
|
||||
num_cached_tokens = None
|
||||
first_iteration = True
|
||||
|
||||
stream_options = request.stream_options
|
||||
include_usage, include_continuous_usage = should_include_usage(
|
||||
stream_options, self.enable_force_include_usage
|
||||
)
|
||||
|
||||
try:
|
||||
async for prompt_idx, res in result_generator:
|
||||
prompt_token_ids = res.prompt_token_ids
|
||||
prompt_logprobs = res.prompt_logprobs
|
||||
|
||||
# Abort the request if the client disconnects.
|
||||
if await raw_request.is_disconnected():
|
||||
await self.engine.abort(f"{request_id}-{prompt_idx}")
|
||||
raise StopAsyncIteration()
|
||||
if first_iteration:
|
||||
num_cached_tokens = res.num_cached_tokens
|
||||
first_iteration = False
|
||||
|
||||
prompt_text = res.prompt
|
||||
if prompt_text is None:
|
||||
engine_prompt = engine_prompts[prompt_idx]
|
||||
prompt_text = (
|
||||
None
|
||||
if is_embeds_prompt(engine_prompt)
|
||||
else engine_prompt.get("prompt")
|
||||
)
|
||||
|
||||
# Prompt details are excluded from later streamed outputs
|
||||
if prompt_token_ids is not None:
|
||||
num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
|
||||
|
||||
delta_token_ids: GenericSequence[int]
|
||||
out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
|
||||
|
||||
for output in res.outputs:
|
||||
i = output.index + prompt_idx * request.n
|
||||
# TODO(simon): optimize the performance by avoiding full
|
||||
# text O(n^2) sending.
|
||||
i = output.index + prompt_idx * num_choices
|
||||
|
||||
# Useful when request.return_token_ids is True
|
||||
# Returning prompt token IDs shares the same logic
|
||||
# with the echo implementation.
|
||||
prompt_token_ids_to_return: list[int] | None = None
|
||||
|
||||
assert request.max_tokens is not None
|
||||
if request.echo and request.max_tokens == 0:
|
||||
# only return the prompt
|
||||
delta_text = res.prompt
|
||||
delta_token_ids = res.prompt_token_ids
|
||||
top_logprobs = res.prompt_logprobs
|
||||
has_echoed[i] = True
|
||||
elif (request.echo and request.max_tokens > 0
|
||||
and not has_echoed[i]):
|
||||
# echo the prompt and first token
|
||||
delta_text = res.prompt + output.text
|
||||
delta_token_ids = (res.prompt_token_ids +
|
||||
output.token_ids)
|
||||
top_logprobs = res.prompt_logprobs + (output.logprobs
|
||||
or [])
|
||||
if request.echo and not has_echoed[i]:
|
||||
assert prompt_token_ids is not None
|
||||
if request.return_token_ids:
|
||||
prompt_text = ""
|
||||
assert prompt_text is not None
|
||||
if request.max_tokens == 0:
|
||||
# only return the prompt
|
||||
delta_text = prompt_text
|
||||
delta_token_ids = prompt_token_ids
|
||||
out_logprobs = prompt_logprobs
|
||||
else:
|
||||
# echo the prompt and first token
|
||||
delta_text = prompt_text + output.text
|
||||
delta_token_ids = [
|
||||
*prompt_token_ids,
|
||||
*output.token_ids,
|
||||
]
|
||||
out_logprobs = [
|
||||
*(prompt_logprobs or []),
|
||||
*(output.logprobs or []),
|
||||
]
|
||||
prompt_token_ids_to_return = prompt_token_ids
|
||||
has_echoed[i] = True
|
||||
else:
|
||||
# return just the delta
|
||||
delta_text = output.text[len(previous_texts[i]):]
|
||||
delta_token_ids = output.token_ids[
|
||||
previous_num_tokens[i]:]
|
||||
top_logprobs = output.logprobs[previous_num_tokens[
|
||||
i]:] if output.logprobs else None
|
||||
delta_text = output.text
|
||||
delta_token_ids = output.token_ids
|
||||
out_logprobs = output.logprobs
|
||||
|
||||
# has_echoed[i] is reused here to indicate whether
|
||||
# we have already returned the prompt token IDs.
|
||||
if not has_echoed[i] and request.return_token_ids:
|
||||
prompt_token_ids_to_return = prompt_token_ids
|
||||
has_echoed[i] = True
|
||||
|
||||
if (
|
||||
not delta_text
|
||||
and not delta_token_ids
|
||||
and not previous_num_tokens[i]
|
||||
):
|
||||
# Chunked prefill case, don't return empty chunks
|
||||
continue
|
||||
|
||||
if request.logprobs is not None:
|
||||
logprobs = self._create_logprobs(
|
||||
assert out_logprobs is not None, "Did not output logprobs"
|
||||
logprobs = self._create_completion_logprobs(
|
||||
token_ids=delta_token_ids,
|
||||
top_logprobs=top_logprobs,
|
||||
top_logprobs=out_logprobs,
|
||||
num_output_top_logprobs=request.logprobs,
|
||||
initial_text_offset=len(previous_texts[i]),
|
||||
tokenizer=tokenizer,
|
||||
initial_text_offset=previous_text_lens[i],
|
||||
return_as_token_id=request.return_tokens_as_token_ids,
|
||||
)
|
||||
else:
|
||||
logprobs = None
|
||||
|
||||
previous_texts[i] = output.text
|
||||
previous_num_tokens[i] = len(output.token_ids)
|
||||
previous_text_lens[i] += len(output.text)
|
||||
previous_num_tokens[i] += len(output.token_ids)
|
||||
finish_reason = output.finish_reason
|
||||
stop_reason = output.stop_reason
|
||||
if output.finish_reason is not None: # return final usage
|
||||
prompt_tokens = len(res.prompt_token_ids)
|
||||
completion_tokens = len(output.token_ids)
|
||||
final_usage = UsageInfo(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=prompt_tokens + completion_tokens,
|
||||
)
|
||||
else:
|
||||
final_usage = None
|
||||
response_json = CompletionStreamResponse(
|
||||
|
||||
self._raise_if_error(finish_reason, request_id)
|
||||
|
||||
chunk = CompletionStreamResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
@@ -263,58 +456,129 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
logprobs=logprobs,
|
||||
finish_reason=finish_reason,
|
||||
stop_reason=stop_reason,
|
||||
prompt_token_ids=prompt_token_ids_to_return,
|
||||
token_ids=(
|
||||
as_list(output.token_ids)
|
||||
if request.return_token_ids
|
||||
else None
|
||||
),
|
||||
)
|
||||
],
|
||||
usage=final_usage,
|
||||
).model_dump_json(exclude_unset=True)
|
||||
)
|
||||
if include_continuous_usage:
|
||||
prompt_tokens = num_prompt_tokens[prompt_idx]
|
||||
completion_tokens = previous_num_tokens[i]
|
||||
chunk.usage = UsageInfo(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=prompt_tokens + completion_tokens,
|
||||
)
|
||||
|
||||
response_json = chunk.model_dump_json(exclude_unset=False)
|
||||
yield f"data: {response_json}\n\n"
|
||||
except ValueError as e:
|
||||
|
||||
total_prompt_tokens = sum(num_prompt_tokens)
|
||||
total_completion_tokens = sum(previous_num_tokens)
|
||||
final_usage_info = UsageInfo(
|
||||
prompt_tokens=total_prompt_tokens,
|
||||
completion_tokens=total_completion_tokens,
|
||||
total_tokens=total_prompt_tokens + total_completion_tokens,
|
||||
)
|
||||
|
||||
if self.enable_prompt_tokens_details and num_cached_tokens:
|
||||
final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
|
||||
cached_tokens=num_cached_tokens
|
||||
)
|
||||
|
||||
if include_usage:
|
||||
final_usage_chunk = CompletionStreamResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
choices=[],
|
||||
usage=final_usage_info,
|
||||
)
|
||||
final_usage_data = final_usage_chunk.model_dump_json(
|
||||
exclude_unset=False, exclude_none=True
|
||||
)
|
||||
yield f"data: {final_usage_data}\n\n"
|
||||
|
||||
# report to FastAPI middleware aggregate usage across all choices
|
||||
request_metadata.final_usage_info = final_usage_info
|
||||
|
||||
except GenerationError as e:
|
||||
yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
|
||||
except Exception as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
logger.exception("Error in completion stream generator.")
|
||||
data = self.create_streaming_error_response(str(e))
|
||||
yield f"data: {data}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
def request_output_to_completion_response(
|
||||
self,
|
||||
final_res_batch: List[RequestOutput],
|
||||
final_res_batch: list[RequestOutput],
|
||||
request: CompletionRequest,
|
||||
request_id: str,
|
||||
created_time: int,
|
||||
model_name: str,
|
||||
tokenizer: TokenizerLike | None,
|
||||
request_metadata: RequestResponseMetadata,
|
||||
) -> CompletionResponse:
|
||||
choices: List[CompletionResponseChoice] = []
|
||||
choices: list[CompletionResponseChoice] = []
|
||||
num_prompt_tokens = 0
|
||||
num_generated_tokens = 0
|
||||
kv_transfer_params = None
|
||||
last_final_res = None
|
||||
for final_res in final_res_batch:
|
||||
assert final_res is not None
|
||||
last_final_res = final_res
|
||||
prompt_token_ids = final_res.prompt_token_ids
|
||||
prompt_logprobs = final_res.prompt_logprobs
|
||||
assert prompt_token_ids is not None
|
||||
prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
|
||||
prompt_text = final_res.prompt
|
||||
|
||||
token_ids: GenericSequence[int]
|
||||
out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
|
||||
|
||||
for output in final_res.outputs:
|
||||
self._raise_if_error(output.finish_reason, request_id)
|
||||
|
||||
assert request.max_tokens is not None
|
||||
if request.echo and request.max_tokens == 0:
|
||||
token_ids = prompt_token_ids
|
||||
top_logprobs = prompt_logprobs
|
||||
output_text = prompt_text
|
||||
elif request.echo and request.max_tokens > 0:
|
||||
token_ids = prompt_token_ids + output.token_ids
|
||||
top_logprobs = (prompt_logprobs + output.logprobs
|
||||
if request.logprobs else None)
|
||||
output_text = prompt_text + output.text
|
||||
if request.echo:
|
||||
if request.return_token_ids:
|
||||
prompt_text = ""
|
||||
assert prompt_text is not None
|
||||
if request.max_tokens == 0:
|
||||
token_ids = prompt_token_ids
|
||||
out_logprobs = prompt_logprobs
|
||||
output_text = prompt_text
|
||||
else:
|
||||
token_ids = [*prompt_token_ids, *output.token_ids]
|
||||
|
||||
if request.logprobs is None:
|
||||
out_logprobs = None
|
||||
else:
|
||||
assert prompt_logprobs is not None
|
||||
assert output.logprobs is not None
|
||||
out_logprobs = [
|
||||
*prompt_logprobs,
|
||||
*output.logprobs,
|
||||
]
|
||||
|
||||
output_text = prompt_text + output.text
|
||||
else:
|
||||
token_ids = output.token_ids
|
||||
top_logprobs = output.logprobs
|
||||
out_logprobs = output.logprobs
|
||||
output_text = output.text
|
||||
|
||||
if request.logprobs is not None:
|
||||
assert top_logprobs is not None, (
|
||||
"top_logprobs must be provided when logprobs "
|
||||
"is requested")
|
||||
logprobs = self._create_logprobs(
|
||||
assert out_logprobs is not None, "Did not output logprobs"
|
||||
logprobs = self._create_completion_logprobs(
|
||||
token_ids=token_ids,
|
||||
top_logprobs=top_logprobs,
|
||||
top_logprobs=out_logprobs,
|
||||
tokenizer=tokenizer,
|
||||
num_output_top_logprobs=request.logprobs,
|
||||
return_as_token_id=request.return_tokens_as_token_ids,
|
||||
)
|
||||
else:
|
||||
logprobs = None
|
||||
@@ -325,12 +589,19 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
logprobs=logprobs,
|
||||
finish_reason=output.finish_reason,
|
||||
stop_reason=output.stop_reason,
|
||||
prompt_logprobs=final_res.prompt_logprobs,
|
||||
prompt_token_ids=(
|
||||
prompt_token_ids if request.return_token_ids else None
|
||||
),
|
||||
token_ids=(
|
||||
as_list(output.token_ids) if request.return_token_ids else None
|
||||
),
|
||||
)
|
||||
choices.append(choice_data)
|
||||
|
||||
num_generated_tokens += len(output.token_ids)
|
||||
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
num_generated_tokens += sum(
|
||||
len(output.token_ids) for output in final_res.outputs)
|
||||
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
@@ -338,10 +609,121 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||
)
|
||||
|
||||
if (
|
||||
self.enable_prompt_tokens_details
|
||||
and last_final_res
|
||||
and last_final_res.num_cached_tokens
|
||||
):
|
||||
usage.prompt_tokens_details = PromptTokenUsageInfo(
|
||||
cached_tokens=last_final_res.num_cached_tokens
|
||||
)
|
||||
|
||||
request_metadata.final_usage_info = usage
|
||||
if final_res_batch:
|
||||
kv_transfer_params = final_res_batch[0].kv_transfer_params
|
||||
return CompletionResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
choices=choices,
|
||||
usage=usage,
|
||||
kv_transfer_params=kv_transfer_params,
|
||||
)
|
||||
|
||||
def _create_completion_logprobs(
|
||||
self,
|
||||
token_ids: GenericSequence[int],
|
||||
top_logprobs: GenericSequence[dict[int, Logprob] | None],
|
||||
num_output_top_logprobs: int,
|
||||
tokenizer: TokenizerLike | None,
|
||||
initial_text_offset: int = 0,
|
||||
return_as_token_id: bool | None = None,
|
||||
) -> CompletionLogProbs:
|
||||
"""Create logprobs for OpenAI Completion API."""
|
||||
out_text_offset: list[int] = []
|
||||
out_token_logprobs: list[float | None] = []
|
||||
out_tokens: list[str] = []
|
||||
out_top_logprobs: list[dict[str, float] | None] = []
|
||||
|
||||
last_token_len = 0
|
||||
|
||||
should_return_as_token_id = (
|
||||
return_as_token_id
|
||||
if return_as_token_id is not None
|
||||
else self.return_tokens_as_token_ids
|
||||
)
|
||||
for i, token_id in enumerate(token_ids):
|
||||
step_top_logprobs = top_logprobs[i]
|
||||
if step_top_logprobs is None:
|
||||
if should_return_as_token_id:
|
||||
token = f"token_id:{token_id}"
|
||||
else:
|
||||
if tokenizer is None:
|
||||
raise ValueError(
|
||||
"Unable to get tokenizer because `skip_tokenizer_init=True`"
|
||||
)
|
||||
|
||||
token = tokenizer.decode(token_id)
|
||||
|
||||
out_tokens.append(token)
|
||||
out_token_logprobs.append(None)
|
||||
out_top_logprobs.append(None)
|
||||
else:
|
||||
step_token = step_top_logprobs[token_id]
|
||||
|
||||
token = self._get_decoded_token(
|
||||
step_token,
|
||||
token_id,
|
||||
tokenizer,
|
||||
return_as_token_id=should_return_as_token_id,
|
||||
)
|
||||
token_logprob = max(step_token.logprob, -9999.0)
|
||||
|
||||
out_tokens.append(token)
|
||||
out_token_logprobs.append(token_logprob)
|
||||
|
||||
# makes sure to add the top num_output_top_logprobs + 1
|
||||
# logprobs, as defined in the openai API
|
||||
# (cf. https://github.com/openai/openai-openapi/blob/
|
||||
# 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
|
||||
out_top_logprobs.append(
|
||||
{
|
||||
# Convert float("-inf") to the
|
||||
# JSON-serializable float that OpenAI uses
|
||||
self._get_decoded_token(
|
||||
top_lp[1],
|
||||
top_lp[0],
|
||||
tokenizer,
|
||||
return_as_token_id=should_return_as_token_id,
|
||||
): max(top_lp[1].logprob, -9999.0)
|
||||
for i, top_lp in enumerate(step_top_logprobs.items())
|
||||
if num_output_top_logprobs >= i
|
||||
}
|
||||
)
|
||||
|
||||
if len(out_text_offset) == 0:
|
||||
out_text_offset.append(initial_text_offset)
|
||||
else:
|
||||
out_text_offset.append(out_text_offset[-1] + last_token_len)
|
||||
last_token_len = len(token)
|
||||
|
||||
return CompletionLogProbs(
|
||||
text_offset=out_text_offset,
|
||||
token_logprobs=out_token_logprobs,
|
||||
tokens=out_tokens,
|
||||
top_logprobs=out_top_logprobs,
|
||||
)
|
||||
|
||||
def _build_render_config(
|
||||
self,
|
||||
request: CompletionRequest,
|
||||
max_input_length: int | None = None,
|
||||
) -> RenderConfig:
|
||||
max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
|
||||
return RenderConfig(
|
||||
max_length=max_input_tokens_len,
|
||||
truncate_prompt_tokens=request.truncate_prompt_tokens,
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
cache_salt=request.cache_salt,
|
||||
needs_detokenization=bool(request.echo and not request.return_token_ids),
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
304
vllm/entrypoints/openai/serving_models.py
Normal file
304
vllm/entrypoints/openai/serving_models.py
Normal file
@@ -0,0 +1,304 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from asyncio import Lock
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorInfo,
|
||||
ErrorResponse,
|
||||
LoadLoRAAdapterRequest,
|
||||
ModelCard,
|
||||
ModelList,
|
||||
ModelPermission,
|
||||
UnloadLoRAAdapterRequest,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
from vllm.utils.counter import AtomicCounter
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseModelPath:
|
||||
name: str
|
||||
model_path: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoRAModulePath:
|
||||
name: str
|
||||
path: str
|
||||
base_model_name: str | None = None
|
||||
|
||||
|
||||
class OpenAIServingModels:
|
||||
"""Shared instance to hold data about the loaded base model(s) and adapters.
|
||||
|
||||
Handles the routes:
|
||||
- /v1/models
|
||||
- /v1/load_lora_adapter
|
||||
- /v1/unload_lora_adapter
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
base_model_paths: list[BaseModelPath],
|
||||
*,
|
||||
lora_modules: list[LoRAModulePath] | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.engine_client = engine_client
|
||||
self.base_model_paths = base_model_paths
|
||||
|
||||
self.static_lora_modules = lora_modules
|
||||
self.lora_requests: dict[str, LoRARequest] = {}
|
||||
self.lora_id_counter = AtomicCounter(0)
|
||||
|
||||
self.lora_resolvers: list[LoRAResolver] = []
|
||||
for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers():
|
||||
self.lora_resolvers.append(
|
||||
LoRAResolverRegistry.get_resolver(lora_resolver_name)
|
||||
)
|
||||
self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
|
||||
|
||||
self.input_processor = self.engine_client.input_processor
|
||||
self.io_processor = self.engine_client.io_processor
|
||||
self.model_config = self.engine_client.model_config
|
||||
self.max_model_len = self.model_config.max_model_len
|
||||
|
||||
async def init_static_loras(self):
|
||||
"""Loads all static LoRA modules.
|
||||
Raises if any fail to load"""
|
||||
if self.static_lora_modules is None:
|
||||
return
|
||||
for lora in self.static_lora_modules:
|
||||
load_request = LoadLoRAAdapterRequest(
|
||||
lora_path=lora.path, lora_name=lora.name
|
||||
)
|
||||
load_result = await self.load_lora_adapter(
|
||||
request=load_request, base_model_name=lora.base_model_name
|
||||
)
|
||||
if isinstance(load_result, ErrorResponse):
|
||||
raise ValueError(load_result.error.message)
|
||||
|
||||
def is_base_model(self, model_name) -> bool:
|
||||
return any(model.name == model_name for model in self.base_model_paths)
|
||||
|
||||
def model_name(self, lora_request: LoRARequest | None = None) -> str:
|
||||
"""Returns the appropriate model name depending on the availability
|
||||
and support of the LoRA or base model.
|
||||
Parameters:
|
||||
- lora: LoRARequest that contain a base_model_name.
|
||||
Returns:
|
||||
- str: The name of the base model or the first available model path.
|
||||
"""
|
||||
if lora_request is not None:
|
||||
return lora_request.lora_name
|
||||
return self.base_model_paths[0].name
|
||||
|
||||
async def show_available_models(self) -> ModelList:
|
||||
"""Show available models. This includes the base model and all
|
||||
adapters"""
|
||||
model_cards = [
|
||||
ModelCard(
|
||||
id=base_model.name,
|
||||
max_model_len=self.max_model_len,
|
||||
root=base_model.model_path,
|
||||
permission=[ModelPermission()],
|
||||
)
|
||||
for base_model in self.base_model_paths
|
||||
]
|
||||
lora_cards = [
|
||||
ModelCard(
|
||||
id=lora.lora_name,
|
||||
root=lora.local_path,
|
||||
parent=lora.base_model_name
|
||||
if lora.base_model_name
|
||||
else self.base_model_paths[0].name,
|
||||
permission=[ModelPermission()],
|
||||
)
|
||||
for lora in self.lora_requests.values()
|
||||
]
|
||||
model_cards.extend(lora_cards)
|
||||
return ModelList(data=model_cards)
|
||||
|
||||
async def load_lora_adapter(
|
||||
self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None
|
||||
) -> ErrorResponse | str:
|
||||
lora_name = request.lora_name
|
||||
|
||||
# Ensure atomicity based on the lora name
|
||||
async with self.lora_resolver_lock[lora_name]:
|
||||
error_check_ret = await self._check_load_lora_adapter_request(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
lora_path = request.lora_path
|
||||
unique_id = self.lora_id_counter.inc(1)
|
||||
lora_request = LoRARequest(
|
||||
lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path
|
||||
)
|
||||
if base_model_name is not None and self.is_base_model(base_model_name):
|
||||
lora_request.base_model_name = base_model_name
|
||||
|
||||
# Validate that the adapter can be loaded into the engine
|
||||
# This will also preload it for incoming requests
|
||||
try:
|
||||
await self.engine_client.add_lora(lora_request)
|
||||
except Exception as e:
|
||||
error_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
if "No adapter found" in str(e):
|
||||
error_type = "NotFoundError"
|
||||
status_code = HTTPStatus.NOT_FOUND
|
||||
|
||||
return create_error_response(
|
||||
message=str(e), err_type=error_type, status_code=status_code
|
||||
)
|
||||
|
||||
self.lora_requests[lora_name] = lora_request
|
||||
logger.info(
|
||||
"Loaded new LoRA adapter: name '%s', path '%s'", lora_name, lora_path
|
||||
)
|
||||
return f"Success: LoRA adapter '{lora_name}' added successfully."
|
||||
|
||||
async def unload_lora_adapter(
|
||||
self, request: UnloadLoRAAdapterRequest
|
||||
) -> ErrorResponse | str:
|
||||
lora_name = request.lora_name
|
||||
|
||||
# Ensure atomicity based on the lora name
|
||||
async with self.lora_resolver_lock[lora_name]:
|
||||
error_check_ret = await self._check_unload_lora_adapter_request(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
# Safe to delete now since we hold the lock
|
||||
del self.lora_requests[lora_name]
|
||||
logger.info("Removed LoRA adapter: name '%s'", lora_name)
|
||||
return f"Success: LoRA adapter '{lora_name}' removed successfully."
|
||||
|
||||
async def _check_load_lora_adapter_request(
|
||||
self, request: LoadLoRAAdapterRequest
|
||||
) -> ErrorResponse | None:
|
||||
# Check if both 'lora_name' and 'lora_path' are provided
|
||||
if not request.lora_name or not request.lora_path:
|
||||
return create_error_response(
|
||||
message="Both 'lora_name' and 'lora_path' must be provided.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
|
||||
# Check if the lora adapter with the given name already exists
|
||||
if request.lora_name in self.lora_requests:
|
||||
return create_error_response(
|
||||
message=f"The lora adapter '{request.lora_name}' has already been "
|
||||
"loaded.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
async def _check_unload_lora_adapter_request(
|
||||
self, request: UnloadLoRAAdapterRequest
|
||||
) -> ErrorResponse | None:
|
||||
# Check if 'lora_name' is not provided return an error
|
||||
if not request.lora_name:
|
||||
return create_error_response(
|
||||
message="'lora_name' needs to be provided to unload a LoRA adapter.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
|
||||
# Check if the lora adapter with the given name exists
|
||||
if request.lora_name not in self.lora_requests:
|
||||
return create_error_response(
|
||||
message=f"The lora adapter '{request.lora_name}' cannot be found.",
|
||||
err_type="NotFoundError",
|
||||
status_code=HTTPStatus.NOT_FOUND,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse:
|
||||
"""Attempt to resolve a LoRA adapter using available resolvers.
|
||||
|
||||
Args:
|
||||
lora_name: Name/identifier of the LoRA adapter
|
||||
|
||||
Returns:
|
||||
LoRARequest if found and loaded successfully.
|
||||
ErrorResponse (404) if no resolver finds the adapter.
|
||||
ErrorResponse (400) if adapter(s) are found but none load.
|
||||
"""
|
||||
async with self.lora_resolver_lock[lora_name]:
|
||||
# First check if this LoRA is already loaded
|
||||
if lora_name in self.lora_requests:
|
||||
return self.lora_requests[lora_name]
|
||||
|
||||
base_model_name = self.model_config.model
|
||||
unique_id = self.lora_id_counter.inc(1)
|
||||
found_adapter = False
|
||||
|
||||
# Try to resolve using available resolvers
|
||||
for resolver in self.lora_resolvers:
|
||||
lora_request = await resolver.resolve_lora(base_model_name, lora_name)
|
||||
|
||||
if lora_request is not None:
|
||||
found_adapter = True
|
||||
lora_request.lora_int_id = unique_id
|
||||
|
||||
try:
|
||||
await self.engine_client.add_lora(lora_request)
|
||||
self.lora_requests[lora_name] = lora_request
|
||||
logger.info(
|
||||
"Resolved and loaded LoRA adapter '%s' using %s",
|
||||
lora_name,
|
||||
resolver.__class__.__name__,
|
||||
)
|
||||
return lora_request
|
||||
except BaseException as e:
|
||||
logger.warning(
|
||||
"Failed to load LoRA '%s' resolved by %s: %s. "
|
||||
"Trying next resolver.",
|
||||
lora_name,
|
||||
resolver.__class__.__name__,
|
||||
e,
|
||||
)
|
||||
continue
|
||||
|
||||
if found_adapter:
|
||||
# An adapter was found, but all attempts to load it failed.
|
||||
return create_error_response(
|
||||
message=(
|
||||
f"LoRA adapter '{lora_name}' was found but could not be loaded."
|
||||
),
|
||||
err_type="BadRequestError",
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
else:
|
||||
# No adapter was found
|
||||
return create_error_response(
|
||||
message=f"LoRA adapter {lora_name} does not exist",
|
||||
err_type="NotFoundError",
|
||||
status_code=HTTPStatus.NOT_FOUND,
|
||||
)
|
||||
|
||||
|
||||
def create_error_response(
|
||||
message: str,
|
||||
err_type: str = "BadRequestError",
|
||||
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
|
||||
) -> ErrorResponse:
|
||||
return ErrorResponse(
|
||||
error=ErrorInfo(message=message, type=err_type, code=status_code.value)
|
||||
)
|
||||
2080
vllm/entrypoints/openai/serving_responses.py
Normal file
2080
vllm/entrypoints/openai/serving_responses.py
Normal file
File diff suppressed because it is too large
Load Diff
168
vllm/entrypoints/openai/serving_transcription.py
Normal file
168
vllm/entrypoints/openai/serving_transcription.py
Normal file
@@ -0,0 +1,168 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
RequestResponseMetadata,
|
||||
TranscriptionRequest,
|
||||
TranscriptionResponse,
|
||||
TranscriptionResponseStreamChoice,
|
||||
TranscriptionResponseVerbose,
|
||||
TranscriptionStreamResponse,
|
||||
TranslationRequest,
|
||||
TranslationResponse,
|
||||
TranslationResponseStreamChoice,
|
||||
TranslationResponseVerbose,
|
||||
TranslationStreamResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.openai.speech_to_text import OpenAISpeechToText
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class OpenAIServingTranscription(OpenAISpeechToText):
|
||||
"""Handles transcription requests."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
log_error_stack: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
task_type="transcribe",
|
||||
log_error_stack=log_error_stack,
|
||||
enable_force_include_usage=enable_force_include_usage,
|
||||
)
|
||||
|
||||
async def create_transcription(
|
||||
self, audio_data: bytes, request: TranscriptionRequest, raw_request: Request
|
||||
) -> (
|
||||
TranscriptionResponse
|
||||
| TranscriptionResponseVerbose
|
||||
| AsyncGenerator[str, None]
|
||||
| ErrorResponse
|
||||
):
|
||||
"""Transcription API similar to OpenAI's API.
|
||||
|
||||
See https://platform.openai.com/docs/api-reference/audio/createTranscription
|
||||
for the API specification. This API mimics the OpenAI transcription API.
|
||||
"""
|
||||
return await self._create_speech_to_text(
|
||||
audio_data=audio_data,
|
||||
request=request,
|
||||
raw_request=raw_request,
|
||||
response_class=(
|
||||
TranscriptionResponseVerbose
|
||||
if request.response_format == "verbose_json"
|
||||
else TranscriptionResponse
|
||||
),
|
||||
stream_generator_method=self.transcription_stream_generator,
|
||||
)
|
||||
|
||||
async def transcription_stream_generator(
|
||||
self,
|
||||
request: TranscriptionRequest,
|
||||
result_generator: list[AsyncGenerator[RequestOutput, None]],
|
||||
request_id: str,
|
||||
request_metadata: RequestResponseMetadata,
|
||||
audio_duration_s: float,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
generator = self._speech_to_text_stream_generator(
|
||||
request=request,
|
||||
list_result_generator=result_generator,
|
||||
request_id=request_id,
|
||||
request_metadata=request_metadata,
|
||||
audio_duration_s=audio_duration_s,
|
||||
chunk_object_type="transcription.chunk",
|
||||
response_stream_choice_class=TranscriptionResponseStreamChoice,
|
||||
stream_response_class=TranscriptionStreamResponse,
|
||||
)
|
||||
async for chunk in generator:
|
||||
yield chunk
|
||||
|
||||
|
||||
class OpenAIServingTranslation(OpenAISpeechToText):
|
||||
"""Handles translation requests."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
log_error_stack: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
task_type="translate",
|
||||
log_error_stack=log_error_stack,
|
||||
enable_force_include_usage=enable_force_include_usage,
|
||||
)
|
||||
|
||||
async def create_translation(
|
||||
self, audio_data: bytes, request: TranslationRequest, raw_request: Request
|
||||
) -> (
|
||||
TranslationResponse
|
||||
| TranslationResponseVerbose
|
||||
| AsyncGenerator[str, None]
|
||||
| ErrorResponse
|
||||
):
|
||||
"""Translation API similar to OpenAI's API.
|
||||
|
||||
See https://platform.openai.com/docs/api-reference/audio/createTranslation
|
||||
for the API specification. This API mimics the OpenAI translation API.
|
||||
"""
|
||||
return await self._create_speech_to_text(
|
||||
audio_data=audio_data,
|
||||
request=request,
|
||||
raw_request=raw_request,
|
||||
response_class=(
|
||||
TranslationResponseVerbose
|
||||
if request.response_format == "verbose_json"
|
||||
else TranslationResponse
|
||||
),
|
||||
stream_generator_method=self.translation_stream_generator,
|
||||
)
|
||||
|
||||
async def translation_stream_generator(
|
||||
self,
|
||||
request: TranslationRequest,
|
||||
result_generator: list[AsyncGenerator[RequestOutput, None]],
|
||||
request_id: str,
|
||||
request_metadata: RequestResponseMetadata,
|
||||
audio_duration_s: float,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
generator = self._speech_to_text_stream_generator(
|
||||
request=request,
|
||||
list_result_generator=result_generator,
|
||||
request_id=request_id,
|
||||
request_metadata=request_metadata,
|
||||
audio_duration_s=audio_duration_s,
|
||||
chunk_object_type="translation.chunk",
|
||||
response_stream_choice_class=TranslationResponseStreamChoice,
|
||||
stream_response_class=TranslationStreamResponse,
|
||||
)
|
||||
async for chunk in generator:
|
||||
yield chunk
|
||||
559
vllm/entrypoints/openai/speech_to_text.py
Normal file
559
vllm/entrypoints/openai/speech_to_text.py
Normal file
@@ -0,0 +1,559 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import io
|
||||
import math
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, Callable
|
||||
from functools import cached_property
|
||||
from typing import Literal, TypeAlias, TypeVar, cast
|
||||
|
||||
import numpy as np
|
||||
from fastapi import Request
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
DeltaMessage,
|
||||
ErrorResponse,
|
||||
RequestResponseMetadata,
|
||||
TranscriptionResponse,
|
||||
TranscriptionResponseStreamChoice,
|
||||
TranscriptionResponseVerbose,
|
||||
TranscriptionSegment,
|
||||
TranscriptionStreamResponse,
|
||||
TranslationResponse,
|
||||
TranslationResponseStreamChoice,
|
||||
TranslationResponseVerbose,
|
||||
TranslationSegment,
|
||||
TranslationStreamResponse,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models import SupportsTranscription
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.utils.import_utils import PlaceholderModule
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
|
||||
SpeechToTextResponseVerbose: TypeAlias = (
|
||||
TranscriptionResponseVerbose | TranslationResponseVerbose
|
||||
)
|
||||
SpeechToTextSegment: TypeAlias = TranscriptionSegment | TranslationSegment
|
||||
T = TypeVar("T", bound=SpeechToTextResponse)
|
||||
V = TypeVar("V", bound=SpeechToTextResponseVerbose)
|
||||
S = TypeVar("S", bound=SpeechToTextSegment)
|
||||
|
||||
ResponseType: TypeAlias = (
|
||||
TranscriptionResponse
|
||||
| TranslationResponse
|
||||
| TranscriptionResponseVerbose
|
||||
| TranslationResponseVerbose
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class OpenAISpeechToText(OpenAIServing):
|
||||
"""Base class for speech-to-text operations like transcription and
|
||||
translation."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
task_type: Literal["transcribe", "translate"] = "transcribe",
|
||||
log_error_stack: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
|
||||
self.default_sampling_params = self.model_config.get_diff_sampling_param()
|
||||
self.task_type = task_type
|
||||
|
||||
self.asr_config = self.model_cls.get_speech_to_text_config(
|
||||
self.model_config, task_type
|
||||
)
|
||||
|
||||
self.enable_force_include_usage = enable_force_include_usage
|
||||
|
||||
self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
|
||||
if self.model_cls.supports_segment_timestamp:
|
||||
self.tokenizer = cast(
|
||||
PreTrainedTokenizerBase,
|
||||
get_tokenizer(
|
||||
tokenizer_name=self.model_config.tokenizer,
|
||||
tokenizer_mode=self.model_config.tokenizer_mode,
|
||||
),
|
||||
)
|
||||
|
||||
if self.default_sampling_params:
|
||||
logger.info(
|
||||
"Overwriting default completion sampling param with: %s",
|
||||
self.default_sampling_params,
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def model_cls(self) -> type[SupportsTranscription]:
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
|
||||
model_cls = get_model_cls(self.model_config)
|
||||
return cast(type[SupportsTranscription], model_cls)
|
||||
|
||||
async def _preprocess_speech_to_text(
|
||||
self,
|
||||
request: SpeechToTextRequest,
|
||||
audio_data: bytes,
|
||||
) -> tuple[list[PromptType], float]:
|
||||
# Validate request
|
||||
language = self.model_cls.validate_language(request.language)
|
||||
# Skip to_language validation to avoid extra logging for Whisper.
|
||||
to_language = (
|
||||
self.model_cls.validate_language(request.to_language)
|
||||
if request.to_language
|
||||
else None
|
||||
)
|
||||
|
||||
if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
|
||||
raise ValueError("Maximum file size exceeded.")
|
||||
|
||||
with io.BytesIO(audio_data) as bytes_:
|
||||
# NOTE resample to model SR here for efficiency. This is also a
|
||||
# pre-requisite for chunking, as it assumes Whisper SR.
|
||||
y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
|
||||
|
||||
duration = librosa.get_duration(y=y, sr=sr)
|
||||
do_split_audio = (
|
||||
self.asr_config.allow_audio_chunking
|
||||
and duration > self.asr_config.max_audio_clip_s
|
||||
)
|
||||
chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
|
||||
prompts = []
|
||||
for chunk in chunks:
|
||||
# The model has control over the construction, as long as it
|
||||
# returns a valid PromptType.
|
||||
prompt = self.model_cls.get_generation_prompt(
|
||||
audio=chunk,
|
||||
stt_config=self.asr_config,
|
||||
model_config=self.model_config,
|
||||
language=language,
|
||||
task_type=self.task_type,
|
||||
request_prompt=request.prompt,
|
||||
to_language=to_language,
|
||||
)
|
||||
if request.response_format == "verbose_json":
|
||||
if not isinstance(prompt, dict):
|
||||
raise ValueError(f"Expected prompt to be a dict,got {type(prompt)}")
|
||||
prompt_dict = cast(dict, prompt)
|
||||
decoder_prompt = prompt.get("decoder_prompt")
|
||||
if not isinstance(decoder_prompt, str):
|
||||
raise ValueError(
|
||||
f"Expected decoder_prompt to bestr, got {type(decoder_prompt)}"
|
||||
)
|
||||
prompt_dict["decoder_prompt"] = decoder_prompt.replace(
|
||||
"<|notimestamps|>", "<|0.00|>"
|
||||
)
|
||||
prompts.append(prompt)
|
||||
return prompts, duration
|
||||
|
||||
def _get_verbose_segments(
|
||||
self,
|
||||
tokens: tuple,
|
||||
request: SpeechToTextRequest,
|
||||
segment_class: type[SpeechToTextSegment],
|
||||
start_time: float = 0,
|
||||
) -> list[SpeechToTextSegment]:
|
||||
"""
|
||||
Convert tokens to verbose segments.
|
||||
|
||||
This method expects the model to produce
|
||||
timestamps as tokens (similar to Whisper).
|
||||
If the tokens do not include timestamp information,
|
||||
the segments may not be generated correctly.
|
||||
|
||||
Note: Fields like avg_logprob, compression_ratio,
|
||||
and no_speech_prob are not supported
|
||||
in this implementation and will be None. See docs for details.
|
||||
"""
|
||||
BASE_OFFSET = 0.02
|
||||
init_token = self.tokenizer.encode("<|0.00|>", add_special_tokens=False)[0]
|
||||
if tokens[-1] == self.tokenizer.eos_token_id:
|
||||
tokens = tokens[:-1]
|
||||
|
||||
tokens_with_start = (init_token,) + tokens
|
||||
segments: list[SpeechToTextSegment] = []
|
||||
last_timestamp_start = 0
|
||||
|
||||
if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
|
||||
tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
|
||||
for idx, token in enumerate(tokens_with_start):
|
||||
# Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
|
||||
# If the ordering is violated, this slicing may produce incorrect results.
|
||||
if (
|
||||
token >= init_token
|
||||
and idx != 0
|
||||
and tokens_with_start[idx - 1] >= init_token
|
||||
):
|
||||
sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
|
||||
start_timestamp = sliced_timestamp_tokens[0] - init_token
|
||||
end_timestamp = sliced_timestamp_tokens[-1] - init_token
|
||||
|
||||
casting_segment = cast(
|
||||
SpeechToTextSegment,
|
||||
segment_class(
|
||||
id=len(segments),
|
||||
seek=start_time,
|
||||
start=start_time + BASE_OFFSET * start_timestamp,
|
||||
end=start_time + BASE_OFFSET * end_timestamp,
|
||||
temperature=request.temperature,
|
||||
text=self.tokenizer.decode(sliced_timestamp_tokens[1:-1]),
|
||||
tokens=sliced_timestamp_tokens[1:-1],
|
||||
),
|
||||
)
|
||||
segments.append(casting_segment)
|
||||
last_timestamp_start = idx
|
||||
return segments
|
||||
|
||||
async def _create_speech_to_text(
|
||||
self,
|
||||
audio_data: bytes,
|
||||
request: SpeechToTextRequest,
|
||||
raw_request: Request,
|
||||
response_class: type[T | V],
|
||||
stream_generator_method: Callable[..., AsyncGenerator[str, None]],
|
||||
) -> T | V | AsyncGenerator[str, None] | ErrorResponse:
|
||||
"""Base method for speech-to-text operations like transcription and
|
||||
translation."""
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
# If the engine is dead, raise the engine's DEAD_ERROR.
|
||||
# This is required for the streaming case, where we return a
|
||||
# success status before we actually start generating text :).
|
||||
if self.engine_client.errored:
|
||||
raise self.engine_client.dead_error
|
||||
|
||||
if request.response_format not in ["text", "json", "verbose_json"]:
|
||||
return self.create_error_response(
|
||||
("Currently only support response_format")
|
||||
+ ("`text`, `json` or `verbose_json`")
|
||||
)
|
||||
|
||||
if (
|
||||
request.response_format == "verbose_json"
|
||||
and not self.model_cls.supports_segment_timestamp
|
||||
):
|
||||
return self.create_error_response(
|
||||
f"Currently do not support verbose_json for {request.model}"
|
||||
)
|
||||
|
||||
if request.response_format == "verbose_json" and request.stream:
|
||||
return self.create_error_response(
|
||||
"verbose_json format doesn't support streaming case"
|
||||
)
|
||||
request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"
|
||||
|
||||
request_metadata = RequestResponseMetadata(request_id=request_id)
|
||||
if raw_request:
|
||||
raw_request.state.request_metadata = request_metadata
|
||||
|
||||
try:
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
prompts, duration_s = await self._preprocess_speech_to_text(
|
||||
request=request,
|
||||
audio_data=audio_data,
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
|
||||
try:
|
||||
# Unlike most decoder-only models, whisper generation length is not
|
||||
# constrained by the size of the input audio, which is mapped to a
|
||||
# fixed-size log-mel-spectogram.
|
||||
default_max_tokens = self.model_config.max_model_len
|
||||
sampling_params = request.to_sampling_params(
|
||||
default_max_tokens, self.default_sampling_params
|
||||
)
|
||||
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
# It will not display special tokens like <|startoftranscript|>
|
||||
request.prompt,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
list_result_generator = [
|
||||
self.engine_client.generate(
|
||||
prompt,
|
||||
sampling_params,
|
||||
f"{request_id}_{i}",
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for i, prompt in enumerate(prompts)
|
||||
]
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
if request.stream:
|
||||
return stream_generator_method(
|
||||
request, list_result_generator, request_id, request_metadata, duration_s
|
||||
)
|
||||
# Non-streaming response.
|
||||
total_segments = []
|
||||
text_parts = []
|
||||
try:
|
||||
assert list_result_generator is not None
|
||||
segments_types: dict[str, type[SpeechToTextSegment]] = {
|
||||
"transcribe": TranscriptionSegment,
|
||||
"translate": TranslationSegment,
|
||||
}
|
||||
segment_class: type[SpeechToTextSegment] = segments_types[self.task_type]
|
||||
text = ""
|
||||
for idx, result_generator in enumerate(list_result_generator):
|
||||
async for op in result_generator:
|
||||
if request.response_format == "verbose_json":
|
||||
segments: list[SpeechToTextSegment] = (
|
||||
self._get_verbose_segments(
|
||||
tokens=tuple(op.outputs[0].token_ids),
|
||||
segment_class=segment_class,
|
||||
request=request,
|
||||
start_time=idx * self.asr_config.max_audio_clip_s,
|
||||
)
|
||||
)
|
||||
|
||||
total_segments.extend(segments)
|
||||
text_parts.extend([seg.text for seg in segments])
|
||||
else:
|
||||
text_parts.append(op.outputs[0].text)
|
||||
text = "".join(text_parts)
|
||||
if self.task_type == "transcribe":
|
||||
final_response: ResponseType
|
||||
# add usage in TranscriptionResponse.
|
||||
usage = {
|
||||
"type": "duration",
|
||||
# rounded up as per openAI specs
|
||||
"seconds": int(math.ceil(duration_s)),
|
||||
}
|
||||
if request.response_format != "verbose_json":
|
||||
final_response = cast(
|
||||
T, TranscriptionResponse(text=text, usage=usage)
|
||||
)
|
||||
else:
|
||||
final_response = cast(
|
||||
V,
|
||||
TranscriptionResponseVerbose(
|
||||
text=text,
|
||||
language=request.language,
|
||||
duration=str(duration_s),
|
||||
segments=total_segments,
|
||||
),
|
||||
)
|
||||
else:
|
||||
# no usage in response for translation task
|
||||
if request.response_format != "verbose_json":
|
||||
final_response = cast(T, TranslationResponse(text=text))
|
||||
else:
|
||||
final_response = cast(
|
||||
V,
|
||||
TranslationResponseVerbose(
|
||||
text=text,
|
||||
language=request.language,
|
||||
duration=str(duration_s),
|
||||
segments=total_segments,
|
||||
),
|
||||
)
|
||||
return final_response
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
async def _speech_to_text_stream_generator(
|
||||
self,
|
||||
request: SpeechToTextRequest,
|
||||
list_result_generator: list[AsyncGenerator[RequestOutput, None]],
|
||||
request_id: str,
|
||||
request_metadata: RequestResponseMetadata,
|
||||
audio_duration_s: float,
|
||||
chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
|
||||
response_stream_choice_class: type[TranscriptionResponseStreamChoice]
|
||||
| type[TranslationResponseStreamChoice],
|
||||
stream_response_class: type[TranscriptionStreamResponse]
|
||||
| type[TranslationStreamResponse],
|
||||
) -> AsyncGenerator[str, None]:
|
||||
created_time = int(time.time())
|
||||
model_name = request.model
|
||||
|
||||
completion_tokens = 0
|
||||
num_prompt_tokens = 0
|
||||
|
||||
include_usage = self.enable_force_include_usage or request.stream_include_usage
|
||||
include_continuous_usage = (
|
||||
request.stream_continuous_usage_stats
|
||||
if include_usage and request.stream_continuous_usage_stats
|
||||
else False
|
||||
)
|
||||
|
||||
try:
|
||||
for result_generator in list_result_generator:
|
||||
async for res in result_generator:
|
||||
# On first result.
|
||||
if res.prompt_token_ids is not None:
|
||||
num_prompt_tokens = len(res.prompt_token_ids)
|
||||
if audio_tokens := self.model_cls.get_num_audio_tokens(
|
||||
audio_duration_s, self.asr_config, self.model_config
|
||||
):
|
||||
num_prompt_tokens += audio_tokens
|
||||
|
||||
# We need to do it here, because if there are exceptions in
|
||||
# the result_generator, it needs to be sent as the FIRST
|
||||
# response (by the try...catch).
|
||||
|
||||
# Just one output (n=1) supported.
|
||||
assert len(res.outputs) == 1
|
||||
output = res.outputs[0]
|
||||
|
||||
delta_message = DeltaMessage(content=output.text)
|
||||
completion_tokens += len(output.token_ids)
|
||||
|
||||
if output.finish_reason is None:
|
||||
# Still generating, send delta update.
|
||||
choice_data = response_stream_choice_class(delta=delta_message)
|
||||
else:
|
||||
# Model is finished generating.
|
||||
choice_data = response_stream_choice_class(
|
||||
delta=delta_message,
|
||||
finish_reason=output.finish_reason,
|
||||
stop_reason=output.stop_reason,
|
||||
)
|
||||
|
||||
chunk = stream_response_class(
|
||||
id=request_id,
|
||||
object=chunk_object_type,
|
||||
created=created_time,
|
||||
choices=[choice_data],
|
||||
model=model_name,
|
||||
)
|
||||
|
||||
# handle usage stats if requested & if continuous
|
||||
if include_continuous_usage:
|
||||
chunk.usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=num_prompt_tokens + completion_tokens,
|
||||
)
|
||||
|
||||
data = chunk.model_dump_json(exclude_unset=True)
|
||||
yield f"data: {data}\n\n"
|
||||
|
||||
# Once the final token is handled, if stream_options.include_usage
|
||||
# is sent, send the usage.
|
||||
if include_usage:
|
||||
final_usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=num_prompt_tokens + completion_tokens,
|
||||
)
|
||||
|
||||
final_usage_chunk = stream_response_class(
|
||||
id=request_id,
|
||||
object=chunk_object_type,
|
||||
created=created_time,
|
||||
choices=[],
|
||||
model=model_name,
|
||||
usage=final_usage,
|
||||
)
|
||||
final_usage_data = final_usage_chunk.model_dump_json(
|
||||
exclude_unset=True, exclude_none=True
|
||||
)
|
||||
yield f"data: {final_usage_data}\n\n"
|
||||
|
||||
# report to FastAPI middleware aggregate usage across all choices
|
||||
request_metadata.final_usage_info = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=num_prompt_tokens + completion_tokens,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
logger.exception("Error in %s stream generator.", self.task_type)
|
||||
data = self.create_streaming_error_response(str(e))
|
||||
yield f"data: {data}\n\n"
|
||||
# Send the final done message after all response.n are finished
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
def _split_audio(
|
||||
self, audio_data: np.ndarray, sample_rate: int
|
||||
) -> list[np.ndarray]:
|
||||
chunk_size = sample_rate * self.asr_config.max_audio_clip_s
|
||||
overlap_size = sample_rate * self.asr_config.overlap_chunk_second
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < audio_data.shape[-1]:
|
||||
if i + chunk_size >= audio_data.shape[-1]:
|
||||
# handle last chunk
|
||||
chunks.append(audio_data[..., i:])
|
||||
break
|
||||
|
||||
# Find the best split point in the overlap region
|
||||
search_start = i + chunk_size - overlap_size
|
||||
search_end = min(i + chunk_size, audio_data.shape[-1])
|
||||
split_point = self._find_split_point(audio_data, search_start, search_end)
|
||||
|
||||
# Extract chunk up to the split point
|
||||
chunks.append(audio_data[..., i:split_point])
|
||||
i = split_point
|
||||
return chunks
|
||||
|
||||
def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int:
|
||||
"""Find the best point to split audio by
|
||||
looking for silence or low amplitude.
|
||||
Args:
|
||||
wav: Audio tensor [1, T]
|
||||
start_idx: Start index of search region
|
||||
end_idx: End index of search region
|
||||
Returns:
|
||||
Index of best splitting point
|
||||
"""
|
||||
segment = wav[start_idx:end_idx]
|
||||
|
||||
# Calculate RMS energy in small windows
|
||||
min_energy = math.inf
|
||||
quietest_idx = 0
|
||||
min_energy_window = self.asr_config.min_energy_split_window_size
|
||||
assert min_energy_window is not None
|
||||
for i in range(0, len(segment) - min_energy_window, min_energy_window):
|
||||
window = segment[i : i + min_energy_window]
|
||||
energy = (window**2).mean() ** 0.5
|
||||
if energy < min_energy:
|
||||
quietest_idx = i + start_idx
|
||||
min_energy = energy
|
||||
return quietest_idx
|
||||
33
vllm/entrypoints/openai/tool_parsers/__init__.py
Normal file
33
vllm/entrypoints/openai/tool_parsers/__init__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import warnings
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
if name == "ToolParser":
|
||||
from vllm.tool_parsers import ToolParser
|
||||
|
||||
warnings.warn(
|
||||
"`vllm.entrypoints.openai.tool_parsers.ToolParser` has been moved to "
|
||||
"`vllm.tool_parsers.ToolParser`. "
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
return ToolParser
|
||||
if name == "ToolParserManager":
|
||||
from vllm.tool_parsers import ToolParserManager
|
||||
|
||||
warnings.warn(
|
||||
"`vllm.entrypoints.openai.tool_parsers.ToolParserManager` "
|
||||
"has been moved to `vllm.tool_parsers.ToolParserManager`. "
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
return ToolParserManager
|
||||
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
49
vllm/entrypoints/openai/utils.py
Normal file
49
vllm/entrypoints/openai/utils.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import TypeVar
|
||||
|
||||
from fastapi import Request
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponseChoice,
|
||||
ChatCompletionResponseStreamChoice,
|
||||
)
|
||||
|
||||
# Used internally
|
||||
_ChatCompletionResponseChoiceT = TypeVar(
|
||||
"_ChatCompletionResponseChoiceT",
|
||||
ChatCompletionResponseChoice,
|
||||
ChatCompletionResponseStreamChoice,
|
||||
)
|
||||
|
||||
|
||||
def maybe_filter_parallel_tool_calls(
|
||||
choice: _ChatCompletionResponseChoiceT, request: ChatCompletionRequest
|
||||
) -> _ChatCompletionResponseChoiceT:
|
||||
"""Filter to first tool call only when parallel_tool_calls is False."""
|
||||
|
||||
if request.parallel_tool_calls:
|
||||
return choice
|
||||
|
||||
if isinstance(choice, ChatCompletionResponseChoice) and choice.message.tool_calls:
|
||||
choice.message.tool_calls = choice.message.tool_calls[:1]
|
||||
elif (
|
||||
isinstance(choice, ChatCompletionResponseStreamChoice)
|
||||
and choice.delta.tool_calls
|
||||
):
|
||||
choice.delta.tool_calls = [
|
||||
tool_call for tool_call in choice.delta.tool_calls if tool_call.index == 0
|
||||
]
|
||||
|
||||
return choice
|
||||
|
||||
|
||||
async def validate_json_request(raw_request: Request):
|
||||
content_type = raw_request.headers.get("content-type", "").lower()
|
||||
media_type = content_type.split(";", maxsplit=1)[0]
|
||||
if media_type != "application/json":
|
||||
raise RequestValidationError(
|
||||
errors=["Unsupported Media Type: Only 'application/json' is allowed"]
|
||||
)
|
||||
16
vllm/entrypoints/pooling/__init__.py
Normal file
16
vllm/entrypoints/pooling/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
|
||||
def register_pooling_api_routers(app: FastAPI):
|
||||
from vllm.entrypoints.pooling.classify.api_router import router as classify_router
|
||||
from vllm.entrypoints.pooling.embed.api_router import router as embed_router
|
||||
from vllm.entrypoints.pooling.pooling.api_router import router as pooling_router
|
||||
from vllm.entrypoints.pooling.score.api_router import router as score_router
|
||||
|
||||
app.include_router(classify_router)
|
||||
app.include_router(embed_router)
|
||||
app.include_router(score_router)
|
||||
app.include_router(pooling_router)
|
||||
0
vllm/entrypoints/pooling/classify/__init__.py
Normal file
0
vllm/entrypoints/pooling/classify/__init__.py
Normal file
50
vllm/entrypoints/pooling/classify/api_router.py
Normal file
50
vllm/entrypoints/pooling/classify/api_router.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from starlette.responses import JSONResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.pooling.classify.protocol import (
|
||||
ClassificationRequest,
|
||||
ClassificationResponse,
|
||||
)
|
||||
from vllm.entrypoints.pooling.classify.serving import ServingClassification
|
||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def classify(request: Request) -> ServingClassification | None:
|
||||
return request.app.state.openai_serving_classification
|
||||
|
||||
|
||||
@router.post("/classify", dependencies=[Depends(validate_json_request)])
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def create_classify(request: ClassificationRequest, raw_request: Request):
|
||||
handler = classify(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Classification API"
|
||||
)
|
||||
|
||||
try:
|
||||
generator = await handler.create_classify(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
|
||||
elif isinstance(generator, ClassificationResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
assert_never(generator)
|
||||
181
vllm/entrypoints/pooling/classify/protocol.py
Normal file
181
vllm/entrypoints/pooling/classify/protocol.py
Normal file
@@ -0,0 +1,181 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import time
|
||||
from typing import Annotated, Any, TypeAlias
|
||||
|
||||
from pydantic import (
|
||||
Field,
|
||||
)
|
||||
|
||||
from vllm import PoolingParams
|
||||
from vllm.config.pooler import get_use_activation
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
|
||||
class ClassificationCompletionRequest(OpenAIBaseModel):
|
||||
model: str | None = None
|
||||
input: list[str] | str
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
|
||||
user: str | None = None
|
||||
|
||||
# --8<-- [start:classification-extra-params]
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."
|
||||
),
|
||||
)
|
||||
add_special_tokens: bool = Field(
|
||||
default=True,
|
||||
description=(
|
||||
"If true (the default), special tokens (e.g. BOS) will be added to "
|
||||
"the prompt."
|
||||
),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
"through out the inference process and return in response."
|
||||
),
|
||||
)
|
||||
softmax: bool | None = Field(
|
||||
default=None,
|
||||
description="softmax will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
|
||||
activation: bool | None = Field(
|
||||
default=None,
|
||||
description="activation will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
|
||||
use_activation: bool | None = Field(
|
||||
default=None,
|
||||
description="Whether to use activation for classification outputs. "
|
||||
"Default is True.",
|
||||
)
|
||||
# --8<-- [end:classification-extra-params]
|
||||
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=get_use_activation(self),
|
||||
)
|
||||
|
||||
|
||||
class ClassificationChatRequest(OpenAIBaseModel):
|
||||
model: str | None = None
|
||||
messages: list[ChatCompletionMessageParam]
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
|
||||
user: str | None = None
|
||||
|
||||
# --8<-- [start:chat-classification-extra-params]
|
||||
add_generation_prompt: bool = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"If true, the generation prompt will be added to the chat template. "
|
||||
"This is a parameter used by chat template in tokenizer config of the "
|
||||
"model."
|
||||
),
|
||||
)
|
||||
|
||||
add_special_tokens: bool = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"If true, special tokens (e.g. BOS) will be added to the prompt "
|
||||
"on top of what is added by the chat template. "
|
||||
"For most models, the chat template takes care of adding the "
|
||||
"special tokens so this should be set to false (as is the "
|
||||
"default)."
|
||||
),
|
||||
)
|
||||
|
||||
chat_template: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"A Jinja template to use for this conversion. "
|
||||
"As of transformers v4.44, default chat template is no longer "
|
||||
"allowed, so you must provide a chat template if the tokenizer "
|
||||
"does not define one."
|
||||
),
|
||||
)
|
||||
|
||||
chat_template_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Additional keyword args to pass to the template renderer. "
|
||||
"Will be accessible by the chat template."
|
||||
),
|
||||
)
|
||||
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."
|
||||
),
|
||||
)
|
||||
|
||||
request_id: str = Field(
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
"through out the inference process and return in response."
|
||||
),
|
||||
)
|
||||
softmax: bool | None = Field(
|
||||
default=None,
|
||||
description="softmax will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
|
||||
activation: bool | None = Field(
|
||||
default=None,
|
||||
description="activation will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
|
||||
use_activation: bool | None = Field(
|
||||
default=None,
|
||||
description="Whether to use activation for classification outputs. "
|
||||
"Default is True.",
|
||||
)
|
||||
# --8<-- [end:chat-classification-extra-params]
|
||||
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=get_use_activation(self),
|
||||
)
|
||||
|
||||
|
||||
ClassificationRequest: TypeAlias = (
|
||||
ClassificationCompletionRequest | ClassificationChatRequest
|
||||
)
|
||||
|
||||
|
||||
class ClassificationData(OpenAIBaseModel):
|
||||
index: int
|
||||
label: str | None
|
||||
probs: list[float]
|
||||
num_classes: int
|
||||
|
||||
|
||||
class ClassificationResponse(OpenAIBaseModel):
|
||||
id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
|
||||
object: str = "list"
|
||||
created: int = Field(default_factory=lambda: int(time.time()))
|
||||
model: str
|
||||
data: list[ClassificationData]
|
||||
usage: UsageInfo
|
||||
233
vllm/entrypoints/pooling/classify/serving.py
Normal file
233
vllm/entrypoints/pooling/classify/serving.py
Normal file
@@ -0,0 +1,233 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from http import HTTPStatus
|
||||
from typing import cast
|
||||
|
||||
import jinja2
|
||||
import numpy as np
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
ErrorResponse,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import (
|
||||
ClassificationServeContext,
|
||||
OpenAIServing,
|
||||
ServeContext,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.pooling.classify.protocol import (
|
||||
ClassificationChatRequest,
|
||||
ClassificationCompletionRequest,
|
||||
ClassificationData,
|
||||
ClassificationRequest,
|
||||
ClassificationResponse,
|
||||
)
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import ClassificationOutput, PoolingRequestOutput
|
||||
from vllm.pooling_params import PoolingParams
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ClassificationMixin(OpenAIServing):
|
||||
chat_template: str | None
|
||||
chat_template_content_format: ChatTemplateContentFormatOption
|
||||
trust_request_chat_template: bool
|
||||
|
||||
async def _preprocess(
|
||||
self,
|
||||
ctx: ServeContext,
|
||||
) -> ErrorResponse | None:
|
||||
"""
|
||||
Process classification inputs: tokenize text, resolve adapters,
|
||||
and prepare model-specific inputs.
|
||||
"""
|
||||
ctx = cast(ClassificationServeContext, ctx)
|
||||
try:
|
||||
ctx.tokenizer = await self.engine_client.get_tokenizer()
|
||||
|
||||
request_obj = ctx.request
|
||||
|
||||
if isinstance(request_obj, ClassificationChatRequest):
|
||||
chat_request = request_obj
|
||||
messages = chat_request.messages
|
||||
trust_request_chat_template = getattr(
|
||||
self,
|
||||
"trust_request_chat_template",
|
||||
False,
|
||||
)
|
||||
ret = self._validate_chat_template(
|
||||
request_chat_template=chat_request.chat_template,
|
||||
chat_template_kwargs=chat_request.chat_template_kwargs,
|
||||
trust_request_chat_template=trust_request_chat_template,
|
||||
)
|
||||
if ret:
|
||||
return ret
|
||||
|
||||
_, engine_prompts = await self._preprocess_chat(
|
||||
cast(ChatCompletionRequest, chat_request),
|
||||
ctx.tokenizer,
|
||||
messages,
|
||||
chat_template=(
|
||||
chat_request.chat_template
|
||||
or getattr(self, "chat_template", None)
|
||||
),
|
||||
chat_template_content_format=cast(
|
||||
ChatTemplateContentFormatOption,
|
||||
getattr(self, "chat_template_content_format", "auto"),
|
||||
),
|
||||
add_generation_prompt=False,
|
||||
continue_final_message=False,
|
||||
add_special_tokens=chat_request.add_special_tokens,
|
||||
)
|
||||
ctx.engine_prompts = engine_prompts
|
||||
|
||||
elif isinstance(request_obj, ClassificationCompletionRequest):
|
||||
completion_request = request_obj
|
||||
input_data = completion_request.input
|
||||
if input_data in (None, ""):
|
||||
return self.create_error_response(
|
||||
"Input or messages must be provided",
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
if isinstance(input_data, list) and not input_data:
|
||||
ctx.engine_prompts = []
|
||||
return None
|
||||
|
||||
renderer = self._get_renderer(ctx.tokenizer)
|
||||
prompt_input = cast(str | list[str], input_data)
|
||||
ctx.engine_prompts = await renderer.render_prompt(
|
||||
prompt_or_prompts=prompt_input,
|
||||
config=self._build_render_config(completion_request),
|
||||
)
|
||||
else:
|
||||
return self.create_error_response(
|
||||
"Invalid classification request type",
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
except (ValueError, TypeError, jinja2.TemplateError) as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
def _build_response(
|
||||
self,
|
||||
ctx: ServeContext,
|
||||
) -> ClassificationResponse | ErrorResponse:
|
||||
"""
|
||||
Convert model outputs to a formatted classification response
|
||||
with probabilities and labels.
|
||||
"""
|
||||
ctx = cast(ClassificationServeContext, ctx)
|
||||
items: list[ClassificationData] = []
|
||||
num_prompt_tokens = 0
|
||||
|
||||
final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch)
|
||||
|
||||
for idx, final_res in enumerate(final_res_batch_checked):
|
||||
classify_res = ClassificationOutput.from_base(final_res.outputs)
|
||||
|
||||
probs = classify_res.probs
|
||||
predicted_index = int(np.argmax(probs))
|
||||
label = getattr(self.model_config.hf_config, "id2label", {}).get(
|
||||
predicted_index
|
||||
)
|
||||
|
||||
item = ClassificationData(
|
||||
index=idx,
|
||||
label=label,
|
||||
probs=probs,
|
||||
num_classes=len(probs),
|
||||
)
|
||||
|
||||
items.append(item)
|
||||
prompt_token_ids = final_res.prompt_token_ids
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
total_tokens=num_prompt_tokens,
|
||||
)
|
||||
|
||||
return ClassificationResponse(
|
||||
id=ctx.request_id,
|
||||
created=ctx.created_time,
|
||||
model=ctx.model_name,
|
||||
data=items,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
def _build_render_config(self, request: ClassificationRequest) -> RenderConfig:
|
||||
return RenderConfig(
|
||||
max_length=self.max_model_len,
|
||||
truncate_prompt_tokens=request.truncate_prompt_tokens,
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
)
|
||||
|
||||
|
||||
class ServingClassification(ClassificationMixin):
|
||||
request_id_prefix = "classify"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
chat_template: str | None = None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption = "auto",
|
||||
trust_request_chat_template: bool = False,
|
||||
log_error_stack: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
|
||||
self.chat_template = chat_template
|
||||
self.chat_template_content_format = chat_template_content_format
|
||||
self.trust_request_chat_template = trust_request_chat_template
|
||||
|
||||
async def create_classify(
|
||||
self,
|
||||
request: ClassificationRequest,
|
||||
raw_request: Request,
|
||||
) -> ClassificationResponse | ErrorResponse:
|
||||
model_name = self.models.model_name()
|
||||
request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
|
||||
|
||||
ctx = ClassificationServeContext(
|
||||
request=request,
|
||||
raw_request=raw_request,
|
||||
model_name=model_name,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
return await super().handle(ctx) # type: ignore
|
||||
|
||||
def _create_pooling_params(
|
||||
self,
|
||||
ctx: ServeContext[ClassificationRequest],
|
||||
) -> PoolingParams | ErrorResponse:
|
||||
pooling_params = super()._create_pooling_params(ctx)
|
||||
if isinstance(pooling_params, ErrorResponse):
|
||||
return pooling_params
|
||||
|
||||
try:
|
||||
pooling_params.verify("classify", self.model_config)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
return pooling_params
|
||||
0
vllm/entrypoints/pooling/embed/__init__.py
Normal file
0
vllm/entrypoints/pooling/embed/__init__.py
Normal file
67
vllm/entrypoints/pooling/embed/api_router.py
Normal file
67
vllm/entrypoints/pooling/embed/api_router.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.pooling.embed.protocol import (
|
||||
EmbeddingBytesResponse,
|
||||
EmbeddingRequest,
|
||||
EmbeddingResponse,
|
||||
)
|
||||
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
|
||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def embedding(request: Request) -> OpenAIServingEmbedding | None:
|
||||
return request.app.state.openai_serving_embedding
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/embeddings",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def create_embedding(
|
||||
request: EmbeddingRequest,
|
||||
raw_request: Request,
|
||||
):
|
||||
handler = embedding(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Embeddings API"
|
||||
)
|
||||
|
||||
try:
|
||||
generator = await handler.create_embedding(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, EmbeddingResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
elif isinstance(generator, EmbeddingBytesResponse):
|
||||
return StreamingResponse(
|
||||
content=generator.content,
|
||||
headers=generator.headers,
|
||||
media_type=generator.media_type,
|
||||
)
|
||||
|
||||
assert_never(generator)
|
||||
208
vllm/entrypoints/pooling/embed/protocol.py
Normal file
208
vllm/entrypoints/pooling/embed/protocol.py
Normal file
@@ -0,0 +1,208 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import time
|
||||
from typing import Annotated, Any, TypeAlias
|
||||
|
||||
from pydantic import (
|
||||
Field,
|
||||
model_validator,
|
||||
)
|
||||
|
||||
from vllm import PoolingParams
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
|
||||
|
||||
|
||||
class EmbeddingCompletionRequest(OpenAIBaseModel):
|
||||
# Ordered by official OpenAI API documentation
|
||||
# https://platform.openai.com/docs/api-reference/embeddings
|
||||
model: str | None = None
|
||||
input: list[int] | list[list[int]] | str | list[str]
|
||||
encoding_format: EncodingFormat = "float"
|
||||
dimensions: int | None = None
|
||||
user: str | None = None
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
|
||||
|
||||
# --8<-- [start:embedding-extra-params]
|
||||
add_special_tokens: bool = Field(
|
||||
default=True,
|
||||
description=(
|
||||
"If true (the default), special tokens (e.g. BOS) will be added to "
|
||||
"the prompt."
|
||||
),
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."
|
||||
),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
"through out the inference process and return in response."
|
||||
),
|
||||
)
|
||||
normalize: bool | None = Field(
|
||||
default=None,
|
||||
description="Whether to normalize the embeddings outputs. Default is True.",
|
||||
)
|
||||
embed_dtype: EmbedDType = Field(
|
||||
default="float32",
|
||||
description=(
|
||||
"What dtype to use for encoding. Default to using float32 for base64 "
|
||||
"encoding to match the OpenAI python client behavior. "
|
||||
"This parameter will affect base64 and binary_response."
|
||||
),
|
||||
)
|
||||
endianness: Endianness = Field(
|
||||
default="native",
|
||||
description=(
|
||||
"What endianness to use for encoding. Default to using native for "
|
||||
"base64 encoding to match the OpenAI python client behavior."
|
||||
"This parameter will affect base64 and binary_response."
|
||||
),
|
||||
)
|
||||
# --8<-- [end:embedding-extra-params]
|
||||
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
dimensions=self.dimensions,
|
||||
normalize=self.normalize,
|
||||
)
|
||||
|
||||
|
||||
class EmbeddingChatRequest(OpenAIBaseModel):
|
||||
model: str | None = None
|
||||
messages: list[ChatCompletionMessageParam]
|
||||
|
||||
encoding_format: EncodingFormat = "float"
|
||||
dimensions: int | None = None
|
||||
user: str | None = None
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
|
||||
|
||||
# --8<-- [start:chat-embedding-extra-params]
|
||||
add_generation_prompt: bool = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"If true, the generation prompt will be added to the chat template. "
|
||||
"This is a parameter used by chat template in tokenizer config of the "
|
||||
"model."
|
||||
),
|
||||
)
|
||||
|
||||
add_special_tokens: bool = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"If true, special tokens (e.g. BOS) will be added to the prompt "
|
||||
"on top of what is added by the chat template. "
|
||||
"For most models, the chat template takes care of adding the "
|
||||
"special tokens so this should be set to false (as is the "
|
||||
"default)."
|
||||
),
|
||||
)
|
||||
chat_template: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"A Jinja template to use for this conversion. "
|
||||
"As of transformers v4.44, default chat template is no longer "
|
||||
"allowed, so you must provide a chat template if the tokenizer "
|
||||
"does not define one."
|
||||
),
|
||||
)
|
||||
chat_template_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Additional keyword args to pass to the template renderer. "
|
||||
"Will be accessible by the chat template."
|
||||
),
|
||||
)
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."
|
||||
),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
"through out the inference process and return in response."
|
||||
),
|
||||
)
|
||||
normalize: bool | None = Field(
|
||||
default=None,
|
||||
description="Whether to normalize the embeddings outputs. Default is True.",
|
||||
)
|
||||
embed_dtype: EmbedDType = Field(
|
||||
default="float32",
|
||||
description=(
|
||||
"What dtype to use for encoding. Default to using float32 for base64 "
|
||||
"encoding to match the OpenAI python client behavior. "
|
||||
"This parameter will affect base64 and binary_response."
|
||||
),
|
||||
)
|
||||
endianness: Endianness = Field(
|
||||
default="native",
|
||||
description=(
|
||||
"What endianness to use for encoding. Default to using native for "
|
||||
"base64 encoding to match the OpenAI python client behavior."
|
||||
"This parameter will affect base64 and binary_response."
|
||||
),
|
||||
)
|
||||
# --8<-- [end:chat-embedding-extra-params]
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_generation_prompt(cls, data):
|
||||
if data.get("continue_final_message") and data.get("add_generation_prompt"):
|
||||
raise ValueError(
|
||||
"Cannot set both `continue_final_message` and "
|
||||
"`add_generation_prompt` to True."
|
||||
)
|
||||
return data
|
||||
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
dimensions=self.dimensions,
|
||||
normalize=self.normalize,
|
||||
)
|
||||
|
||||
|
||||
EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
|
||||
|
||||
|
||||
class EmbeddingResponseData(OpenAIBaseModel):
|
||||
index: int
|
||||
object: str = "embedding"
|
||||
embedding: list[float] | str
|
||||
|
||||
|
||||
class EmbeddingResponse(OpenAIBaseModel):
|
||||
id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
|
||||
object: str = "list"
|
||||
created: int = Field(default_factory=lambda: int(time.time()))
|
||||
model: str
|
||||
data: list[EmbeddingResponseData]
|
||||
usage: UsageInfo
|
||||
|
||||
|
||||
class EmbeddingBytesResponse(OpenAIBaseModel):
|
||||
content: list[bytes]
|
||||
headers: dict[str, str] | None = None
|
||||
media_type: str = "application/octet-stream"
|
||||
684
vllm/entrypoints/pooling/embed/serving.py
Normal file
684
vllm/entrypoints/pooling/embed/serving.py
Normal file
@@ -0,0 +1,684 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from collections.abc import AsyncGenerator, Mapping
|
||||
from typing import Any, Final, cast
|
||||
|
||||
import torch
|
||||
from fastapi import Request
|
||||
from fastapi.responses import Response
|
||||
from typing_extensions import assert_never, override
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import (
|
||||
EmbeddingServeContext,
|
||||
OpenAIServing,
|
||||
ServeContext,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.pooling.embed.protocol import (
|
||||
EmbeddingBytesResponse,
|
||||
EmbeddingChatRequest,
|
||||
EmbeddingCompletionRequest,
|
||||
EmbeddingRequest,
|
||||
EmbeddingResponse,
|
||||
EmbeddingResponseData,
|
||||
)
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import (
|
||||
EmbeddingRequestOutput,
|
||||
PoolingOutput,
|
||||
PoolingRequestOutput,
|
||||
RequestOutput,
|
||||
)
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
from vllm.utils.collection_utils import chunk_list
|
||||
from vllm.utils.serial_utils import (
|
||||
EmbedDType,
|
||||
EncodingFormat,
|
||||
Endianness,
|
||||
encode_pooling_bytes,
|
||||
encode_pooling_output,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class EmbeddingMixin(OpenAIServing):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
pooler_config = self.model_config.pooler_config
|
||||
|
||||
# Avoid repeated attribute lookups
|
||||
self.supports_chunked_processing = bool(
|
||||
pooler_config and pooler_config.enable_chunked_processing
|
||||
)
|
||||
self.max_embed_len = (
|
||||
pooler_config.max_embed_len
|
||||
if pooler_config and pooler_config.max_embed_len
|
||||
else None
|
||||
)
|
||||
|
||||
@override
|
||||
async def _preprocess(
|
||||
self,
|
||||
ctx: ServeContext,
|
||||
) -> ErrorResponse | None:
|
||||
ctx = cast(EmbeddingServeContext, ctx)
|
||||
try:
|
||||
ctx.lora_request = self._maybe_get_adapters(ctx.request)
|
||||
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
renderer = self._get_renderer(tokenizer)
|
||||
|
||||
if isinstance(ctx.request, EmbeddingChatRequest):
|
||||
_, ctx.engine_prompts = await self._preprocess_chat(
|
||||
ctx.request,
|
||||
tokenizer,
|
||||
ctx.request.messages,
|
||||
chat_template=ctx.request.chat_template or ctx.chat_template,
|
||||
chat_template_content_format=ctx.chat_template_content_format,
|
||||
add_generation_prompt=ctx.request.add_generation_prompt,
|
||||
continue_final_message=False,
|
||||
add_special_tokens=ctx.request.add_special_tokens,
|
||||
)
|
||||
else:
|
||||
ctx.engine_prompts = await renderer.render_prompt(
|
||||
prompt_or_prompts=ctx.request.input,
|
||||
config=self._build_render_config(ctx.request),
|
||||
)
|
||||
return None
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
def _build_render_config(self, request: EmbeddingCompletionRequest) -> RenderConfig:
|
||||
# Set max_length based on chunked processing capability
|
||||
if self._should_use_chunked_processing(request):
|
||||
max_length = None
|
||||
else:
|
||||
max_length = self.max_embed_len or self.max_model_len
|
||||
|
||||
return RenderConfig(
|
||||
max_length=max_length,
|
||||
truncate_prompt_tokens=request.truncate_prompt_tokens,
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
)
|
||||
|
||||
@override
|
||||
def _build_response(
|
||||
self,
|
||||
ctx: ServeContext,
|
||||
) -> EmbeddingResponse | Response | ErrorResponse:
|
||||
final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch)
|
||||
|
||||
encoding_format: EncodingFormat = ctx.request.encoding_format
|
||||
embed_dtype: EmbedDType = ctx.request.embed_dtype
|
||||
endianness: Endianness = ctx.request.endianness
|
||||
|
||||
def encode_float_base64():
|
||||
items: list[EmbeddingResponseData] = []
|
||||
num_prompt_tokens = 0
|
||||
|
||||
for idx, final_res in enumerate(final_res_batch_checked):
|
||||
item = EmbeddingResponseData(
|
||||
index=idx,
|
||||
embedding=encode_pooling_output(
|
||||
final_res,
|
||||
encoding_format=encoding_format,
|
||||
embed_dtype=embed_dtype,
|
||||
endianness=endianness,
|
||||
),
|
||||
)
|
||||
prompt_token_ids = final_res.prompt_token_ids
|
||||
|
||||
items.append(item)
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
total_tokens=num_prompt_tokens,
|
||||
)
|
||||
|
||||
return EmbeddingResponse(
|
||||
id=ctx.request_id,
|
||||
created=ctx.created_time,
|
||||
model=ctx.model_name,
|
||||
data=items,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
def encode_bytes(bytes_only: bool) -> EmbeddingBytesResponse:
|
||||
content, items, usage = encode_pooling_bytes(
|
||||
pooling_outputs=final_res_batch_checked,
|
||||
embed_dtype=embed_dtype,
|
||||
endianness=endianness,
|
||||
)
|
||||
|
||||
headers = (
|
||||
None
|
||||
if bytes_only
|
||||
else {
|
||||
"metadata": json.dumps(
|
||||
{
|
||||
"id": ctx.request_id,
|
||||
"created": ctx.created_time,
|
||||
"model": ctx.model_name,
|
||||
"data": items,
|
||||
"usage": usage,
|
||||
}
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return EmbeddingBytesResponse(content=content, headers=headers)
|
||||
|
||||
if encoding_format == "float" or encoding_format == "base64":
|
||||
return encode_float_base64()
|
||||
elif encoding_format == "bytes" or encoding_format == "bytes_only":
|
||||
return encode_bytes(bytes_only=encoding_format == "bytes_only")
|
||||
else:
|
||||
assert_never(encoding_format)
|
||||
|
||||
def _get_max_position_embeddings(self) -> int:
|
||||
"""Get the model's effective maximum sequence length for chunking."""
|
||||
return self.model_config.max_model_len
|
||||
|
||||
def _should_use_chunked_processing(self, request) -> bool:
|
||||
"""Check if chunked processing should be used for this request."""
|
||||
return (
|
||||
isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest))
|
||||
and self.supports_chunked_processing
|
||||
)
|
||||
|
||||
async def _process_chunked_request(
|
||||
self,
|
||||
ctx: EmbeddingServeContext,
|
||||
token_ids: list[int],
|
||||
pooling_params,
|
||||
trace_headers,
|
||||
prompt_idx: int,
|
||||
) -> list[AsyncGenerator[PoolingRequestOutput, None]]:
|
||||
"""Process a single prompt using chunked processing."""
|
||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||
|
||||
# Split into chunks using max_position_embeddings
|
||||
max_pos_embeddings = self._get_max_position_embeddings()
|
||||
# Process all chunks for MEAN aggregation
|
||||
for chunk_idx, chunk_tokens in enumerate(
|
||||
chunk_list(token_ids, max_pos_embeddings)
|
||||
):
|
||||
# Create a request ID for this chunk
|
||||
chunk_request_id = f"{ctx.request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}"
|
||||
|
||||
# Create engine prompt for this chunk
|
||||
chunk_engine_prompt = TokensPrompt(prompt_token_ids=chunk_tokens)
|
||||
|
||||
# Log the chunk
|
||||
self._log_inputs(
|
||||
chunk_request_id,
|
||||
chunk_engine_prompt,
|
||||
params=pooling_params,
|
||||
lora_request=ctx.lora_request,
|
||||
)
|
||||
|
||||
# Create generator for this chunk and wrap it to return indices
|
||||
original_generator = self.engine_client.encode(
|
||||
chunk_engine_prompt,
|
||||
pooling_params,
|
||||
chunk_request_id,
|
||||
lora_request=ctx.lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=getattr(ctx.request, "priority", 0),
|
||||
)
|
||||
|
||||
generators.append(original_generator)
|
||||
|
||||
return generators
|
||||
|
||||
def _validate_input(
|
||||
self,
|
||||
request,
|
||||
input_ids: list[int],
|
||||
input_text: str,
|
||||
) -> TokensPrompt:
|
||||
"""Override to support chunked processing for embedding requests."""
|
||||
token_num = len(input_ids)
|
||||
|
||||
# Note: EmbeddingRequest doesn't have max_tokens
|
||||
if isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest)):
|
||||
# Check if chunked processing is enabled for pooling models
|
||||
enable_chunked = self._should_use_chunked_processing(request)
|
||||
|
||||
# Use max_position_embeddings for chunked processing decisions
|
||||
max_pos_embeddings = self._get_max_position_embeddings()
|
||||
|
||||
# Determine the effective max length for validation
|
||||
if self.max_embed_len is not None:
|
||||
# Use max_embed_len for validation instead of max_model_len
|
||||
length_type = "maximum embedding input length"
|
||||
max_length_value = self.max_embed_len
|
||||
else:
|
||||
# Fall back to max_model_len validation (original behavior)
|
||||
length_type = "maximum context length"
|
||||
max_length_value = self.max_model_len
|
||||
|
||||
validation_error_msg = (
|
||||
"This model's {length_type} is {max_length_value} tokens. "
|
||||
"However, you requested {token_num} tokens in the input for "
|
||||
"embedding generation. Please reduce the length of the input."
|
||||
)
|
||||
|
||||
chunked_processing_error_msg = (
|
||||
"This model's {length_type} is {max_length_value} tokens. "
|
||||
"However, you requested {token_num} tokens in the input for "
|
||||
"embedding generation. Please reduce the length of the input "
|
||||
"or enable chunked processing."
|
||||
)
|
||||
|
||||
# Check if input exceeds max length
|
||||
if token_num > max_length_value:
|
||||
raise ValueError(
|
||||
validation_error_msg.format(
|
||||
length_type=length_type,
|
||||
max_length_value=max_length_value,
|
||||
token_num=token_num,
|
||||
)
|
||||
)
|
||||
|
||||
# Check for chunked processing
|
||||
# when exceeding max_position_embeddings
|
||||
if token_num > max_pos_embeddings:
|
||||
if enable_chunked:
|
||||
# Allow long inputs when chunked processing is enabled
|
||||
logger.info(
|
||||
"Input length %s exceeds max_position_embeddings "
|
||||
"%s, will use chunked processing",
|
||||
token_num,
|
||||
max_pos_embeddings,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
chunked_processing_error_msg.format(
|
||||
length_type="maximum position embeddings length",
|
||||
max_length_value=max_pos_embeddings,
|
||||
token_num=token_num,
|
||||
)
|
||||
)
|
||||
|
||||
return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
|
||||
|
||||
# For other request types, use the parent's implementation
|
||||
return super()._validate_input(request, input_ids, input_text)
|
||||
|
||||
async def _create_single_prompt_generator(
|
||||
self,
|
||||
ctx: EmbeddingServeContext,
|
||||
engine_prompt: TokensPrompt,
|
||||
pooling_params: PoolingParams,
|
||||
trace_headers: Mapping[str, str] | None,
|
||||
prompt_index: int,
|
||||
) -> AsyncGenerator[RequestOutput | PoolingRequestOutput, None]:
|
||||
"""Create a generator for a single prompt using standard processing."""
|
||||
request_id_item = f"{ctx.request_id}-{prompt_index}"
|
||||
|
||||
self._log_inputs(
|
||||
request_id_item,
|
||||
engine_prompt,
|
||||
params=pooling_params,
|
||||
lora_request=ctx.lora_request,
|
||||
)
|
||||
|
||||
# Return the original generator without wrapping
|
||||
return self.engine_client.encode(
|
||||
engine_prompt,
|
||||
pooling_params,
|
||||
request_id_item,
|
||||
lora_request=ctx.lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=getattr(ctx.request, "priority", 0),
|
||||
)
|
||||
|
||||
@override
|
||||
async def _prepare_generators(
|
||||
self,
|
||||
ctx: ServeContext,
|
||||
) -> ErrorResponse | None:
|
||||
"""Override to support chunked processing."""
|
||||
ctx = cast(EmbeddingServeContext, ctx)
|
||||
|
||||
# Check if we should use chunked processing
|
||||
use_chunked = self._should_use_chunked_processing(ctx.request)
|
||||
|
||||
# If no chunked processing needed, delegate to parent class
|
||||
if not use_chunked:
|
||||
return await super()._prepare_generators(ctx)
|
||||
|
||||
# Custom logic for chunked processing
|
||||
generators: list[
|
||||
AsyncGenerator[RequestOutput | PoolingRequestOutput, None]
|
||||
] = []
|
||||
|
||||
try:
|
||||
trace_headers = (
|
||||
None
|
||||
if ctx.raw_request is None
|
||||
else await self._get_trace_headers(ctx.raw_request.headers)
|
||||
)
|
||||
|
||||
pooling_params = self._create_pooling_params(ctx)
|
||||
if isinstance(pooling_params, ErrorResponse):
|
||||
return pooling_params
|
||||
|
||||
# Verify and set the task for pooling params
|
||||
try:
|
||||
pooling_params.verify("embed", self.model_config)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
if ctx.engine_prompts is None:
|
||||
return self.create_error_response("Engine prompts not available")
|
||||
|
||||
max_pos_embeddings = self._get_max_position_embeddings()
|
||||
|
||||
for i, engine_prompt in enumerate(ctx.engine_prompts):
|
||||
# Check if this specific prompt needs chunked processing
|
||||
if "prompt_token_ids" in engine_prompt:
|
||||
prompt_token_ids = engine_prompt["prompt_token_ids"]
|
||||
if len(prompt_token_ids) > max_pos_embeddings:
|
||||
# Use chunked processing for this prompt
|
||||
chunk_generators = await self._process_chunked_request(
|
||||
ctx,
|
||||
prompt_token_ids,
|
||||
pooling_params,
|
||||
trace_headers,
|
||||
i,
|
||||
)
|
||||
generators.extend(chunk_generators)
|
||||
continue
|
||||
|
||||
# Normal processing for short prompts or non-token prompts
|
||||
generator = await self._create_single_prompt_generator(
|
||||
ctx, engine_prompt, pooling_params, trace_headers, i
|
||||
)
|
||||
generators.append(generator)
|
||||
|
||||
ctx.result_generator = merge_async_iterators(*generators)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
@override
|
||||
async def _collect_batch(
|
||||
self,
|
||||
ctx: ServeContext,
|
||||
) -> ErrorResponse | None:
|
||||
"""Collect and aggregate batch results
|
||||
with support for chunked processing.
|
||||
|
||||
For chunked requests, performs online aggregation to
|
||||
minimize memory usage.
|
||||
For regular requests, collects results normally.
|
||||
"""
|
||||
ctx = cast(EmbeddingServeContext, ctx)
|
||||
try:
|
||||
if ctx.engine_prompts is None:
|
||||
return self.create_error_response("Engine prompts not available")
|
||||
|
||||
# Check if we used chunked processing
|
||||
use_chunked = self._should_use_chunked_processing(ctx.request)
|
||||
|
||||
if not use_chunked:
|
||||
return await super()._collect_batch(ctx=ctx)
|
||||
|
||||
if ctx.result_generator is None:
|
||||
return self.create_error_response("Result generator not available")
|
||||
|
||||
# Online aggregation for chunked requests to
|
||||
# minimize memory usage
|
||||
# Track aggregation state for each prompt
|
||||
prompt_aggregators: dict[int, dict[str, Any]] = {}
|
||||
short_prompts_results: dict[int, PoolingRequestOutput] = {}
|
||||
|
||||
async for result_idx, result in ctx.result_generator:
|
||||
if "-chunk-" in result.request_id:
|
||||
# Extract prompt_idx from chunked request_id
|
||||
parts = result.request_id.split("-")
|
||||
try:
|
||||
prompt_idx = int(parts[parts.index("prompt") + 1])
|
||||
except (ValueError, IndexError):
|
||||
# Fallback: extract from result_idx if parsing fails
|
||||
prompt_idx = result_idx
|
||||
|
||||
# Initialize aggregator for this prompt if needed
|
||||
if prompt_idx not in prompt_aggregators:
|
||||
prompt_aggregators[prompt_idx] = {
|
||||
"weighted_sum": None,
|
||||
"total_weight": 0,
|
||||
"chunk_count": 0,
|
||||
"request_id": result.request_id.split("-chunk-")[0],
|
||||
}
|
||||
|
||||
aggregator = prompt_aggregators[prompt_idx]
|
||||
|
||||
# MEAN pooling with online weighted averaging
|
||||
# Ensure result is PoolingRequestOutput
|
||||
# for embedding processing
|
||||
if not isinstance(result, PoolingRequestOutput):
|
||||
return self.create_error_response(
|
||||
f"Expected PoolingRequestOutput for "
|
||||
f"chunked embedding, got "
|
||||
f"{type(result).__name__}"
|
||||
)
|
||||
|
||||
# Handle both PoolingOutput and
|
||||
# EmbeddingOutput types
|
||||
if hasattr(result.outputs, "data"):
|
||||
# PoolingOutput case
|
||||
embedding_data = result.outputs.data
|
||||
elif hasattr(result.outputs, "embedding"):
|
||||
# EmbeddingOutput case -
|
||||
# convert embedding list to tensor
|
||||
embedding_data = result.outputs.embedding
|
||||
else:
|
||||
return self.create_error_response(
|
||||
f"Unsupported output type: {type(result.outputs).__name__}"
|
||||
)
|
||||
|
||||
if not isinstance(embedding_data, torch.Tensor):
|
||||
embedding_data = torch.tensor(
|
||||
embedding_data, dtype=torch.float32
|
||||
)
|
||||
|
||||
if result.prompt_token_ids is None:
|
||||
return self.create_error_response(
|
||||
"prompt_token_ids cannot be None for chunked processing"
|
||||
)
|
||||
weight = len(result.prompt_token_ids)
|
||||
|
||||
weighted_embedding = embedding_data.to(dtype=torch.float32) * weight
|
||||
|
||||
if aggregator["weighted_sum"] is None:
|
||||
# First chunk
|
||||
aggregator["weighted_sum"] = weighted_embedding
|
||||
else:
|
||||
# Accumulate
|
||||
aggregator["weighted_sum"] += weighted_embedding
|
||||
|
||||
aggregator["total_weight"] += weight
|
||||
aggregator["chunk_count"] += 1
|
||||
else:
|
||||
# Non-chunked result - extract prompt_idx from request_id
|
||||
parts = result.request_id.split("-")
|
||||
try:
|
||||
# Last part should be prompt index
|
||||
prompt_idx = int(parts[-1])
|
||||
except (ValueError, IndexError):
|
||||
prompt_idx = result_idx # Fallback to result_idx
|
||||
|
||||
short_prompts_results[prompt_idx] = cast(
|
||||
PoolingRequestOutput, result
|
||||
)
|
||||
|
||||
# Finalize aggregated results
|
||||
final_res_batch: list[PoolingRequestOutput | EmbeddingRequestOutput] = []
|
||||
num_prompts = len(ctx.engine_prompts)
|
||||
|
||||
for prompt_idx in range(num_prompts):
|
||||
if prompt_idx in prompt_aggregators:
|
||||
# Finalize MEAN aggregation for this chunked prompt
|
||||
aggregator = prompt_aggregators[prompt_idx]
|
||||
|
||||
weighted_sum = aggregator["weighted_sum"]
|
||||
total_weight = aggregator["total_weight"]
|
||||
|
||||
if (
|
||||
weighted_sum is not None
|
||||
and isinstance(weighted_sum, torch.Tensor)
|
||||
and isinstance(total_weight, (int, float))
|
||||
and total_weight > 0
|
||||
):
|
||||
# Compute final mean embedding
|
||||
final_embedding = weighted_sum / total_weight
|
||||
|
||||
# Create a PoolingRequestOutput
|
||||
# for the aggregated result
|
||||
pooling_output_data = PoolingOutput(data=final_embedding)
|
||||
|
||||
# Get original prompt token IDs for this prompt
|
||||
original_prompt = ctx.engine_prompts[prompt_idx]
|
||||
if "prompt_token_ids" not in original_prompt:
|
||||
return self.create_error_response(
|
||||
f"Chunked prompt {prompt_idx} does not contain "
|
||||
"token IDs"
|
||||
)
|
||||
|
||||
original_token_ids = original_prompt["prompt_token_ids"]
|
||||
|
||||
pooling_request_output = PoolingRequestOutput(
|
||||
request_id=aggregator["request_id"],
|
||||
prompt_token_ids=original_token_ids,
|
||||
outputs=pooling_output_data,
|
||||
num_cached_tokens=0,
|
||||
finished=True,
|
||||
)
|
||||
|
||||
final_res_batch.append(pooling_request_output)
|
||||
else:
|
||||
return self.create_error_response(
|
||||
f"Failed to aggregate chunks for prompt {prompt_idx}"
|
||||
)
|
||||
elif prompt_idx in short_prompts_results:
|
||||
final_res_batch.append(
|
||||
cast(PoolingRequestOutput, short_prompts_results[prompt_idx])
|
||||
)
|
||||
else:
|
||||
return self.create_error_response(
|
||||
f"Result not found for prompt {prompt_idx}"
|
||||
)
|
||||
|
||||
ctx.final_res_batch = cast(
|
||||
list[RequestOutput | PoolingRequestOutput], final_res_batch
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
|
||||
class OpenAIServingEmbedding(EmbeddingMixin):
|
||||
request_id_prefix = "embd"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
chat_template: str | None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
trust_request_chat_template: bool = False,
|
||||
log_error_stack: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
|
||||
self.chat_template = chat_template
|
||||
self.chat_template_content_format: Final = chat_template_content_format
|
||||
self.trust_request_chat_template = trust_request_chat_template
|
||||
|
||||
async def create_embedding(
|
||||
self,
|
||||
request: EmbeddingRequest,
|
||||
raw_request: Request | None = None,
|
||||
) -> EmbeddingResponse | ErrorResponse:
|
||||
"""
|
||||
Embedding API similar to OpenAI's API.
|
||||
|
||||
See https://platform.openai.com/docs/api-reference/embeddings/create
|
||||
for the API specification. This API mimics the OpenAI Embedding API.
|
||||
"""
|
||||
model_name = self.models.model_name()
|
||||
request_id = (
|
||||
f"{self.request_id_prefix}-"
|
||||
f"{self._base_request_id(raw_request, request.request_id)}"
|
||||
)
|
||||
|
||||
ctx = EmbeddingServeContext(
|
||||
request=request,
|
||||
raw_request=raw_request,
|
||||
model_name=model_name,
|
||||
request_id=request_id,
|
||||
chat_template=self.chat_template,
|
||||
chat_template_content_format=self.chat_template_content_format,
|
||||
)
|
||||
|
||||
return await super().handle(ctx) # type: ignore
|
||||
|
||||
@override
|
||||
def _create_pooling_params(
|
||||
self,
|
||||
ctx: ServeContext[EmbeddingRequest],
|
||||
) -> PoolingParams | ErrorResponse:
|
||||
pooling_params = super()._create_pooling_params(ctx)
|
||||
if isinstance(pooling_params, ErrorResponse):
|
||||
return pooling_params
|
||||
|
||||
try:
|
||||
pooling_params.verify("embed", self.model_config)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
return pooling_params
|
||||
|
||||
async def _preprocess(
|
||||
self,
|
||||
ctx: ServeContext,
|
||||
) -> ErrorResponse | None:
|
||||
if isinstance(ctx.request, EmbeddingChatRequest):
|
||||
error_check_ret = self._validate_chat_template(
|
||||
request_chat_template=ctx.request.chat_template,
|
||||
chat_template_kwargs=ctx.request.chat_template_kwargs,
|
||||
trust_request_chat_template=self.trust_request_chat_template,
|
||||
)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
return await super()._preprocess(ctx)
|
||||
0
vllm/entrypoints/pooling/pooling/__init__.py
Normal file
0
vllm/entrypoints/pooling/pooling/__init__.py
Normal file
63
vllm/entrypoints/pooling/pooling/api_router.py
Normal file
63
vllm/entrypoints/pooling/pooling/api_router.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.pooling.pooling.protocol import (
|
||||
IOProcessorResponse,
|
||||
PoolingBytesResponse,
|
||||
PoolingRequest,
|
||||
PoolingResponse,
|
||||
)
|
||||
from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
|
||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def pooling(request: Request) -> OpenAIServingPooling | None:
|
||||
return request.app.state.openai_serving_pooling
|
||||
|
||||
|
||||
@router.post(
|
||||
"/pooling",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def create_pooling(request: PoolingRequest, raw_request: Request):
|
||||
handler = pooling(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Pooling API"
|
||||
)
|
||||
try:
|
||||
generator = await handler.create_pooling(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
elif isinstance(generator, PoolingBytesResponse):
|
||||
return StreamingResponse(
|
||||
content=generator.content,
|
||||
headers=generator.headers,
|
||||
media_type=generator.media_type,
|
||||
)
|
||||
|
||||
assert_never(generator)
|
||||
148
vllm/entrypoints/pooling/pooling/protocol.py
Normal file
148
vllm/entrypoints/pooling/pooling/protocol.py
Normal file
@@ -0,0 +1,148 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import time
|
||||
from typing import Generic, TypeAlias, TypeVar
|
||||
|
||||
from pydantic import (
|
||||
Field,
|
||||
)
|
||||
|
||||
from vllm import PoolingParams
|
||||
from vllm.config.pooler import get_use_activation
|
||||
from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
|
||||
from vllm.entrypoints.pooling.embed.protocol import (
|
||||
EmbeddingChatRequest,
|
||||
EmbeddingCompletionRequest,
|
||||
)
|
||||
from vllm.tasks import PoolingTask
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
|
||||
|
||||
|
||||
class PoolingCompletionRequest(EmbeddingCompletionRequest):
|
||||
task: PoolingTask | None = None
|
||||
softmax: bool | None = Field(
|
||||
default=None,
|
||||
description="softmax will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
activation: bool | None = Field(
|
||||
default=None,
|
||||
description="activation will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
use_activation: bool | None = Field(
|
||||
default=None,
|
||||
description="Whether to use activation for classification outputs. "
|
||||
"If it is a classify or token_classify task, the default is True; "
|
||||
"for other tasks, this value should be None.",
|
||||
)
|
||||
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
dimensions=self.dimensions,
|
||||
normalize=self.normalize,
|
||||
use_activation=get_use_activation(self),
|
||||
)
|
||||
|
||||
|
||||
class PoolingChatRequest(EmbeddingChatRequest):
|
||||
task: PoolingTask | None = None
|
||||
softmax: bool | None = Field(
|
||||
default=None,
|
||||
description="softmax will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
activation: bool | None = Field(
|
||||
default=None,
|
||||
description="activation will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
use_activation: bool | None = Field(
|
||||
default=None,
|
||||
description="Whether to use activation for classification outputs. "
|
||||
"If it is a classify or token_classify task, the default is True; "
|
||||
"for other tasks, this value should be None.",
|
||||
)
|
||||
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
dimensions=self.dimensions,
|
||||
normalize=self.normalize,
|
||||
use_activation=get_use_activation(self),
|
||||
)
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
|
||||
model: str | None = None
|
||||
|
||||
priority: int = Field(default=0)
|
||||
"""
|
||||
The priority of the request (lower means earlier handling;
|
||||
default: 0). Any priority other than 0 will raise an error
|
||||
if the served model does not use priority scheduling.
|
||||
"""
|
||||
data: T
|
||||
|
||||
task: PoolingTask = "plugin"
|
||||
encoding_format: EncodingFormat = "float"
|
||||
embed_dtype: EmbedDType = Field(
|
||||
default="float32",
|
||||
description=(
|
||||
"What dtype to use for encoding. Default to using float32 for base64 "
|
||||
"encoding to match the OpenAI python client behavior. "
|
||||
"This parameter will affect base64 and binary_response."
|
||||
),
|
||||
)
|
||||
endianness: Endianness = Field(
|
||||
default="native",
|
||||
description=(
|
||||
"What endianness to use for encoding. Default to using native for "
|
||||
"base64 encoding to match the OpenAI python client behavior."
|
||||
"This parameter will affect base64 and binary_response."
|
||||
),
|
||||
)
|
||||
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams()
|
||||
|
||||
|
||||
class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
|
||||
request_id: str | None = None
|
||||
"""
|
||||
The request_id associated with this response
|
||||
"""
|
||||
created_at: int = Field(default_factory=lambda: int(time.time()))
|
||||
|
||||
data: T
|
||||
"""
|
||||
When using plugins IOProcessor plugins, the actual output is generated
|
||||
by the plugin itself. Hence, we use a generic type for the response data
|
||||
"""
|
||||
|
||||
|
||||
PoolingRequest: TypeAlias = (
|
||||
PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
|
||||
)
|
||||
|
||||
|
||||
class PoolingResponseData(OpenAIBaseModel):
|
||||
index: int
|
||||
object: str = "pooling"
|
||||
data: list[list[float]] | list[float] | str
|
||||
|
||||
|
||||
class PoolingResponse(OpenAIBaseModel):
|
||||
id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
|
||||
object: str = "list"
|
||||
created: int = Field(default_factory=lambda: int(time.time()))
|
||||
model: str
|
||||
data: list[PoolingResponseData]
|
||||
usage: UsageInfo
|
||||
|
||||
|
||||
class PoolingBytesResponse(OpenAIBaseModel):
|
||||
content: list[bytes]
|
||||
headers: dict[str, str] | None = None
|
||||
media_type: str = "application/octet-stream"
|
||||
354
vllm/entrypoints/pooling/pooling/serving.py
Normal file
354
vllm/entrypoints/pooling/pooling/serving.py
Normal file
@@ -0,0 +1,354 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, Sequence
|
||||
from typing import Final, cast
|
||||
|
||||
import jinja2
|
||||
from fastapi import Request
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.pooling.pooling.protocol import (
|
||||
IOProcessorRequest,
|
||||
IOProcessorResponse,
|
||||
PoolingBytesResponse,
|
||||
PoolingChatRequest,
|
||||
PoolingCompletionRequest,
|
||||
PoolingRequest,
|
||||
PoolingResponse,
|
||||
PoolingResponseData,
|
||||
)
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
from vllm.entrypoints.utils import _validate_truncation_size
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import PoolingRequestOutput
|
||||
from vllm.tasks import PoolingTask, SupportedTask
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
from vllm.utils.serial_utils import (
|
||||
EmbedDType,
|
||||
EncodingFormat,
|
||||
Endianness,
|
||||
encode_pooling_bytes,
|
||||
encode_pooling_output,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class OpenAIServingPooling(OpenAIServing):
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
supported_tasks: tuple[SupportedTask, ...],
|
||||
request_logger: RequestLogger | None,
|
||||
chat_template: str | None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
trust_request_chat_template: bool = False,
|
||||
log_error_stack: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
|
||||
self.supported_tasks = supported_tasks
|
||||
self.chat_template = chat_template
|
||||
self.chat_template_content_format: Final = chat_template_content_format
|
||||
self.trust_request_chat_template = trust_request_chat_template
|
||||
|
||||
async def create_pooling(
|
||||
self,
|
||||
request: PoolingRequest,
|
||||
raw_request: Request | None = None,
|
||||
) -> PoolingResponse | IOProcessorResponse | PoolingBytesResponse | ErrorResponse:
|
||||
"""
|
||||
See https://platform.openai.com/docs/api-reference/embeddings/create
|
||||
for the API specification. This API mimics the OpenAI Embedding API.
|
||||
"""
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
model_name = self.models.model_name()
|
||||
|
||||
request_id = f"pool-{self._base_request_id(raw_request)}"
|
||||
created_time = int(time.time())
|
||||
|
||||
is_io_processor_request = isinstance(request, IOProcessorRequest)
|
||||
try:
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
renderer = self._get_renderer(tokenizer)
|
||||
|
||||
if getattr(request, "dimensions", None) is not None:
|
||||
return self.create_error_response(
|
||||
"dimensions is currently not supported"
|
||||
)
|
||||
|
||||
truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
|
||||
truncate_prompt_tokens = _validate_truncation_size(
|
||||
self.max_model_len, truncate_prompt_tokens
|
||||
)
|
||||
|
||||
if is_io_processor_request:
|
||||
if self.io_processor is None:
|
||||
raise ValueError(
|
||||
"No IOProcessor plugin installed. Please refer "
|
||||
"to the documentation and to the "
|
||||
"'prithvi_geospatial_mae_io_processor' "
|
||||
"offline inference example for more details."
|
||||
)
|
||||
|
||||
validated_prompt = self.io_processor.parse_request(request)
|
||||
|
||||
engine_prompts = await self.io_processor.pre_process_async(
|
||||
prompt=validated_prompt, request_id=request_id
|
||||
)
|
||||
if not isinstance(engine_prompts, Sequence) or isinstance(
|
||||
engine_prompts, (str, bytes, bytearray)
|
||||
):
|
||||
engine_prompts = [engine_prompts]
|
||||
|
||||
elif isinstance(request, PoolingChatRequest):
|
||||
error_check_ret = self._validate_chat_template(
|
||||
request_chat_template=request.chat_template,
|
||||
chat_template_kwargs=request.chat_template_kwargs,
|
||||
trust_request_chat_template=self.trust_request_chat_template,
|
||||
)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
_, engine_prompts = await self._preprocess_chat(
|
||||
request,
|
||||
tokenizer,
|
||||
request.messages,
|
||||
chat_template=request.chat_template or self.chat_template,
|
||||
chat_template_content_format=self.chat_template_content_format,
|
||||
# In pooling requests, we are not generating tokens,
|
||||
# so there is no need to append extra tokens to the input
|
||||
add_generation_prompt=False,
|
||||
continue_final_message=False,
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
)
|
||||
elif isinstance(request, PoolingCompletionRequest):
|
||||
engine_prompts = await renderer.render_prompt(
|
||||
prompt_or_prompts=request.input,
|
||||
config=self._build_render_config(request),
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported request of type {type(request)}")
|
||||
except (ValueError, TypeError, jinja2.TemplateError) as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||
try:
|
||||
if is_io_processor_request:
|
||||
assert self.io_processor is not None and isinstance(
|
||||
request, IOProcessorRequest
|
||||
)
|
||||
pooling_params = self.io_processor.validate_or_generate_params()
|
||||
else:
|
||||
pooling_params = request.to_pooling_params()
|
||||
|
||||
pooling_task: PoolingTask
|
||||
if request.task is None:
|
||||
if "token_embed" in self.supported_tasks:
|
||||
pooling_task = "token_embed"
|
||||
elif "token_classify" in self.supported_tasks:
|
||||
pooling_task = "token_classify"
|
||||
elif "plugin" in self.supported_tasks:
|
||||
pooling_task = "plugin"
|
||||
else:
|
||||
return self.create_error_response(
|
||||
f"pooling_task must be one of {self.supported_tasks}."
|
||||
)
|
||||
else:
|
||||
pooling_task = request.task
|
||||
|
||||
if pooling_task not in self.supported_tasks:
|
||||
return self.create_error_response(
|
||||
f"Task {pooling_task} is not supported, it"
|
||||
f" must be one of {self.supported_tasks}."
|
||||
)
|
||||
|
||||
try:
|
||||
pooling_params.verify(pooling_task, self.model_config)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
request_id_item = f"{request_id}-{i}"
|
||||
|
||||
self._log_inputs(
|
||||
request_id_item,
|
||||
engine_prompt,
|
||||
params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
trace_headers = (
|
||||
None
|
||||
if raw_request is None
|
||||
else await self._get_trace_headers(raw_request.headers)
|
||||
)
|
||||
|
||||
generator = self.engine_client.encode(
|
||||
engine_prompt,
|
||||
pooling_params,
|
||||
request_id_item,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
)
|
||||
|
||||
generators.append(generator)
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
result_generator = merge_async_iterators(*generators)
|
||||
|
||||
if is_io_processor_request:
|
||||
assert self.io_processor is not None
|
||||
output = await self.io_processor.post_process_async(
|
||||
model_output=result_generator,
|
||||
request_id=request_id,
|
||||
)
|
||||
return self.io_processor.output_to_response(output)
|
||||
|
||||
assert isinstance(request, (PoolingCompletionRequest, PoolingChatRequest))
|
||||
num_prompts = len(engine_prompts)
|
||||
|
||||
# Non-streaming response
|
||||
final_res_batch: list[PoolingRequestOutput | None]
|
||||
final_res_batch = [None] * num_prompts
|
||||
try:
|
||||
async for i, res in result_generator:
|
||||
final_res_batch[i] = res
|
||||
|
||||
assert all(final_res is not None for final_res in final_res_batch)
|
||||
|
||||
final_res_batch_checked = cast(list[PoolingRequestOutput], final_res_batch)
|
||||
|
||||
response = self.request_output_to_pooling_response(
|
||||
final_res_batch_checked,
|
||||
request_id,
|
||||
created_time,
|
||||
model_name,
|
||||
request.encoding_format,
|
||||
request.embed_dtype,
|
||||
request.endianness,
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
return response
|
||||
|
||||
def request_output_to_pooling_response(
|
||||
self,
|
||||
final_res_batch: list[PoolingRequestOutput],
|
||||
request_id: str,
|
||||
created_time: int,
|
||||
model_name: str,
|
||||
encoding_format: EncodingFormat,
|
||||
embed_dtype: EmbedDType,
|
||||
endianness: Endianness,
|
||||
) -> PoolingResponse | PoolingBytesResponse:
|
||||
def encode_float_base64():
|
||||
items: list[PoolingResponseData] = []
|
||||
num_prompt_tokens = 0
|
||||
|
||||
for idx, final_res in enumerate(final_res_batch):
|
||||
item = PoolingResponseData(
|
||||
index=idx,
|
||||
data=encode_pooling_output(
|
||||
final_res,
|
||||
encoding_format=encoding_format,
|
||||
embed_dtype=embed_dtype,
|
||||
endianness=endianness,
|
||||
),
|
||||
)
|
||||
prompt_token_ids = final_res.prompt_token_ids
|
||||
|
||||
items.append(item)
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
total_tokens=num_prompt_tokens,
|
||||
)
|
||||
|
||||
return PoolingResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
data=items,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
def encode_bytes(bytes_only: bool) -> PoolingBytesResponse:
|
||||
content, items, usage = encode_pooling_bytes(
|
||||
pooling_outputs=final_res_batch,
|
||||
embed_dtype=embed_dtype,
|
||||
endianness=endianness,
|
||||
)
|
||||
|
||||
headers = (
|
||||
None
|
||||
if bytes_only
|
||||
else {
|
||||
"metadata": json.dumps(
|
||||
{
|
||||
"id": request_id,
|
||||
"created": created_time,
|
||||
"model": model_name,
|
||||
"data": items,
|
||||
"usage": usage,
|
||||
}
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return PoolingBytesResponse(
|
||||
content=content,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
if encoding_format == "float" or encoding_format == "base64":
|
||||
return encode_float_base64()
|
||||
elif encoding_format == "bytes" or encoding_format == "bytes_only":
|
||||
return encode_bytes(bytes_only=encoding_format == "bytes_only")
|
||||
else:
|
||||
assert_never(encoding_format)
|
||||
|
||||
def _build_render_config(self, request: PoolingCompletionRequest) -> RenderConfig:
|
||||
return RenderConfig(
|
||||
max_length=self.max_model_len,
|
||||
truncate_prompt_tokens=request.truncate_prompt_tokens,
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
)
|
||||
0
vllm/entrypoints/pooling/score/__init__.py
Normal file
0
vllm/entrypoints/pooling/score/__init__.py
Normal file
149
vllm/entrypoints/pooling/score/api_router.py
Normal file
149
vllm/entrypoints/pooling/score/api_router.py
Normal file
@@ -0,0 +1,149 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.pooling.score.protocol import (
|
||||
RerankRequest,
|
||||
RerankResponse,
|
||||
ScoreRequest,
|
||||
ScoreResponse,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.serving import ServingScores
|
||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
||||
from vllm.logger import init_logger
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def score(request: Request) -> ServingScores | None:
|
||||
return request.app.state.openai_serving_scores
|
||||
|
||||
|
||||
def rerank(request: Request) -> ServingScores | None:
|
||||
return request.app.state.openai_serving_scores
|
||||
|
||||
|
||||
@router.post(
|
||||
"/score",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def create_score(request: ScoreRequest, raw_request: Request):
|
||||
handler = score(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Score API"
|
||||
)
|
||||
|
||||
try:
|
||||
generator = await handler.create_score(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, ScoreResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
assert_never(generator)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/score",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def create_score_v1(request: ScoreRequest, raw_request: Request):
|
||||
logger.warning(
|
||||
"To indicate that Score API is not part of standard OpenAI API, we "
|
||||
"have moved it to `/score`. Please update your client accordingly."
|
||||
)
|
||||
|
||||
return await create_score(request, raw_request)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/rerank",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def do_rerank(request: RerankRequest, raw_request: Request):
|
||||
handler = rerank(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Rerank (Score) API"
|
||||
)
|
||||
try:
|
||||
generator = await handler.do_rerank(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, RerankResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
assert_never(generator)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/rerank",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
async def do_rerank_v1(request: RerankRequest, raw_request: Request):
|
||||
logger.warning_once(
|
||||
"To indicate that the rerank API is not part of the standard OpenAI"
|
||||
" API, we have located it at `/rerank`. Please update your client "
|
||||
"accordingly. (Note: Conforms to JinaAI rerank API)"
|
||||
)
|
||||
|
||||
return await do_rerank(request, raw_request)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v2/rerank",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
async def do_rerank_v2(request: RerankRequest, raw_request: Request):
|
||||
return await do_rerank(request, raw_request)
|
||||
146
vllm/entrypoints/pooling/score/protocol.py
Normal file
146
vllm/entrypoints/pooling/score/protocol.py
Normal file
@@ -0,0 +1,146 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import time
|
||||
from typing import Annotated, Any
|
||||
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
Field,
|
||||
)
|
||||
|
||||
from vllm import PoolingParams
|
||||
from vllm.config.pooler import get_use_activation
|
||||
from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
|
||||
from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
|
||||
class ScoreRequest(OpenAIBaseModel):
|
||||
model: str | None = None
|
||||
text_1: list[str] | str | ScoreMultiModalParam
|
||||
text_2: list[str] | str | ScoreMultiModalParam
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
|
||||
|
||||
# --8<-- [start:score-extra-params]
|
||||
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."
|
||||
),
|
||||
)
|
||||
|
||||
softmax: bool | None = Field(
|
||||
default=None,
|
||||
description="softmax will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
|
||||
activation: bool | None = Field(
|
||||
default=None,
|
||||
description="activation will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
|
||||
use_activation: bool | None = Field(
|
||||
default=None,
|
||||
description="Whether to use activation for classification outputs. "
|
||||
"Default is True.",
|
||||
)
|
||||
# --8<-- [end:score-extra-params]
|
||||
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=get_use_activation(self),
|
||||
)
|
||||
|
||||
|
||||
class RerankRequest(OpenAIBaseModel):
|
||||
model: str | None = None
|
||||
query: str | ScoreMultiModalParam
|
||||
documents: list[str] | ScoreMultiModalParam
|
||||
top_n: int = Field(default_factory=lambda: 0)
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
|
||||
|
||||
# --8<-- [start:rerank-extra-params]
|
||||
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."
|
||||
),
|
||||
)
|
||||
|
||||
softmax: bool | None = Field(
|
||||
default=None,
|
||||
description="softmax will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
|
||||
activation: bool | None = Field(
|
||||
default=None,
|
||||
description="activation will be deprecated, please use use_activation instead.",
|
||||
)
|
||||
|
||||
use_activation: bool | None = Field(
|
||||
default=None,
|
||||
description="Whether to use activation for classification outputs. "
|
||||
"Default is True.",
|
||||
)
|
||||
# --8<-- [end:rerank-extra-params]
|
||||
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=get_use_activation(self),
|
||||
)
|
||||
|
||||
|
||||
class RerankDocument(BaseModel):
|
||||
text: str | None = None
|
||||
multi_modal: ScoreContentPartParam | None = None
|
||||
|
||||
|
||||
class RerankResult(BaseModel):
|
||||
index: int
|
||||
document: RerankDocument
|
||||
relevance_score: float
|
||||
|
||||
|
||||
class RerankUsage(BaseModel):
|
||||
prompt_tokens: int
|
||||
total_tokens: int
|
||||
|
||||
|
||||
class RerankResponse(OpenAIBaseModel):
|
||||
id: str
|
||||
model: str
|
||||
usage: RerankUsage
|
||||
results: list[RerankResult]
|
||||
|
||||
|
||||
class ScoreResponseData(OpenAIBaseModel):
|
||||
index: int
|
||||
object: str = "score"
|
||||
score: float
|
||||
|
||||
|
||||
class ScoreResponse(OpenAIBaseModel):
|
||||
id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
|
||||
object: str = "list"
|
||||
created: int = Field(default_factory=lambda: int(time.time()))
|
||||
model: str
|
||||
data: list[ScoreResponseData]
|
||||
usage: UsageInfo
|
||||
508
vllm/entrypoints/pooling/score/serving.py
Normal file
508
vllm/entrypoints/pooling/score/serving.py
Normal file
@@ -0,0 +1,508 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, Mapping
|
||||
from typing import Any
|
||||
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.pooling.score.protocol import (
|
||||
RerankDocument,
|
||||
RerankRequest,
|
||||
RerankResponse,
|
||||
RerankResult,
|
||||
RerankUsage,
|
||||
ScoreRequest,
|
||||
ScoreResponse,
|
||||
ScoreResponseData,
|
||||
)
|
||||
from vllm.entrypoints.score_utils import (
|
||||
ScoreContentPartParam,
|
||||
ScoreMultiModalParam,
|
||||
_cosine_similarity,
|
||||
_validate_score_input_lens,
|
||||
compress_token_type_ids,
|
||||
get_score_prompt,
|
||||
)
|
||||
from vllm.entrypoints.utils import _validate_truncation_size
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.utils.async_utils import make_async, merge_async_iterators
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ServingScores(OpenAIServing):
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
log_error_stack: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
|
||||
async def _embedding_score(
|
||||
self,
|
||||
tokenizer: TokenizerLike,
|
||||
texts_1: list[str],
|
||||
texts_2: list[str],
|
||||
request: RerankRequest | ScoreRequest,
|
||||
request_id: str,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
lora_request: LoRARequest | None | None = None,
|
||||
trace_headers: Mapping[str, str] | None = None,
|
||||
) -> list[PoolingRequestOutput] | ErrorResponse:
|
||||
input_texts = texts_1 + texts_2
|
||||
|
||||
engine_prompts: list[TokensPrompt] = []
|
||||
tokenize_async = make_async(
|
||||
tokenizer.__call__, executor=self._tokenizer_executor
|
||||
)
|
||||
|
||||
tokenization_kwargs = tokenization_kwargs or {}
|
||||
tokenized_prompts = await asyncio.gather(
|
||||
*(tokenize_async(t, **tokenization_kwargs) for t in input_texts)
|
||||
)
|
||||
|
||||
for tok_result, input_text in zip(tokenized_prompts, input_texts):
|
||||
text_token_prompt = self._validate_input(
|
||||
request, tok_result["input_ids"], input_text
|
||||
)
|
||||
|
||||
engine_prompts.append(
|
||||
TokensPrompt(prompt_token_ids=text_token_prompt["prompt_token_ids"])
|
||||
)
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||
pooling_params = request.to_pooling_params()
|
||||
|
||||
try:
|
||||
pooling_params.verify("embed", self.model_config)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
request_id_item = f"{request_id}-{i}"
|
||||
|
||||
self._log_inputs(
|
||||
request_id_item,
|
||||
input_texts[i],
|
||||
params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
generators.append(
|
||||
self.engine_client.encode(
|
||||
engine_prompt,
|
||||
pooling_params,
|
||||
request_id_item,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
)
|
||||
)
|
||||
|
||||
result_generator = merge_async_iterators(*generators)
|
||||
|
||||
# Non-streaming response
|
||||
final_res_batch: list[PoolingRequestOutput] = []
|
||||
|
||||
embeddings: list[PoolingRequestOutput | None] = [None] * len(engine_prompts)
|
||||
|
||||
async for i, res in result_generator:
|
||||
embeddings[i] = res
|
||||
|
||||
emb_texts_1: list[PoolingRequestOutput] = []
|
||||
emb_texts_2: list[PoolingRequestOutput] = []
|
||||
|
||||
for i in range(0, len(texts_1)):
|
||||
assert (emb := embeddings[i]) is not None
|
||||
emb_texts_1.append(emb)
|
||||
|
||||
for i in range(len(texts_1), len(embeddings)):
|
||||
assert (emb := embeddings[i]) is not None
|
||||
emb_texts_2.append(emb)
|
||||
|
||||
if len(emb_texts_1) == 1:
|
||||
emb_texts_1 = emb_texts_1 * len(emb_texts_2)
|
||||
|
||||
final_res_batch = _cosine_similarity(
|
||||
tokenizer=tokenizer, embed_1=emb_texts_1, embed_2=emb_texts_2
|
||||
)
|
||||
|
||||
return final_res_batch
|
||||
|
||||
def _preprocess_score(
|
||||
self,
|
||||
request: RerankRequest | ScoreRequest,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: str | ScoreContentPartParam,
|
||||
data_2: str | ScoreContentPartParam,
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
model_config = self.model_config
|
||||
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
model_config=model_config,
|
||||
data_1=data_1,
|
||||
data_2=data_2,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
|
||||
if request.mm_processor_kwargs is not None:
|
||||
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
|
||||
|
||||
return full_prompt, engine_prompt
|
||||
|
||||
async def _cross_encoding_score(
|
||||
self,
|
||||
tokenizer: TokenizerLike,
|
||||
data_1: list[str] | list[ScoreContentPartParam],
|
||||
data_2: list[str] | list[ScoreContentPartParam],
|
||||
request: RerankRequest | ScoreRequest,
|
||||
request_id: str,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
lora_request: LoRARequest | None | None = None,
|
||||
trace_headers: Mapping[str, str] | None = None,
|
||||
) -> list[PoolingRequestOutput] | ErrorResponse:
|
||||
request_prompts: list[str] = []
|
||||
engine_prompts: list[TokensPrompt] = []
|
||||
|
||||
if len(data_1) == 1:
|
||||
data_1 = data_1 * len(data_2)
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("MistralTokenizer not supported for cross-encoding")
|
||||
|
||||
tokenization_kwargs = tokenization_kwargs or {}
|
||||
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
|
||||
|
||||
preprocess_async = make_async(
|
||||
self._preprocess_score, executor=self._tokenizer_executor
|
||||
)
|
||||
|
||||
preprocessed_prompts = await asyncio.gather(
|
||||
*(
|
||||
preprocess_async(
|
||||
request=request,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
data_1=t1,
|
||||
data_2=t2,
|
||||
)
|
||||
for t1, t2 in input_pairs
|
||||
)
|
||||
)
|
||||
|
||||
for full_prompt, engine_prompt in preprocessed_prompts:
|
||||
request_prompts.append(full_prompt)
|
||||
engine_prompts.append(engine_prompt)
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||
|
||||
default_pooling_params = request.to_pooling_params()
|
||||
|
||||
try:
|
||||
default_pooling_params.verify("score", self.model_config)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
request_id_item = f"{request_id}-{i}"
|
||||
|
||||
self._log_inputs(
|
||||
request_id_item,
|
||||
request_prompts[i],
|
||||
params=default_pooling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
if token_type_ids := engine_prompt.pop("token_type_ids", None):
|
||||
pooling_params = default_pooling_params.clone()
|
||||
compressed = compress_token_type_ids(token_type_ids)
|
||||
pooling_params.extra_kwargs = {"compressed_token_type_ids": compressed}
|
||||
else:
|
||||
pooling_params = default_pooling_params
|
||||
|
||||
generator = self.engine_client.encode(
|
||||
engine_prompt,
|
||||
pooling_params,
|
||||
request_id_item,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
)
|
||||
|
||||
generators.append(generator)
|
||||
|
||||
result_generator = merge_async_iterators(*generators)
|
||||
|
||||
# Non-streaming response
|
||||
final_res_batch: list[PoolingRequestOutput | None] = [None] * len(
|
||||
engine_prompts
|
||||
)
|
||||
|
||||
async for i, res in result_generator:
|
||||
final_res_batch[i] = res
|
||||
|
||||
return [out for out in final_res_batch if out is not None]
|
||||
|
||||
async def _run_scoring(
|
||||
self,
|
||||
data_1: list[str] | str | ScoreMultiModalParam,
|
||||
data_2: list[str] | str | ScoreMultiModalParam,
|
||||
request: ScoreRequest | RerankRequest,
|
||||
request_id: str,
|
||||
raw_request: Request | None = None,
|
||||
) -> list[PoolingRequestOutput] | ErrorResponse:
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
|
||||
truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
|
||||
|
||||
tokenization_kwargs: dict[str, Any] = {}
|
||||
_validate_truncation_size(
|
||||
self.max_model_len, truncate_prompt_tokens, tokenization_kwargs
|
||||
)
|
||||
|
||||
trace_headers = (
|
||||
None
|
||||
if raw_request is None
|
||||
else await self._get_trace_headers(raw_request.headers)
|
||||
)
|
||||
|
||||
if not self.model_config.is_multimodal_model and (
|
||||
isinstance(data_1, dict) or isinstance(data_2, dict)
|
||||
):
|
||||
raise ValueError(
|
||||
f"MultiModalParam is not supported for {self.model_config.architecture}" # noqa: E501
|
||||
)
|
||||
|
||||
if isinstance(data_1, str):
|
||||
data_1 = [data_1]
|
||||
elif isinstance(data_1, dict):
|
||||
data_1 = data_1.get("content") # type: ignore[assignment]
|
||||
|
||||
if isinstance(data_2, str):
|
||||
data_2 = [data_2]
|
||||
elif isinstance(data_2, dict):
|
||||
data_2 = data_2.get("content") # type: ignore[assignment]
|
||||
|
||||
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
|
||||
|
||||
if self.model_config.is_cross_encoder:
|
||||
return await self._cross_encoding_score(
|
||||
tokenizer=tokenizer,
|
||||
data_1=data_1, # type: ignore[arg-type]
|
||||
data_2=data_2, # type: ignore[arg-type]
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
|
||||
else:
|
||||
return await self._embedding_score(
|
||||
tokenizer=tokenizer,
|
||||
texts_1=data_1, # type: ignore[arg-type]
|
||||
texts_2=data_2, # type: ignore[arg-type]
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
|
||||
async def create_score(
|
||||
self,
|
||||
request: ScoreRequest,
|
||||
raw_request: Request | None = None,
|
||||
) -> ScoreResponse | ErrorResponse:
|
||||
"""
|
||||
Score API similar to Sentence Transformers cross encoder
|
||||
|
||||
See https://sbert.net/docs/package_reference/cross_encoder
|
||||
"""
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
request_id = f"score-{self._base_request_id(raw_request)}"
|
||||
created_time = int(time.time())
|
||||
|
||||
try:
|
||||
final_res_batch = await self._run_scoring(
|
||||
request.text_1,
|
||||
request.text_2,
|
||||
request,
|
||||
request_id,
|
||||
raw_request,
|
||||
)
|
||||
if isinstance(final_res_batch, ErrorResponse):
|
||||
return final_res_batch
|
||||
|
||||
return self.request_output_to_score_response(
|
||||
final_res_batch,
|
||||
request_id,
|
||||
created_time,
|
||||
self.models.model_name(),
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
async def do_rerank(
|
||||
self, request: RerankRequest, raw_request: Request | None = None
|
||||
) -> RerankResponse | ErrorResponse:
|
||||
"""
|
||||
Rerank API based on JinaAI's rerank API; implements the same
|
||||
API interface. Designed for compatibility with off-the-shelf
|
||||
tooling, since this is a common standard for reranking APIs
|
||||
|
||||
See example client implementations at
|
||||
https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py
|
||||
numerous clients use this standard.
|
||||
"""
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
request_id = f"rerank-{self._base_request_id(raw_request)}"
|
||||
documents = request.documents
|
||||
top_n = (
|
||||
request.top_n
|
||||
if request.top_n > 0
|
||||
else (
|
||||
len(documents)
|
||||
if isinstance(documents, list)
|
||||
else len(documents["content"])
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
final_res_batch = await self._run_scoring(
|
||||
request.query,
|
||||
documents,
|
||||
request,
|
||||
request_id,
|
||||
raw_request,
|
||||
)
|
||||
if isinstance(final_res_batch, ErrorResponse):
|
||||
return final_res_batch
|
||||
|
||||
return self.request_output_to_rerank_response(
|
||||
final_res_batch,
|
||||
request_id,
|
||||
self.models.model_name(),
|
||||
documents,
|
||||
top_n,
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
def request_output_to_score_response(
|
||||
self,
|
||||
final_res_batch: list[PoolingRequestOutput],
|
||||
request_id: str,
|
||||
created_time: int,
|
||||
model_name: str,
|
||||
) -> ScoreResponse:
|
||||
items: list[ScoreResponseData] = []
|
||||
num_prompt_tokens = 0
|
||||
|
||||
for idx, final_res in enumerate(final_res_batch):
|
||||
classify_res = ScoringRequestOutput.from_base(final_res)
|
||||
|
||||
item = ScoreResponseData(
|
||||
index=idx,
|
||||
score=classify_res.outputs.score,
|
||||
)
|
||||
prompt_token_ids = final_res.prompt_token_ids
|
||||
|
||||
items.append(item)
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
total_tokens=num_prompt_tokens,
|
||||
)
|
||||
|
||||
return ScoreResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
data=items,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
def request_output_to_rerank_response(
|
||||
self,
|
||||
final_res_batch: list[PoolingRequestOutput],
|
||||
request_id: str,
|
||||
model_name: str,
|
||||
documents: list[str] | ScoreMultiModalParam,
|
||||
top_n: int,
|
||||
) -> RerankResponse:
|
||||
"""
|
||||
Convert the output of do_rank to a RerankResponse
|
||||
"""
|
||||
results: list[RerankResult] = []
|
||||
num_prompt_tokens = 0
|
||||
for idx, final_res in enumerate(final_res_batch):
|
||||
classify_res = ScoringRequestOutput.from_base(final_res)
|
||||
|
||||
result = RerankResult(
|
||||
index=idx,
|
||||
document=RerankDocument(text=documents[idx])
|
||||
if isinstance(documents, list)
|
||||
else RerankDocument(multi_modal=documents["content"][idx]),
|
||||
relevance_score=classify_res.outputs.score,
|
||||
)
|
||||
results.append(result)
|
||||
prompt_token_ids = final_res.prompt_token_ids
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
|
||||
# sort by relevance, then return the top n if set
|
||||
results.sort(key=lambda x: x.relevance_score, reverse=True)
|
||||
if top_n < len(documents):
|
||||
results = results[:top_n]
|
||||
|
||||
return RerankResponse(
|
||||
id=request_id,
|
||||
model=model_name,
|
||||
results=results,
|
||||
usage=RerankUsage(
|
||||
total_tokens=num_prompt_tokens, prompt_tokens=num_prompt_tokens
|
||||
),
|
||||
)
|
||||
410
vllm/entrypoints/renderer.py
Normal file
410
vllm/entrypoints/renderer.py
Normal file
@@ -0,0 +1,410 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Annotated
|
||||
|
||||
import pybase64
|
||||
import torch
|
||||
from pydantic import Field
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt
|
||||
from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.utils.async_utils import AsyncMicrobatchTokenizer
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RenderConfig:
|
||||
"""Configuration to control how prompts are prepared."""
|
||||
|
||||
max_length: int | None = None
|
||||
"""Maximum allowable total input token length. If provided,
|
||||
token inputs longer than this raise `ValueError`."""
|
||||
|
||||
truncate_prompt_tokens: int | None = None
|
||||
"""Number of tokens to keep. `None` means no truncation.
|
||||
`0` yields an empty list (and skips embeds).
|
||||
`-1` maps to `model_config.max_model_len`."""
|
||||
|
||||
add_special_tokens: bool = True
|
||||
"""Whether to add model-specific special tokens during tokenization."""
|
||||
|
||||
cache_salt: str | None = None
|
||||
"""String to disambiguate prefix cache entries."""
|
||||
|
||||
needs_detokenization: bool | None = False
|
||||
"""If True, detokenize IDs back to text for inclusion in outputs."""
|
||||
|
||||
def verify_truncate_prompt_tokens(self, model_config: ModelConfig) -> int | None:
|
||||
"""Validate and normalize `truncate_prompt_tokens` parameter."""
|
||||
truncate_prompt_tokens = self.truncate_prompt_tokens
|
||||
if truncate_prompt_tokens is None:
|
||||
return None
|
||||
|
||||
if truncate_prompt_tokens == 0:
|
||||
return 0
|
||||
|
||||
if truncate_prompt_tokens < 0:
|
||||
truncate_prompt_tokens = model_config.max_model_len
|
||||
|
||||
max_length = self.max_length
|
||||
if max_length is not None and truncate_prompt_tokens > max_length: # type: ignore[operator]
|
||||
raise ValueError(
|
||||
f"{truncate_prompt_tokens=} cannot be greater than "
|
||||
f"{max_length=}. Please select a smaller truncation size."
|
||||
)
|
||||
|
||||
return truncate_prompt_tokens
|
||||
|
||||
|
||||
class BaseRenderer(ABC):
|
||||
"""
|
||||
Base class for unified input processing and rendering.
|
||||
|
||||
The Renderer serves as a unified input processor that consolidates
|
||||
tokenization, chat template formatting, and multimodal input handling
|
||||
into a single component.
|
||||
It converts high-level API requests (OpenAI-style JSON) into token IDs and
|
||||
multimodal features ready for engine consumption.
|
||||
|
||||
Key responsibilities:
|
||||
- Convert text prompts to token sequences with proper special tokens
|
||||
- Apply chat templates and format conversations
|
||||
- Handle multimodal inputs (images, audio, etc.) when applicable
|
||||
- Manage prompt truncation and length validation
|
||||
- Provide clean separation between API layer and engine core
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
tokenizer: TokenizerLike | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.model_config = model_config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
@abstractmethod
|
||||
async def render_prompt(
|
||||
self,
|
||||
*,
|
||||
prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
|
||||
config: RenderConfig,
|
||||
) -> list[TokensPrompt]:
|
||||
"""
|
||||
Convert text or token inputs into engine-ready TokensPrompt objects.
|
||||
|
||||
This method accepts text or token inputs and produces a
|
||||
list of [`TokensPrompt`][vllm.inputs.data.TokensPrompt] objects
|
||||
for the engine.
|
||||
|
||||
Args:
|
||||
prompt_or_prompts: One of:
|
||||
- `str`: Single text prompt.
|
||||
- `list[str]`: Batch of text prompts.
|
||||
- `list[int]`: Single pre-tokenized sequence.
|
||||
- `list[list[int]]`: Batch of pre-tokenized sequences.
|
||||
config: Render configuration controlling how prompts are prepared
|
||||
(e.g., tokenization and length handling).
|
||||
|
||||
Returns:
|
||||
list[TokensPrompt]: Engine-ready token prompts.
|
||||
|
||||
Raises:
|
||||
ValueError: If input formats are invalid or length limits exceeded.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def render_prompt_and_embeds(
|
||||
self,
|
||||
*,
|
||||
prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
|
||||
prompt_embeds: bytes | list[bytes] | None = None,
|
||||
config: RenderConfig,
|
||||
) -> list[TokensPrompt | EmbedsPrompt]:
|
||||
"""
|
||||
Convert text/token and/or base64-encoded embeddings inputs into
|
||||
engine-ready prompt objects using a unified RenderConfig.
|
||||
|
||||
At least one of `prompt_or_prompts` or `prompt_embeds` must be
|
||||
provided and non-empty. If both are omitted or empty (e.g., empty
|
||||
string and empty list), a `ValueError` is raised.
|
||||
|
||||
Args:
|
||||
prompt_or_prompts: Text or token inputs to include.
|
||||
prompt_embeds: Base64-encoded bytes (or list thereof) containing a
|
||||
torch-saved tensor to be used as prompt embeddings.
|
||||
config: Render configuration controlling how prompts are prepared
|
||||
(e.g., tokenization and length handling).
|
||||
|
||||
Returns:
|
||||
list[Union[TokensPrompt, EmbedsPrompt]]:
|
||||
Engine-ready prompt objects.
|
||||
|
||||
Raises:
|
||||
ValueError: If both `prompt_or_prompts` and `prompt_embeds`
|
||||
are omitted or empty (decoder prompt cannot be empty), or if
|
||||
length limits are exceeded.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def load_prompt_embeds(
|
||||
self,
|
||||
prompt_embeds: bytes | list[bytes],
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=0)] | None = None,
|
||||
cache_salt: str | None = None,
|
||||
) -> list[EmbedsPrompt]:
|
||||
"""Load and validate base64-encoded embeddings into prompt objects."""
|
||||
if not self.model_config.enable_prompt_embeds:
|
||||
raise ValueError(
|
||||
"You must set `--enable-prompt-embeds` to input `prompt_embeds`."
|
||||
)
|
||||
|
||||
def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
|
||||
# Enable sparse tensor integrity checks to prevent out-of-bounds
|
||||
# writes from maliciously crafted tensors
|
||||
with torch.sparse.check_sparse_tensor_invariants():
|
||||
tensor = torch.load(
|
||||
io.BytesIO(pybase64.b64decode(embed, validate=True)),
|
||||
weights_only=True,
|
||||
map_location=torch.device("cpu"),
|
||||
)
|
||||
assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
|
||||
torch.float32,
|
||||
torch.bfloat16,
|
||||
torch.float16,
|
||||
)
|
||||
tensor = tensor.to_dense()
|
||||
if tensor.dim() > 2:
|
||||
tensor = tensor.squeeze(0)
|
||||
assert tensor.dim() == 2
|
||||
if truncate_prompt_tokens is not None:
|
||||
tensor = tensor[-truncate_prompt_tokens:]
|
||||
embeds_prompt = EmbedsPrompt(prompt_embeds=tensor)
|
||||
if cache_salt is not None:
|
||||
embeds_prompt["cache_salt"] = cache_salt
|
||||
return embeds_prompt
|
||||
|
||||
if isinstance(prompt_embeds, list):
|
||||
return [_load_and_validate_embed(embed) for embed in prompt_embeds]
|
||||
|
||||
return [_load_and_validate_embed(prompt_embeds)]
|
||||
|
||||
|
||||
class CompletionRenderer(BaseRenderer):
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
tokenizer: TokenizerLike | None = None,
|
||||
async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer]
|
||||
| None = None,
|
||||
):
|
||||
super().__init__(model_config, tokenizer)
|
||||
self.async_tokenizer_pool = async_tokenizer_pool
|
||||
self.async_tokenizer: AsyncMicrobatchTokenizer | None = None
|
||||
|
||||
async def render_prompt(
|
||||
self,
|
||||
*,
|
||||
prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
|
||||
config: RenderConfig,
|
||||
) -> list[TokensPrompt]:
|
||||
"""Implementation of prompt rendering for completion-style requests.
|
||||
|
||||
Uses async tokenizer pooling for improved performance. See base class
|
||||
for detailed parameter documentation.
|
||||
"""
|
||||
truncate_prompt_tokens = config.verify_truncate_prompt_tokens(self.model_config)
|
||||
if truncate_prompt_tokens == 0:
|
||||
return []
|
||||
|
||||
tasks = (
|
||||
self._create_prompt(
|
||||
prompt_input,
|
||||
config=config,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
)
|
||||
for prompt_input in parse_raw_prompts(prompt_or_prompts)
|
||||
)
|
||||
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
async def render_prompt_and_embeds(
|
||||
self,
|
||||
*,
|
||||
prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
|
||||
prompt_embeds: bytes | list[bytes] | None = None,
|
||||
config: RenderConfig,
|
||||
) -> list[TokensPrompt | EmbedsPrompt]:
|
||||
"""
|
||||
Render text/token prompts and/or precomputed embedding prompts. At
|
||||
least one of `prompt_or_prompts` or `prompt_embeds` must be provided.
|
||||
"""
|
||||
truncate_prompt_tokens = config.verify_truncate_prompt_tokens(self.model_config)
|
||||
if truncate_prompt_tokens == 0:
|
||||
return []
|
||||
|
||||
rendered: list[TokensPrompt | EmbedsPrompt] = []
|
||||
|
||||
if prompt_embeds is not None:
|
||||
rendered.extend(
|
||||
self.load_prompt_embeds(
|
||||
prompt_embeds, truncate_prompt_tokens, config.cache_salt
|
||||
)
|
||||
)
|
||||
if prompt_or_prompts is None or prompt_or_prompts == "":
|
||||
return rendered
|
||||
|
||||
token_prompts = await self.render_prompt(
|
||||
prompt_or_prompts=prompt_or_prompts,
|
||||
config=config,
|
||||
)
|
||||
rendered.extend(token_prompts)
|
||||
|
||||
return rendered
|
||||
|
||||
def _maybe_apply_truncation(
|
||||
self, token_ids: list[int], truncate_prompt_tokens: int | None
|
||||
) -> list[int]:
|
||||
"""Apply truncation to token sequence."""
|
||||
if truncate_prompt_tokens is None:
|
||||
return token_ids
|
||||
if truncate_prompt_tokens >= len(token_ids):
|
||||
return token_ids
|
||||
|
||||
return token_ids[-truncate_prompt_tokens:]
|
||||
|
||||
async def _create_prompt(
|
||||
self,
|
||||
prompt_input: TextPrompt | TokensPrompt,
|
||||
config: RenderConfig,
|
||||
truncate_prompt_tokens: int | None,
|
||||
) -> TokensPrompt:
|
||||
prompt, prompt_token_ids, _ = get_prompt_components(prompt_input)
|
||||
|
||||
if prompt_token_ids is not None:
|
||||
# NOTE: detokenization is needed when echo is enabled,
|
||||
# where the input token IDs are decoded back to text.
|
||||
return await self._create_prompt_from_token_ids(
|
||||
prompt_token_ids,
|
||||
config.max_length,
|
||||
truncate_prompt_tokens,
|
||||
config.cache_salt,
|
||||
config.needs_detokenization,
|
||||
)
|
||||
|
||||
if prompt is not None:
|
||||
return await self._create_prompt_from_text(
|
||||
prompt,
|
||||
config.max_length,
|
||||
truncate_prompt_tokens,
|
||||
config.add_special_tokens,
|
||||
config.cache_salt,
|
||||
)
|
||||
|
||||
# TODO: Also handle embeds prompt using this method
|
||||
raise NotImplementedError
|
||||
|
||||
async def _create_prompt_from_text(
|
||||
self,
|
||||
text: str,
|
||||
max_length: int | None,
|
||||
truncate_prompt_tokens: int | None,
|
||||
add_special_tokens: bool,
|
||||
cache_salt: str | None,
|
||||
) -> TokensPrompt:
|
||||
"""Tokenize text input asynchronously."""
|
||||
async_tokenizer = self._get_async_tokenizer()
|
||||
|
||||
# Handle encoder-specific preprocessing
|
||||
if (
|
||||
self.model_config.encoder_config is not None
|
||||
and self.model_config.encoder_config.get("do_lower_case", False)
|
||||
):
|
||||
text = text.lower()
|
||||
|
||||
# Tokenize texts
|
||||
if truncate_prompt_tokens is None:
|
||||
encoded = await async_tokenizer(text, add_special_tokens=add_special_tokens)
|
||||
else:
|
||||
encoded = await async_tokenizer(
|
||||
text,
|
||||
add_special_tokens=add_special_tokens,
|
||||
truncation=True,
|
||||
max_length=truncate_prompt_tokens,
|
||||
)
|
||||
|
||||
return self._create_tokens_prompt(
|
||||
encoded.input_ids, max_length, cache_salt, text
|
||||
)
|
||||
|
||||
async def _create_prompt_from_token_ids(
|
||||
self,
|
||||
token_ids: list[int],
|
||||
max_length: int | None,
|
||||
truncate_prompt_tokens: int | None,
|
||||
cache_salt: str | None,
|
||||
needs_detokenization: bool | None = False,
|
||||
) -> TokensPrompt:
|
||||
"""Optionally detokenize token IDs and build a tokens prompt."""
|
||||
token_ids = self._maybe_apply_truncation(token_ids, truncate_prompt_tokens)
|
||||
|
||||
prompt = None
|
||||
if needs_detokenization:
|
||||
async_tokenizer = self._get_async_tokenizer()
|
||||
prompt = await async_tokenizer.decode(token_ids)
|
||||
|
||||
return self._create_tokens_prompt(
|
||||
token_ids=token_ids,
|
||||
max_length=max_length,
|
||||
cache_salt=cache_salt,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
|
||||
"""Get or create async tokenizer using shared pool."""
|
||||
async_tokenizer = self.async_tokenizer
|
||||
if async_tokenizer is not None:
|
||||
return async_tokenizer
|
||||
|
||||
tokenizer = self.tokenizer
|
||||
if tokenizer is None:
|
||||
raise ValueError("No tokenizer available for text input processing")
|
||||
|
||||
if self.async_tokenizer_pool is None:
|
||||
async_tokenizer = AsyncMicrobatchTokenizer(tokenizer)
|
||||
else:
|
||||
async_tokenizer = self.async_tokenizer_pool.get(tokenizer)
|
||||
if async_tokenizer is None:
|
||||
async_tokenizer = AsyncMicrobatchTokenizer(tokenizer)
|
||||
self.async_tokenizer_pool[tokenizer] = async_tokenizer
|
||||
self.async_tokenizer = async_tokenizer
|
||||
return async_tokenizer
|
||||
|
||||
def _create_tokens_prompt(
|
||||
self,
|
||||
token_ids: list[int],
|
||||
max_length: int | None = None,
|
||||
cache_salt: str | None = None,
|
||||
prompt: str | None = None,
|
||||
) -> TokensPrompt:
|
||||
"""Create validated TokensPrompt."""
|
||||
if max_length is not None and len(token_ids) > max_length:
|
||||
raise ValueError(
|
||||
f"This model's maximum context length is {max_length} tokens. "
|
||||
f"However, your request has {len(token_ids)} input tokens. "
|
||||
"Please reduce the length of the input messages."
|
||||
)
|
||||
|
||||
tokens_prompt = TokensPrompt(prompt_token_ids=token_ids)
|
||||
if cache_salt is not None:
|
||||
tokens_prompt["cache_salt"] = cache_salt
|
||||
if prompt is not None:
|
||||
tokens_prompt["prompt"] = prompt
|
||||
return tokens_prompt
|
||||
249
vllm/entrypoints/responses_utils.py
Normal file
249
vllm/entrypoints/responses_utils.py
Normal file
@@ -0,0 +1,249 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any
|
||||
|
||||
from openai.types.chat import (
|
||||
ChatCompletionAssistantMessageParam,
|
||||
ChatCompletionMessageToolCallParam,
|
||||
ChatCompletionToolMessageParam,
|
||||
)
|
||||
from openai.types.chat.chat_completion_message_tool_call_param import (
|
||||
Function as FunctionCallTool,
|
||||
)
|
||||
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
|
||||
from openai.types.responses.response import ToolChoice
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
from openai.types.responses.response_output_item import McpCall
|
||||
from openai.types.responses.response_output_message import ResponseOutputMessage
|
||||
from openai.types.responses.response_reasoning_item import ResponseReasoningItem
|
||||
from openai.types.responses.tool import Tool
|
||||
|
||||
from vllm import envs
|
||||
from vllm.entrypoints.constants import MCP_PREFIX
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionMessageParam,
|
||||
ResponseInputOutputItem,
|
||||
)
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
|
||||
def make_response_output_items_from_parsable_context(
|
||||
response_messages: list[ResponseInputOutputItem],
|
||||
) -> list[ResponseOutputItem]:
|
||||
"""Given a list of sentences, construct ResponseOutput Items."""
|
||||
output_messages: list[ResponseOutputItem] = []
|
||||
for message in response_messages:
|
||||
if not isinstance(message, ResponseFunctionToolCallOutputItem):
|
||||
output_messages.append(message)
|
||||
else:
|
||||
if len(output_messages) == 0:
|
||||
raise ValueError(
|
||||
"Cannot have a FunctionToolCallOutput before FunctionToolCall."
|
||||
)
|
||||
if isinstance(output_messages[-1], ResponseFunctionToolCall):
|
||||
mcp_message = McpCall(
|
||||
id=f"{MCP_PREFIX}{random_uuid()}",
|
||||
arguments=output_messages[-1].arguments,
|
||||
name=output_messages[-1].name,
|
||||
server_label=output_messages[
|
||||
-1
|
||||
].name, # TODO: store the server label
|
||||
type=f"{MCP_PREFIX}call",
|
||||
status="completed",
|
||||
output=message.output,
|
||||
# TODO: support error output
|
||||
)
|
||||
output_messages[-1] = mcp_message
|
||||
|
||||
return output_messages
|
||||
|
||||
|
||||
def construct_input_messages(
|
||||
*,
|
||||
request_instructions: str | None = None,
|
||||
request_input: str | list[ResponseInputOutputItem],
|
||||
prev_msg: list[ChatCompletionMessageParam] | None = None,
|
||||
prev_response_output: list[ResponseOutputItem] | None = None,
|
||||
):
|
||||
messages: list[ChatCompletionMessageParam] = []
|
||||
if request_instructions:
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": request_instructions,
|
||||
}
|
||||
)
|
||||
|
||||
# Prepend the conversation history.
|
||||
if prev_msg is not None:
|
||||
# Add the previous messages.
|
||||
messages.extend(prev_msg)
|
||||
if prev_response_output is not None:
|
||||
# Add the previous output.
|
||||
for output_item in prev_response_output:
|
||||
# NOTE: We skip the reasoning output.
|
||||
if isinstance(output_item, ResponseOutputMessage):
|
||||
for content in output_item.content:
|
||||
messages.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": content.text,
|
||||
}
|
||||
)
|
||||
|
||||
# Append the new input.
|
||||
# Responses API supports simple text inputs without chat format.
|
||||
if isinstance(request_input, str):
|
||||
messages.append({"role": "user", "content": request_input})
|
||||
else:
|
||||
input_messages = construct_chat_messages_with_tool_call(request_input)
|
||||
messages.extend(input_messages)
|
||||
return messages
|
||||
|
||||
|
||||
def _maybe_combine_reasoning_and_tool_call(
|
||||
item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam]
|
||||
) -> ChatCompletionMessageParam | None:
|
||||
"""Many models treat MCP calls and reasoning as a single message.
|
||||
This function checks if the last message is a reasoning message and
|
||||
the current message is a tool call"""
|
||||
if not (
|
||||
isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX)
|
||||
):
|
||||
return None
|
||||
if len(messages) == 0:
|
||||
return None
|
||||
last_message = messages[-1]
|
||||
if not (
|
||||
last_message.get("role") == "assistant"
|
||||
and last_message.get("reasoning") is not None
|
||||
):
|
||||
return None
|
||||
|
||||
last_message["tool_calls"] = [
|
||||
ChatCompletionMessageToolCallParam(
|
||||
id=item.call_id,
|
||||
function=FunctionCallTool(
|
||||
name=item.name,
|
||||
arguments=item.arguments,
|
||||
),
|
||||
type="function",
|
||||
)
|
||||
]
|
||||
return last_message
|
||||
|
||||
|
||||
def construct_chat_messages_with_tool_call(
|
||||
input_messages: list[ResponseInputOutputItem],
|
||||
) -> list[ChatCompletionMessageParam]:
|
||||
"""This function wraps _construct_single_message_from_response_item
|
||||
Because some chatMessages come from multiple response items
|
||||
for example a reasoning item and a MCP tool call are two response items
|
||||
but are one chat message
|
||||
"""
|
||||
messages: list[ChatCompletionMessageParam] = []
|
||||
for item in input_messages:
|
||||
maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages)
|
||||
if maybe_combined_message is not None:
|
||||
messages[-1] = maybe_combined_message
|
||||
else:
|
||||
messages.append(_construct_single_message_from_response_item(item))
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def _construct_single_message_from_response_item(
|
||||
item: ResponseInputOutputItem,
|
||||
) -> ChatCompletionMessageParam:
|
||||
if isinstance(item, ResponseFunctionToolCall):
|
||||
# Append the function call as a tool call.
|
||||
return ChatCompletionAssistantMessageParam(
|
||||
role="assistant",
|
||||
tool_calls=[
|
||||
ChatCompletionMessageToolCallParam(
|
||||
id=item.call_id,
|
||||
function=FunctionCallTool(
|
||||
name=item.name,
|
||||
arguments=item.arguments,
|
||||
),
|
||||
type="function",
|
||||
)
|
||||
],
|
||||
)
|
||||
elif isinstance(item, ResponseReasoningItem):
|
||||
reasoning_content = ""
|
||||
if item.encrypted_content:
|
||||
raise ValueError("Encrypted content is not supported.")
|
||||
if len(item.summary) == 1:
|
||||
reasoning_content = item.summary[0].text
|
||||
elif item.content and len(item.content) == 1:
|
||||
reasoning_content = item.content[0].text
|
||||
return {
|
||||
"role": "assistant",
|
||||
"reasoning": reasoning_content,
|
||||
}
|
||||
elif isinstance(item, ResponseOutputMessage):
|
||||
return {
|
||||
"role": "assistant",
|
||||
"content": item.content[0].text,
|
||||
}
|
||||
elif isinstance(item, ResponseFunctionToolCallOutputItem):
|
||||
return ChatCompletionToolMessageParam(
|
||||
role="tool",
|
||||
content=item.output,
|
||||
tool_call_id=item.call_id,
|
||||
)
|
||||
elif isinstance(item, dict) and item.get("type") == "function_call_output":
|
||||
# Append the function call output as a tool message.
|
||||
return ChatCompletionToolMessageParam(
|
||||
role="tool",
|
||||
content=item.get("output"),
|
||||
tool_call_id=item.get("call_id"),
|
||||
)
|
||||
return item # type: ignore
|
||||
|
||||
|
||||
def extract_tool_types(tools: list[Tool]) -> set[str]:
|
||||
"""
|
||||
Extracts the tool types from the given tools.
|
||||
"""
|
||||
tool_types: set[str] = set()
|
||||
for tool in tools:
|
||||
if tool.type == "mcp":
|
||||
# Allow the MCP Tool type to enable built in tools if the
|
||||
# server_label is allowlisted in
|
||||
# envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS
|
||||
if tool.server_label in envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS:
|
||||
tool_types.add(tool.server_label)
|
||||
else:
|
||||
tool_types.add(tool.type)
|
||||
return tool_types
|
||||
|
||||
|
||||
def convert_tool_responses_to_completions_format(tool: dict) -> dict:
|
||||
"""
|
||||
Convert a flat tool schema:
|
||||
{"type": "function", "name": "...", "description": "...", "parameters": {...}}
|
||||
into:
|
||||
{"type": "function", "function": {...}}
|
||||
"""
|
||||
return {
|
||||
"type": "function",
|
||||
"function": tool,
|
||||
}
|
||||
|
||||
|
||||
def construct_tool_dicts(
|
||||
tools: list[Tool], tool_choice: ToolChoice
|
||||
) -> list[dict[str, Any]] | None:
|
||||
if tools is None or (tool_choice == "none"):
|
||||
tool_dicts = None
|
||||
else:
|
||||
tool_dicts = [
|
||||
convert_tool_responses_to_completions_format(tool.model_dump())
|
||||
for tool in tools
|
||||
]
|
||||
return tool_dicts
|
||||
4
vllm/entrypoints/sagemaker/__init__.py
Normal file
4
vllm/entrypoints/sagemaker/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""SageMaker-specific integration for vLLM."""
|
||||
118
vllm/entrypoints/sagemaker/routes.py
Normal file
118
vllm/entrypoints/sagemaker/routes.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from collections.abc import Awaitable, Callable
|
||||
from http import HTTPStatus
|
||||
from typing import Any
|
||||
|
||||
import model_hosting_container_standards.sagemaker as sagemaker_standards
|
||||
import pydantic
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
base,
|
||||
chat,
|
||||
completion,
|
||||
create_chat_completion,
|
||||
create_completion,
|
||||
validate_json_request,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
CompletionRequest,
|
||||
ErrorResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.pooling.classify.api_router import classify, create_classify
|
||||
from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
|
||||
from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
|
||||
from vllm.entrypoints.pooling.pooling.api_router import create_pooling, pooling
|
||||
from vllm.entrypoints.pooling.pooling.protocol import PoolingRequest
|
||||
from vllm.entrypoints.pooling.score.api_router import (
|
||||
create_score,
|
||||
do_rerank,
|
||||
rerank,
|
||||
score,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.protocol import RerankRequest, ScoreRequest
|
||||
from vllm.entrypoints.serve.instrumentator.health import health
|
||||
|
||||
# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
|
||||
# (requires typing_extensions >= 4.13)
|
||||
RequestType = Any
|
||||
GetHandlerFn = Callable[[Request], OpenAIServing | None]
|
||||
EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
|
||||
|
||||
# NOTE: Items defined earlier take higher priority
|
||||
INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [
|
||||
(ChatCompletionRequest, (chat, create_chat_completion)),
|
||||
(CompletionRequest, (completion, create_completion)),
|
||||
(EmbeddingRequest, (embedding, create_embedding)),
|
||||
(ClassificationRequest, (classify, create_classify)),
|
||||
(ScoreRequest, (score, create_score)),
|
||||
(RerankRequest, (rerank, do_rerank)),
|
||||
(PoolingRequest, (pooling, create_pooling)),
|
||||
]
|
||||
|
||||
# NOTE: Construct the TypeAdapters only once
|
||||
INVOCATION_VALIDATORS = [
|
||||
(pydantic.TypeAdapter(request_type), (get_handler, endpoint))
|
||||
for request_type, (get_handler, endpoint) in INVOCATION_TYPES
|
||||
]
|
||||
|
||||
|
||||
def register_sagemaker_routes(router: APIRouter):
|
||||
@router.post("/ping", response_class=Response)
|
||||
@router.get("/ping", response_class=Response)
|
||||
@sagemaker_standards.register_ping_handler
|
||||
async def ping(raw_request: Request) -> Response:
|
||||
"""Ping check. Endpoint required for SageMaker"""
|
||||
return await health(raw_request)
|
||||
|
||||
@router.post(
|
||||
"/invocations",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@sagemaker_standards.register_invocation_handler
|
||||
@sagemaker_standards.stateful_session_manager()
|
||||
@sagemaker_standards.inject_adapter_id(adapter_path="model")
|
||||
async def invocations(raw_request: Request):
|
||||
"""For SageMaker, routes requests based on the request type."""
|
||||
try:
|
||||
body = await raw_request.json()
|
||||
except json.JSONDecodeError as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.BAD_REQUEST.value,
|
||||
detail=f"JSON decode error: {e}",
|
||||
) from e
|
||||
|
||||
valid_endpoints = [
|
||||
(validator, endpoint)
|
||||
for validator, (get_handler, endpoint) in INVOCATION_VALIDATORS
|
||||
if get_handler(raw_request) is not None
|
||||
]
|
||||
|
||||
for request_validator, endpoint in valid_endpoints:
|
||||
try:
|
||||
request = request_validator.validate_python(body)
|
||||
except pydantic.ValidationError:
|
||||
continue
|
||||
|
||||
return await endpoint(request, raw_request)
|
||||
|
||||
type_names = [
|
||||
t.__name__ if isinstance(t := validator._type, type) else str(t)
|
||||
for validator, _ in valid_endpoints
|
||||
]
|
||||
msg = f"Cannot find suitable handler for request. Expected one of: {type_names}"
|
||||
res = base(raw_request).create_error_response(message=msg)
|
||||
return JSONResponse(content=res.model_dump(), status_code=res.error.code)
|
||||
|
||||
return router
|
||||
237
vllm/entrypoints/score_utils.py
Normal file
237
vllm/entrypoints/score_utils.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any, TypeAlias, cast
|
||||
|
||||
from torch.nn import CosineSimilarity
|
||||
from typing_extensions import Required, TypedDict
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
BaseMultiModalItemTracker,
|
||||
ChatCompletionContentPartImageEmbedsParam,
|
||||
ChatCompletionContentPartImageParam,
|
||||
ChatCompletionContentPartTextParam,
|
||||
MultiModalItemTracker,
|
||||
_ContentPart,
|
||||
_parse_chat_message_content_part,
|
||||
)
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.model_executor.models.interfaces import supports_score_template
|
||||
from vllm.multimodal.inputs import MultiModalDataDict
|
||||
from vllm.outputs import PoolingRequestOutput
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
ScoreContentPartParam: TypeAlias = (
|
||||
ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
|
||||
)
|
||||
|
||||
|
||||
class ScoreMultiModalParam(TypedDict, total=False):
|
||||
"""
|
||||
A specialized parameter type for scoring multimodal content
|
||||
|
||||
The reasons why don't reuse `CustomChatCompletionMessageParam` directly:
|
||||
1. Score tasks don't need the 'role' field (user/assistant/system) that's required in chat completions
|
||||
2. Including chat-specific fields would confuse users about their purpose in scoring
|
||||
3. This is a more focused interface that only exposes what's needed for scoring
|
||||
""" # noqa: E501
|
||||
|
||||
content: Required[list[ScoreContentPartParam]]
|
||||
"""The multimodal contents"""
|
||||
|
||||
|
||||
def _cosine_similarity(
|
||||
tokenizer: TokenizerLike,
|
||||
embed_1: list[PoolingRequestOutput],
|
||||
embed_2: list[PoolingRequestOutput],
|
||||
) -> list[PoolingRequestOutput]:
|
||||
scorer = CosineSimilarity(0)
|
||||
scores: list[PoolingRequestOutput] = []
|
||||
|
||||
for emb_1, emb_2 in zip(embed_1, embed_2):
|
||||
pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
|
||||
|
||||
padding: list[int] = []
|
||||
if (pad_token_id := tokenizer.pad_token_id) is not None:
|
||||
padding = [pad_token_id]
|
||||
|
||||
tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
|
||||
|
||||
scores.append(
|
||||
PoolingRequestOutput(
|
||||
request_id=f"{emb_1.request_id}_{emb_2.request_id}",
|
||||
outputs=pair_score,
|
||||
prompt_token_ids=tokens,
|
||||
num_cached_tokens=emb_1.num_cached_tokens + emb_2.num_cached_tokens,
|
||||
finished=True,
|
||||
)
|
||||
)
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def _validate_score_input_lens(
|
||||
data_1: list[str] | list[ScoreContentPartParam],
|
||||
data_2: list[str] | list[ScoreContentPartParam],
|
||||
):
|
||||
len_1 = len(data_1)
|
||||
len_2 = len(data_2)
|
||||
|
||||
if len_1 > 1 and len_1 != len_2:
|
||||
raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
|
||||
if len_1 == 0:
|
||||
raise ValueError("At least one text element must be given")
|
||||
if len_2 == 0:
|
||||
raise ValueError("At least one text_pair element must be given")
|
||||
|
||||
|
||||
def parse_score_data(
|
||||
data_1: str | ScoreContentPartParam,
|
||||
data_2: str | ScoreContentPartParam,
|
||||
model_config: ModelConfig,
|
||||
) -> tuple[str, str, MultiModalDataDict | None]:
|
||||
mm_tracker = MultiModalItemTracker(model_config)
|
||||
|
||||
content_1 = _parse_score_content(data_1, mm_tracker)
|
||||
content_2 = _parse_score_content(data_2, mm_tracker)
|
||||
|
||||
def ensure_str(content: _ContentPart | None) -> str:
|
||||
if content is not None and isinstance(content, str):
|
||||
return cast(str, content)
|
||||
else:
|
||||
raise ValueError(f"Only string content is supported, but got {content}.")
|
||||
|
||||
prompt_1 = ensure_str(content_1)
|
||||
prompt_2 = ensure_str(content_2)
|
||||
|
||||
return prompt_1, prompt_2, mm_tracker.all_mm_data()
|
||||
|
||||
|
||||
def _parse_score_content(
|
||||
data: str | ScoreContentPartParam,
|
||||
mm_tracker: BaseMultiModalItemTracker,
|
||||
) -> _ContentPart | None:
|
||||
if isinstance(data, str):
|
||||
part = ChatCompletionContentPartTextParam(type="text", text=data)
|
||||
else:
|
||||
part = data
|
||||
|
||||
mm_parser = mm_tracker.create_parser()
|
||||
|
||||
parse_res = _parse_chat_message_content_part(
|
||||
part,
|
||||
mm_parser,
|
||||
wrap_dicts=False,
|
||||
interleave_strings=False,
|
||||
)
|
||||
|
||||
if parse_res:
|
||||
return parse_res
|
||||
|
||||
mm_placeholder_storage = mm_parser.mm_placeholder_storage()
|
||||
|
||||
if (
|
||||
len(mm_placeholder_storage) != 1
|
||||
or len(next(iter(mm_placeholder_storage.values()))) != 1
|
||||
):
|
||||
raise ValueError("Only one multi-modal item is supported")
|
||||
|
||||
return next(iter(mm_placeholder_storage.values()))[0]
|
||||
|
||||
|
||||
def apply_score_template(
|
||||
model_config: ModelConfig,
|
||||
prompt_1: str,
|
||||
prompt_2: str,
|
||||
) -> str:
|
||||
# NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
|
||||
model = get_model_cls(model_config)
|
||||
if supports_score_template(model):
|
||||
full_prompt = model.get_score_template(prompt_1, prompt_2)
|
||||
if full_prompt is None:
|
||||
raise ValueError("Get empty score template from model")
|
||||
return full_prompt
|
||||
|
||||
raise ValueError(f"Unsupported model architecture: {model_config.architecture}")
|
||||
|
||||
|
||||
def post_process_tokens(
|
||||
model_config: ModelConfig,
|
||||
prompt: TokensPrompt,
|
||||
) -> None:
|
||||
"""
|
||||
Perform architecture-specific manipulations on the input tokens.
|
||||
|
||||
Note:
|
||||
This is an in-place operation.
|
||||
"""
|
||||
# NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
|
||||
model = get_model_cls(model_config)
|
||||
if supports_score_template(model):
|
||||
model.post_process_tokens(prompt)
|
||||
|
||||
|
||||
def get_score_prompt(
|
||||
model_config: ModelConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: str | ScoreContentPartParam,
|
||||
data_2: str | ScoreContentPartParam,
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
prompt_1, prompt_2, mm_data = parse_score_data(
|
||||
data_1,
|
||||
data_2,
|
||||
model_config,
|
||||
)
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
|
||||
model = get_model_cls(model_config)
|
||||
if supports_score_template(model):
|
||||
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
|
||||
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
|
||||
elif model_config.use_pad_token:
|
||||
# cross_encoder models defaults to using pad_token.
|
||||
prompt_inputs = tokenizer(
|
||||
text=prompt_1, text_pair=prompt_2, **tokenization_kwargs
|
||||
)
|
||||
full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
|
||||
else:
|
||||
# `llm as reranker` models defaults to not using pad_token.
|
||||
full_prompt = prompt_1 + prompt_2
|
||||
prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)
|
||||
|
||||
engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])
|
||||
|
||||
if (token_type_ids := prompt_inputs.get("token_type_ids")) is not None:
|
||||
engine_prompt["token_type_ids"] = token_type_ids
|
||||
|
||||
post_process_tokens(model_config, engine_prompt)
|
||||
|
||||
if mm_data is not None:
|
||||
engine_prompt["multi_modal_data"] = mm_data
|
||||
return full_prompt, engine_prompt
|
||||
|
||||
|
||||
def compress_token_type_ids(token_type_ids: list[int]) -> int:
|
||||
"""
|
||||
Return position of the first 1 or the length of the list
|
||||
if not found.
|
||||
"""
|
||||
first_one = len(token_type_ids)
|
||||
err_msg = (
|
||||
"Token type ids are expected to be a sequence"
|
||||
" of zeros followed by a sequence of ones"
|
||||
)
|
||||
for i, type_id in enumerate(token_type_ids):
|
||||
if type_id == 0 and first_one < i:
|
||||
raise ValueError(err_msg)
|
||||
elif type_id == 1 and first_one > i:
|
||||
first_one = i
|
||||
elif type_id > 1:
|
||||
raise ValueError(err_msg)
|
||||
|
||||
return first_one
|
||||
60
vllm/entrypoints/serve/__init__.py
Normal file
60
vllm/entrypoints/serve/__init__.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
|
||||
def register_vllm_serve_api_routers(app: FastAPI):
|
||||
from vllm.entrypoints.serve.lora.api_router import (
|
||||
attach_router as attach_lora_router,
|
||||
)
|
||||
|
||||
attach_lora_router(app)
|
||||
from vllm.entrypoints.serve.elastic_ep.api_router import (
|
||||
attach_router as attach_elastic_ep_router,
|
||||
)
|
||||
|
||||
attach_elastic_ep_router(app)
|
||||
|
||||
from vllm.entrypoints.serve.profile.api_router import (
|
||||
attach_router as attach_profile_router,
|
||||
)
|
||||
|
||||
attach_profile_router(app)
|
||||
|
||||
from vllm.entrypoints.serve.sleep.api_router import (
|
||||
attach_router as attach_sleep_router,
|
||||
)
|
||||
|
||||
attach_sleep_router(app)
|
||||
|
||||
from vllm.entrypoints.serve.tokenize.api_router import (
|
||||
attach_router as attach_tokenize_router,
|
||||
)
|
||||
|
||||
attach_tokenize_router(app)
|
||||
|
||||
from vllm.entrypoints.serve.disagg.api_router import (
|
||||
attach_router as attach_disagg_router,
|
||||
)
|
||||
|
||||
attach_disagg_router(app)
|
||||
|
||||
from vllm.entrypoints.serve.rlhf.api_router import (
|
||||
attach_router as attach_rlhf_router,
|
||||
)
|
||||
|
||||
attach_rlhf_router(app)
|
||||
|
||||
from vllm.entrypoints.serve.instrumentator.metrics import (
|
||||
attach_router as attach_metrics_router,
|
||||
)
|
||||
|
||||
attach_metrics_router(app)
|
||||
|
||||
from vllm.entrypoints.serve.instrumentator.health import (
|
||||
attach_router as attach_health_router,
|
||||
)
|
||||
|
||||
attach_health_router(app)
|
||||
0
vllm/entrypoints/serve/disagg/__init__.py
Normal file
0
vllm/entrypoints/serve/disagg/__init__.py
Normal file
110
vllm/entrypoints/serve/disagg/api_router.py
Normal file
110
vllm/entrypoints/serve/disagg/api_router.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, Response
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.api_server import validate_json_request
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.disagg.protocol import (
|
||||
GenerateRequest,
|
||||
GenerateResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.disagg.serving import (
|
||||
ServingTokens,
|
||||
)
|
||||
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
|
||||
from vllm.entrypoints.utils import (
|
||||
load_aware_call,
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def tokenization(request: Request) -> OpenAIServingTokenization:
|
||||
return request.app.state.openai_serving_tokenization
|
||||
|
||||
|
||||
def generate_tokens(request: Request) -> ServingTokens | None:
|
||||
return request.app.state.serving_tokens
|
||||
|
||||
|
||||
def engine_client(request: Request) -> EngineClient:
|
||||
return request.app.state.engine_client
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/inference/v1/generate",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def generate(request: GenerateRequest, raw_request: Request):
|
||||
handler = generate_tokens(raw_request)
|
||||
if handler is None:
|
||||
return tokenization(raw_request).create_error_response(
|
||||
message="The model does not support generate tokens API"
|
||||
)
|
||||
try:
|
||||
generator = await handler.serve_tokens(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
|
||||
elif isinstance(generator, GenerateResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
return StreamingResponse(content=generator, media_type="text/event-stream")
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
if getattr(app.state.args, "tokens_only", False):
|
||||
|
||||
@router.post("/abort_requests")
|
||||
async def abort_requests(raw_request: Request):
|
||||
"""
|
||||
Abort one or more requests. To be used in a
|
||||
Disaggregated Everything setup.
|
||||
"""
|
||||
try:
|
||||
body = await raw_request.json()
|
||||
except json.JSONDecodeError as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.BAD_REQUEST.value,
|
||||
detail=f"JSON decode error: {e}",
|
||||
) from e
|
||||
request_ids = body.get("request_ids")
|
||||
if request_ids is None:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.BAD_REQUEST.value,
|
||||
detail="Missing 'request_ids' in request body",
|
||||
)
|
||||
# Abort requests in background
|
||||
asyncio.create_task(engine_client(raw_request).abort(request_ids))
|
||||
return Response(status_code=200)
|
||||
|
||||
app.include_router(router)
|
||||
90
vllm/entrypoints/serve/disagg/protocol.py
Normal file
90
vllm/entrypoints/serve/disagg/protocol.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionLogProbs,
|
||||
Logprob,
|
||||
SamplingParams,
|
||||
StreamOptions,
|
||||
)
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
|
||||
####### Tokens IN <> Tokens OUT #######
|
||||
class GenerateRequest(BaseModel):
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
"through out the inference process and return in response."
|
||||
),
|
||||
)
|
||||
token_ids: list[int]
|
||||
"""The token ids to generate text from."""
|
||||
|
||||
# features: MultiModalFeatureSpec
|
||||
# TODO (NickLucche): implement once Renderer work is completed
|
||||
features: str | None = None
|
||||
"""The processed MM inputs for the model."""
|
||||
|
||||
sampling_params: SamplingParams
|
||||
"""The sampling parameters for the model."""
|
||||
|
||||
model: str | None = None
|
||||
|
||||
stream: bool | None = False
|
||||
stream_options: StreamOptions | None = None
|
||||
cache_salt: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the prefix cache will be salted with the provided "
|
||||
"string to prevent an attacker to guess prompts in multi-user "
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit)."
|
||||
),
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."
|
||||
),
|
||||
)
|
||||
kv_transfer_params: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description="KVTransfer parameters used for disaggregated serving.",
|
||||
)
|
||||
|
||||
|
||||
class GenerateResponseChoice(BaseModel):
|
||||
index: int
|
||||
logprobs: ChatCompletionLogProbs | None = None
|
||||
# per OpenAI spec this is the default
|
||||
finish_reason: str | None = "stop"
|
||||
token_ids: list[int] | None = None
|
||||
|
||||
|
||||
class GenerateResponse(BaseModel):
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
"through out the inference process and return in response."
|
||||
),
|
||||
)
|
||||
choices: list[GenerateResponseChoice]
|
||||
|
||||
prompt_logprobs: list[dict[int, Logprob] | None] | None = None
|
||||
|
||||
kv_transfer_params: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description="KVTransfer parameters used for disaggregated serving.",
|
||||
)
|
||||
285
vllm/entrypoints/serve/disagg/serving.py
Normal file
285
vllm/entrypoints/serve/disagg/serving.py
Normal file
@@ -0,0 +1,285 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from collections.abc import AsyncGenerator
|
||||
from collections.abc import Sequence as GenericSequence
|
||||
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionLogProb,
|
||||
ChatCompletionLogProbs,
|
||||
ChatCompletionLogProbsContent,
|
||||
ErrorResponse,
|
||||
PromptTokenUsageInfo,
|
||||
RequestResponseMetadata,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.disagg.protocol import (
|
||||
GenerateRequest,
|
||||
GenerateResponse,
|
||||
GenerateResponseChoice,
|
||||
)
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils.collection_utils import as_list
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ServingTokens(OpenAIServing):
|
||||
"""Provides Tokens IN <> Tokens OUT functionality to vLLM API."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
force_no_detokenize: bool = False,
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
log_error_stack: bool = False,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
enable_log_outputs: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||
self.enable_log_outputs = enable_log_outputs
|
||||
self.force_no_detokenize = force_no_detokenize
|
||||
if force_no_detokenize:
|
||||
logger.info(
|
||||
"Tokens-only mode is enabled, skipping detokenization "
|
||||
"step for incoming requests."
|
||||
)
|
||||
|
||||
async def serve_tokens(
|
||||
self,
|
||||
request: GenerateRequest,
|
||||
raw_request: Request | None = None,
|
||||
) -> GenerateResponse | ErrorResponse:
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
logger.error("Error with model %s", error_check_ret)
|
||||
return error_check_ret
|
||||
|
||||
# If the engine is dead, raise the engine's DEAD_ERROR.
|
||||
# This is required for the streaming case, where we return a
|
||||
# success status before we actually start generating text :).
|
||||
if self.engine_client.errored:
|
||||
raise self.engine_client.dead_error
|
||||
|
||||
lora_request = None
|
||||
lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
|
||||
|
||||
model_name = self.models.model_name(lora_request)
|
||||
|
||||
request_id = (
|
||||
f"generate-tokens-{self._base_request_id(raw_request, request.request_id)}"
|
||||
)
|
||||
|
||||
request_metadata = RequestResponseMetadata(request_id=request_id)
|
||||
if raw_request:
|
||||
raw_request.state.request_metadata = request_metadata
|
||||
|
||||
# TODO(NickLucche): Change to EngineCoreRequest once Renderer work is
|
||||
# completed
|
||||
engine_prompt = TokensPrompt(prompt_token_ids=request.token_ids)
|
||||
if request.features is not None:
|
||||
engine_prompt["multi_modal_data"] = None
|
||||
|
||||
if hasattr(request, "cache_salt") and request.cache_salt is not None:
|
||||
engine_prompt["cache_salt"] = request.cache_salt
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
result_generator: AsyncGenerator[RequestOutput, None] | None = None
|
||||
try:
|
||||
sampling_params = request.sampling_params
|
||||
if self.force_no_detokenize:
|
||||
sampling_params.detokenize = False
|
||||
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
TokensPrompt(prompt_token_ids=request.token_ids),
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
trace_headers = (
|
||||
None
|
||||
if raw_request is None
|
||||
else await self._get_trace_headers(raw_request.headers)
|
||||
)
|
||||
|
||||
result_generator = self.engine_client.generate(
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
# TODO(NickLucche): Implement streaming response
|
||||
|
||||
try:
|
||||
assert result_generator is not None
|
||||
return await self.serve_tokens_full_generator(
|
||||
request, result_generator, request_id, model_name, request_metadata
|
||||
)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
async def serve_tokens_full_generator(
|
||||
self,
|
||||
request: GenerateRequest,
|
||||
result_generator: AsyncGenerator[RequestOutput, None],
|
||||
request_id: str,
|
||||
model_name: str,
|
||||
request_metadata: RequestResponseMetadata,
|
||||
) -> ErrorResponse | GenerateResponse:
|
||||
created_time = int(time.time())
|
||||
final_res: RequestOutput | None = None
|
||||
sampling_params: SamplingParams = request.sampling_params
|
||||
|
||||
try:
|
||||
async for res in result_generator:
|
||||
final_res = res
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
assert final_res is not None
|
||||
|
||||
choices: list[GenerateResponseChoice] = []
|
||||
num_generated_tokens = 0
|
||||
for output in final_res.outputs:
|
||||
token_ids = output.token_ids
|
||||
out_logprobs = output.logprobs
|
||||
|
||||
# This is top_logprobs in completions API
|
||||
if sampling_params.logprobs:
|
||||
assert out_logprobs is not None, "Did not output logprobs"
|
||||
logprobs = self._create_tokens_logprobs(
|
||||
token_ids=token_ids,
|
||||
top_logprobs=out_logprobs,
|
||||
num_output_top_logprobs=sampling_params.logprobs,
|
||||
)
|
||||
else:
|
||||
logprobs = None
|
||||
|
||||
choice_data = GenerateResponseChoice(
|
||||
index=output.index,
|
||||
logprobs=logprobs,
|
||||
finish_reason=output.finish_reason if output.finish_reason else "stop",
|
||||
token_ids=as_list(output.token_ids),
|
||||
)
|
||||
|
||||
choices.append(choice_data)
|
||||
num_generated_tokens += len(output.token_ids)
|
||||
|
||||
assert final_res.prompt_token_ids is not None
|
||||
num_prompt_tokens = len(final_res.prompt_token_ids)
|
||||
if final_res.encoder_prompt_token_ids is not None:
|
||||
num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
|
||||
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=num_generated_tokens,
|
||||
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||
)
|
||||
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
|
||||
# This info is not available at the /coordinator level
|
||||
usage.prompt_tokens_details = PromptTokenUsageInfo(
|
||||
cached_tokens=final_res.num_cached_tokens
|
||||
)
|
||||
|
||||
request_metadata.final_usage_info = usage
|
||||
|
||||
response = GenerateResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
choices=choices,
|
||||
usage=usage,
|
||||
prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
|
||||
kv_transfer_params=final_res.kv_transfer_params,
|
||||
)
|
||||
|
||||
# Log complete response if output logging is enabled
|
||||
if self.enable_log_outputs and self.request_logger:
|
||||
for choice in choices:
|
||||
# Get the corresponding output token IDs
|
||||
output_token_ids = None
|
||||
if choice.index < len(final_res.outputs):
|
||||
output_token_ids = final_res.outputs[choice.index].token_ids
|
||||
|
||||
if output_token_ids:
|
||||
# Log token_ids only.
|
||||
self.request_logger.log_outputs(
|
||||
request_id=request_id,
|
||||
outputs="",
|
||||
output_token_ids=output_token_ids,
|
||||
finish_reason=choice.finish_reason,
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
def _create_tokens_logprobs(
|
||||
self,
|
||||
token_ids: GenericSequence[int],
|
||||
top_logprobs: GenericSequence[dict[int, Logprob] | None],
|
||||
num_output_top_logprobs: int | None = None,
|
||||
) -> ChatCompletionLogProbs:
|
||||
"""Create OpenAI-style logprobs."""
|
||||
logprobs_content: list[ChatCompletionLogProbsContent] = []
|
||||
|
||||
for i, token_id in enumerate(token_ids):
|
||||
token = f"token_id:{token_id}"
|
||||
step_top_logprobs = top_logprobs[i]
|
||||
if step_top_logprobs is None or step_top_logprobs.get(token_id) is None:
|
||||
logprobs_content.append(
|
||||
ChatCompletionLogProbsContent(
|
||||
token=token,
|
||||
)
|
||||
)
|
||||
else:
|
||||
step_token = step_top_logprobs[token_id]
|
||||
|
||||
logprobs_content.append(
|
||||
ChatCompletionLogProbsContent(
|
||||
token=token,
|
||||
logprob=max(step_token.logprob, -9999.0),
|
||||
top_logprobs=[
|
||||
ChatCompletionLogProb(
|
||||
token=token,
|
||||
logprob=max(p[1].logprob, -9999.0),
|
||||
)
|
||||
for i, p in enumerate(step_top_logprobs.items())
|
||||
if num_output_top_logprobs and i < num_output_top_logprobs
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
return ChatCompletionLogProbs(content=logprobs_content)
|
||||
0
vllm/entrypoints/serve/elastic_ep/__init__.py
Normal file
0
vllm/entrypoints/serve/elastic_ep/__init__.py
Normal file
96
vllm/entrypoints/serve/elastic_ep/api_router.py
Normal file
96
vllm/entrypoints/serve/elastic_ep/api_router.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import json
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.api_server import validate_json_request
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.elastic_ep.middleware import (
|
||||
get_scaling_elastic_ep,
|
||||
set_scaling_elastic_ep,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def engine_client(request: Request) -> EngineClient:
|
||||
return request.app.state.engine_client
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/scale_elastic_ep",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.OK.value: {"model": dict},
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.REQUEST_TIMEOUT.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def scale_elastic_ep(raw_request: Request):
|
||||
try:
|
||||
body = await raw_request.json()
|
||||
except json.JSONDecodeError as e:
|
||||
raise HTTPException(status_code=400, detail="Invalid JSON format") from e # noqa: B904
|
||||
|
||||
new_data_parallel_size = body.get("new_data_parallel_size")
|
||||
drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes
|
||||
|
||||
if new_data_parallel_size is None:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="new_data_parallel_size is required"
|
||||
)
|
||||
|
||||
if not isinstance(new_data_parallel_size, int) or new_data_parallel_size <= 0:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="new_data_parallel_size must be a positive integer",
|
||||
)
|
||||
|
||||
if not isinstance(drain_timeout, int) or drain_timeout <= 0:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="drain_timeout must be a positive integer"
|
||||
)
|
||||
|
||||
# Set scaling flag to prevent new requests
|
||||
set_scaling_elastic_ep(True)
|
||||
client = engine_client(raw_request)
|
||||
try:
|
||||
await client.scale_elastic_ep(new_data_parallel_size, drain_timeout)
|
||||
return JSONResponse(
|
||||
{
|
||||
"message": f"Scaled to {new_data_parallel_size} data parallel engines",
|
||||
}
|
||||
)
|
||||
except TimeoutError as e:
|
||||
raise HTTPException(
|
||||
status_code=408,
|
||||
detail="Scale failed due to request drain timeout "
|
||||
f"after {drain_timeout} seconds",
|
||||
) from e
|
||||
except Exception as e:
|
||||
logger.error("Scale failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail="Scale failed") from e
|
||||
finally:
|
||||
set_scaling_elastic_ep(False)
|
||||
|
||||
|
||||
@router.post("/is_scaling_elastic_ep")
|
||||
async def is_scaling_elastic_ep(raw_request: Request):
|
||||
return JSONResponse({"is_scaling_elastic_ep": get_scaling_elastic_ep()})
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
app.include_router(router)
|
||||
49
vllm/entrypoints/serve/elastic_ep/middleware.py
Normal file
49
vllm/entrypoints/serve/elastic_ep/middleware.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Awaitable
|
||||
|
||||
from fastapi.responses import JSONResponse
|
||||
from starlette.types import ASGIApp, Receive, Scope, Send
|
||||
|
||||
# Global variable to track scaling state
|
||||
_scaling_elastic_ep = False
|
||||
|
||||
|
||||
def get_scaling_elastic_ep():
|
||||
return _scaling_elastic_ep
|
||||
|
||||
|
||||
def set_scaling_elastic_ep(value):
|
||||
global _scaling_elastic_ep
|
||||
_scaling_elastic_ep = value
|
||||
|
||||
|
||||
class ScalingMiddleware:
|
||||
"""
|
||||
Middleware that checks if the model is currently scaling and
|
||||
returns a 503 Service Unavailable response if it is.
|
||||
|
||||
This middleware applies to all HTTP requests and prevents
|
||||
processing when the model is in a scaling state.
|
||||
"""
|
||||
|
||||
def __init__(self, app: ASGIApp) -> None:
|
||||
self.app = app
|
||||
|
||||
def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
|
||||
if scope["type"] != "http":
|
||||
return self.app(scope, receive, send)
|
||||
|
||||
# Check global scaling state
|
||||
if get_scaling_elastic_ep():
|
||||
# Return 503 Service Unavailable response
|
||||
response = JSONResponse(
|
||||
content={
|
||||
"error": "The model is currently scaling. Please try again later."
|
||||
},
|
||||
status_code=503,
|
||||
)
|
||||
return response(scope, receive, send)
|
||||
|
||||
return self.app(scope, receive, send)
|
||||
0
vllm/entrypoints/serve/instrumentator/__init__.py
Normal file
0
vllm/entrypoints/serve/instrumentator/__init__.py
Normal file
33
vllm/entrypoints/serve/instrumentator/health.py
Normal file
33
vllm/entrypoints/serve/instrumentator/health.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from fastapi import APIRouter, Request
|
||||
from fastapi.responses import Response
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.engine.exceptions import EngineDeadError
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def engine_client(request: Request) -> EngineClient:
|
||||
return request.app.state.engine_client
|
||||
|
||||
|
||||
@router.get("/health", response_class=Response)
|
||||
async def health(raw_request: Request) -> Response:
|
||||
"""Health check."""
|
||||
try:
|
||||
await engine_client(raw_request).check_health()
|
||||
return Response(status_code=200)
|
||||
except EngineDeadError:
|
||||
return Response(status_code=503)
|
||||
|
||||
|
||||
def attach_router(app):
|
||||
app.include_router(router)
|
||||
45
vllm/entrypoints/serve/instrumentator/metrics.py
Normal file
45
vllm/entrypoints/serve/instrumentator/metrics.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import prometheus_client
|
||||
import regex as re
|
||||
from fastapi import FastAPI, Response
|
||||
from prometheus_client import make_asgi_app
|
||||
from prometheus_fastapi_instrumentator import Instrumentator
|
||||
from starlette.routing import Mount
|
||||
|
||||
from vllm.v1.metrics.prometheus import get_prometheus_registry
|
||||
|
||||
|
||||
class PrometheusResponse(Response):
|
||||
media_type = prometheus_client.CONTENT_TYPE_LATEST
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
"""Mount prometheus metrics to a FastAPI app."""
|
||||
|
||||
registry = get_prometheus_registry()
|
||||
|
||||
# `response_class=PrometheusResponse` is needed to return an HTTP response
|
||||
# with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
|
||||
# instead of the default "application/json" which is incorrect.
|
||||
# See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364
|
||||
Instrumentator(
|
||||
excluded_handlers=[
|
||||
"/metrics",
|
||||
"/health",
|
||||
"/load",
|
||||
"/ping",
|
||||
"/version",
|
||||
"/server_info",
|
||||
],
|
||||
registry=registry,
|
||||
).add().instrument(app).expose(app, response_class=PrometheusResponse)
|
||||
|
||||
# Add prometheus asgi middleware to route /metrics requests
|
||||
metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
|
||||
|
||||
# Workaround for 307 Redirect for /metrics
|
||||
metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
|
||||
app.routes.append(metrics_route)
|
||||
0
vllm/entrypoints/serve/lora/__init__.py
Normal file
0
vllm/entrypoints/serve/lora/__init__.py
Normal file
70
vllm/entrypoints/serve/lora/api_router.py
Normal file
70
vllm/entrypoints/serve/lora/api_router.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import model_hosting_container_standards.sagemaker as sagemaker_standards
|
||||
from fastapi import APIRouter, Depends, FastAPI, Request
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
from vllm import envs
|
||||
from vllm.entrypoints.openai.api_server import models, validate_json_request
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
LoadLoRAAdapterRequest,
|
||||
UnloadLoRAAdapterRequest,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
if not envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
|
||||
"""If LoRA dynamic loading & unloading is not enabled, do nothing."""
|
||||
return
|
||||
logger.warning(
|
||||
"LoRA dynamic loading & unloading is enabled in the API server. "
|
||||
"This should ONLY be used for local development!"
|
||||
)
|
||||
|
||||
@sagemaker_standards.register_load_adapter_handler(
|
||||
request_shape={
|
||||
"lora_name": "body.name",
|
||||
"lora_path": "body.src",
|
||||
},
|
||||
)
|
||||
@router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)])
|
||||
async def load_lora_adapter(request: LoadLoRAAdapterRequest, raw_request: Request):
|
||||
handler: OpenAIServingModels = models(raw_request)
|
||||
response = await handler.load_lora_adapter(request)
|
||||
if isinstance(response, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=response.model_dump(), status_code=response.error.code
|
||||
)
|
||||
|
||||
return Response(status_code=200, content=response)
|
||||
|
||||
@sagemaker_standards.register_unload_adapter_handler(
|
||||
request_shape={
|
||||
"lora_name": "path_params.adapter_name",
|
||||
}
|
||||
)
|
||||
@router.post(
|
||||
"/v1/unload_lora_adapter", dependencies=[Depends(validate_json_request)]
|
||||
)
|
||||
async def unload_lora_adapter(
|
||||
request: UnloadLoRAAdapterRequest, raw_request: Request
|
||||
):
|
||||
handler: OpenAIServingModels = models(raw_request)
|
||||
response = await handler.unload_lora_adapter(request)
|
||||
if isinstance(response, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=response.model_dump(), status_code=response.error.code
|
||||
)
|
||||
|
||||
return Response(status_code=200, content=response)
|
||||
|
||||
# register the router
|
||||
app.include_router(router)
|
||||
0
vllm/entrypoints/serve/profile/__init__.py
Normal file
0
vllm/entrypoints/serve/profile/__init__.py
Normal file
46
vllm/entrypoints/serve/profile/api_router.py
Normal file
46
vllm/entrypoints/serve/profile/api_router.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from fastapi import APIRouter, FastAPI, Request
|
||||
from fastapi.responses import Response
|
||||
|
||||
from vllm.config import ProfilerConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def engine_client(request: Request) -> EngineClient:
|
||||
return request.app.state.engine_client
|
||||
|
||||
|
||||
@router.post("/start_profile")
|
||||
async def start_profile(raw_request: Request):
|
||||
logger.info("Starting profiler...")
|
||||
await engine_client(raw_request).start_profile()
|
||||
logger.info("Profiler started.")
|
||||
return Response(status_code=200)
|
||||
|
||||
|
||||
@router.post("/stop_profile")
|
||||
async def stop_profile(raw_request: Request):
|
||||
logger.info("Stopping profiler...")
|
||||
await engine_client(raw_request).stop_profile()
|
||||
logger.info("Profiler stopped.")
|
||||
return Response(status_code=200)
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
profiler_config = getattr(app.state.args, "profiler_config", None)
|
||||
assert profiler_config is None or isinstance(profiler_config, ProfilerConfig)
|
||||
if profiler_config is not None and profiler_config.profiler is not None:
|
||||
logger.warning_once(
|
||||
"Profiler with mode '%s' is enabled in the "
|
||||
"API server. This should ONLY be used for local development!",
|
||||
profiler_config.profiler,
|
||||
)
|
||||
app.include_router(router)
|
||||
0
vllm/entrypoints/serve/rlhf/__init__.py
Normal file
0
vllm/entrypoints/serve/rlhf/__init__.py
Normal file
102
vllm/entrypoints/serve/rlhf/api_router.py
Normal file
102
vllm/entrypoints/serve/rlhf/api_router.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, FastAPI, Query, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def engine_client(request: Request) -> EngineClient:
|
||||
return request.app.state.engine_client
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/pause")
|
||||
async def pause_generation(
|
||||
raw_request: Request,
|
||||
wait_for_inflight_requests: bool = Query(False),
|
||||
clear_cache: bool = Query(True),
|
||||
) -> JSONResponse:
|
||||
"""Pause generation requests to allow weight updates.
|
||||
|
||||
Args:
|
||||
wait_for_inflight_requests: When ``True`` waits for in-flight
|
||||
requests to finish before pausing. When ``False`` (default),
|
||||
aborts any in-flight requests immediately.
|
||||
clear_cache: Whether to clear KV/prefix caches after draining.
|
||||
"""
|
||||
|
||||
engine = engine_client(raw_request)
|
||||
|
||||
try:
|
||||
await engine.pause_generation(
|
||||
wait_for_inflight_requests=wait_for_inflight_requests,
|
||||
clear_cache=clear_cache,
|
||||
)
|
||||
return JSONResponse(
|
||||
content={"status": "paused"},
|
||||
status_code=HTTPStatus.OK.value,
|
||||
)
|
||||
|
||||
except ValueError as err:
|
||||
return JSONResponse(
|
||||
content={"error": str(err)},
|
||||
status_code=HTTPStatus.BAD_REQUEST.value,
|
||||
)
|
||||
except Exception as err: # pragma: no cover - defensive
|
||||
logger.exception("Failed to pause generation")
|
||||
return JSONResponse(
|
||||
content={"error": f"Failed to pause generation: {err}"},
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/resume")
|
||||
async def resume_generation(raw_request: Request) -> JSONResponse:
|
||||
"""Resume generation after a pause."""
|
||||
|
||||
engine = engine_client(raw_request)
|
||||
|
||||
try:
|
||||
await engine.resume_generation()
|
||||
return JSONResponse(
|
||||
content={"status": "resumed"},
|
||||
status_code=HTTPStatus.OK.value,
|
||||
)
|
||||
except Exception as err: # pragma: no cover - defensive
|
||||
logger.exception("Failed to resume generation")
|
||||
return JSONResponse(
|
||||
content={"error": f"Failed to resume generation: {err}"},
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/is_paused")
|
||||
async def is_paused(raw_request: Request) -> JSONResponse:
|
||||
"""Return the current pause status."""
|
||||
|
||||
engine = engine_client(raw_request)
|
||||
|
||||
try:
|
||||
paused = await engine.is_paused()
|
||||
except Exception as err: # pragma: no cover - defensive
|
||||
logger.exception("Failed to fetch pause status")
|
||||
return JSONResponse(
|
||||
content={"error": f"Failed to fetch pause status: {err}"},
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
|
||||
)
|
||||
|
||||
return JSONResponse(content={"is_paused": paused})
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
app.include_router(router)
|
||||
0
vllm/entrypoints/serve/sleep/__init__.py
Normal file
0
vllm/entrypoints/serve/sleep/__init__.py
Normal file
60
vllm/entrypoints/serve/sleep/api_router.py
Normal file
60
vllm/entrypoints/serve/sleep/api_router.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from fastapi import APIRouter, FastAPI, Request
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def engine_client(request: Request) -> EngineClient:
|
||||
return request.app.state.engine_client
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/sleep")
|
||||
async def sleep(raw_request: Request):
|
||||
# get POST params
|
||||
level = raw_request.query_params.get("level", "1")
|
||||
await engine_client(raw_request).sleep(int(level))
|
||||
# FIXME: in v0 with frontend multiprocessing, the sleep command
|
||||
# is sent but does not finish yet when we return a response.
|
||||
return Response(status_code=200)
|
||||
|
||||
|
||||
@router.post("/wake_up")
|
||||
async def wake_up(raw_request: Request):
|
||||
tags = raw_request.query_params.getlist("tags")
|
||||
if tags == []:
|
||||
# set to None to wake up all tags if no tags are provided
|
||||
tags = None
|
||||
logger.info("wake up the engine with tags: %s", tags)
|
||||
await engine_client(raw_request).wake_up(tags)
|
||||
# FIXME: in v0 with frontend multiprocessing, the wake-up command
|
||||
# is sent but does not finish yet when we return a response.
|
||||
return Response(status_code=200)
|
||||
|
||||
|
||||
@router.get("/is_sleeping")
|
||||
async def is_sleeping(raw_request: Request):
|
||||
logger.info("check whether the engine is sleeping")
|
||||
is_sleeping = await engine_client(raw_request).is_sleeping()
|
||||
return JSONResponse(content={"is_sleeping": is_sleeping})
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
if not envs.VLLM_SERVER_DEV_MODE:
|
||||
return
|
||||
logger.warning(
|
||||
"SECURITY WARNING: Development endpoints are enabled! "
|
||||
"This should NOT be used in production!"
|
||||
)
|
||||
|
||||
app.include_router(router)
|
||||
0
vllm/entrypoints/serve/tokenize/__init__.py
Normal file
0
vllm/entrypoints/serve/tokenize/__init__.py
Normal file
118
vllm/entrypoints/serve/tokenize/api_router.py
Normal file
118
vllm/entrypoints/serve/tokenize/api_router.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.responses import JSONResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.entrypoints.openai.api_server import validate_json_request
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
DetokenizeRequest,
|
||||
DetokenizeResponse,
|
||||
ErrorResponse,
|
||||
TokenizeRequest,
|
||||
TokenizeResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
|
||||
from vllm.entrypoints.utils import (
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def tokenization(request: Request) -> OpenAIServingTokenization:
|
||||
return request.app.state.openai_serving_tokenization
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/tokenize",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
async def tokenize(request: TokenizeRequest, raw_request: Request):
|
||||
handler = tokenization(raw_request)
|
||||
|
||||
try:
|
||||
generator = await handler.create_tokenize(request, raw_request)
|
||||
except NotImplementedError as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
|
||||
) from e
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, TokenizeResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
assert_never(generator)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/detokenize",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
async def detokenize(request: DetokenizeRequest, raw_request: Request):
|
||||
handler = tokenization(raw_request)
|
||||
|
||||
try:
|
||||
generator = await handler.create_detokenize(request, raw_request)
|
||||
except OverflowError as e:
|
||||
raise RequestValidationError(errors=[str(e)]) from e
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, DetokenizeResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
assert_never(generator)
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
if getattr(app.state.args, "enable_tokenizer_info_endpoint", False):
|
||||
"""Conditionally register the tokenizer info endpoint if enabled."""
|
||||
|
||||
@router.get("/tokenizer_info")
|
||||
async def get_tokenizer_info(raw_request: Request):
|
||||
"""Get comprehensive tokenizer information."""
|
||||
result = await tokenization(raw_request).get_tokenizer_info()
|
||||
return JSONResponse(
|
||||
content=result.model_dump(),
|
||||
status_code=result.error.code
|
||||
if isinstance(result, ErrorResponse)
|
||||
else 200,
|
||||
)
|
||||
|
||||
app.include_router(router)
|
||||
204
vllm/entrypoints/serve/tokenize/serving.py
Normal file
204
vllm/entrypoints/serve/tokenize/serving.py
Normal file
@@ -0,0 +1,204 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Final
|
||||
|
||||
import jinja2
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
DetokenizeRequest,
|
||||
DetokenizeResponse,
|
||||
ErrorResponse,
|
||||
TokenizeChatRequest,
|
||||
TokenizeRequest,
|
||||
TokenizeResponse,
|
||||
TokenizerInfoResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class OpenAIServingTokenization(OpenAIServing):
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
chat_template: str | None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
trust_request_chat_template: bool = False,
|
||||
log_error_stack: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
|
||||
self.chat_template = chat_template
|
||||
self.chat_template_content_format: Final = chat_template_content_format
|
||||
self.trust_request_chat_template = trust_request_chat_template
|
||||
|
||||
async def create_tokenize(
|
||||
self,
|
||||
request: TokenizeRequest,
|
||||
raw_request: Request,
|
||||
) -> TokenizeResponse | ErrorResponse:
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
request_id = f"tokn-{self._base_request_id(raw_request)}"
|
||||
|
||||
try:
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
renderer = self._get_renderer(tokenizer)
|
||||
|
||||
if isinstance(request, TokenizeChatRequest):
|
||||
tool_dicts = (
|
||||
None
|
||||
if request.tools is None
|
||||
else [tool.model_dump() for tool in request.tools]
|
||||
)
|
||||
error_check_ret = self._validate_chat_template(
|
||||
request_chat_template=request.chat_template,
|
||||
chat_template_kwargs=request.chat_template_kwargs,
|
||||
trust_request_chat_template=self.trust_request_chat_template,
|
||||
)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
_, engine_prompts = await self._preprocess_chat(
|
||||
request,
|
||||
tokenizer,
|
||||
request.messages,
|
||||
tool_dicts=tool_dicts,
|
||||
chat_template=request.chat_template or self.chat_template,
|
||||
chat_template_content_format=self.chat_template_content_format,
|
||||
add_generation_prompt=request.add_generation_prompt,
|
||||
continue_final_message=request.continue_final_message,
|
||||
chat_template_kwargs=request.chat_template_kwargs,
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
)
|
||||
else:
|
||||
engine_prompts = await renderer.render_prompt(
|
||||
prompt_or_prompts=request.prompt,
|
||||
config=self._build_render_config(request),
|
||||
)
|
||||
except (ValueError, TypeError, jinja2.TemplateError) as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(f"{e} {e.__cause__}")
|
||||
|
||||
input_ids: list[int] = []
|
||||
for engine_prompt in engine_prompts:
|
||||
self._log_inputs(
|
||||
request_id, engine_prompt, params=None, lora_request=lora_request
|
||||
)
|
||||
|
||||
if isinstance(engine_prompt, dict) and "prompt_token_ids" in engine_prompt:
|
||||
input_ids.extend(engine_prompt["prompt_token_ids"])
|
||||
|
||||
token_strs = None
|
||||
if request.return_token_strs:
|
||||
token_strs = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
|
||||
return TokenizeResponse(
|
||||
tokens=input_ids,
|
||||
token_strs=token_strs,
|
||||
count=len(input_ids),
|
||||
max_model_len=self.max_model_len,
|
||||
)
|
||||
|
||||
async def create_detokenize(
|
||||
self,
|
||||
request: DetokenizeRequest,
|
||||
raw_request: Request,
|
||||
) -> DetokenizeResponse | ErrorResponse:
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
request_id = f"tokn-{self._base_request_id(raw_request)}"
|
||||
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
TokensPrompt(prompt_token_ids=request.tokens),
|
||||
params=None,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
prompt_input = await self._tokenize_prompt_input_async(
|
||||
request,
|
||||
tokenizer,
|
||||
request.tokens,
|
||||
)
|
||||
input_text = prompt_input["prompt"]
|
||||
|
||||
return DetokenizeResponse(prompt=input_text)
|
||||
|
||||
async def get_tokenizer_info(
|
||||
self,
|
||||
) -> TokenizerInfoResponse | ErrorResponse:
|
||||
"""Get comprehensive tokenizer information."""
|
||||
try:
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
info = TokenizerInfo(tokenizer, self.chat_template).to_dict()
|
||||
return TokenizerInfoResponse(**info)
|
||||
except Exception as e:
|
||||
return self.create_error_response(f"Failed to get tokenizer info: {str(e)}")
|
||||
|
||||
def _build_render_config(self, request: TokenizeRequest) -> RenderConfig:
|
||||
return RenderConfig(add_special_tokens=request.add_special_tokens)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenizerInfo:
|
||||
tokenizer: TokenizerLike
|
||||
chat_template: str | None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Return the tokenizer configuration."""
|
||||
return self._get_tokenizer_config()
|
||||
|
||||
def _get_tokenizer_config(self) -> dict[str, Any]:
|
||||
"""Get tokenizer configuration directly from the tokenizer object."""
|
||||
config = dict(getattr(self.tokenizer, "init_kwargs", None) or {})
|
||||
|
||||
# Remove file path fields
|
||||
config.pop("vocab_file", None)
|
||||
config.pop("merges_file", None)
|
||||
|
||||
config = self._make_json_serializable(config)
|
||||
config["tokenizer_class"] = type(self.tokenizer).__name__
|
||||
if self.chat_template:
|
||||
config["chat_template"] = self.chat_template
|
||||
return config
|
||||
|
||||
def _make_json_serializable(self, obj):
|
||||
"""Convert any non-JSON-serializable objects to serializable format."""
|
||||
if hasattr(obj, "content"):
|
||||
return obj.content
|
||||
elif isinstance(obj, dict):
|
||||
return {k: self._make_json_serializable(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [self._make_json_serializable(item) for item in obj]
|
||||
else:
|
||||
return obj
|
||||
78
vllm/entrypoints/ssl.py
Normal file
78
vllm/entrypoints/ssl.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
from collections.abc import Callable
|
||||
from ssl import SSLContext
|
||||
|
||||
from watchfiles import Change, awatch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class SSLCertRefresher:
|
||||
"""A class that monitors SSL certificate files and
|
||||
reloads them when they change.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ssl_context: SSLContext,
|
||||
key_path: str | None = None,
|
||||
cert_path: str | None = None,
|
||||
ca_path: str | None = None,
|
||||
) -> None:
|
||||
self.ssl = ssl_context
|
||||
self.key_path = key_path
|
||||
self.cert_path = cert_path
|
||||
self.ca_path = ca_path
|
||||
|
||||
# Setup certification chain watcher
|
||||
def update_ssl_cert_chain(change: Change, file_path: str) -> None:
|
||||
logger.info("Reloading SSL certificate chain")
|
||||
assert self.key_path and self.cert_path
|
||||
self.ssl.load_cert_chain(self.cert_path, self.key_path)
|
||||
|
||||
self.watch_ssl_cert_task = None
|
||||
if self.key_path and self.cert_path:
|
||||
self.watch_ssl_cert_task = asyncio.create_task(
|
||||
self._watch_files(
|
||||
[self.key_path, self.cert_path], update_ssl_cert_chain
|
||||
)
|
||||
)
|
||||
|
||||
# Setup CA files watcher
|
||||
def update_ssl_ca(change: Change, file_path: str) -> None:
|
||||
logger.info("Reloading SSL CA certificates")
|
||||
assert self.ca_path
|
||||
self.ssl.load_verify_locations(self.ca_path)
|
||||
|
||||
self.watch_ssl_ca_task = None
|
||||
if self.ca_path:
|
||||
self.watch_ssl_ca_task = asyncio.create_task(
|
||||
self._watch_files([self.ca_path], update_ssl_ca)
|
||||
)
|
||||
|
||||
async def _watch_files(self, paths, fun: Callable[[Change, str], None]) -> None:
|
||||
"""Watch multiple file paths asynchronously."""
|
||||
logger.info("SSLCertRefresher monitors files: %s", paths)
|
||||
async for changes in awatch(*paths):
|
||||
try:
|
||||
for change, file_path in changes:
|
||||
logger.info("File change detected: %s - %s", change.name, file_path)
|
||||
fun(change, file_path)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"SSLCertRefresher failed taking action on file change. Error: %s", e
|
||||
)
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop watching files."""
|
||||
if self.watch_ssl_cert_task:
|
||||
self.watch_ssl_cert_task.cancel()
|
||||
self.watch_ssl_cert_task = None
|
||||
if self.watch_ssl_ca_task:
|
||||
self.watch_ssl_ca_task.cancel()
|
||||
self.watch_ssl_ca_task = None
|
||||
187
vllm/entrypoints/tool.py
Normal file
187
vllm/entrypoints/tool.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
from openai_harmony import Author, Message, Role, TextContent
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# Avoid circular import.
|
||||
from vllm.entrypoints.context import ConversationContext
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
MIN_GPT_OSS_VERSION = "0.0.7"
|
||||
|
||||
|
||||
def validate_gpt_oss_install():
|
||||
"""
|
||||
Check if the gpt-oss is installed and its version is at least 0.0.7.
|
||||
If not, raise an ImportError.
|
||||
"""
|
||||
from importlib.metadata import PackageNotFoundError, version
|
||||
|
||||
from packaging.version import InvalidVersion, Version
|
||||
|
||||
try:
|
||||
pkg_version_str = version("gpt_oss")
|
||||
pkg_version = Version(pkg_version_str)
|
||||
except PackageNotFoundError:
|
||||
raise ImportError("Package 'gpt_oss' is not installed.") from None
|
||||
except InvalidVersion as e:
|
||||
raise ImportError(f"Invalid version string for 'gpt_oss': {e}") from None
|
||||
|
||||
if pkg_version < Version(MIN_GPT_OSS_VERSION):
|
||||
raise ImportError(
|
||||
f"gpt_oss >= {MIN_GPT_OSS_VERSION} is required, "
|
||||
f"but {pkg_version} is installed."
|
||||
) from None
|
||||
|
||||
|
||||
class Tool(ABC):
|
||||
@abstractmethod
|
||||
async def get_result(self, context: "ConversationContext") -> Any:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def get_result_parsable_context(self, context: "ConversationContext") -> Any:
|
||||
pass
|
||||
|
||||
|
||||
class HarmonyBrowserTool(Tool):
|
||||
def __init__(self):
|
||||
self.enabled = True
|
||||
exa_api_key = os.getenv("EXA_API_KEY")
|
||||
if not exa_api_key:
|
||||
self.enabled = False
|
||||
logger.warning_once("EXA_API_KEY is not set, browsing is disabled")
|
||||
return
|
||||
|
||||
try:
|
||||
validate_gpt_oss_install()
|
||||
from gpt_oss.tools.simple_browser import SimpleBrowserTool
|
||||
from gpt_oss.tools.simple_browser.backend import ExaBackend
|
||||
except ImportError as e:
|
||||
self.enabled = False
|
||||
logger.warning_once(
|
||||
"gpt_oss is not installed properly (%s), browsing is disabled", e
|
||||
)
|
||||
return
|
||||
|
||||
browser_backend = ExaBackend(source="web", api_key=exa_api_key)
|
||||
self.browser_tool = SimpleBrowserTool(backend=browser_backend)
|
||||
logger.info_once("Browser tool initialized")
|
||||
|
||||
async def get_result(self, context: "ConversationContext") -> Any:
|
||||
from vllm.entrypoints.context import HarmonyContext
|
||||
|
||||
assert isinstance(context, HarmonyContext)
|
||||
last_msg = context.messages[-1]
|
||||
tool_output_msgs = []
|
||||
async for msg in self.browser_tool.process(last_msg):
|
||||
tool_output_msgs.append(msg)
|
||||
return tool_output_msgs
|
||||
|
||||
async def get_result_parsable_context(self, context: "ConversationContext") -> Any:
|
||||
raise NotImplementedError("Not implemented yet")
|
||||
|
||||
@property
|
||||
def tool_config(self) -> Any:
|
||||
return self.browser_tool.tool_config
|
||||
|
||||
|
||||
class HarmonyPythonTool(Tool):
|
||||
def __init__(self):
|
||||
self.enabled = True
|
||||
|
||||
try:
|
||||
validate_gpt_oss_install()
|
||||
from gpt_oss.tools.python_docker.docker_tool import PythonTool
|
||||
except ImportError as e:
|
||||
self.enabled = False
|
||||
logger.warning_once(
|
||||
"gpt_oss is not installed properly (%s), code interpreter is disabled",
|
||||
e,
|
||||
)
|
||||
return
|
||||
|
||||
self.python_tool = PythonTool()
|
||||
|
||||
async def validate(self):
|
||||
if not self.enabled:
|
||||
return
|
||||
try:
|
||||
message = Message(
|
||||
author=Author(role=Role.ASSISTANT),
|
||||
content=[TextContent(text="print('Hello, world!')")],
|
||||
channel="analysis",
|
||||
recipient="python",
|
||||
content_type="code",
|
||||
)
|
||||
msgs = []
|
||||
async for msg in self.python_tool.process(message):
|
||||
msgs.append(msg)
|
||||
assert msgs[0].content[0].text == "Hello, world!\n"
|
||||
except Exception as e:
|
||||
self.enabled = False
|
||||
logger.warning_once(
|
||||
"Code interpreter tool failed to initialize (%s), code "
|
||||
"interpreter is disabled",
|
||||
e,
|
||||
)
|
||||
return
|
||||
logger.info_once("Code interpreter tool initialized")
|
||||
|
||||
async def get_result(self, context: "ConversationContext") -> Any:
|
||||
from vllm.entrypoints.context import HarmonyContext
|
||||
|
||||
assert isinstance(context, HarmonyContext)
|
||||
last_msg = context.messages[-1]
|
||||
tool_output_msgs = []
|
||||
async for msg in self.python_tool.process(last_msg):
|
||||
tool_output_msgs.append(msg)
|
||||
return tool_output_msgs
|
||||
|
||||
async def get_result_parsable_context(self, context: "ConversationContext") -> Any:
|
||||
"""
|
||||
This function converts parsable context types to harmony and
|
||||
back so we can use GPTOSS demo python tool
|
||||
"""
|
||||
from vllm.entrypoints.context import ParsableContext
|
||||
|
||||
assert isinstance(context, ParsableContext)
|
||||
|
||||
last_msg = context.parser.response_messages[-1]
|
||||
args = json.loads(last_msg.arguments)
|
||||
|
||||
last_msg_harmony = Message(
|
||||
author=Author(role="assistant", name=None),
|
||||
content=[TextContent(text=args["code"])],
|
||||
channel="analysis",
|
||||
recipient="python",
|
||||
content_type="code",
|
||||
)
|
||||
|
||||
tool_output_msgs = []
|
||||
async for msg in self.python_tool.process(last_msg_harmony):
|
||||
processed = ResponseFunctionToolCallOutputItem(
|
||||
id=f"fco_{random_uuid()}",
|
||||
type="function_call_output",
|
||||
call_id=f"call_{random_uuid()}",
|
||||
output=msg.content[0].text,
|
||||
status="completed",
|
||||
)
|
||||
tool_output_msgs.append(processed)
|
||||
return tool_output_msgs
|
||||
|
||||
@property
|
||||
def tool_config(self) -> Any:
|
||||
return self.python_tool.tool_config
|
||||
234
vllm/entrypoints/tool_server.py
Normal file
234
vllm/entrypoints/tool_server.py
Normal file
@@ -0,0 +1,234 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
from contextlib import AbstractAsyncContextManager, asynccontextmanager
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from openai_harmony import ToolDescription, ToolNamespaceConfig
|
||||
|
||||
from vllm.entrypoints.tool import HarmonyBrowserTool, HarmonyPythonTool, Tool
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from mcp.types import ListToolsResult
|
||||
|
||||
|
||||
async def list_server_and_tools(server_url: str):
|
||||
from mcp import ClientSession
|
||||
from mcp.client.sse import sse_client
|
||||
|
||||
async with (
|
||||
sse_client(url=server_url) as streams,
|
||||
ClientSession(*streams) as session,
|
||||
):
|
||||
initialize_response = await session.initialize()
|
||||
list_tools_response = await session.list_tools()
|
||||
return initialize_response, list_tools_response
|
||||
|
||||
|
||||
def trim_schema(schema: dict) -> dict:
|
||||
# Turn JSON Schema from MCP generated into Harmony's variant.
|
||||
if "title" in schema:
|
||||
del schema["title"]
|
||||
if "default" in schema and schema["default"] is None:
|
||||
del schema["default"]
|
||||
if "anyOf" in schema:
|
||||
# Turn "anyOf": [{"type": "type-1"}, {"type": "type-2"}]
|
||||
# into "type": ["type-1", "type-2"]
|
||||
# if there's more than 1 types, also remove "null" type as Harmony will
|
||||
# just ignore it
|
||||
types = [
|
||||
type_dict["type"]
|
||||
for type_dict in schema["anyOf"]
|
||||
if type_dict["type"] != "null"
|
||||
]
|
||||
schema["type"] = types
|
||||
del schema["anyOf"]
|
||||
if "properties" in schema:
|
||||
schema["properties"] = {
|
||||
k: trim_schema(v) for k, v in schema["properties"].items()
|
||||
}
|
||||
return schema
|
||||
|
||||
|
||||
def post_process_tools_description(
|
||||
list_tools_result: "ListToolsResult",
|
||||
) -> "ListToolsResult":
|
||||
# Adapt the MCP tool result for Harmony
|
||||
for tool in list_tools_result.tools:
|
||||
tool.inputSchema = trim_schema(tool.inputSchema)
|
||||
|
||||
# Some tools schema don't need to be part of the prompt (e.g. simple text
|
||||
# in text out for Python)
|
||||
list_tools_result.tools = [
|
||||
tool
|
||||
for tool in list_tools_result.tools
|
||||
if getattr(tool.annotations, "include_in_prompt", True)
|
||||
]
|
||||
|
||||
return list_tools_result
|
||||
|
||||
|
||||
class ToolServer(ABC):
|
||||
@abstractmethod
|
||||
def has_tool(self, tool_name: str) -> bool:
|
||||
"""
|
||||
Return True if the tool is supported, False otherwise.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_tool_description(
|
||||
self, tool_name: str, allowed_tools: list[str] | None = None
|
||||
) -> ToolNamespaceConfig | None:
|
||||
"""
|
||||
Return the tool description for the given tool name.
|
||||
If the tool is not supported, return None.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def new_session(
|
||||
self, tool_name: str, session_id: str, headers: dict[str, str] | None = None
|
||||
) -> AbstractAsyncContextManager[Any]:
|
||||
"""
|
||||
Create a session for the tool.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
class MCPToolServer(ToolServer):
|
||||
def __init__(self):
|
||||
try:
|
||||
import mcp # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"mcp is not installed. Please run `pip install mcp` to use "
|
||||
"MCPToolServer."
|
||||
) from None
|
||||
self.harmony_tool_descriptions = {}
|
||||
|
||||
async def add_tool_server(self, server_url: str):
|
||||
tool_urls = server_url.split(",")
|
||||
self.harmony_tool_descriptions = {}
|
||||
self.urls: dict[str, str] = {}
|
||||
for url in tool_urls:
|
||||
url = f"http://{url}/sse"
|
||||
initialize_response, list_tools_response = await list_server_and_tools(url)
|
||||
|
||||
list_tools_response = post_process_tools_description(list_tools_response)
|
||||
|
||||
tool_from_mcp = ToolNamespaceConfig(
|
||||
name=initialize_response.serverInfo.name,
|
||||
description=initialize_response.instructions,
|
||||
tools=[
|
||||
ToolDescription.new(
|
||||
name=tool.name,
|
||||
description=tool.description,
|
||||
parameters=tool.inputSchema,
|
||||
)
|
||||
for tool in list_tools_response.tools
|
||||
],
|
||||
)
|
||||
self.harmony_tool_descriptions[tool_from_mcp.name] = tool_from_mcp
|
||||
if tool_from_mcp.name not in self.urls:
|
||||
self.urls[tool_from_mcp.name] = url
|
||||
else:
|
||||
logger.warning(
|
||||
"Tool %s already exists. Ignoring duplicate tool server %s",
|
||||
tool_from_mcp.name,
|
||||
url,
|
||||
)
|
||||
logger.info(
|
||||
"MCPToolServer initialized with tools: %s",
|
||||
list(self.harmony_tool_descriptions.keys()),
|
||||
)
|
||||
|
||||
def has_tool(self, tool_name: str):
|
||||
return tool_name in self.harmony_tool_descriptions
|
||||
|
||||
def get_tool_description(
|
||||
self,
|
||||
server_label: str,
|
||||
allowed_tools: list[str] | None = None,
|
||||
) -> ToolNamespaceConfig | None:
|
||||
cfg = self.harmony_tool_descriptions.get(server_label)
|
||||
if cfg is None:
|
||||
return None
|
||||
|
||||
# No restrictions: all tools from this MCP server
|
||||
if allowed_tools is None:
|
||||
return cfg
|
||||
|
||||
filtered = [t for t in cfg.tools if t.name in allowed_tools]
|
||||
|
||||
if not filtered:
|
||||
return None
|
||||
|
||||
return ToolNamespaceConfig(
|
||||
name=cfg.name,
|
||||
description=cfg.description,
|
||||
tools=filtered,
|
||||
)
|
||||
|
||||
@asynccontextmanager
|
||||
async def new_session(
|
||||
self, tool_name: str, session_id: str, headers: dict[str, str] | None = None
|
||||
):
|
||||
from mcp import ClientSession
|
||||
from mcp.client.sse import sse_client
|
||||
|
||||
url = self.urls.get(tool_name)
|
||||
request_headers = {"x-session-id": session_id}
|
||||
if headers is not None:
|
||||
request_headers.update(headers)
|
||||
if not url:
|
||||
raise KeyError(f"Tool '{tool_name}' is not supported")
|
||||
async with (
|
||||
sse_client(url=url, headers=request_headers) as streams,
|
||||
ClientSession(*streams) as session,
|
||||
):
|
||||
await session.initialize()
|
||||
yield session
|
||||
|
||||
|
||||
class DemoToolServer(ToolServer):
|
||||
def __init__(self):
|
||||
self.tools: dict[str, Tool] = {}
|
||||
|
||||
async def init_and_validate(self):
|
||||
browser_tool = HarmonyBrowserTool()
|
||||
python_tool = HarmonyPythonTool()
|
||||
await python_tool.validate()
|
||||
if browser_tool.enabled:
|
||||
self.tools["browser"] = browser_tool
|
||||
if python_tool.enabled:
|
||||
self.tools["python"] = python_tool
|
||||
logger.info(
|
||||
"DemoToolServer initialized with tools: %s", list(self.tools.keys())
|
||||
)
|
||||
|
||||
def has_tool(self, tool_name: str) -> bool:
|
||||
return tool_name in self.tools
|
||||
|
||||
def get_tool_description(
|
||||
self, tool_name: str, allowed_tools: list[str] | None = None
|
||||
) -> ToolNamespaceConfig | None:
|
||||
if tool_name not in self.tools:
|
||||
return None
|
||||
if tool_name == "browser":
|
||||
return ToolNamespaceConfig.browser()
|
||||
elif tool_name == "python":
|
||||
return ToolNamespaceConfig.python()
|
||||
else:
|
||||
raise ValueError(f"Unknown tool {tool_name}")
|
||||
|
||||
@asynccontextmanager
|
||||
async def new_session(
|
||||
self, tool_name: str, session_id: str, headers: dict[str, str] | None = None
|
||||
):
|
||||
if tool_name not in self.tools:
|
||||
raise KeyError(f"Tool '{tool_name}' is not supported")
|
||||
yield self.tools[tool_name]
|
||||
319
vllm/entrypoints/utils.py
Normal file
319
vllm/entrypoints/utils.py
Normal file
@@ -0,0 +1,319 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import functools
|
||||
import os
|
||||
from argparse import Namespace
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from fastapi import Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from starlette.background import BackgroundTask, BackgroundTasks
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
load_chat_template,
|
||||
resolve_hf_chat_template,
|
||||
resolve_mistral_chat_template,
|
||||
)
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
CompletionRequest,
|
||||
StreamOptions,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
VLLM_SUBCMD_PARSER_EPILOG = (
|
||||
"For full list: vllm {subcmd} --help=all\n"
|
||||
"For a section: vllm {subcmd} --help=ModelConfig (case-insensitive)\n" # noqa: E501
|
||||
"For a flag: vllm {subcmd} --help=max-model-len (_ or - accepted)\n" # noqa: E501
|
||||
"Documentation: https://docs.vllm.ai\n"
|
||||
)
|
||||
|
||||
|
||||
async def listen_for_disconnect(request: Request) -> None:
|
||||
"""Returns if a disconnect message is received"""
|
||||
while True:
|
||||
message = await request.receive()
|
||||
if message["type"] == "http.disconnect":
|
||||
# If load tracking is enabled *and* the counter exists, decrement
|
||||
# it. Combines the previous nested checks into a single condition
|
||||
# to satisfy the linter rule.
|
||||
if getattr(
|
||||
request.app.state, "enable_server_load_tracking", False
|
||||
) and hasattr(request.app.state, "server_load_metrics"):
|
||||
request.app.state.server_load_metrics -= 1
|
||||
break
|
||||
|
||||
|
||||
def with_cancellation(handler_func):
|
||||
"""Decorator that allows a route handler to be cancelled by client
|
||||
disconnections.
|
||||
|
||||
This does _not_ use request.is_disconnected, which does not work with
|
||||
middleware. Instead this follows the pattern from
|
||||
starlette.StreamingResponse, which simultaneously awaits on two tasks- one
|
||||
to wait for an http disconnect message, and the other to do the work that we
|
||||
want done. When the first task finishes, the other is cancelled.
|
||||
|
||||
A core assumption of this method is that the body of the request has already
|
||||
been read. This is a safe assumption to make for fastapi handlers that have
|
||||
already parsed the body of the request into a pydantic model for us.
|
||||
This decorator is unsafe to use elsewhere, as it will consume and throw away
|
||||
all incoming messages for the request while it looks for a disconnect
|
||||
message.
|
||||
|
||||
In the case where a `StreamingResponse` is returned by the handler, this
|
||||
wrapper will stop listening for disconnects and instead the response object
|
||||
will start listening for disconnects.
|
||||
"""
|
||||
|
||||
# Functools.wraps is required for this wrapper to appear to fastapi as a
|
||||
# normal route handler, with the correct request type hinting.
|
||||
@functools.wraps(handler_func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
# The request is either the second positional arg or `raw_request`
|
||||
request = args[1] if len(args) > 1 else kwargs["raw_request"]
|
||||
|
||||
handler_task = asyncio.create_task(handler_func(*args, **kwargs))
|
||||
cancellation_task = asyncio.create_task(listen_for_disconnect(request))
|
||||
|
||||
done, pending = await asyncio.wait(
|
||||
[handler_task, cancellation_task], return_when=asyncio.FIRST_COMPLETED
|
||||
)
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
|
||||
if handler_task in done:
|
||||
return handler_task.result()
|
||||
return None
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def decrement_server_load(request: Request):
|
||||
request.app.state.server_load_metrics -= 1
|
||||
|
||||
|
||||
def load_aware_call(func):
|
||||
@functools.wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
raw_request = kwargs.get("raw_request", args[1] if len(args) > 1 else None)
|
||||
|
||||
if raw_request is None:
|
||||
raise ValueError(
|
||||
"raw_request required when server load tracking is enabled"
|
||||
)
|
||||
|
||||
if not getattr(raw_request.app.state, "enable_server_load_tracking", False):
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
# ensure the counter exists
|
||||
if not hasattr(raw_request.app.state, "server_load_metrics"):
|
||||
raw_request.app.state.server_load_metrics = 0
|
||||
|
||||
raw_request.app.state.server_load_metrics += 1
|
||||
try:
|
||||
response = await func(*args, **kwargs)
|
||||
except Exception:
|
||||
raw_request.app.state.server_load_metrics -= 1
|
||||
raise
|
||||
|
||||
if isinstance(response, (JSONResponse, StreamingResponse)):
|
||||
if response.background is None:
|
||||
response.background = BackgroundTask(decrement_server_load, raw_request)
|
||||
elif isinstance(response.background, BackgroundTasks):
|
||||
response.background.add_task(decrement_server_load, raw_request)
|
||||
elif isinstance(response.background, BackgroundTask):
|
||||
# Convert the single BackgroundTask to BackgroundTasks
|
||||
# and chain the decrement_server_load task to it
|
||||
tasks = BackgroundTasks()
|
||||
tasks.add_task(
|
||||
response.background.func,
|
||||
*response.background.args,
|
||||
**response.background.kwargs,
|
||||
)
|
||||
tasks.add_task(decrement_server_load, raw_request)
|
||||
response.background = tasks
|
||||
else:
|
||||
raw_request.app.state.server_load_metrics -= 1
|
||||
|
||||
return response
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def cli_env_setup():
|
||||
# The safest multiprocessing method is `spawn`, as the default `fork` method
|
||||
# is not compatible with some accelerators. The default method will be
|
||||
# changing in future versions of Python, so we should use it explicitly when
|
||||
# possible.
|
||||
#
|
||||
# We only set it here in the CLI entrypoint, because changing to `spawn`
|
||||
# could break some existing code using vLLM as a library. `spawn` will cause
|
||||
# unexpected behavior if the code is not protected by
|
||||
# `if __name__ == "__main__":`.
|
||||
#
|
||||
# References:
|
||||
# - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
|
||||
# - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
|
||||
# - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
|
||||
# - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
|
||||
if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
|
||||
logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
def _validate_truncation_size(
|
||||
max_model_len: int,
|
||||
truncate_prompt_tokens: int | None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> int | None:
|
||||
if truncate_prompt_tokens is not None:
|
||||
if truncate_prompt_tokens <= -1:
|
||||
truncate_prompt_tokens = max_model_len
|
||||
|
||||
if truncate_prompt_tokens > max_model_len:
|
||||
raise ValueError(
|
||||
f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
|
||||
f"is greater than max_model_len ({max_model_len})."
|
||||
f" Please, select a smaller truncation size."
|
||||
)
|
||||
|
||||
if tokenization_kwargs is not None:
|
||||
tokenization_kwargs["truncation"] = True
|
||||
tokenization_kwargs["max_length"] = truncate_prompt_tokens
|
||||
|
||||
else:
|
||||
if tokenization_kwargs is not None:
|
||||
tokenization_kwargs["truncation"] = False
|
||||
|
||||
return truncate_prompt_tokens
|
||||
|
||||
|
||||
def get_max_tokens(
|
||||
max_model_len: int,
|
||||
request: ChatCompletionRequest | CompletionRequest,
|
||||
input_length: int,
|
||||
default_sampling_params: dict,
|
||||
) -> int:
|
||||
max_tokens = getattr(request, "max_completion_tokens", None) or request.max_tokens
|
||||
default_max_tokens = max_model_len - input_length
|
||||
max_output_tokens = current_platform.get_max_output_tokens(input_length)
|
||||
|
||||
return min(
|
||||
val
|
||||
for val in (
|
||||
default_max_tokens,
|
||||
max_tokens,
|
||||
max_output_tokens,
|
||||
default_sampling_params.get("max_tokens"),
|
||||
)
|
||||
if val is not None
|
||||
)
|
||||
|
||||
|
||||
def log_non_default_args(args: Namespace | EngineArgs):
|
||||
non_default_args = {}
|
||||
|
||||
# Handle Namespace
|
||||
if isinstance(args, Namespace):
|
||||
parser = make_arg_parser(FlexibleArgumentParser())
|
||||
for arg, default in vars(parser.parse_args([])).items():
|
||||
if default != getattr(args, arg):
|
||||
non_default_args[arg] = getattr(args, arg)
|
||||
|
||||
# Handle EngineArgs instance
|
||||
elif isinstance(args, EngineArgs):
|
||||
default_args = EngineArgs(model=args.model) # Create default instance
|
||||
for field in dataclasses.fields(args):
|
||||
current_val = getattr(args, field.name)
|
||||
default_val = getattr(default_args, field.name)
|
||||
if current_val != default_val:
|
||||
non_default_args[field.name] = current_val
|
||||
if default_args.model != EngineArgs.model:
|
||||
non_default_args["model"] = default_args.model
|
||||
else:
|
||||
raise TypeError(
|
||||
"Unsupported argument type. Must be Namespace or EngineArgs instance."
|
||||
)
|
||||
|
||||
logger.info("non-default args: %s", non_default_args)
|
||||
|
||||
|
||||
def should_include_usage(
|
||||
stream_options: StreamOptions | None, enable_force_include_usage: bool
|
||||
) -> tuple[bool, bool]:
|
||||
if stream_options:
|
||||
include_usage = stream_options.include_usage or enable_force_include_usage
|
||||
include_continuous_usage = include_usage and bool(
|
||||
stream_options.continuous_usage_stats
|
||||
)
|
||||
else:
|
||||
include_usage, include_continuous_usage = enable_force_include_usage, False
|
||||
return include_usage, include_continuous_usage
|
||||
|
||||
|
||||
def process_lora_modules(
|
||||
args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
|
||||
) -> list[LoRAModulePath]:
|
||||
lora_modules = args_lora_modules
|
||||
if default_mm_loras:
|
||||
default_mm_lora_paths = [
|
||||
LoRAModulePath(
|
||||
name=modality,
|
||||
path=lora_path,
|
||||
)
|
||||
for modality, lora_path in default_mm_loras.items()
|
||||
]
|
||||
if args_lora_modules is None:
|
||||
lora_modules = default_mm_lora_paths
|
||||
else:
|
||||
lora_modules += default_mm_lora_paths
|
||||
return lora_modules
|
||||
|
||||
|
||||
async def process_chat_template(
|
||||
args_chat_template: Path | str | None,
|
||||
engine_client: EngineClient,
|
||||
model_config: ModelConfig,
|
||||
) -> str | None:
|
||||
resolved_chat_template = load_chat_template(args_chat_template)
|
||||
if resolved_chat_template is not None:
|
||||
# Get the tokenizer to check official template
|
||||
tokenizer = await engine_client.get_tokenizer()
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
# The warning is logged in resolve_mistral_chat_template.
|
||||
resolved_chat_template = resolve_mistral_chat_template(
|
||||
chat_template=resolved_chat_template
|
||||
)
|
||||
else:
|
||||
hf_chat_template = resolve_hf_chat_template(
|
||||
tokenizer=tokenizer,
|
||||
chat_template=None,
|
||||
tools=None,
|
||||
model_config=model_config,
|
||||
)
|
||||
|
||||
if hf_chat_template != resolved_chat_template:
|
||||
logger.warning(
|
||||
"Using supplied chat template: %s\n"
|
||||
"It is different from official chat template '%s'. "
|
||||
"This discrepancy may lead to performance degradation.",
|
||||
resolved_chat_template,
|
||||
model_config.model,
|
||||
)
|
||||
return resolved_chat_template
|
||||
Reference in New Issue
Block a user