Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
0
vllm/entrypoints/openai/generate/__init__.py
Normal file
0
vllm/entrypoints/openai/generate/__init__.py
Normal file
166
vllm/entrypoints/openai/generate/api_router.py
Normal file
166
vllm/entrypoints/openai/generate/api_router.py
Normal file
@@ -0,0 +1,166 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from argparse import Namespace
|
||||
|
||||
from starlette.datastructures import State
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.tasks import SupportedTask
|
||||
else:
|
||||
RequestLogger = object
|
||||
|
||||
|
||||
def register_generate_api_routers(app: FastAPI):
|
||||
from vllm.entrypoints.openai.chat_completion.api_router import (
|
||||
attach_router as register_chat_api_router,
|
||||
)
|
||||
|
||||
register_chat_api_router(app)
|
||||
|
||||
from vllm.entrypoints.openai.responses.api_router import (
|
||||
attach_router as register_responses_api_router,
|
||||
)
|
||||
|
||||
register_responses_api_router(app)
|
||||
|
||||
from vllm.entrypoints.openai.completion.api_router import (
|
||||
attach_router as register_completion_api_router,
|
||||
)
|
||||
|
||||
register_completion_api_router(app)
|
||||
|
||||
from vllm.entrypoints.anthropic.api_router import (
|
||||
attach_router as register_anthropic_api_router,
|
||||
)
|
||||
|
||||
register_anthropic_api_router(app)
|
||||
|
||||
|
||||
async def init_generate_state(
|
||||
engine_client: "EngineClient",
|
||||
state: "State",
|
||||
args: "Namespace",
|
||||
request_logger: RequestLogger | None,
|
||||
supported_tasks: tuple["SupportedTask", ...],
|
||||
):
|
||||
from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
from vllm.entrypoints.mcp.tool_server import (
|
||||
DemoToolServer,
|
||||
MCPToolServer,
|
||||
ToolServer,
|
||||
)
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses
|
||||
from vllm.entrypoints.serve.disagg.serving import ServingTokens
|
||||
|
||||
if args.tool_server == "demo":
|
||||
tool_server: ToolServer | None = DemoToolServer()
|
||||
assert isinstance(tool_server, DemoToolServer)
|
||||
await tool_server.init_and_validate()
|
||||
elif args.tool_server:
|
||||
tool_server = MCPToolServer()
|
||||
await tool_server.add_tool_server(args.tool_server)
|
||||
else:
|
||||
tool_server = None
|
||||
resolved_chat_template = load_chat_template(args.chat_template)
|
||||
|
||||
state.openai_serving_responses = (
|
||||
OpenAIServingResponses(
|
||||
engine_client,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
tool_parser=args.tool_call_parser,
|
||||
tool_server=tool_server,
|
||||
reasoning_parser=args.structured_outputs_config.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
enable_log_outputs=args.enable_log_outputs,
|
||||
log_error_stack=args.log_error_stack,
|
||||
)
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
state.openai_serving_chat = (
|
||||
OpenAIServingChat(
|
||||
engine_client,
|
||||
state.openai_serving_models,
|
||||
args.response_role,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
default_chat_template_kwargs=args.default_chat_template_kwargs,
|
||||
trust_request_chat_template=args.trust_request_chat_template,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
|
||||
tool_parser=args.tool_call_parser,
|
||||
reasoning_parser=args.structured_outputs_config.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
enable_log_outputs=args.enable_log_outputs,
|
||||
enable_log_deltas=args.enable_log_deltas,
|
||||
log_error_stack=args.log_error_stack,
|
||||
)
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
# Warm up chat template processing to avoid first-request latency
|
||||
if state.openai_serving_chat is not None:
|
||||
await state.openai_serving_chat.warmup()
|
||||
state.openai_serving_completion = (
|
||||
OpenAIServingCompletion(
|
||||
engine_client,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
log_error_stack=args.log_error_stack,
|
||||
)
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
state.anthropic_serving_messages = (
|
||||
AnthropicServingMessages(
|
||||
engine_client,
|
||||
state.openai_serving_models,
|
||||
args.response_role,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
tool_parser=args.tool_call_parser,
|
||||
reasoning_parser=args.structured_outputs_config.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
)
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
state.serving_tokens = (
|
||||
ServingTokens(
|
||||
engine_client,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
log_error_stack=args.log_error_stack,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_log_outputs=args.enable_log_outputs,
|
||||
force_no_detokenize=args.tokens_only,
|
||||
)
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
Reference in New Issue
Block a user