# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING from fastapi import FastAPI if TYPE_CHECKING: from argparse import Namespace from starlette.datastructures import State from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger from vllm.tasks import SupportedTask else: RequestLogger = object def register_generate_api_routers(app: FastAPI): from vllm.entrypoints.openai.chat_completion.api_router import ( attach_router as register_chat_api_router, ) register_chat_api_router(app) from vllm.entrypoints.openai.responses.api_router import ( attach_router as register_responses_api_router, ) register_responses_api_router(app) from vllm.entrypoints.openai.completion.api_router import ( attach_router as register_completion_api_router, ) register_completion_api_router(app) from vllm.entrypoints.anthropic.api_router import ( attach_router as register_anthropic_api_router, ) register_anthropic_api_router(app) async def init_generate_state( engine_client: "EngineClient", state: "State", args: "Namespace", request_logger: RequestLogger | None, supported_tasks: tuple["SupportedTask", ...], ): from vllm.entrypoints.anthropic.serving import AnthropicServingMessages from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.mcp.tool_server import ( DemoToolServer, MCPToolServer, ToolServer, ) from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.serve.disagg.serving import ServingTokens if args.tool_server == "demo": tool_server: ToolServer | None = DemoToolServer() assert isinstance(tool_server, DemoToolServer) await tool_server.init_and_validate() elif args.tool_server: tool_server = MCPToolServer() await tool_server.add_tool_server(args.tool_server) else: tool_server = None resolved_chat_template = load_chat_template(args.chat_template) state.openai_serving_responses = ( OpenAIServingResponses( engine_client, state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, tool_server=tool_server, reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, log_error_stack=args.log_error_stack, ) if "generate" in supported_tasks else None ) state.openai_serving_chat = ( OpenAIServingChat( engine_client, state.openai_serving_models, args.response_role, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, default_chat_template_kwargs=args.default_chat_template_kwargs, trust_request_chat_template=args.trust_request_chat_template, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, tool_parser=args.tool_call_parser, reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, enable_log_deltas=args.enable_log_deltas, log_error_stack=args.log_error_stack, ) if "generate" in supported_tasks else None ) # Warm up chat template processing to avoid first-request latency if state.openai_serving_chat is not None: await state.openai_serving_chat.warmup() state.openai_serving_completion = ( OpenAIServingCompletion( engine_client, state.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, log_error_stack=args.log_error_stack, ) if "generate" in supported_tasks else None ) state.anthropic_serving_messages = ( AnthropicServingMessages( engine_client, state.openai_serving_models, args.response_role, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, ) if "generate" in supported_tasks else None ) state.serving_tokens = ( ServingTokens( engine_client, state.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, log_error_stack=args.log_error_stack, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_log_outputs=args.enable_log_outputs, force_no_detokenize=args.tokens_only, ) if "generate" in supported_tasks else None )