update
This commit is contained in:
0
vllm/entrypoints/serve/tokenize/__init__.py
Normal file
0
vllm/entrypoints/serve/tokenize/__init__.py
Normal file
114
vllm/entrypoints/serve/tokenize/api_router.py
Normal file
114
vllm/entrypoints/serve/tokenize/api_router.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.responses import JSONResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.serve.tokenize.protocol import (
|
||||
DetokenizeRequest,
|
||||
DetokenizeResponse,
|
||||
TokenizeRequest,
|
||||
TokenizeResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
|
||||
from vllm.entrypoints.utils import (
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def tokenization(request: Request) -> OpenAIServingTokenization:
|
||||
return request.app.state.openai_serving_tokenization
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/tokenize",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
async def tokenize(request: TokenizeRequest, raw_request: Request):
|
||||
handler = tokenization(raw_request)
|
||||
|
||||
try:
|
||||
generator = await handler.create_tokenize(request, raw_request)
|
||||
except Exception as e:
|
||||
generator = handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, TokenizeResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
assert_never(generator)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/detokenize",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
async def detokenize(request: DetokenizeRequest, raw_request: Request):
|
||||
handler = tokenization(raw_request)
|
||||
|
||||
try:
|
||||
generator = await handler.create_detokenize(request, raw_request)
|
||||
except OverflowError as e:
|
||||
raise RequestValidationError(errors=[str(e)]) from e
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, DetokenizeResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
assert_never(generator)
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
if getattr(app.state.args, "enable_tokenizer_info_endpoint", False):
|
||||
"""Conditionally register the tokenizer info endpoint if enabled."""
|
||||
|
||||
@router.get("/tokenizer_info")
|
||||
async def get_tokenizer_info(raw_request: Request):
|
||||
"""Get comprehensive tokenizer information."""
|
||||
result = await tokenization(raw_request).get_tokenizer_info()
|
||||
return JSONResponse(
|
||||
content=result.model_dump(),
|
||||
status_code=result.error.code
|
||||
if isinstance(result, ErrorResponse)
|
||||
else 200,
|
||||
)
|
||||
|
||||
app.include_router(router)
|
||||
183
vllm/entrypoints/serve/tokenize/protocol.py
Normal file
183
vllm/entrypoints/serve/tokenize/protocol.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from typing import Annotated, Any, TypeAlias
|
||||
|
||||
from pydantic import ConfigDict, Field, model_validator
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatCompletionMessageParam,
|
||||
ChatTemplateContentFormatOption,
|
||||
)
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionToolsParam,
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
OpenAIBaseModel,
|
||||
)
|
||||
from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs
|
||||
|
||||
|
||||
class TokenizeCompletionRequest(OpenAIBaseModel):
|
||||
model: str | None = None
|
||||
prompt: str
|
||||
|
||||
add_special_tokens: bool = Field(
|
||||
default=True,
|
||||
description=(
|
||||
"If true (the default), special tokens (e.g. BOS) will be added to "
|
||||
"the prompt."
|
||||
),
|
||||
)
|
||||
return_token_strs: bool | None = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"If true, also return the token strings corresponding to the token ids."
|
||||
),
|
||||
)
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
return TokenizeParams(
|
||||
max_total_tokens=None,
|
||||
max_output_tokens=0,
|
||||
add_special_tokens=self.add_special_tokens,
|
||||
)
|
||||
|
||||
|
||||
class TokenizeChatRequest(OpenAIBaseModel):
|
||||
model: str | None = None
|
||||
messages: list[ChatCompletionMessageParam]
|
||||
|
||||
add_generation_prompt: bool = Field(
|
||||
default=True,
|
||||
description=(
|
||||
"If true, the generation prompt will be added to the chat template. "
|
||||
"This is a parameter used by chat template in tokenizer config of the "
|
||||
"model."
|
||||
),
|
||||
)
|
||||
return_token_strs: bool | None = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"If true, also return the token strings corresponding to the token ids."
|
||||
),
|
||||
)
|
||||
continue_final_message: bool = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"If this is set, the chat will be formatted so that the final "
|
||||
"message in the chat is open-ended, without any EOS tokens. The "
|
||||
"model will continue this message rather than starting a new one. "
|
||||
'This allows you to "prefill" part of the model\'s response for it. '
|
||||
"Cannot be used at the same time as `add_generation_prompt`."
|
||||
),
|
||||
)
|
||||
add_special_tokens: bool = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"If true, special tokens (e.g. BOS) will be added to the prompt "
|
||||
"on top of what is added by the chat template. "
|
||||
"For most models, the chat template takes care of adding the "
|
||||
"special tokens so this should be set to false (as is the "
|
||||
"default)."
|
||||
),
|
||||
)
|
||||
chat_template: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"A Jinja template to use for this conversion. "
|
||||
"As of transformers v4.44, default chat template is no longer "
|
||||
"allowed, so you must provide a chat template if the tokenizer "
|
||||
"does not define one."
|
||||
),
|
||||
)
|
||||
chat_template_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Additional keyword args to pass to the template renderer. "
|
||||
"Will be accessible by the chat template."
|
||||
),
|
||||
)
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description="Additional kwargs to pass to the HF processor.",
|
||||
)
|
||||
tools: list[ChatCompletionToolsParam] | None = Field(
|
||||
default=None,
|
||||
description="A list of tools the model may call.",
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_generation_prompt(cls, data):
|
||||
if data.get("continue_final_message") and data.get("add_generation_prompt"):
|
||||
raise ValueError(
|
||||
"Cannot set both `continue_final_message` and "
|
||||
"`add_generation_prompt` to True."
|
||||
)
|
||||
return data
|
||||
|
||||
def build_chat_params(
|
||||
self,
|
||||
default_template: str | None,
|
||||
default_template_content_format: ChatTemplateContentFormatOption,
|
||||
) -> ChatParams:
|
||||
return ChatParams(
|
||||
chat_template=self.chat_template or default_template,
|
||||
chat_template_content_format=default_template_content_format,
|
||||
chat_template_kwargs=merge_kwargs(
|
||||
self.chat_template_kwargs,
|
||||
dict(
|
||||
add_generation_prompt=self.add_generation_prompt,
|
||||
continue_final_message=self.continue_final_message,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
return TokenizeParams(
|
||||
max_total_tokens=None,
|
||||
max_output_tokens=0,
|
||||
add_special_tokens=self.add_special_tokens,
|
||||
)
|
||||
|
||||
|
||||
TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
|
||||
|
||||
|
||||
class TokenizeResponse(OpenAIBaseModel):
|
||||
count: int
|
||||
max_model_len: int
|
||||
tokens: list[int]
|
||||
token_strs: list[str] | None = None
|
||||
|
||||
|
||||
class DetokenizeRequest(OpenAIBaseModel):
|
||||
model: str | None = None
|
||||
# TODO: Factor `torch.iinfo` out. `torch.iinfo` pulls torch into a
|
||||
# Pydantic protocol file that currently has no torch dependency.
|
||||
# See: https://github.com/vllm-project/vllm/pull/34468#discussion_r2801173630
|
||||
tokens: list[Annotated[int, Field(ge=0, le=2**63 - 1)]]
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
return TokenizeParams(
|
||||
max_total_tokens=None,
|
||||
max_output_tokens=0,
|
||||
needs_detokenization=True,
|
||||
)
|
||||
|
||||
|
||||
class DetokenizeResponse(OpenAIBaseModel):
|
||||
prompt: str
|
||||
|
||||
|
||||
class TokenizerInfoResponse(OpenAIBaseModel):
|
||||
"""
|
||||
Response containing tokenizer configuration
|
||||
equivalent to tokenizer_config.json
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
tokenizer_class: str
|
||||
195
vllm/entrypoints/serve/tokenize/serving.py
Normal file
195
vllm/entrypoints/serve/tokenize/serving.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Final
|
||||
|
||||
import jinja2
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.tokenize.protocol import (
|
||||
DetokenizeRequest,
|
||||
DetokenizeResponse,
|
||||
TokenizeChatRequest,
|
||||
TokenizeRequest,
|
||||
TokenizeResponse,
|
||||
TokenizerInfoResponse,
|
||||
)
|
||||
from vllm.inputs import TokensPrompt, token_inputs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class OpenAIServingTokenization(OpenAIServing):
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
chat_template: str | None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
trust_request_chat_template: bool = False,
|
||||
log_error_stack: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
|
||||
self.chat_template = chat_template
|
||||
self.chat_template_content_format: Final = chat_template_content_format
|
||||
self.trust_request_chat_template = trust_request_chat_template
|
||||
|
||||
async def create_tokenize(
|
||||
self,
|
||||
request: TokenizeRequest,
|
||||
raw_request: Request,
|
||||
) -> TokenizeResponse | ErrorResponse:
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
request_id = f"tokenize-{self._base_request_id(raw_request)}"
|
||||
|
||||
try:
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
if isinstance(request, TokenizeChatRequest):
|
||||
tool_dicts = (
|
||||
None
|
||||
if request.tools is None
|
||||
else [tool.model_dump() for tool in request.tools]
|
||||
)
|
||||
error_check_ret = self._validate_chat_template(
|
||||
request_chat_template=request.chat_template,
|
||||
chat_template_kwargs=request.chat_template_kwargs,
|
||||
trust_request_chat_template=self.trust_request_chat_template,
|
||||
)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
_, engine_prompts = await self._preprocess_chat(
|
||||
request,
|
||||
request.messages,
|
||||
default_template=self.chat_template,
|
||||
default_template_content_format=self.chat_template_content_format,
|
||||
default_template_kwargs=None,
|
||||
tool_dicts=tool_dicts,
|
||||
)
|
||||
else:
|
||||
engine_prompts = await self._preprocess_completion(
|
||||
request,
|
||||
prompt_input=request.prompt,
|
||||
prompt_embeds=None,
|
||||
)
|
||||
except (ValueError, TypeError, jinja2.TemplateError) as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(f"{e} {e.__cause__}")
|
||||
|
||||
input_ids: list[int] = []
|
||||
for engine_prompt in engine_prompts:
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
engine_prompt,
|
||||
params=None,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
if "prompt_token_ids" in engine_prompt:
|
||||
input_ids.extend(engine_prompt["prompt_token_ids"]) # type: ignore[typeddict-item]
|
||||
|
||||
token_strs = None
|
||||
if request.return_token_strs:
|
||||
tokenizer = self.renderer.get_tokenizer()
|
||||
token_strs = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
|
||||
return TokenizeResponse(
|
||||
tokens=input_ids,
|
||||
token_strs=token_strs,
|
||||
count=len(input_ids),
|
||||
max_model_len=self.model_config.max_model_len,
|
||||
)
|
||||
|
||||
async def create_detokenize(
|
||||
self,
|
||||
request: DetokenizeRequest,
|
||||
raw_request: Request,
|
||||
) -> DetokenizeResponse | ErrorResponse:
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
request_id = f"tokenize-{self._base_request_id(raw_request)}"
|
||||
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
token_inputs(request.tokens),
|
||||
params=None,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
engine_prompt = await self.renderer.tokenize_prompt_async(
|
||||
TokensPrompt(prompt_token_ids=request.tokens),
|
||||
request.build_tok_params(self.model_config),
|
||||
)
|
||||
prompt_text = engine_prompt["prompt"] # type: ignore[typeddict-item]
|
||||
|
||||
return DetokenizeResponse(prompt=prompt_text)
|
||||
|
||||
async def get_tokenizer_info(
|
||||
self,
|
||||
) -> TokenizerInfoResponse | ErrorResponse:
|
||||
"""Get comprehensive tokenizer information."""
|
||||
try:
|
||||
tokenizer = self.renderer.get_tokenizer()
|
||||
info = TokenizerInfo(tokenizer, self.chat_template).to_dict()
|
||||
return TokenizerInfoResponse(**info)
|
||||
except Exception as e:
|
||||
return self.create_error_response(f"Failed to get tokenizer info: {str(e)}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenizerInfo:
|
||||
tokenizer: TokenizerLike
|
||||
chat_template: str | None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Return the tokenizer configuration."""
|
||||
return self._get_tokenizer_config()
|
||||
|
||||
def _get_tokenizer_config(self) -> dict[str, Any]:
|
||||
"""Get tokenizer configuration directly from the tokenizer object."""
|
||||
config = dict(getattr(self.tokenizer, "init_kwargs", None) or {})
|
||||
|
||||
# Remove file path fields
|
||||
config.pop("vocab_file", None)
|
||||
config.pop("merges_file", None)
|
||||
|
||||
config = self._make_json_serializable(config)
|
||||
config["tokenizer_class"] = type(self.tokenizer).__name__
|
||||
if self.chat_template:
|
||||
config["chat_template"] = self.chat_template
|
||||
return config
|
||||
|
||||
def _make_json_serializable(self, obj):
|
||||
"""Convert any non-JSON-serializable objects to serializable format."""
|
||||
if hasattr(obj, "content"):
|
||||
return obj.content
|
||||
elif isinstance(obj, dict):
|
||||
return {k: self._make_json_serializable(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [self._make_json_serializable(item) for item in obj]
|
||||
else:
|
||||
return obj
|
||||
Reference in New Issue
Block a user