init
This commit is contained in:
234
vllm/entrypoints/openai/serving_engine.py
Normal file
234
vllm/entrypoints/openai/serving_engine.py
Normal file
@@ -0,0 +1,234 @@
|
||||
import asyncio
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from pydantic import Field
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
CompletionRequest, ErrorResponse,
|
||||
LogProbs, ModelCard, ModelList,
|
||||
ModelPermission)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import Logprob
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoRAModulePath:
|
||||
name: str
|
||||
local_path: str
|
||||
|
||||
|
||||
class OpenAIServing:
|
||||
|
||||
def __init__(self,
|
||||
engine: AsyncLLMEngine,
|
||||
served_model_names: List[str],
|
||||
lora_modules: Optional[List[LoRAModulePath]],
|
||||
await_post_init: Optional[Awaitable[Any]] = None):
|
||||
self.engine = engine
|
||||
self.served_model_names = served_model_names
|
||||
if lora_modules is None:
|
||||
self.lora_requests = []
|
||||
else:
|
||||
self.lora_requests = [
|
||||
LoRARequest(
|
||||
lora_name=lora.name,
|
||||
lora_int_id=i,
|
||||
lora_local_path=lora.local_path,
|
||||
) for i, lora in enumerate(lora_modules, start=1)
|
||||
]
|
||||
|
||||
self.max_model_len = 0
|
||||
# Lazy initialized
|
||||
self.tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
||||
|
||||
try:
|
||||
event_loop = asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
event_loop = None
|
||||
|
||||
if event_loop is not None and event_loop.is_running():
|
||||
# If the current is instanced by Ray Serve,
|
||||
# there is already a running event loop
|
||||
event_loop.create_task(self._post_init(await_post_init))
|
||||
else:
|
||||
# When using single vLLM without engine_use_ray
|
||||
asyncio.run(self._post_init(await_post_init))
|
||||
|
||||
async def _post_init(self, await_post_init):
|
||||
engine_model_config = await self.engine.get_model_config()
|
||||
self.max_model_len = engine_model_config.max_model_len
|
||||
|
||||
# A separate tokenizer to map token IDs to strings.
|
||||
self.tokenizer = get_tokenizer(
|
||||
engine_model_config.tokenizer,
|
||||
tokenizer_mode=engine_model_config.tokenizer_mode,
|
||||
tokenizer_revision=engine_model_config.tokenizer_revision,
|
||||
trust_remote_code=engine_model_config.trust_remote_code,
|
||||
truncation_side="left")
|
||||
|
||||
if await_post_init is not None:
|
||||
await await_post_init
|
||||
|
||||
async def show_available_models(self) -> ModelList:
|
||||
"""Show available models. Right now we only have one model."""
|
||||
model_cards = [
|
||||
ModelCard(id=served_model_name,
|
||||
root=self.served_model_names[0],
|
||||
permission=[ModelPermission()])
|
||||
for served_model_name in self.served_model_names
|
||||
]
|
||||
lora_cards = [
|
||||
ModelCard(id=lora.lora_name,
|
||||
root=self.served_model_names[0],
|
||||
permission=[ModelPermission()])
|
||||
for lora in self.lora_requests
|
||||
]
|
||||
model_cards.extend(lora_cards)
|
||||
return ModelList(data=model_cards)
|
||||
|
||||
def _create_logprobs(
|
||||
self,
|
||||
token_ids: List[int],
|
||||
top_logprobs: List[Optional[Dict[int, Logprob]]],
|
||||
num_output_top_logprobs: Optional[int] = None,
|
||||
initial_text_offset: int = 0,
|
||||
) -> LogProbs:
|
||||
"""Create OpenAI-style logprobs."""
|
||||
logprobs = LogProbs()
|
||||
last_token_len = 0
|
||||
if num_output_top_logprobs:
|
||||
logprobs.top_logprobs = []
|
||||
|
||||
for i, token_id in enumerate(token_ids):
|
||||
step_top_logprobs = top_logprobs[i]
|
||||
if step_top_logprobs is None:
|
||||
token = self.tokenizer.decode(token_id)
|
||||
logprobs.tokens.append(token)
|
||||
logprobs.token_logprobs.append(None)
|
||||
assert logprobs.top_logprobs is not None
|
||||
logprobs.top_logprobs.append(None)
|
||||
else:
|
||||
token_logprob = step_top_logprobs[token_id].logprob
|
||||
token = step_top_logprobs[token_id].decoded_token
|
||||
logprobs.tokens.append(token)
|
||||
logprobs.token_logprobs.append(token_logprob)
|
||||
|
||||
if num_output_top_logprobs:
|
||||
assert logprobs.top_logprobs is not None
|
||||
logprobs.top_logprobs.append({
|
||||
# Convert float("-inf") to the
|
||||
# JSON-serializable float that OpenAI uses
|
||||
p.decoded_token: max(p.logprob, -9999.0)
|
||||
for i, p in step_top_logprobs.items()
|
||||
} if step_top_logprobs else None)
|
||||
|
||||
if len(logprobs.text_offset) == 0:
|
||||
logprobs.text_offset.append(initial_text_offset)
|
||||
else:
|
||||
logprobs.text_offset.append(logprobs.text_offset[-1] +
|
||||
last_token_len)
|
||||
last_token_len = len(token)
|
||||
return logprobs
|
||||
|
||||
def create_error_response(
|
||||
self,
|
||||
message: str,
|
||||
err_type: str = "BadRequestError",
|
||||
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
|
||||
return ErrorResponse(message=message,
|
||||
type=err_type,
|
||||
code=status_code.value)
|
||||
|
||||
def create_streaming_error_response(
|
||||
self,
|
||||
message: str,
|
||||
err_type: str = "BadRequestError",
|
||||
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str:
|
||||
json_str = json.dumps({
|
||||
"error":
|
||||
self.create_error_response(message=message,
|
||||
err_type=err_type,
|
||||
status_code=status_code).model_dump()
|
||||
})
|
||||
return json_str
|
||||
|
||||
async def _check_model(
|
||||
self, request: Union[CompletionRequest, ChatCompletionRequest]
|
||||
) -> Optional[ErrorResponse]:
|
||||
if request.model in self.served_model_names:
|
||||
return None
|
||||
if request.model in [lora.lora_name for lora in self.lora_requests]:
|
||||
return None
|
||||
return self.create_error_response(
|
||||
message=f"The model `{request.model}` does not exist.",
|
||||
err_type="NotFoundError",
|
||||
status_code=HTTPStatus.NOT_FOUND)
|
||||
|
||||
def _maybe_get_lora(
|
||||
self, request: Union[CompletionRequest, ChatCompletionRequest]
|
||||
) -> Optional[LoRARequest]:
|
||||
if request.model in self.served_model_names:
|
||||
return None
|
||||
for lora in self.lora_requests:
|
||||
if request.model == lora.lora_name:
|
||||
return lora
|
||||
# if _check_model has been called earlier, this will be unreachable
|
||||
raise ValueError(f"The model `{request.model}` does not exist.")
|
||||
|
||||
def _validate_prompt_and_tokenize(
|
||||
self,
|
||||
request: Union[ChatCompletionRequest, CompletionRequest],
|
||||
prompt: Optional[str] = None,
|
||||
prompt_ids: Optional[List[int]] = None,
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
|
||||
) -> Tuple[List[int], str]:
|
||||
if not (prompt or prompt_ids):
|
||||
raise ValueError("Either prompt or prompt_ids should be provided.")
|
||||
if (prompt and prompt_ids):
|
||||
raise ValueError(
|
||||
"Only one of prompt or prompt_ids should be provided.")
|
||||
|
||||
if prompt_ids is None:
|
||||
tokenizer_kwargs = {} if truncate_prompt_tokens is None else {
|
||||
"truncation": True,
|
||||
"max_length": truncate_prompt_tokens,
|
||||
}
|
||||
input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
|
||||
elif truncate_prompt_tokens is not None:
|
||||
input_ids = prompt_ids[-truncate_prompt_tokens:]
|
||||
else:
|
||||
input_ids = prompt_ids
|
||||
|
||||
input_text = prompt if prompt is not None else self.tokenizer.decode(
|
||||
prompt_ids)
|
||||
token_num = len(input_ids)
|
||||
|
||||
if request.max_tokens is None:
|
||||
if token_num >= self.max_model_len:
|
||||
raise ValueError(
|
||||
f"This model's maximum context length is "
|
||||
f"{self.max_model_len} tokens. However, you requested "
|
||||
f"{token_num} tokens in the messages, "
|
||||
f"Please reduce the length of the messages.", )
|
||||
request.max_tokens = self.max_model_len - token_num
|
||||
|
||||
if token_num + request.max_tokens > self.max_model_len:
|
||||
raise ValueError(
|
||||
f"This model's maximum context length is "
|
||||
f"{self.max_model_len} tokens. However, you requested "
|
||||
f"{request.max_tokens + token_num} tokens "
|
||||
f"({token_num} in the messages, "
|
||||
f"{request.max_tokens} in the completion). "
|
||||
f"Please reduce the length of the messages or completion.", )
|
||||
else:
|
||||
return input_ids, input_text
|
||||
Reference in New Issue
Block a user