update
This commit is contained in:
0
vllm/entrypoints/pooling/score/__init__.py
Normal file
0
vllm/entrypoints/pooling/score/__init__.py
Normal file
147
vllm/entrypoints/pooling/score/api_router.py
Normal file
147
vllm/entrypoints/pooling/score/api_router.py
Normal file
@@ -0,0 +1,147 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.pooling.score.protocol import (
|
||||
RerankRequest,
|
||||
RerankResponse,
|
||||
ScoreRequest,
|
||||
ScoreResponse,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.serving import ServingScores
|
||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
||||
from vllm.logger import init_logger
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def score(request: Request) -> ServingScores | None:
|
||||
return request.app.state.openai_serving_scores
|
||||
|
||||
|
||||
def rerank(request: Request) -> ServingScores | None:
|
||||
return request.app.state.openai_serving_scores
|
||||
|
||||
|
||||
@router.post(
|
||||
"/score",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def create_score(request: ScoreRequest, raw_request: Request):
|
||||
handler = score(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Score API"
|
||||
)
|
||||
|
||||
try:
|
||||
generator = await handler.create_score(request, raw_request)
|
||||
except Exception as e:
|
||||
generator = handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, ScoreResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
assert_never(generator)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/score",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def create_score_v1(request: ScoreRequest, raw_request: Request):
|
||||
logger.warning(
|
||||
"To indicate that Score API is not part of standard OpenAI API, we "
|
||||
"have moved it to `/score`. Please update your client accordingly."
|
||||
)
|
||||
|
||||
return await create_score(request, raw_request)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/rerank",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def do_rerank(request: RerankRequest, raw_request: Request):
|
||||
handler = rerank(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Rerank (Score) API"
|
||||
)
|
||||
try:
|
||||
generator = await handler.do_rerank(request, raw_request)
|
||||
except Exception as e:
|
||||
generator = handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
)
|
||||
elif isinstance(generator, RerankResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
assert_never(generator)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/rerank",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
async def do_rerank_v1(request: RerankRequest, raw_request: Request):
|
||||
logger.warning_once(
|
||||
"To indicate that the rerank API is not part of the standard OpenAI"
|
||||
" API, we have located it at `/rerank`. Please update your client "
|
||||
"accordingly. (Note: Conforms to JinaAI rerank API)"
|
||||
)
|
||||
|
||||
return await do_rerank(request, raw_request)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v2/rerank",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
async def do_rerank_v2(request: RerankRequest, raw_request: Request):
|
||||
return await do_rerank(request, raw_request)
|
||||
156
vllm/entrypoints/pooling/score/protocol.py
Normal file
156
vllm/entrypoints/pooling/score/protocol.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import time
|
||||
from typing import TypeAlias
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from vllm import PoolingParams
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo
|
||||
from vllm.entrypoints.pooling.base.protocol import (
|
||||
ClassifyRequestMixin,
|
||||
PoolingBasicRequestMixin,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.utils import (
|
||||
ScoreContentPartParam,
|
||||
ScoreInput,
|
||||
ScoreInputs,
|
||||
)
|
||||
from vllm.renderers import TokenizeParams
|
||||
from vllm.tasks import PoolingTask
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
|
||||
class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
encoder_config = model_config.encoder_config or {}
|
||||
|
||||
return TokenizeParams(
|
||||
max_total_tokens=model_config.max_model_len,
|
||||
max_output_tokens=0,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
do_lower_case=encoder_config.get("do_lower_case", False),
|
||||
max_total_tokens_param="max_model_len",
|
||||
)
|
||||
|
||||
def to_pooling_params(self, task: PoolingTask = "score"):
|
||||
return PoolingParams(
|
||||
task=task,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=self.use_activation,
|
||||
)
|
||||
|
||||
|
||||
class ScoreDataRequest(ScoreRequestMixin):
|
||||
data_1: ScoreInputs
|
||||
data_2: ScoreInputs
|
||||
|
||||
|
||||
class ScoreQueriesDocumentsRequest(ScoreRequestMixin):
|
||||
queries: ScoreInputs
|
||||
documents: ScoreInputs
|
||||
|
||||
@property
|
||||
def data_1(self):
|
||||
return self.queries
|
||||
|
||||
@property
|
||||
def data_2(self):
|
||||
return self.documents
|
||||
|
||||
|
||||
class ScoreQueriesItemsRequest(ScoreRequestMixin):
|
||||
queries: ScoreInputs
|
||||
items: ScoreInputs
|
||||
|
||||
@property
|
||||
def data_1(self):
|
||||
return self.queries
|
||||
|
||||
@property
|
||||
def data_2(self):
|
||||
return self.items
|
||||
|
||||
|
||||
class ScoreTextRequest(ScoreRequestMixin):
|
||||
text_1: ScoreInputs
|
||||
text_2: ScoreInputs
|
||||
|
||||
@property
|
||||
def data_1(self):
|
||||
return self.text_1
|
||||
|
||||
@property
|
||||
def data_2(self):
|
||||
return self.text_2
|
||||
|
||||
|
||||
ScoreRequest: TypeAlias = (
|
||||
ScoreQueriesDocumentsRequest
|
||||
| ScoreQueriesItemsRequest
|
||||
| ScoreDataRequest
|
||||
| ScoreTextRequest
|
||||
)
|
||||
|
||||
|
||||
class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
query: ScoreInput
|
||||
documents: ScoreInputs
|
||||
top_n: int = Field(default_factory=lambda: 0)
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
encoder_config = model_config.encoder_config or {}
|
||||
|
||||
return TokenizeParams(
|
||||
max_total_tokens=model_config.max_model_len,
|
||||
max_output_tokens=0,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
do_lower_case=encoder_config.get("do_lower_case", False),
|
||||
max_total_tokens_param="max_model_len",
|
||||
)
|
||||
|
||||
def to_pooling_params(self, task: PoolingTask = "score"):
|
||||
return PoolingParams(
|
||||
task=task,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=self.use_activation,
|
||||
)
|
||||
|
||||
|
||||
class RerankDocument(BaseModel):
|
||||
text: str | None = None
|
||||
multi_modal: list[ScoreContentPartParam] | None = None
|
||||
|
||||
|
||||
class RerankResult(BaseModel):
|
||||
index: int
|
||||
document: RerankDocument
|
||||
relevance_score: float
|
||||
|
||||
|
||||
class RerankUsage(BaseModel):
|
||||
prompt_tokens: int
|
||||
total_tokens: int
|
||||
|
||||
|
||||
class RerankResponse(OpenAIBaseModel):
|
||||
id: str
|
||||
model: str
|
||||
usage: RerankUsage
|
||||
results: list[RerankResult]
|
||||
|
||||
|
||||
class ScoreResponseData(OpenAIBaseModel):
|
||||
index: int
|
||||
object: str = "score"
|
||||
score: float
|
||||
|
||||
|
||||
class ScoreResponse(OpenAIBaseModel):
|
||||
id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
|
||||
object: str = "list"
|
||||
created: int = Field(default_factory=lambda: int(time.time()))
|
||||
model: str
|
||||
data: list[ScoreResponseData]
|
||||
usage: UsageInfo
|
||||
651
vllm/entrypoints/pooling/score/serving.py
Normal file
651
vllm/entrypoints/pooling/score/serving.py
Normal file
@@ -0,0 +1,651 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, Mapping
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Any
|
||||
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.pooling.score.protocol import (
|
||||
RerankDocument,
|
||||
RerankRequest,
|
||||
RerankResponse,
|
||||
RerankResult,
|
||||
RerankUsage,
|
||||
ScoreRequest,
|
||||
ScoreResponse,
|
||||
ScoreResponseData,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.utils import (
|
||||
ScoreData,
|
||||
ScoreInputs,
|
||||
_cosine_similarity,
|
||||
compress_token_type_ids,
|
||||
compute_maxsim_score,
|
||||
get_score_prompt,
|
||||
parse_score_data_single,
|
||||
validate_score_input,
|
||||
)
|
||||
from vllm.inputs.data import ProcessorInputs, TokensPrompt, token_inputs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.utils.async_utils import make_async, merge_async_iterators
|
||||
from vllm.utils.mistral import is_mistral_tokenizer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ServingScores(OpenAIServing):
|
||||
def __init__(
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: RequestLogger | None,
|
||||
score_template: str | None = None,
|
||||
log_error_stack: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
self.score_template = score_template
|
||||
|
||||
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
self.is_cross_encoder = self.model_config.is_cross_encoder
|
||||
self.is_multimodal_model = self.model_config.is_multimodal_model
|
||||
self.architecture = self.model_config.architecture
|
||||
self.is_late_interaction = self.model_config.is_late_interaction
|
||||
|
||||
if self.is_cross_encoder:
|
||||
self._score_func = self._cross_encoding_score
|
||||
elif self.is_late_interaction:
|
||||
self._score_func = self._late_interaction_score
|
||||
else:
|
||||
self._score_func = self._embedding_score
|
||||
|
||||
async def _embedding_score(
|
||||
self,
|
||||
data_1: list[ScoreData],
|
||||
data_2: list[ScoreData],
|
||||
request: RerankRequest | ScoreRequest,
|
||||
request_id: str,
|
||||
lora_request: LoRARequest | None | None = None,
|
||||
trace_headers: Mapping[str, str] | None = None,
|
||||
) -> list[PoolingRequestOutput] | ErrorResponse:
|
||||
input_texts: list[str] = []
|
||||
for text in data_1 + data_2:
|
||||
if not isinstance(text, str):
|
||||
raise NotImplementedError(
|
||||
"Embedding scores currently do not support multimodal input."
|
||||
)
|
||||
input_texts.append(text)
|
||||
|
||||
model_config = self.model_config
|
||||
tokenizer = self.renderer.get_tokenizer()
|
||||
|
||||
encode_async = make_async(
|
||||
tokenizer.encode,
|
||||
executor=self._tokenizer_executor,
|
||||
)
|
||||
|
||||
tokenization_kwargs = request.build_tok_params(model_config).get_encode_kwargs()
|
||||
tokenized_prompts = await asyncio.gather(
|
||||
*(encode_async(t, **tokenization_kwargs) for t in input_texts)
|
||||
)
|
||||
|
||||
engine_prompts: list[ProcessorInputs] = []
|
||||
for tok_result, input_text in zip(tokenized_prompts, input_texts):
|
||||
text_token_prompt = self._validate_input(request, tok_result, input_text)
|
||||
|
||||
engine_prompts.append(
|
||||
token_inputs(
|
||||
text_token_prompt["prompt_token_ids"],
|
||||
prompt=input_text,
|
||||
)
|
||||
)
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||
pooling_params = request.to_pooling_params("embed")
|
||||
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
request_id_item = f"{request_id}-{i}"
|
||||
|
||||
self._log_inputs(
|
||||
request_id_item,
|
||||
engine_prompt,
|
||||
params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
generators.append(
|
||||
self.engine_client.encode(
|
||||
engine_prompt,
|
||||
pooling_params,
|
||||
request_id_item,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
)
|
||||
)
|
||||
|
||||
result_generator = merge_async_iterators(*generators)
|
||||
|
||||
# Non-streaming response
|
||||
final_res_batch: list[PoolingRequestOutput] = []
|
||||
|
||||
embeddings: list[PoolingRequestOutput | None] = [None] * len(engine_prompts)
|
||||
|
||||
async for i, res in result_generator:
|
||||
embeddings[i] = res
|
||||
|
||||
emb_data_1: list[PoolingRequestOutput] = []
|
||||
emb_data_2: list[PoolingRequestOutput] = []
|
||||
|
||||
for i in range(0, len(data_1)):
|
||||
assert (emb := embeddings[i]) is not None
|
||||
emb_data_1.append(emb)
|
||||
|
||||
for i in range(len(data_1), len(embeddings)):
|
||||
assert (emb := embeddings[i]) is not None
|
||||
emb_data_2.append(emb)
|
||||
|
||||
if len(emb_data_1) == 1:
|
||||
emb_data_1 = emb_data_1 * len(emb_data_2)
|
||||
|
||||
final_res_batch = _cosine_similarity(
|
||||
tokenizer=tokenizer, embed_1=emb_data_1, embed_2=emb_data_2
|
||||
)
|
||||
|
||||
return final_res_batch
|
||||
|
||||
def _preprocess_late_interaction_item(
|
||||
self,
|
||||
data: ScoreData,
|
||||
role: str,
|
||||
request: RerankRequest | ScoreRequest,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
"""Parse a single ScoreData into a text + optional multimodal
|
||||
TokensPrompt for late-interaction encoding.
|
||||
|
||||
For plain strings, tokenises directly.
|
||||
For multimodal content parts, extracts text and multi_modal_data.
|
||||
"""
|
||||
model_config = self.model_config
|
||||
|
||||
if isinstance(data, str):
|
||||
text, mm_data, mm_uuids = data, None, None
|
||||
else:
|
||||
text, mm_data, mm_uuids = parse_score_data_single(data, role, model_config)
|
||||
|
||||
prompt_inputs = tokenizer(text, **tokenization_kwargs)
|
||||
self._validate_input(request, prompt_inputs["input_ids"], text)
|
||||
|
||||
engine_prompt = TokensPrompt(
|
||||
prompt_token_ids=prompt_inputs["input_ids"],
|
||||
)
|
||||
|
||||
if mm_data is not None:
|
||||
engine_prompt["multi_modal_data"] = mm_data
|
||||
if mm_uuids is not None:
|
||||
engine_prompt["multi_modal_uuids"] = mm_uuids
|
||||
if request.mm_processor_kwargs is not None:
|
||||
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
|
||||
|
||||
return text, engine_prompt
|
||||
|
||||
async def _late_interaction_score(
|
||||
self,
|
||||
data_1: list[ScoreData],
|
||||
data_2: list[ScoreData],
|
||||
request: RerankRequest | ScoreRequest,
|
||||
request_id: str,
|
||||
lora_request: LoRARequest | None = None,
|
||||
trace_headers: Mapping[str, str] | None = None,
|
||||
) -> list[PoolingRequestOutput] | ErrorResponse:
|
||||
"""
|
||||
Late interaction scoring (ColBERT MaxSim).
|
||||
|
||||
Encodes queries and documents into per-token embeddings, then computes
|
||||
MaxSim: sum over query tokens of max similarity to any document token.
|
||||
"""
|
||||
model_config = self.model_config
|
||||
tokenizer = self.renderer.get_tokenizer()
|
||||
tokenization_kwargs = request.build_tok_params(model_config).get_encode_kwargs()
|
||||
|
||||
all_data = data_1 + data_2
|
||||
roles = ["query"] * len(data_1) + ["document"] * len(data_2)
|
||||
|
||||
preprocess_async = make_async(
|
||||
self._preprocess_late_interaction_item,
|
||||
executor=self._tokenizer_executor,
|
||||
)
|
||||
|
||||
preprocessed = await asyncio.gather(
|
||||
*(
|
||||
preprocess_async(
|
||||
data=d,
|
||||
role=r,
|
||||
request=request,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
for d, r in zip(all_data, roles)
|
||||
)
|
||||
)
|
||||
|
||||
input_texts: list[str] = []
|
||||
engine_prompts: list[TokensPrompt] = []
|
||||
for text, engine_prompt in preprocessed:
|
||||
input_texts.append(text)
|
||||
engine_prompts.append(engine_prompt)
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||
|
||||
pooling_params = request.to_pooling_params("token_embed")
|
||||
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
request_id_item = f"{request_id}-{i}"
|
||||
|
||||
self._log_inputs(
|
||||
request_id_item,
|
||||
engine_prompt,
|
||||
params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
generators.append(
|
||||
self.engine_client.encode(
|
||||
engine_prompt,
|
||||
pooling_params,
|
||||
request_id_item,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
)
|
||||
)
|
||||
|
||||
result_generator = merge_async_iterators(*generators)
|
||||
|
||||
# Collect token embeddings
|
||||
embeddings: list[PoolingRequestOutput | None] = [None] * len(engine_prompts)
|
||||
|
||||
async for i, res in result_generator:
|
||||
embeddings[i] = res
|
||||
|
||||
# Split into query and document embeddings
|
||||
emb_data_1: list[PoolingRequestOutput] = []
|
||||
emb_data_2: list[PoolingRequestOutput] = []
|
||||
|
||||
for i in range(0, len(data_1)):
|
||||
assert (emb := embeddings[i]) is not None
|
||||
emb_data_1.append(emb)
|
||||
|
||||
for i in range(len(data_1), len(embeddings)):
|
||||
assert (emb := embeddings[i]) is not None
|
||||
emb_data_2.append(emb)
|
||||
|
||||
# Expand queries if 1:N scoring
|
||||
if len(emb_data_1) == 1:
|
||||
emb_data_1 = emb_data_1 * len(emb_data_2)
|
||||
|
||||
# Compute MaxSim scores
|
||||
from vllm.outputs import PoolingOutput
|
||||
|
||||
scores: list[PoolingRequestOutput] = []
|
||||
padding: list[int] = []
|
||||
if (pad_token_id := tokenizer.pad_token_id) is not None:
|
||||
padding = [pad_token_id]
|
||||
|
||||
for emb_1, emb_2 in zip(emb_data_1, emb_data_2):
|
||||
# emb_1.outputs.data: [query_len, dim]
|
||||
# emb_2.outputs.data: [doc_len, dim]
|
||||
q_emb = emb_1.outputs.data
|
||||
d_emb = emb_2.outputs.data
|
||||
|
||||
maxsim_score = compute_maxsim_score(q_emb, d_emb)
|
||||
|
||||
tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
|
||||
|
||||
scores.append(
|
||||
PoolingRequestOutput(
|
||||
request_id=f"{emb_1.request_id}_{emb_2.request_id}",
|
||||
outputs=PoolingOutput(data=maxsim_score),
|
||||
prompt_token_ids=tokens,
|
||||
num_cached_tokens=emb_1.num_cached_tokens + emb_2.num_cached_tokens,
|
||||
finished=True,
|
||||
)
|
||||
)
|
||||
|
||||
return scores
|
||||
|
||||
async def _cross_encoding_score(
|
||||
self,
|
||||
data_1: list[ScoreData],
|
||||
data_2: list[ScoreData],
|
||||
request: RerankRequest | ScoreRequest,
|
||||
request_id: str,
|
||||
lora_request: LoRARequest | None | None = None,
|
||||
trace_headers: Mapping[str, str] | None = None,
|
||||
) -> list[PoolingRequestOutput] | ErrorResponse:
|
||||
tokenizer = self.renderer.get_tokenizer()
|
||||
if is_mistral_tokenizer(tokenizer):
|
||||
raise ValueError("MistralTokenizer not supported for cross-encoding")
|
||||
|
||||
model_config = self.model_config
|
||||
|
||||
if len(data_1) == 1:
|
||||
data_1 = data_1 * len(data_2)
|
||||
|
||||
tok_kwargs = request.build_tok_params(model_config).get_encode_kwargs()
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
|
||||
preprocess_async = make_async(
|
||||
self._preprocess_score,
|
||||
executor=self._tokenizer_executor,
|
||||
)
|
||||
preprocessed_prompts = await asyncio.gather(
|
||||
*(
|
||||
preprocess_async(
|
||||
request=request,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tok_kwargs,
|
||||
data_1=t1,
|
||||
data_2=t2,
|
||||
)
|
||||
for t1, t2 in input_pairs
|
||||
)
|
||||
)
|
||||
|
||||
request_prompts: list[str] = []
|
||||
engine_prompts: list[TokensPrompt] = []
|
||||
for full_prompt, engine_prompt in preprocessed_prompts:
|
||||
request_prompts.append(full_prompt)
|
||||
engine_prompts.append(engine_prompt)
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||
|
||||
default_pooling_params = request.to_pooling_params("score")
|
||||
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
request_id_item = f"{request_id}-{i}"
|
||||
|
||||
self._log_inputs(
|
||||
request_id_item,
|
||||
request_prompts[i],
|
||||
params=default_pooling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
if token_type_ids := engine_prompt.pop("token_type_ids", None):
|
||||
pooling_params = default_pooling_params.clone()
|
||||
compressed = compress_token_type_ids(token_type_ids)
|
||||
pooling_params.extra_kwargs = {"compressed_token_type_ids": compressed}
|
||||
else:
|
||||
pooling_params = default_pooling_params
|
||||
|
||||
generator = self.engine_client.encode(
|
||||
engine_prompt,
|
||||
pooling_params,
|
||||
request_id_item,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
)
|
||||
|
||||
generators.append(generator)
|
||||
|
||||
result_generator = merge_async_iterators(*generators)
|
||||
|
||||
# Non-streaming response
|
||||
final_res_batch: list[PoolingRequestOutput | None] = [None] * len(
|
||||
engine_prompts
|
||||
)
|
||||
|
||||
async for i, res in result_generator:
|
||||
final_res_batch[i] = res
|
||||
|
||||
return [out for out in final_res_batch if out is not None]
|
||||
|
||||
def _preprocess_score(
|
||||
self,
|
||||
request: RerankRequest | ScoreRequest,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: ScoreData,
|
||||
data_2: ScoreData,
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
model_config = self.model_config
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
model_config=model_config,
|
||||
data_1=data_1,
|
||||
data_2=data_2,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
score_template=self.score_template,
|
||||
)
|
||||
self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
|
||||
if request.mm_processor_kwargs is not None:
|
||||
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
|
||||
|
||||
return full_prompt, engine_prompt
|
||||
|
||||
async def _run_scoring(
|
||||
self,
|
||||
data_1: ScoreInputs,
|
||||
data_2: ScoreInputs,
|
||||
request: ScoreRequest | RerankRequest,
|
||||
request_id: str,
|
||||
raw_request: Request | None = None,
|
||||
) -> list[PoolingRequestOutput] | ErrorResponse:
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
trace_headers = (
|
||||
None
|
||||
if raw_request is None
|
||||
else await self._get_trace_headers(raw_request.headers)
|
||||
)
|
||||
|
||||
score_data_1, score_data_2 = validate_score_input(
|
||||
data_1,
|
||||
data_2,
|
||||
is_multimodal_model=self.is_multimodal_model,
|
||||
architecture=self.architecture,
|
||||
)
|
||||
|
||||
return await self._score_func(
|
||||
data_1=score_data_1,
|
||||
data_2=score_data_2,
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
|
||||
async def create_score(
|
||||
self,
|
||||
request: ScoreRequest,
|
||||
raw_request: Request | None = None,
|
||||
) -> ScoreResponse | ErrorResponse:
|
||||
"""
|
||||
Score API similar to Sentence Transformers cross encoder
|
||||
|
||||
See https://sbert.net/docs/package_reference/cross_encoder
|
||||
"""
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
request_id = f"score-{self._base_request_id(raw_request)}"
|
||||
created_time = int(time.time())
|
||||
|
||||
try:
|
||||
final_res_batch = await self._run_scoring(
|
||||
request.data_1,
|
||||
request.data_2,
|
||||
request,
|
||||
request_id,
|
||||
raw_request,
|
||||
)
|
||||
if isinstance(final_res_batch, ErrorResponse):
|
||||
return final_res_batch
|
||||
|
||||
return self.request_output_to_score_response(
|
||||
final_res_batch,
|
||||
request_id,
|
||||
created_time,
|
||||
self.models.model_name(),
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
except ValueError as e:
|
||||
return self.create_error_response(e)
|
||||
|
||||
async def do_rerank(
|
||||
self, request: RerankRequest, raw_request: Request | None = None
|
||||
) -> RerankResponse | ErrorResponse:
|
||||
"""
|
||||
Rerank API based on JinaAI's rerank API; implements the same
|
||||
API interface. Designed for compatibility with off-the-shelf
|
||||
tooling, since this is a common standard for reranking APIs
|
||||
|
||||
See example client implementations at
|
||||
https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py
|
||||
numerous clients use this standard.
|
||||
"""
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
request_id = f"rerank-{self._base_request_id(raw_request)}"
|
||||
documents = request.documents
|
||||
|
||||
try:
|
||||
final_res_batch = await self._run_scoring(
|
||||
request.query,
|
||||
documents,
|
||||
request,
|
||||
request_id,
|
||||
raw_request,
|
||||
)
|
||||
if isinstance(final_res_batch, ErrorResponse):
|
||||
return final_res_batch
|
||||
|
||||
top_n = request.top_n if request.top_n > 0 else len(final_res_batch)
|
||||
|
||||
return self.request_output_to_rerank_response(
|
||||
final_res_batch,
|
||||
request_id,
|
||||
self.models.model_name(),
|
||||
documents,
|
||||
top_n,
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
except ValueError as e:
|
||||
return self.create_error_response(e)
|
||||
|
||||
def request_output_to_score_response(
|
||||
self,
|
||||
final_res_batch: list[PoolingRequestOutput],
|
||||
request_id: str,
|
||||
created_time: int,
|
||||
model_name: str,
|
||||
) -> ScoreResponse:
|
||||
items: list[ScoreResponseData] = []
|
||||
num_prompt_tokens = 0
|
||||
|
||||
for idx, final_res in enumerate(final_res_batch):
|
||||
classify_res = ScoringRequestOutput.from_base(final_res)
|
||||
|
||||
item = ScoreResponseData(
|
||||
index=idx,
|
||||
score=classify_res.outputs.score,
|
||||
)
|
||||
prompt_token_ids = final_res.prompt_token_ids
|
||||
|
||||
items.append(item)
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
total_tokens=num_prompt_tokens,
|
||||
)
|
||||
|
||||
return ScoreResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
data=items,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
def request_output_to_rerank_response(
|
||||
self,
|
||||
final_res_batch: list[PoolingRequestOutput],
|
||||
request_id: str,
|
||||
model_name: str,
|
||||
documents: ScoreInputs,
|
||||
top_n: int,
|
||||
) -> RerankResponse:
|
||||
"""
|
||||
Convert the output of do_rank to a RerankResponse
|
||||
"""
|
||||
|
||||
if not isinstance(documents, list):
|
||||
documents = [documents]
|
||||
|
||||
results: list[RerankResult] = []
|
||||
num_prompt_tokens = 0
|
||||
for idx, final_res in enumerate(final_res_batch):
|
||||
classify_res = ScoringRequestOutput.from_base(final_res)
|
||||
|
||||
document = documents[idx]
|
||||
if isinstance(document, str):
|
||||
rerank_document = RerankDocument(text=document)
|
||||
else:
|
||||
rerank_document = RerankDocument(
|
||||
multi_modal=document.get("content", [])
|
||||
)
|
||||
|
||||
result = RerankResult(
|
||||
index=idx,
|
||||
document=rerank_document,
|
||||
relevance_score=classify_res.outputs.score,
|
||||
)
|
||||
results.append(result)
|
||||
prompt_token_ids = final_res.prompt_token_ids
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
|
||||
# sort by relevance, then return the top n if set
|
||||
results.sort(key=lambda x: x.relevance_score, reverse=True)
|
||||
if top_n < len(documents):
|
||||
results = results[:top_n]
|
||||
|
||||
return RerankResponse(
|
||||
id=request_id,
|
||||
model=model_name,
|
||||
results=results,
|
||||
usage=RerankUsage(
|
||||
total_tokens=num_prompt_tokens, prompt_tokens=num_prompt_tokens
|
||||
),
|
||||
)
|
||||
404
vllm/entrypoints/pooling/score/utils.py
Normal file
404
vllm/entrypoints/pooling/score/utils.py
Normal file
@@ -0,0 +1,404 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, TypeAlias, cast
|
||||
|
||||
import torch
|
||||
from torch.nn import CosineSimilarity
|
||||
from typing_extensions import Required, TypedDict
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
BaseMultiModalItemTracker,
|
||||
ChatCompletionContentPartImageEmbedsParam,
|
||||
ChatCompletionContentPartImageParam,
|
||||
ChatCompletionContentPartParam,
|
||||
ChatCompletionContentPartTextParam,
|
||||
ChatCompletionContentPartVideoParam,
|
||||
ChatTemplateResolutionError,
|
||||
ConversationMessage,
|
||||
MultiModalItemTracker,
|
||||
_parse_chat_message_content_parts,
|
||||
)
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.inputs.data import PromptType, TextPrompt
|
||||
from vllm.model_executor.models.interfaces import supports_score_template
|
||||
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalUUIDDict
|
||||
from vllm.outputs import PoolingRequestOutput
|
||||
from vllm.renderers.hf import safe_apply_chat_template
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
ScoreContentPartParam: TypeAlias = (
|
||||
ChatCompletionContentPartImageParam
|
||||
| ChatCompletionContentPartImageEmbedsParam
|
||||
| ChatCompletionContentPartTextParam
|
||||
| ChatCompletionContentPartVideoParam
|
||||
)
|
||||
|
||||
|
||||
def compute_maxsim_score(q_emb: torch.Tensor, d_emb: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute ColBERT MaxSim score.
|
||||
|
||||
Args:
|
||||
q_emb: Query token embeddings [query_len, dim]
|
||||
d_emb: Document token embeddings [doc_len, dim]
|
||||
|
||||
Returns:
|
||||
MaxSim score (sum over query tokens of max similarity to any doc token)
|
||||
"""
|
||||
# [query_len, doc_len]
|
||||
token_scores = torch.matmul(q_emb, d_emb.T)
|
||||
# Max over document tokens, sum over query tokens
|
||||
return token_scores.amax(dim=-1).sum()
|
||||
|
||||
|
||||
class ScoreMultiModalParam(TypedDict, total=False):
|
||||
"""
|
||||
A specialized parameter type for scoring multimodal content
|
||||
|
||||
The reasons why don't reuse `CustomChatCompletionMessageParam` directly:
|
||||
1. Score tasks don't need the 'role' field (user/assistant/system) that's required in chat completions
|
||||
2. Including chat-specific fields would confuse users about their purpose in scoring
|
||||
3. This is a more focused interface that only exposes what's needed for scoring
|
||||
""" # noqa: E501
|
||||
|
||||
content: Required[list[ScoreContentPartParam]]
|
||||
"""The multimodal contents"""
|
||||
|
||||
|
||||
# Raw input data with content key in ScoreMultiModalParam.
|
||||
ScoreInput = str | ScoreMultiModalParam
|
||||
ScoreInputs = ScoreInput | list[ScoreInput]
|
||||
# Score data without content key.
|
||||
ScoreData = str | list[ScoreContentPartParam]
|
||||
|
||||
|
||||
def _cosine_similarity(
|
||||
tokenizer: TokenizerLike,
|
||||
embed_1: list[PoolingRequestOutput],
|
||||
embed_2: list[PoolingRequestOutput],
|
||||
) -> list[PoolingRequestOutput]:
|
||||
scorer = CosineSimilarity(0)
|
||||
scores: list[PoolingRequestOutput] = []
|
||||
|
||||
for emb_1, emb_2 in zip(embed_1, embed_2):
|
||||
pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
|
||||
|
||||
padding: list[int] = []
|
||||
if (pad_token_id := tokenizer.pad_token_id) is not None:
|
||||
padding = [pad_token_id]
|
||||
|
||||
tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
|
||||
|
||||
scores.append(
|
||||
PoolingRequestOutput(
|
||||
request_id=f"{emb_1.request_id}_{emb_2.request_id}",
|
||||
outputs=pair_score,
|
||||
prompt_token_ids=tokens,
|
||||
num_cached_tokens=emb_1.num_cached_tokens + emb_2.num_cached_tokens,
|
||||
finished=True,
|
||||
)
|
||||
)
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def _validate_score_input_lens(
|
||||
data_1: list[ScoreData],
|
||||
data_2: list[ScoreData],
|
||||
):
|
||||
len_1 = len(data_1)
|
||||
len_2 = len(data_2)
|
||||
|
||||
if len_1 > 1 and len_1 != len_2:
|
||||
raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
|
||||
if len_1 == 0:
|
||||
raise ValueError("At least one text element must be given")
|
||||
if len_2 == 0:
|
||||
raise ValueError("At least one text_pair element must be given")
|
||||
|
||||
|
||||
def _validate_mm_score_input(
|
||||
data: list[ScoreInput],
|
||||
is_multimodal_model: bool,
|
||||
architecture: str,
|
||||
) -> list[ScoreData]:
|
||||
out: list[ScoreData] = []
|
||||
for d in data:
|
||||
if isinstance(d, str):
|
||||
out.append(d)
|
||||
else:
|
||||
if not is_multimodal_model:
|
||||
raise ValueError(f"MultiModalParam is not supported for {architecture}")
|
||||
content = cast(list[ScoreContentPartParam], d.get("content", []))
|
||||
out.append(content)
|
||||
return out
|
||||
|
||||
|
||||
def validate_score_input(
|
||||
data_1: ScoreInputs,
|
||||
data_2: ScoreInputs,
|
||||
is_multimodal_model: bool,
|
||||
architecture: str,
|
||||
) -> tuple[list[ScoreData], list[ScoreData]]:
|
||||
if not isinstance(data_1, list):
|
||||
data_1 = [data_1]
|
||||
|
||||
if not isinstance(data_2, list):
|
||||
data_2 = [data_2]
|
||||
|
||||
score_input_1 = _validate_mm_score_input(data_1, is_multimodal_model, architecture)
|
||||
score_input_2 = _validate_mm_score_input(data_2, is_multimodal_model, architecture)
|
||||
_validate_score_input_lens(score_input_1, score_input_2)
|
||||
return score_input_1, score_input_2
|
||||
|
||||
|
||||
def _ensure_str(content: list[ConversationMessage]) -> str:
|
||||
"""Extract a single string prompt from parsed conversation content."""
|
||||
assert len(content) == 1
|
||||
prompt = content[0]["content"]
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
return cast(str, prompt)
|
||||
raise ValueError(f"Only string content is supported, but got {content}.")
|
||||
|
||||
|
||||
def parse_score_data(
|
||||
data_1: ScoreData,
|
||||
data_2: ScoreData,
|
||||
model_config: ModelConfig,
|
||||
) -> tuple[str, str, MultiModalDataDict | None, MultiModalUUIDDict | None]:
|
||||
"""Parse a query-document pair into text prompts and shared multi-modal
|
||||
data.
|
||||
|
||||
Uses a **single** :class:`MultiModalItemTracker` so that multi-modal
|
||||
items from both inputs are merged into one ``mm_data`` dict. This is
|
||||
the correct behaviour for cross-encoder scoring, where query and
|
||||
document are concatenated into a single model prompt.
|
||||
"""
|
||||
mm_tracker = MultiModalItemTracker(model_config)
|
||||
|
||||
content_1 = _parse_score_content("query", data_1, mm_tracker)
|
||||
content_2 = _parse_score_content("document", data_2, mm_tracker)
|
||||
|
||||
prompt_1 = _ensure_str(content_1)
|
||||
prompt_2 = _ensure_str(content_2)
|
||||
mm_items, mm_uuids = mm_tracker.resolve_items()
|
||||
|
||||
return prompt_1, prompt_2, mm_items, mm_uuids
|
||||
|
||||
|
||||
def parse_score_data_single(
|
||||
data: ScoreData,
|
||||
role: str,
|
||||
model_config: ModelConfig,
|
||||
) -> tuple[str, MultiModalDataDict | None, MultiModalUUIDDict | None]:
|
||||
"""Parse **one** ScoreData into a text prompt and its own multi-modal
|
||||
data.
|
||||
|
||||
Unlike :func:`parse_score_data`, each call creates an **independent**
|
||||
:class:`MultiModalItemTracker` so multi-modal items are kept separate.
|
||||
This is the correct behaviour for late-interaction scoring, where
|
||||
query and document are encoded independently.
|
||||
"""
|
||||
mm_tracker = MultiModalItemTracker(model_config)
|
||||
content = _parse_score_content(role, data, mm_tracker)
|
||||
|
||||
prompt = _ensure_str(content)
|
||||
mm_items, mm_uuids = mm_tracker.resolve_items()
|
||||
return prompt, mm_items, mm_uuids
|
||||
|
||||
|
||||
def score_data_to_prompts(
|
||||
data_list: list[ScoreData],
|
||||
role: str,
|
||||
model_config: ModelConfig,
|
||||
) -> list[PromptType]:
|
||||
"""Convert a list of ScoreData into PromptType objects.
|
||||
|
||||
For plain text inputs, returns the string directly.
|
||||
For multimodal inputs (list of content parts), parses them into
|
||||
a :class:`TextPrompt` with attached ``multi_modal_data`` /
|
||||
``multi_modal_uuids``.
|
||||
|
||||
This is used by late-interaction scoring where each query/document
|
||||
is encoded independently.
|
||||
"""
|
||||
prompts: list[PromptType] = []
|
||||
for data in data_list:
|
||||
if isinstance(data, str):
|
||||
prompts.append(data)
|
||||
else:
|
||||
text, mm_data, mm_uuids = parse_score_data_single(data, role, model_config)
|
||||
prompt: TextPrompt = TextPrompt(prompt=text)
|
||||
if mm_data is not None:
|
||||
prompt["multi_modal_data"] = mm_data
|
||||
if mm_uuids is not None:
|
||||
prompt["multi_modal_uuids"] = mm_uuids
|
||||
prompts.append(prompt)
|
||||
return prompts
|
||||
|
||||
|
||||
def _parse_score_content(
|
||||
role: str,
|
||||
data: ScoreData,
|
||||
mm_tracker: BaseMultiModalItemTracker,
|
||||
) -> list[ConversationMessage]:
|
||||
parts: Iterable[ChatCompletionContentPartParam]
|
||||
if isinstance(data, str):
|
||||
parts = [ChatCompletionContentPartTextParam(type="text", text=data)]
|
||||
else:
|
||||
parts = cast(Iterable[ChatCompletionContentPartParam], data)
|
||||
|
||||
mm_parser = mm_tracker.create_parser()
|
||||
|
||||
parse_res = _parse_chat_message_content_parts(
|
||||
role=role,
|
||||
parts=parts,
|
||||
mm_tracker=mm_tracker,
|
||||
wrap_dicts=False,
|
||||
interleave_strings=False,
|
||||
)
|
||||
|
||||
if parse_res:
|
||||
return parse_res
|
||||
|
||||
mm_placeholder_storage = mm_parser.mm_placeholder_storage()
|
||||
|
||||
if (
|
||||
len(mm_placeholder_storage) != 1
|
||||
or len(next(iter(mm_placeholder_storage.values()))) != 1
|
||||
):
|
||||
raise ValueError("Only one multi-modal item is supported")
|
||||
|
||||
return next(iter(mm_placeholder_storage.values()))[0]
|
||||
|
||||
|
||||
def _apply_model_score_template(
|
||||
model_config: ModelConfig, prompt_1: str, prompt_2: str
|
||||
) -> str:
|
||||
# NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
|
||||
model = get_model_cls(model_config)
|
||||
if supports_score_template(model):
|
||||
full_prompt = model.get_score_template(prompt_1, prompt_2)
|
||||
if full_prompt is None:
|
||||
raise ValueError("Get empty score template from model")
|
||||
return full_prompt
|
||||
|
||||
raise ValueError(f"Unsupported model architecture: {model_config.architecture}")
|
||||
|
||||
|
||||
def post_process_tokens(
|
||||
model_config: ModelConfig,
|
||||
prompt: TokensPrompt,
|
||||
) -> None:
|
||||
"""
|
||||
Perform architecture-specific manipulations on the input tokens.
|
||||
|
||||
Note:
|
||||
This is an in-place operation.
|
||||
"""
|
||||
# NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
|
||||
model = get_model_cls(model_config)
|
||||
if supports_score_template(model):
|
||||
model.post_process_tokens(prompt)
|
||||
|
||||
|
||||
def get_score_prompt(
|
||||
model_config: ModelConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: ScoreData,
|
||||
data_2: ScoreData,
|
||||
score_template: str | None = None,
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
prompt_1, prompt_2, mm_data, mm_uuids = parse_score_data(
|
||||
data_1,
|
||||
data_2,
|
||||
model_config,
|
||||
)
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
|
||||
model = get_model_cls(model_config)
|
||||
|
||||
def default_tokenizer_encode():
|
||||
if supports_score_template(model):
|
||||
full_prompt = _apply_model_score_template(model_config, prompt_1, prompt_2)
|
||||
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
|
||||
else:
|
||||
if model_config.use_sep_token:
|
||||
# cross_encoder models defaults to using separating token.
|
||||
prompt_inputs = tokenizer(
|
||||
text=prompt_1, text_pair=prompt_2, **tokenization_kwargs
|
||||
)
|
||||
full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
|
||||
else:
|
||||
# `llm as reranker` defaults to not using separating token.
|
||||
full_prompt = prompt_1 + prompt_2
|
||||
prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)
|
||||
return full_prompt, prompt_inputs
|
||||
|
||||
# FIXME: For now, we only apply a template when one is explicitly provided.
|
||||
# We cannot rely on the tokenizer's chat template because many models
|
||||
# inherit junk templates from their base LLM, which breaks both the models
|
||||
# and the tests that use them.
|
||||
if score_template is None:
|
||||
full_prompt, prompt_inputs = default_tokenizer_encode()
|
||||
else:
|
||||
# FIXME: Try applying a score template from the CLI arg or tokenizer_config.json
|
||||
# If that fails because there is no such template,
|
||||
# fall back to the default implementation.
|
||||
try:
|
||||
full_prompt = safe_apply_chat_template(
|
||||
model_config,
|
||||
tokenizer,
|
||||
[
|
||||
{"role": "query", "content": prompt_1},
|
||||
{"role": "document", "content": prompt_2},
|
||||
],
|
||||
chat_template=score_template,
|
||||
tools=None,
|
||||
tokenize=False,
|
||||
)
|
||||
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
|
||||
except ChatTemplateResolutionError:
|
||||
full_prompt, prompt_inputs = default_tokenizer_encode()
|
||||
|
||||
engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])
|
||||
|
||||
if (token_type_ids := prompt_inputs.get("token_type_ids")) is not None:
|
||||
engine_prompt["token_type_ids"] = token_type_ids
|
||||
|
||||
post_process_tokens(model_config, engine_prompt)
|
||||
|
||||
if mm_data is not None:
|
||||
engine_prompt["multi_modal_data"] = mm_data
|
||||
if mm_uuids is not None:
|
||||
engine_prompt["multi_modal_uuids"] = mm_uuids
|
||||
|
||||
return full_prompt, engine_prompt
|
||||
|
||||
|
||||
def compress_token_type_ids(token_type_ids: list[int]) -> int:
|
||||
"""
|
||||
Return position of the first 1 or the length of the list
|
||||
if not found.
|
||||
"""
|
||||
first_one = len(token_type_ids)
|
||||
err_msg = (
|
||||
"Token type ids are expected to be a sequence"
|
||||
" of zeros followed by a sequence of ones"
|
||||
)
|
||||
for i, type_id in enumerate(token_type_ids):
|
||||
if type_id == 0 and first_one < i:
|
||||
raise ValueError(err_msg)
|
||||
elif type_id == 1 and first_one > i:
|
||||
first_one = i
|
||||
elif type_id > 1:
|
||||
raise ValueError(err_msg)
|
||||
|
||||
return first_one
|
||||
Reference in New Issue
Block a user