Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/vllm/entrypoints/pooling/init.py
+++ b/vllm/entrypoints/pooling/init.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from fastapi import FastAPI
+
+
+def register_pooling_api_routers(app: FastAPI):
+    from vllm.entrypoints.pooling.classify.api_router import router as classify_router
+    from vllm.entrypoints.pooling.embed.api_router import router as embed_router
+    from vllm.entrypoints.pooling.pooling.api_router import router as pooling_router
+    from vllm.entrypoints.pooling.score.api_router import router as score_router
+
+    app.include_router(classify_router)
+    app.include_router(embed_router)
+    app.include_router(score_router)
+    app.include_router(pooling_router)
--- a/vllm/entrypoints/pooling/classify/init.py
+++ b/vllm/entrypoints/pooling/classify/init.py
--- a/vllm/entrypoints/pooling/classify/api_router.py
+++ b/vllm/entrypoints/pooling/classify/api_router.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from starlette.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationRequest,
+    ClassificationResponse,
+)
+from vllm.entrypoints.pooling.classify.serving import ServingClassification
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def classify(request: Request) -> ServingClassification | None:
+    return request.app.state.openai_serving_classification
+
+
+@router.post("/classify", dependencies=[Depends(validate_json_request)])
+@with_cancellation
+@load_aware_call
+async def create_classify(request: ClassificationRequest, raw_request: Request):
+    handler = classify(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Classification API"
+        )
+
+    try:
+        generator = await handler.create_classify(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, ClassificationResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
--- a/vllm/entrypoints/pooling/classify/protocol.py
+++ b/vllm/entrypoints/pooling/classify/protocol.py
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from typing import Annotated, Any, TypeAlias
+
+from pydantic import (
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.utils import random_uuid
+
+
+class ClassificationCompletionRequest(OpenAIBaseModel):
+    model: str | None = None
+    input: list[str] | str
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:classification-extra-params]
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class ClassificationChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:chat-classification-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:chat-classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+ClassificationRequest: TypeAlias = (
+    ClassificationCompletionRequest | ClassificationChatRequest
+)
+
+
+class ClassificationData(OpenAIBaseModel):
+    index: int
+    label: str | None
+    probs: list[float]
+    num_classes: int
+
+
+class ClassificationResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ClassificationData]
+    usage: UsageInfo
--- a/vllm/entrypoints/pooling/classify/serving.py
+++ b/vllm/entrypoints/pooling/classify/serving.py
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from http import HTTPStatus
+from typing import cast
+
+import jinja2
+import numpy as np
+from fastapi import Request
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import (
+    ClassificationServeContext,
+    OpenAIServing,
+    ServeContext,
+)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
+    ClassificationData,
+    ClassificationRequest,
+    ClassificationResponse,
+)
+from vllm.entrypoints.renderer import RenderConfig
+from vllm.logger import init_logger
+from vllm.outputs import ClassificationOutput, PoolingRequestOutput
+from vllm.pooling_params import PoolingParams
+
+logger = init_logger(__name__)
+
+
+class ClassificationMixin(OpenAIServing):
+    chat_template: str | None
+    chat_template_content_format: ChatTemplateContentFormatOption
+    trust_request_chat_template: bool
+
+    async def _preprocess(
+        self,
+        ctx: ServeContext,
+    ) -> ErrorResponse | None:
+        """
+        Process classification inputs: tokenize text, resolve adapters,
+        and prepare model-specific inputs.
+        """
+        ctx = cast(ClassificationServeContext, ctx)
+        try:
+            ctx.tokenizer = await self.engine_client.get_tokenizer()
+
+            request_obj = ctx.request
+
+            if isinstance(request_obj, ClassificationChatRequest):
+                chat_request = request_obj
+                messages = chat_request.messages
+                trust_request_chat_template = getattr(
+                    self,
+                    "trust_request_chat_template",
+                    False,
+                )
+                ret = self._validate_chat_template(
+                    request_chat_template=chat_request.chat_template,
+                    chat_template_kwargs=chat_request.chat_template_kwargs,
+                    trust_request_chat_template=trust_request_chat_template,
+                )
+                if ret:
+                    return ret
+
+                _, engine_prompts = await self._preprocess_chat(
+                    cast(ChatCompletionRequest, chat_request),
+                    ctx.tokenizer,
+                    messages,
+                    chat_template=(
+                        chat_request.chat_template
+                        or getattr(self, "chat_template", None)
+                    ),
+                    chat_template_content_format=cast(
+                        ChatTemplateContentFormatOption,
+                        getattr(self, "chat_template_content_format", "auto"),
+                    ),
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    add_special_tokens=chat_request.add_special_tokens,
+                )
+                ctx.engine_prompts = engine_prompts
+
+            elif isinstance(request_obj, ClassificationCompletionRequest):
+                completion_request = request_obj
+                input_data = completion_request.input
+                if input_data in (None, ""):
+                    return self.create_error_response(
+                        "Input or messages must be provided",
+                        status_code=HTTPStatus.BAD_REQUEST,
+                    )
+                if isinstance(input_data, list) and not input_data:
+                    ctx.engine_prompts = []
+                    return None
+
+                renderer = self._get_renderer(ctx.tokenizer)
+                prompt_input = cast(str | list[str], input_data)
+                ctx.engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=prompt_input,
+                    config=self._build_render_config(completion_request),
+                )
+            else:
+                return self.create_error_response(
+                    "Invalid classification request type",
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
+
+            return None
+
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+    def _build_response(
+        self,
+        ctx: ServeContext,
+    ) -> ClassificationResponse | ErrorResponse:
+        """
+        Convert model outputs to a formatted classification response
+        with probabilities and labels.
+        """
+        ctx = cast(ClassificationServeContext, ctx)
+        items: list[ClassificationData] = []
+        num_prompt_tokens = 0
+
+        final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch)
+
+        for idx, final_res in enumerate(final_res_batch_checked):
+            classify_res = ClassificationOutput.from_base(final_res.outputs)
+
+            probs = classify_res.probs
+            predicted_index = int(np.argmax(probs))
+            label = getattr(self.model_config.hf_config, "id2label", {}).get(
+                predicted_index
+            )
+
+            item = ClassificationData(
+                index=idx,
+                label=label,
+                probs=probs,
+                num_classes=len(probs),
+            )
+
+            items.append(item)
+            prompt_token_ids = final_res.prompt_token_ids
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return ClassificationResponse(
+            id=ctx.request_id,
+            created=ctx.created_time,
+            model=ctx.model_name,
+            data=items,
+            usage=usage,
+        )
+
+    def _build_render_config(self, request: ClassificationRequest) -> RenderConfig:
+        return RenderConfig(
+            max_length=self.max_model_len,
+            truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens,
+        )
+
+
+class ServingClassification(ClassificationMixin):
+    request_id_prefix = "classify"
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        chat_template: str | None = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
+        trust_request_chat_template: bool = False,
+        log_error_stack: bool = False,
+    ) -> None:
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
+
+        self.chat_template = chat_template
+        self.chat_template_content_format = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template
+
+    async def create_classify(
+        self,
+        request: ClassificationRequest,
+        raw_request: Request,
+    ) -> ClassificationResponse | ErrorResponse:
+        model_name = self.models.model_name()
+        request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
+
+        ctx = ClassificationServeContext(
+            request=request,
+            raw_request=raw_request,
+            model_name=model_name,
+            request_id=request_id,
+        )
+
+        return await super().handle(ctx)  # type: ignore
+
+    def _create_pooling_params(
+        self,
+        ctx: ServeContext[ClassificationRequest],
+    ) -> PoolingParams | ErrorResponse:
+        pooling_params = super()._create_pooling_params(ctx)
+        if isinstance(pooling_params, ErrorResponse):
+            return pooling_params
+
+        try:
+            pooling_params.verify("classify", self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        return pooling_params
--- a/vllm/entrypoints/pooling/embed/init.py
+++ b/vllm/entrypoints/pooling/embed/init.py
--- a/vllm/entrypoints/pooling/embed/api_router.py
+++ b/vllm/entrypoints/pooling/embed/api_router.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingBytesResponse,
+    EmbeddingRequest,
+    EmbeddingResponse,
+)
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def embedding(request: Request) -> OpenAIServingEmbedding | None:
+    return request.app.state.openai_serving_embedding
+
+
+@router.post(
+    "/v1/embeddings",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_embedding(
+    request: EmbeddingRequest,
+    raw_request: Request,
+):
+    handler = embedding(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Embeddings API"
+        )
+
+    try:
+        generator = await handler.create_embedding(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, EmbeddingResponse):
+        return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, EmbeddingBytesResponse):
+        return StreamingResponse(
+            content=generator.content,
+            headers=generator.headers,
+            media_type=generator.media_type,
+        )
+
+    assert_never(generator)
--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Annotated, Any, TypeAlias
+
+from pydantic import (
+    Field,
+    model_validator,
+)
+
+from vllm import PoolingParams
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.utils import random_uuid
+from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
+
+
+class EmbeddingCompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/embeddings
+    model: str | None = None
+    input: list[int] | list[list[int]] | str | list[str]
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:embedding-extra-params]
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    # --8<-- [end:embedding-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+        )
+
+
+class EmbeddingChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:chat-embedding-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    # --8<-- [end:chat-embedding-extra-params]
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
+        return data
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+        )
+
+
+EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
+
+
+class EmbeddingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "embedding"
+    embedding: list[float] | str
+
+
+class EmbeddingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[EmbeddingResponseData]
+    usage: UsageInfo
+
+
+class EmbeddingBytesResponse(OpenAIBaseModel):
+    content: list[bytes]
+    headers: dict[str, str] | None = None
+    media_type: str = "application/octet-stream"
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -0,0 +1,684 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from collections.abc import AsyncGenerator, Mapping
+from typing import Any, Final, cast
+
+import torch
+from fastapi import Request
+from fastapi.responses import Response
+from typing_extensions import assert_never, override
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import (
+    EmbeddingServeContext,
+    OpenAIServing,
+    ServeContext,
+)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingBytesResponse,
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    EmbeddingResponseData,
+)
+from vllm.entrypoints.renderer import RenderConfig
+from vllm.inputs.data import TokensPrompt
+from vllm.logger import init_logger
+from vllm.outputs import (
+    EmbeddingRequestOutput,
+    PoolingOutput,
+    PoolingRequestOutput,
+    RequestOutput,
+)
+from vllm.pooling_params import PoolingParams
+from vllm.utils.async_utils import merge_async_iterators
+from vllm.utils.collection_utils import chunk_list
+from vllm.utils.serial_utils import (
+    EmbedDType,
+    EncodingFormat,
+    Endianness,
+    encode_pooling_bytes,
+    encode_pooling_output,
+)
+
+logger = init_logger(__name__)
+
+
+class EmbeddingMixin(OpenAIServing):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        pooler_config = self.model_config.pooler_config
+
+        # Avoid repeated attribute lookups
+        self.supports_chunked_processing = bool(
+            pooler_config and pooler_config.enable_chunked_processing
+        )
+        self.max_embed_len = (
+            pooler_config.max_embed_len
+            if pooler_config and pooler_config.max_embed_len
+            else None
+        )
+
+    @override
+    async def _preprocess(
+        self,
+        ctx: ServeContext,
+    ) -> ErrorResponse | None:
+        ctx = cast(EmbeddingServeContext, ctx)
+        try:
+            ctx.lora_request = self._maybe_get_adapters(ctx.request)
+
+            tokenizer = await self.engine_client.get_tokenizer()
+            renderer = self._get_renderer(tokenizer)
+
+            if isinstance(ctx.request, EmbeddingChatRequest):
+                _, ctx.engine_prompts = await self._preprocess_chat(
+                    ctx.request,
+                    tokenizer,
+                    ctx.request.messages,
+                    chat_template=ctx.request.chat_template or ctx.chat_template,
+                    chat_template_content_format=ctx.chat_template_content_format,
+                    add_generation_prompt=ctx.request.add_generation_prompt,
+                    continue_final_message=False,
+                    add_special_tokens=ctx.request.add_special_tokens,
+                )
+            else:
+                ctx.engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=ctx.request.input,
+                    config=self._build_render_config(ctx.request),
+                )
+            return None
+        except (ValueError, TypeError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+    def _build_render_config(self, request: EmbeddingCompletionRequest) -> RenderConfig:
+        # Set max_length based on chunked processing capability
+        if self._should_use_chunked_processing(request):
+            max_length = None
+        else:
+            max_length = self.max_embed_len or self.max_model_len
+
+        return RenderConfig(
+            max_length=max_length,
+            truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens,
+        )
+
+    @override
+    def _build_response(
+        self,
+        ctx: ServeContext,
+    ) -> EmbeddingResponse | Response | ErrorResponse:
+        final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch)
+
+        encoding_format: EncodingFormat = ctx.request.encoding_format
+        embed_dtype: EmbedDType = ctx.request.embed_dtype
+        endianness: Endianness = ctx.request.endianness
+
+        def encode_float_base64():
+            items: list[EmbeddingResponseData] = []
+            num_prompt_tokens = 0
+
+            for idx, final_res in enumerate(final_res_batch_checked):
+                item = EmbeddingResponseData(
+                    index=idx,
+                    embedding=encode_pooling_output(
+                        final_res,
+                        encoding_format=encoding_format,
+                        embed_dtype=embed_dtype,
+                        endianness=endianness,
+                    ),
+                )
+                prompt_token_ids = final_res.prompt_token_ids
+
+                items.append(item)
+                num_prompt_tokens += len(prompt_token_ids)
+
+            usage = UsageInfo(
+                prompt_tokens=num_prompt_tokens,
+                total_tokens=num_prompt_tokens,
+            )
+
+            return EmbeddingResponse(
+                id=ctx.request_id,
+                created=ctx.created_time,
+                model=ctx.model_name,
+                data=items,
+                usage=usage,
+            )
+
+        def encode_bytes(bytes_only: bool) -> EmbeddingBytesResponse:
+            content, items, usage = encode_pooling_bytes(
+                pooling_outputs=final_res_batch_checked,
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+            )
+
+            headers = (
+                None
+                if bytes_only
+                else {
+                    "metadata": json.dumps(
+                        {
+                            "id": ctx.request_id,
+                            "created": ctx.created_time,
+                            "model": ctx.model_name,
+                            "data": items,
+                            "usage": usage,
+                        }
+                    )
+                }
+            )
+
+            return EmbeddingBytesResponse(content=content, headers=headers)
+
+        if encoding_format == "float" or encoding_format == "base64":
+            return encode_float_base64()
+        elif encoding_format == "bytes" or encoding_format == "bytes_only":
+            return encode_bytes(bytes_only=encoding_format == "bytes_only")
+        else:
+            assert_never(encoding_format)
+
+    def _get_max_position_embeddings(self) -> int:
+        """Get the model's effective maximum sequence length for chunking."""
+        return self.model_config.max_model_len
+
+    def _should_use_chunked_processing(self, request) -> bool:
+        """Check if chunked processing should be used for this request."""
+        return (
+            isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest))
+            and self.supports_chunked_processing
+        )
+
+    async def _process_chunked_request(
+        self,
+        ctx: EmbeddingServeContext,
+        token_ids: list[int],
+        pooling_params,
+        trace_headers,
+        prompt_idx: int,
+    ) -> list[AsyncGenerator[PoolingRequestOutput, None]]:
+        """Process a single prompt using chunked processing."""
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
+
+        # Split into chunks using max_position_embeddings
+        max_pos_embeddings = self._get_max_position_embeddings()
+        # Process all chunks for MEAN aggregation
+        for chunk_idx, chunk_tokens in enumerate(
+            chunk_list(token_ids, max_pos_embeddings)
+        ):
+            # Create a request ID for this chunk
+            chunk_request_id = f"{ctx.request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}"
+
+            # Create engine prompt for this chunk
+            chunk_engine_prompt = TokensPrompt(prompt_token_ids=chunk_tokens)
+
+            # Log the chunk
+            self._log_inputs(
+                chunk_request_id,
+                chunk_engine_prompt,
+                params=pooling_params,
+                lora_request=ctx.lora_request,
+            )
+
+            # Create generator for this chunk and wrap it to return indices
+            original_generator = self.engine_client.encode(
+                chunk_engine_prompt,
+                pooling_params,
+                chunk_request_id,
+                lora_request=ctx.lora_request,
+                trace_headers=trace_headers,
+                priority=getattr(ctx.request, "priority", 0),
+            )
+
+            generators.append(original_generator)
+
+        return generators
+
+    def _validate_input(
+        self,
+        request,
+        input_ids: list[int],
+        input_text: str,
+    ) -> TokensPrompt:
+        """Override to support chunked processing for embedding requests."""
+        token_num = len(input_ids)
+
+        # Note: EmbeddingRequest doesn't have max_tokens
+        if isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest)):
+            # Check if chunked processing is enabled for pooling models
+            enable_chunked = self._should_use_chunked_processing(request)
+
+            # Use max_position_embeddings for chunked processing decisions
+            max_pos_embeddings = self._get_max_position_embeddings()
+
+            # Determine the effective max length for validation
+            if self.max_embed_len is not None:
+                # Use max_embed_len for validation instead of max_model_len
+                length_type = "maximum embedding input length"
+                max_length_value = self.max_embed_len
+            else:
+                # Fall back to max_model_len validation (original behavior)
+                length_type = "maximum context length"
+                max_length_value = self.max_model_len
+
+            validation_error_msg = (
+                "This model's {length_type} is {max_length_value} tokens. "
+                "However, you requested {token_num} tokens in the input for "
+                "embedding generation. Please reduce the length of the input."
+            )
+
+            chunked_processing_error_msg = (
+                "This model's {length_type} is {max_length_value} tokens. "
+                "However, you requested {token_num} tokens in the input for "
+                "embedding generation. Please reduce the length of the input "
+                "or enable chunked processing."
+            )
+
+            # Check if input exceeds max length
+            if token_num > max_length_value:
+                raise ValueError(
+                    validation_error_msg.format(
+                        length_type=length_type,
+                        max_length_value=max_length_value,
+                        token_num=token_num,
+                    )
+                )
+
+            # Check for chunked processing
+            # when exceeding max_position_embeddings
+            if token_num > max_pos_embeddings:
+                if enable_chunked:
+                    # Allow long inputs when chunked processing is enabled
+                    logger.info(
+                        "Input length %s exceeds max_position_embeddings "
+                        "%s, will use chunked processing",
+                        token_num,
+                        max_pos_embeddings,
+                    )
+                else:
+                    raise ValueError(
+                        chunked_processing_error_msg.format(
+                            length_type="maximum position embeddings length",
+                            max_length_value=max_pos_embeddings,
+                            token_num=token_num,
+                        )
+                    )
+
+            return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+
+        # For other request types, use the parent's implementation
+        return super()._validate_input(request, input_ids, input_text)
+
+    async def _create_single_prompt_generator(
+        self,
+        ctx: EmbeddingServeContext,
+        engine_prompt: TokensPrompt,
+        pooling_params: PoolingParams,
+        trace_headers: Mapping[str, str] | None,
+        prompt_index: int,
+    ) -> AsyncGenerator[RequestOutput | PoolingRequestOutput, None]:
+        """Create a generator for a single prompt using standard processing."""
+        request_id_item = f"{ctx.request_id}-{prompt_index}"
+
+        self._log_inputs(
+            request_id_item,
+            engine_prompt,
+            params=pooling_params,
+            lora_request=ctx.lora_request,
+        )
+
+        # Return the original generator without wrapping
+        return self.engine_client.encode(
+            engine_prompt,
+            pooling_params,
+            request_id_item,
+            lora_request=ctx.lora_request,
+            trace_headers=trace_headers,
+            priority=getattr(ctx.request, "priority", 0),
+        )
+
+    @override
+    async def _prepare_generators(
+        self,
+        ctx: ServeContext,
+    ) -> ErrorResponse | None:
+        """Override to support chunked processing."""
+        ctx = cast(EmbeddingServeContext, ctx)
+
+        # Check if we should use chunked processing
+        use_chunked = self._should_use_chunked_processing(ctx.request)
+
+        # If no chunked processing needed, delegate to parent class
+        if not use_chunked:
+            return await super()._prepare_generators(ctx)
+
+        # Custom logic for chunked processing
+        generators: list[
+            AsyncGenerator[RequestOutput | PoolingRequestOutput, None]
+        ] = []
+
+        try:
+            trace_headers = (
+                None
+                if ctx.raw_request is None
+                else await self._get_trace_headers(ctx.raw_request.headers)
+            )
+
+            pooling_params = self._create_pooling_params(ctx)
+            if isinstance(pooling_params, ErrorResponse):
+                return pooling_params
+
+            # Verify and set the task for pooling params
+            try:
+                pooling_params.verify("embed", self.model_config)
+            except ValueError as e:
+                return self.create_error_response(str(e))
+
+            if ctx.engine_prompts is None:
+                return self.create_error_response("Engine prompts not available")
+
+            max_pos_embeddings = self._get_max_position_embeddings()
+
+            for i, engine_prompt in enumerate(ctx.engine_prompts):
+                # Check if this specific prompt needs chunked processing
+                if "prompt_token_ids" in engine_prompt:
+                    prompt_token_ids = engine_prompt["prompt_token_ids"]
+                    if len(prompt_token_ids) > max_pos_embeddings:
+                        # Use chunked processing for this prompt
+                        chunk_generators = await self._process_chunked_request(
+                            ctx,
+                            prompt_token_ids,
+                            pooling_params,
+                            trace_headers,
+                            i,
+                        )
+                        generators.extend(chunk_generators)
+                        continue
+
+                # Normal processing for short prompts or non-token prompts
+                generator = await self._create_single_prompt_generator(
+                    ctx, engine_prompt, pooling_params, trace_headers, i
+                )
+                generators.append(generator)
+
+            ctx.result_generator = merge_async_iterators(*generators)
+
+            return None
+
+        except Exception as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    @override
+    async def _collect_batch(
+        self,
+        ctx: ServeContext,
+    ) -> ErrorResponse | None:
+        """Collect and aggregate batch results
+        with support for chunked processing.
+
+        For chunked requests, performs online aggregation to
+        minimize memory usage.
+        For regular requests, collects results normally.
+        """
+        ctx = cast(EmbeddingServeContext, ctx)
+        try:
+            if ctx.engine_prompts is None:
+                return self.create_error_response("Engine prompts not available")
+
+            # Check if we used chunked processing
+            use_chunked = self._should_use_chunked_processing(ctx.request)
+
+            if not use_chunked:
+                return await super()._collect_batch(ctx=ctx)
+
+            if ctx.result_generator is None:
+                return self.create_error_response("Result generator not available")
+
+            # Online aggregation for chunked requests to
+            # minimize memory usage
+            # Track aggregation state for each prompt
+            prompt_aggregators: dict[int, dict[str, Any]] = {}
+            short_prompts_results: dict[int, PoolingRequestOutput] = {}
+
+            async for result_idx, result in ctx.result_generator:
+                if "-chunk-" in result.request_id:
+                    # Extract prompt_idx from chunked request_id
+                    parts = result.request_id.split("-")
+                    try:
+                        prompt_idx = int(parts[parts.index("prompt") + 1])
+                    except (ValueError, IndexError):
+                        # Fallback: extract from result_idx if parsing fails
+                        prompt_idx = result_idx
+
+                    # Initialize aggregator for this prompt if needed
+                    if prompt_idx not in prompt_aggregators:
+                        prompt_aggregators[prompt_idx] = {
+                            "weighted_sum": None,
+                            "total_weight": 0,
+                            "chunk_count": 0,
+                            "request_id": result.request_id.split("-chunk-")[0],
+                        }
+
+                    aggregator = prompt_aggregators[prompt_idx]
+
+                    # MEAN pooling with online weighted averaging
+                    # Ensure result is PoolingRequestOutput
+                    # for embedding processing
+                    if not isinstance(result, PoolingRequestOutput):
+                        return self.create_error_response(
+                            f"Expected PoolingRequestOutput for "
+                            f"chunked embedding, got "
+                            f"{type(result).__name__}"
+                        )
+
+                    # Handle both PoolingOutput and
+                    # EmbeddingOutput types
+                    if hasattr(result.outputs, "data"):
+                        # PoolingOutput case
+                        embedding_data = result.outputs.data
+                    elif hasattr(result.outputs, "embedding"):
+                        # EmbeddingOutput case -
+                        # convert embedding list to tensor
+                        embedding_data = result.outputs.embedding
+                    else:
+                        return self.create_error_response(
+                            f"Unsupported output type: {type(result.outputs).__name__}"
+                        )
+
+                    if not isinstance(embedding_data, torch.Tensor):
+                        embedding_data = torch.tensor(
+                            embedding_data, dtype=torch.float32
+                        )
+
+                    if result.prompt_token_ids is None:
+                        return self.create_error_response(
+                            "prompt_token_ids cannot be None for chunked processing"
+                        )
+                    weight = len(result.prompt_token_ids)
+
+                    weighted_embedding = embedding_data.to(dtype=torch.float32) * weight
+
+                    if aggregator["weighted_sum"] is None:
+                        # First chunk
+                        aggregator["weighted_sum"] = weighted_embedding
+                    else:
+                        # Accumulate
+                        aggregator["weighted_sum"] += weighted_embedding
+
+                    aggregator["total_weight"] += weight
+                    aggregator["chunk_count"] += 1
+                else:
+                    # Non-chunked result - extract prompt_idx from request_id
+                    parts = result.request_id.split("-")
+                    try:
+                        # Last part should be prompt index
+                        prompt_idx = int(parts[-1])
+                    except (ValueError, IndexError):
+                        prompt_idx = result_idx  # Fallback to result_idx
+
+                    short_prompts_results[prompt_idx] = cast(
+                        PoolingRequestOutput, result
+                    )
+
+            # Finalize aggregated results
+            final_res_batch: list[PoolingRequestOutput | EmbeddingRequestOutput] = []
+            num_prompts = len(ctx.engine_prompts)
+
+            for prompt_idx in range(num_prompts):
+                if prompt_idx in prompt_aggregators:
+                    # Finalize MEAN aggregation for this chunked prompt
+                    aggregator = prompt_aggregators[prompt_idx]
+
+                    weighted_sum = aggregator["weighted_sum"]
+                    total_weight = aggregator["total_weight"]
+
+                    if (
+                        weighted_sum is not None
+                        and isinstance(weighted_sum, torch.Tensor)
+                        and isinstance(total_weight, (int, float))
+                        and total_weight > 0
+                    ):
+                        # Compute final mean embedding
+                        final_embedding = weighted_sum / total_weight
+
+                        # Create a PoolingRequestOutput
+                        # for the aggregated result
+                        pooling_output_data = PoolingOutput(data=final_embedding)
+
+                        # Get original prompt token IDs for this prompt
+                        original_prompt = ctx.engine_prompts[prompt_idx]
+                        if "prompt_token_ids" not in original_prompt:
+                            return self.create_error_response(
+                                f"Chunked prompt {prompt_idx} does not contain "
+                                "token IDs"
+                            )
+
+                        original_token_ids = original_prompt["prompt_token_ids"]
+
+                        pooling_request_output = PoolingRequestOutput(
+                            request_id=aggregator["request_id"],
+                            prompt_token_ids=original_token_ids,
+                            outputs=pooling_output_data,
+                            num_cached_tokens=0,
+                            finished=True,
+                        )
+
+                        final_res_batch.append(pooling_request_output)
+                    else:
+                        return self.create_error_response(
+                            f"Failed to aggregate chunks for prompt {prompt_idx}"
+                        )
+                elif prompt_idx in short_prompts_results:
+                    final_res_batch.append(
+                        cast(PoolingRequestOutput, short_prompts_results[prompt_idx])
+                    )
+                else:
+                    return self.create_error_response(
+                        f"Result not found for prompt {prompt_idx}"
+                    )
+
+            ctx.final_res_batch = cast(
+                list[RequestOutput | PoolingRequestOutput], final_res_batch
+            )
+
+            return None
+
+        except Exception as e:
+            return self.create_error_response(str(e))
+
+
+class OpenAIServingEmbedding(EmbeddingMixin):
+    request_id_prefix = "embd"
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        trust_request_chat_template: bool = False,
+        log_error_stack: bool = False,
+    ) -> None:
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template
+
+    async def create_embedding(
+        self,
+        request: EmbeddingRequest,
+        raw_request: Request | None = None,
+    ) -> EmbeddingResponse | ErrorResponse:
+        """
+        Embedding API similar to OpenAI's API.
+
+        See https://platform.openai.com/docs/api-reference/embeddings/create
+        for the API specification. This API mimics the OpenAI Embedding API.
+        """
+        model_name = self.models.model_name()
+        request_id = (
+            f"{self.request_id_prefix}-"
+            f"{self._base_request_id(raw_request, request.request_id)}"
+        )
+
+        ctx = EmbeddingServeContext(
+            request=request,
+            raw_request=raw_request,
+            model_name=model_name,
+            request_id=request_id,
+            chat_template=self.chat_template,
+            chat_template_content_format=self.chat_template_content_format,
+        )
+
+        return await super().handle(ctx)  # type: ignore
+
+    @override
+    def _create_pooling_params(
+        self,
+        ctx: ServeContext[EmbeddingRequest],
+    ) -> PoolingParams | ErrorResponse:
+        pooling_params = super()._create_pooling_params(ctx)
+        if isinstance(pooling_params, ErrorResponse):
+            return pooling_params
+
+        try:
+            pooling_params.verify("embed", self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        return pooling_params
+
+    async def _preprocess(
+        self,
+        ctx: ServeContext,
+    ) -> ErrorResponse | None:
+        if isinstance(ctx.request, EmbeddingChatRequest):
+            error_check_ret = self._validate_chat_template(
+                request_chat_template=ctx.request.chat_template,
+                chat_template_kwargs=ctx.request.chat_template_kwargs,
+                trust_request_chat_template=self.trust_request_chat_template,
+            )
+            if error_check_ret is not None:
+                return error_check_ret
+        return await super()._preprocess(ctx)
--- a/vllm/entrypoints/pooling/pooling/init.py
+++ b/vllm/entrypoints/pooling/pooling/init.py
--- a/vllm/entrypoints/pooling/pooling/api_router.py
+++ b/vllm/entrypoints/pooling/pooling/api_router.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorResponse,
+    PoolingBytesResponse,
+    PoolingRequest,
+    PoolingResponse,
+)
+from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def pooling(request: Request) -> OpenAIServingPooling | None:
+    return request.app.state.openai_serving_pooling
+
+
+@router.post(
+    "/pooling",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_pooling(request: PoolingRequest, raw_request: Request):
+    handler = pooling(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Pooling API"
+        )
+    try:
+        generator = await handler.create_pooling(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
+        return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, PoolingBytesResponse):
+        return StreamingResponse(
+            content=generator.content,
+            headers=generator.headers,
+            media_type=generator.media_type,
+        )
+
+    assert_never(generator)
--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Generic, TypeAlias, TypeVar
+
+from pydantic import (
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+)
+from vllm.tasks import PoolingTask
+from vllm.utils import random_uuid
+from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
+
+
+class PoolingCompletionRequest(EmbeddingCompletionRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
+
+class PoolingChatRequest(EmbeddingChatRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
+
+T = TypeVar("T")
+
+
+class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
+    model: str | None = None
+
+    priority: int = Field(default=0)
+    """
+    The priority of the request (lower means earlier handling;
+    default: 0). Any priority other than 0 will raise an error
+    if the served model does not use priority scheduling.
+    """
+    data: T
+
+    task: PoolingTask = "plugin"
+    encoding_format: EncodingFormat = "float"
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams()
+
+
+class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
+    request_id: str | None = None
+    """
+    The request_id associated with this response
+    """
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+
+    data: T
+    """
+    When using plugins IOProcessor plugins, the actual output is generated
+    by the plugin itself. Hence, we use a generic type for the response data
+    """
+
+
+PoolingRequest: TypeAlias = (
+    PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
+)
+
+
+class PoolingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "pooling"
+    data: list[list[float]] | list[float] | str
+
+
+class PoolingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[PoolingResponseData]
+    usage: UsageInfo
+
+
+class PoolingBytesResponse(OpenAIBaseModel):
+    content: list[bytes]
+    headers: dict[str, str] | None = None
+    media_type: str = "application/octet-stream"
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -0,0 +1,354 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import json
+import time
+from collections.abc import AsyncGenerator, Sequence
+from typing import Final, cast
+
+import jinja2
+from fastapi import Request
+from typing_extensions import assert_never
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorRequest,
+    IOProcessorResponse,
+    PoolingBytesResponse,
+    PoolingChatRequest,
+    PoolingCompletionRequest,
+    PoolingRequest,
+    PoolingResponse,
+    PoolingResponseData,
+)
+from vllm.entrypoints.renderer import RenderConfig
+from vllm.entrypoints.utils import _validate_truncation_size
+from vllm.logger import init_logger
+from vllm.outputs import PoolingRequestOutput
+from vllm.tasks import PoolingTask, SupportedTask
+from vllm.utils.async_utils import merge_async_iterators
+from vllm.utils.serial_utils import (
+    EmbedDType,
+    EncodingFormat,
+    Endianness,
+    encode_pooling_bytes,
+    encode_pooling_output,
+)
+
+logger = init_logger(__name__)
+
+
+class OpenAIServingPooling(OpenAIServing):
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        supported_tasks: tuple[SupportedTask, ...],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        trust_request_chat_template: bool = False,
+        log_error_stack: bool = False,
+    ) -> None:
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
+
+        self.supported_tasks = supported_tasks
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template
+
+    async def create_pooling(
+        self,
+        request: PoolingRequest,
+        raw_request: Request | None = None,
+    ) -> PoolingResponse | IOProcessorResponse | PoolingBytesResponse | ErrorResponse:
+        """
+        See https://platform.openai.com/docs/api-reference/embeddings/create
+        for the API specification. This API mimics the OpenAI Embedding API.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        model_name = self.models.model_name()
+
+        request_id = f"pool-{self._base_request_id(raw_request)}"
+        created_time = int(time.time())
+
+        is_io_processor_request = isinstance(request, IOProcessorRequest)
+        try:
+            lora_request = self._maybe_get_adapters(request)
+
+            if self.model_config.skip_tokenizer_init:
+                tokenizer = None
+            else:
+                tokenizer = await self.engine_client.get_tokenizer()
+            renderer = self._get_renderer(tokenizer)
+
+            if getattr(request, "dimensions", None) is not None:
+                return self.create_error_response(
+                    "dimensions is currently not supported"
+                )
+
+            truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
+            truncate_prompt_tokens = _validate_truncation_size(
+                self.max_model_len, truncate_prompt_tokens
+            )
+
+            if is_io_processor_request:
+                if self.io_processor is None:
+                    raise ValueError(
+                        "No IOProcessor plugin installed. Please refer "
+                        "to the documentation and to the "
+                        "'prithvi_geospatial_mae_io_processor' "
+                        "offline inference example for more details."
+                    )
+
+                validated_prompt = self.io_processor.parse_request(request)
+
+                engine_prompts = await self.io_processor.pre_process_async(
+                    prompt=validated_prompt, request_id=request_id
+                )
+                if not isinstance(engine_prompts, Sequence) or isinstance(
+                    engine_prompts, (str, bytes, bytearray)
+                ):
+                    engine_prompts = [engine_prompts]
+
+            elif isinstance(request, PoolingChatRequest):
+                error_check_ret = self._validate_chat_template(
+                    request_chat_template=request.chat_template,
+                    chat_template_kwargs=request.chat_template_kwargs,
+                    trust_request_chat_template=self.trust_request_chat_template,
+                )
+                if error_check_ret is not None:
+                    return error_check_ret
+
+                _, engine_prompts = await self._preprocess_chat(
+                    request,
+                    tokenizer,
+                    request.messages,
+                    chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.chat_template_content_format,
+                    # In pooling requests, we are not generating tokens,
+                    # so there is no need to append extra tokens to the input
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    add_special_tokens=request.add_special_tokens,
+                )
+            elif isinstance(request, PoolingCompletionRequest):
+                engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=request.input,
+                    config=self._build_render_config(request),
+                )
+            else:
+                raise ValueError(f"Unsupported request of type {type(request)}")
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
+        try:
+            if is_io_processor_request:
+                assert self.io_processor is not None and isinstance(
+                    request, IOProcessorRequest
+                )
+                pooling_params = self.io_processor.validate_or_generate_params()
+            else:
+                pooling_params = request.to_pooling_params()
+
+            pooling_task: PoolingTask
+            if request.task is None:
+                if "token_embed" in self.supported_tasks:
+                    pooling_task = "token_embed"
+                elif "token_classify" in self.supported_tasks:
+                    pooling_task = "token_classify"
+                elif "plugin" in self.supported_tasks:
+                    pooling_task = "plugin"
+                else:
+                    return self.create_error_response(
+                        f"pooling_task must be one of {self.supported_tasks}."
+                    )
+            else:
+                pooling_task = request.task
+
+            if pooling_task not in self.supported_tasks:
+                return self.create_error_response(
+                    f"Task {pooling_task} is not supported, it"
+                    f" must be one of {self.supported_tasks}."
+                )
+
+            try:
+                pooling_params.verify(pooling_task, self.model_config)
+            except ValueError as e:
+                return self.create_error_response(str(e))
+
+            for i, engine_prompt in enumerate(engine_prompts):
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(
+                    request_id_item,
+                    engine_prompt,
+                    params=pooling_params,
+                    lora_request=lora_request,
+                )
+
+                trace_headers = (
+                    None
+                    if raw_request is None
+                    else await self._get_trace_headers(raw_request.headers)
+                )
+
+                generator = self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        result_generator = merge_async_iterators(*generators)
+
+        if is_io_processor_request:
+            assert self.io_processor is not None
+            output = await self.io_processor.post_process_async(
+                model_output=result_generator,
+                request_id=request_id,
+            )
+            return self.io_processor.output_to_response(output)
+
+        assert isinstance(request, (PoolingCompletionRequest, PoolingChatRequest))
+        num_prompts = len(engine_prompts)
+
+        # Non-streaming response
+        final_res_batch: list[PoolingRequestOutput | None]
+        final_res_batch = [None] * num_prompts
+        try:
+            async for i, res in result_generator:
+                final_res_batch[i] = res
+
+            assert all(final_res is not None for final_res in final_res_batch)
+
+            final_res_batch_checked = cast(list[PoolingRequestOutput], final_res_batch)
+
+            response = self.request_output_to_pooling_response(
+                final_res_batch_checked,
+                request_id,
+                created_time,
+                model_name,
+                request.encoding_format,
+                request.embed_dtype,
+                request.endianness,
+            )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        return response
+
+    def request_output_to_pooling_response(
+        self,
+        final_res_batch: list[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+        encoding_format: EncodingFormat,
+        embed_dtype: EmbedDType,
+        endianness: Endianness,
+    ) -> PoolingResponse | PoolingBytesResponse:
+        def encode_float_base64():
+            items: list[PoolingResponseData] = []
+            num_prompt_tokens = 0
+
+            for idx, final_res in enumerate(final_res_batch):
+                item = PoolingResponseData(
+                    index=idx,
+                    data=encode_pooling_output(
+                        final_res,
+                        encoding_format=encoding_format,
+                        embed_dtype=embed_dtype,
+                        endianness=endianness,
+                    ),
+                )
+                prompt_token_ids = final_res.prompt_token_ids
+
+                items.append(item)
+                num_prompt_tokens += len(prompt_token_ids)
+
+            usage = UsageInfo(
+                prompt_tokens=num_prompt_tokens,
+                total_tokens=num_prompt_tokens,
+            )
+
+            return PoolingResponse(
+                id=request_id,
+                created=created_time,
+                model=model_name,
+                data=items,
+                usage=usage,
+            )
+
+        def encode_bytes(bytes_only: bool) -> PoolingBytesResponse:
+            content, items, usage = encode_pooling_bytes(
+                pooling_outputs=final_res_batch,
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+            )
+
+            headers = (
+                None
+                if bytes_only
+                else {
+                    "metadata": json.dumps(
+                        {
+                            "id": request_id,
+                            "created": created_time,
+                            "model": model_name,
+                            "data": items,
+                            "usage": usage,
+                        }
+                    )
+                }
+            )
+
+            return PoolingBytesResponse(
+                content=content,
+                headers=headers,
+            )
+
+        if encoding_format == "float" or encoding_format == "base64":
+            return encode_float_base64()
+        elif encoding_format == "bytes" or encoding_format == "bytes_only":
+            return encode_bytes(bytes_only=encoding_format == "bytes_only")
+        else:
+            assert_never(encoding_format)
+
+    def _build_render_config(self, request: PoolingCompletionRequest) -> RenderConfig:
+        return RenderConfig(
+            max_length=self.max_model_len,
+            truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens,
+        )
--- a/vllm/entrypoints/pooling/score/init.py
+++ b/vllm/entrypoints/pooling/score/init.py
--- a/vllm/entrypoints/pooling/score/api_router.py
+++ b/vllm/entrypoints/pooling/score/api_router.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    RerankResponse,
+    ScoreRequest,
+    ScoreResponse,
+)
+from vllm.entrypoints.pooling.score.serving import ServingScores
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+from vllm.logger import init_logger
+
+router = APIRouter()
+
+logger = init_logger(__name__)
+
+
+def score(request: Request) -> ServingScores | None:
+    return request.app.state.openai_serving_scores
+
+
+def rerank(request: Request) -> ServingScores | None:
+    return request.app.state.openai_serving_scores
+
+
+@router.post(
+    "/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_score(request: ScoreRequest, raw_request: Request):
+    handler = score(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Score API"
+        )
+
+    try:
+        generator = await handler.create_score(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, ScoreResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/v1/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly."
+    )
+
+    return await create_score(request, raw_request)
+
+
+@router.post(
+    "/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def do_rerank(request: RerankRequest, raw_request: Request):
+    handler = rerank(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Rerank (Score) API"
+        )
+    try:
+        generator = await handler.do_rerank(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, RerankResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/v1/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def do_rerank_v1(request: RerankRequest, raw_request: Request):
+    logger.warning_once(
+        "To indicate that the rerank API is not part of the standard OpenAI"
+        " API, we have located it at `/rerank`. Please update your client "
+        "accordingly. (Note: Conforms to JinaAI rerank API)"
+    )
+
+    return await do_rerank(request, raw_request)
+
+
+@router.post(
+    "/v2/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def do_rerank_v2(request: RerankRequest, raw_request: Request):
+    return await do_rerank(request, raw_request)
--- a/vllm/entrypoints/pooling/score/protocol.py
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Annotated, Any
+
+from pydantic import (
+    BaseModel,
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
+from vllm.utils import random_uuid
+
+
+class ScoreRequest(OpenAIBaseModel):
+    model: str | None = None
+    text_1: list[str] | str | ScoreMultiModalParam
+    text_2: list[str] | str | ScoreMultiModalParam
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:score-extra-params]
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:score-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class RerankRequest(OpenAIBaseModel):
+    model: str | None = None
+    query: str | ScoreMultiModalParam
+    documents: list[str] | ScoreMultiModalParam
+    top_n: int = Field(default_factory=lambda: 0)
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:rerank-extra-params]
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:rerank-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class RerankDocument(BaseModel):
+    text: str | None = None
+    multi_modal: ScoreContentPartParam | None = None
+
+
+class RerankResult(BaseModel):
+    index: int
+    document: RerankDocument
+    relevance_score: float
+
+
+class RerankUsage(BaseModel):
+    prompt_tokens: int
+    total_tokens: int
+
+
+class RerankResponse(OpenAIBaseModel):
+    id: str
+    model: str
+    usage: RerankUsage
+    results: list[RerankResult]
+
+
+class ScoreResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "score"
+    score: float
+
+
+class ScoreResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ScoreResponseData]
+    usage: UsageInfo
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -0,0 +1,508 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import time
+from collections.abc import AsyncGenerator, Mapping
+from typing import Any
+
+from fastapi import Request
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankDocument,
+    RerankRequest,
+    RerankResponse,
+    RerankResult,
+    RerankUsage,
+    ScoreRequest,
+    ScoreResponse,
+    ScoreResponseData,
+)
+from vllm.entrypoints.score_utils import (
+    ScoreContentPartParam,
+    ScoreMultiModalParam,
+    _cosine_similarity,
+    _validate_score_input_lens,
+    compress_token_type_ids,
+    get_score_prompt,
+)
+from vllm.entrypoints.utils import _validate_truncation_size
+from vllm.inputs.data import TokensPrompt
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.utils.async_utils import make_async, merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+class ServingScores(OpenAIServing):
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        log_error_stack: bool = False,
+    ) -> None:
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
+
+    async def _embedding_score(
+        self,
+        tokenizer: TokenizerLike,
+        texts_1: list[str],
+        texts_2: list[str],
+        request: RerankRequest | ScoreRequest,
+        request_id: str,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        lora_request: LoRARequest | None | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+    ) -> list[PoolingRequestOutput] | ErrorResponse:
+        input_texts = texts_1 + texts_2
+
+        engine_prompts: list[TokensPrompt] = []
+        tokenize_async = make_async(
+            tokenizer.__call__, executor=self._tokenizer_executor
+        )
+
+        tokenization_kwargs = tokenization_kwargs or {}
+        tokenized_prompts = await asyncio.gather(
+            *(tokenize_async(t, **tokenization_kwargs) for t in input_texts)
+        )
+
+        for tok_result, input_text in zip(tokenized_prompts, input_texts):
+            text_token_prompt = self._validate_input(
+                request, tok_result["input_ids"], input_text
+            )
+
+            engine_prompts.append(
+                TokensPrompt(prompt_token_ids=text_token_prompt["prompt_token_ids"])
+            )
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
+        pooling_params = request.to_pooling_params()
+
+        try:
+            pooling_params.verify("embed", self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        for i, engine_prompt in enumerate(engine_prompts):
+            request_id_item = f"{request_id}-{i}"
+
+            self._log_inputs(
+                request_id_item,
+                input_texts[i],
+                params=pooling_params,
+                lora_request=lora_request,
+            )
+
+            generators.append(
+                self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+            )
+
+        result_generator = merge_async_iterators(*generators)
+
+        # Non-streaming response
+        final_res_batch: list[PoolingRequestOutput] = []
+
+        embeddings: list[PoolingRequestOutput | None] = [None] * len(engine_prompts)
+
+        async for i, res in result_generator:
+            embeddings[i] = res
+
+        emb_texts_1: list[PoolingRequestOutput] = []
+        emb_texts_2: list[PoolingRequestOutput] = []
+
+        for i in range(0, len(texts_1)):
+            assert (emb := embeddings[i]) is not None
+            emb_texts_1.append(emb)
+
+        for i in range(len(texts_1), len(embeddings)):
+            assert (emb := embeddings[i]) is not None
+            emb_texts_2.append(emb)
+
+        if len(emb_texts_1) == 1:
+            emb_texts_1 = emb_texts_1 * len(emb_texts_2)
+
+        final_res_batch = _cosine_similarity(
+            tokenizer=tokenizer, embed_1=emb_texts_1, embed_2=emb_texts_2
+        )
+
+        return final_res_batch
+
+    def _preprocess_score(
+        self,
+        request: RerankRequest | ScoreRequest,
+        tokenizer: TokenizerLike,
+        tokenization_kwargs: dict[str, Any],
+        data_1: str | ScoreContentPartParam,
+        data_2: str | ScoreContentPartParam,
+    ) -> tuple[str, TokensPrompt]:
+        model_config = self.model_config
+
+        full_prompt, engine_prompt = get_score_prompt(
+            model_config=model_config,
+            data_1=data_1,
+            data_2=data_2,
+            tokenizer=tokenizer,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+        self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
+        if request.mm_processor_kwargs is not None:
+            engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
+
+        return full_prompt, engine_prompt
+
+    async def _cross_encoding_score(
+        self,
+        tokenizer: TokenizerLike,
+        data_1: list[str] | list[ScoreContentPartParam],
+        data_2: list[str] | list[ScoreContentPartParam],
+        request: RerankRequest | ScoreRequest,
+        request_id: str,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        lora_request: LoRARequest | None | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+    ) -> list[PoolingRequestOutput] | ErrorResponse:
+        request_prompts: list[str] = []
+        engine_prompts: list[TokensPrompt] = []
+
+        if len(data_1) == 1:
+            data_1 = data_1 * len(data_2)
+
+        if isinstance(tokenizer, MistralTokenizer):
+            raise ValueError("MistralTokenizer not supported for cross-encoding")
+
+        tokenization_kwargs = tokenization_kwargs or {}
+
+        input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
+
+        preprocess_async = make_async(
+            self._preprocess_score, executor=self._tokenizer_executor
+        )
+
+        preprocessed_prompts = await asyncio.gather(
+            *(
+                preprocess_async(
+                    request=request,
+                    tokenizer=tokenizer,
+                    tokenization_kwargs=tokenization_kwargs,
+                    data_1=t1,
+                    data_2=t2,
+                )
+                for t1, t2 in input_pairs
+            )
+        )
+
+        for full_prompt, engine_prompt in preprocessed_prompts:
+            request_prompts.append(full_prompt)
+            engine_prompts.append(engine_prompt)
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
+
+        default_pooling_params = request.to_pooling_params()
+
+        try:
+            default_pooling_params.verify("score", self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        for i, engine_prompt in enumerate(engine_prompts):
+            request_id_item = f"{request_id}-{i}"
+
+            self._log_inputs(
+                request_id_item,
+                request_prompts[i],
+                params=default_pooling_params,
+                lora_request=lora_request,
+            )
+
+            if token_type_ids := engine_prompt.pop("token_type_ids", None):
+                pooling_params = default_pooling_params.clone()
+                compressed = compress_token_type_ids(token_type_ids)
+                pooling_params.extra_kwargs = {"compressed_token_type_ids": compressed}
+            else:
+                pooling_params = default_pooling_params
+
+            generator = self.engine_client.encode(
+                engine_prompt,
+                pooling_params,
+                request_id_item,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=request.priority,
+            )
+
+            generators.append(generator)
+
+        result_generator = merge_async_iterators(*generators)
+
+        # Non-streaming response
+        final_res_batch: list[PoolingRequestOutput | None] = [None] * len(
+            engine_prompts
+        )
+
+        async for i, res in result_generator:
+            final_res_batch[i] = res
+
+        return [out for out in final_res_batch if out is not None]
+
+    async def _run_scoring(
+        self,
+        data_1: list[str] | str | ScoreMultiModalParam,
+        data_2: list[str] | str | ScoreMultiModalParam,
+        request: ScoreRequest | RerankRequest,
+        request_id: str,
+        raw_request: Request | None = None,
+    ) -> list[PoolingRequestOutput] | ErrorResponse:
+        lora_request = self._maybe_get_adapters(request)
+
+        tokenizer = await self.engine_client.get_tokenizer()
+
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
+
+        tokenization_kwargs: dict[str, Any] = {}
+        _validate_truncation_size(
+            self.max_model_len, truncate_prompt_tokens, tokenization_kwargs
+        )
+
+        trace_headers = (
+            None
+            if raw_request is None
+            else await self._get_trace_headers(raw_request.headers)
+        )
+
+        if not self.model_config.is_multimodal_model and (
+            isinstance(data_1, dict) or isinstance(data_2, dict)
+        ):
+            raise ValueError(
+                f"MultiModalParam is not supported for {self.model_config.architecture}"  # noqa: E501
+            )
+
+        if isinstance(data_1, str):
+            data_1 = [data_1]
+        elif isinstance(data_1, dict):
+            data_1 = data_1.get("content")  # type: ignore[assignment]
+
+        if isinstance(data_2, str):
+            data_2 = [data_2]
+        elif isinstance(data_2, dict):
+            data_2 = data_2.get("content")  # type: ignore[assignment]
+
+        _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]
+
+        if self.model_config.is_cross_encoder:
+            return await self._cross_encoding_score(
+                tokenizer=tokenizer,
+                data_1=data_1,  # type: ignore[arg-type]
+                data_2=data_2,  # type: ignore[arg-type]
+                request=request,
+                request_id=request_id,
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+            )
+
+        else:
+            return await self._embedding_score(
+                tokenizer=tokenizer,
+                texts_1=data_1,  # type: ignore[arg-type]
+                texts_2=data_2,  # type: ignore[arg-type]
+                request=request,
+                request_id=request_id,
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+            )
+
+    async def create_score(
+        self,
+        request: ScoreRequest,
+        raw_request: Request | None = None,
+    ) -> ScoreResponse | ErrorResponse:
+        """
+        Score API similar to Sentence Transformers cross encoder
+
+        See https://sbert.net/docs/package_reference/cross_encoder
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        request_id = f"score-{self._base_request_id(raw_request)}"
+        created_time = int(time.time())
+
+        try:
+            final_res_batch = await self._run_scoring(
+                request.text_1,
+                request.text_2,
+                request,
+                request_id,
+                raw_request,
+            )
+            if isinstance(final_res_batch, ErrorResponse):
+                return final_res_batch
+
+            return self.request_output_to_score_response(
+                final_res_batch,
+                request_id,
+                created_time,
+                self.models.model_name(),
+            )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    async def do_rerank(
+        self, request: RerankRequest, raw_request: Request | None = None
+    ) -> RerankResponse | ErrorResponse:
+        """
+        Rerank API based on JinaAI's rerank API; implements the same
+        API interface. Designed for compatibility with off-the-shelf
+        tooling, since this is a common standard for reranking APIs
+
+        See example client implementations at
+        https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py
+        numerous clients use this standard.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        request_id = f"rerank-{self._base_request_id(raw_request)}"
+        documents = request.documents
+        top_n = (
+            request.top_n
+            if request.top_n > 0
+            else (
+                len(documents)
+                if isinstance(documents, list)
+                else len(documents["content"])
+            )
+        )
+
+        try:
+            final_res_batch = await self._run_scoring(
+                request.query,
+                documents,
+                request,
+                request_id,
+                raw_request,
+            )
+            if isinstance(final_res_batch, ErrorResponse):
+                return final_res_batch
+
+            return self.request_output_to_rerank_response(
+                final_res_batch,
+                request_id,
+                self.models.model_name(),
+                documents,
+                top_n,
+            )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    def request_output_to_score_response(
+        self,
+        final_res_batch: list[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+    ) -> ScoreResponse:
+        items: list[ScoreResponseData] = []
+        num_prompt_tokens = 0
+
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            item = ScoreResponseData(
+                index=idx,
+                score=classify_res.outputs.score,
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return ScoreResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            data=items,
+            usage=usage,
+        )
+
+    def request_output_to_rerank_response(
+        self,
+        final_res_batch: list[PoolingRequestOutput],
+        request_id: str,
+        model_name: str,
+        documents: list[str] | ScoreMultiModalParam,
+        top_n: int,
+    ) -> RerankResponse:
+        """
+        Convert the output of do_rank to a RerankResponse
+        """
+        results: list[RerankResult] = []
+        num_prompt_tokens = 0
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            result = RerankResult(
+                index=idx,
+                document=RerankDocument(text=documents[idx])
+                if isinstance(documents, list)
+                else RerankDocument(multi_modal=documents["content"][idx]),
+                relevance_score=classify_res.outputs.score,
+            )
+            results.append(result)
+            prompt_token_ids = final_res.prompt_token_ids
+            num_prompt_tokens += len(prompt_token_ids)
+
+        # sort by relevance, then return the top n if set
+        results.sort(key=lambda x: x.relevance_score, reverse=True)
+        if top_n < len(documents):
+            results = results[:top_n]
+
+        return RerankResponse(
+            id=request_id,
+            model=model_name,
+            results=results,
+            usage=RerankUsage(
+                total_tokens=num_prompt_tokens, prompt_tokens=num_prompt_tokens
+            ),
+        )