Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
@@ -38,6 +38,7 @@ from vllm.logprobs import Logprob
 from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs
 from vllm.sampling_params import (
    BeamSearchParams,
+    RepetitionDetectionParams,
    RequestOutputKind,
    SamplingParams,
    StructuredOutputsParams,
@@ -336,6 +337,16 @@ class ChatCompletionRequest(OpenAIBaseModel):
        ),
    )

+    repetition_detection: RepetitionDetectionParams | None = Field(
+        default=None,
+        description="Parameters for detecting repetitive N-gram patterns "
+        "in output tokens. If such repetition is detected, generation will "
+        "be ended early. LLMs can sometimes generate repetitive, unhelpful "
+        "token patterns, stopping only when they hit the maximum output length "
+        "(e.g. 'abcdabcdabcd...' or '\emoji \emoji \emoji ...'). This feature "
+        "can detect such behavior and terminate early, saving time and tokens.",
+    )
+
    # --8<-- [end:chat-completion-extra-params]

    def build_chat_params(
@@ -490,7 +501,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
            skip_special_tokens=self.skip_special_tokens,
            spaces_between_special_tokens=self.spaces_between_special_tokens,
            include_stop_str_in_output=self.include_stop_str_in_output,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,
@@ -500,8 +510,37 @@ class ChatCompletionRequest(OpenAIBaseModel):
            allowed_token_ids=self.allowed_token_ids,
            extra_args=extra_args or None,
            skip_clone=True,  # Created fresh per request, safe to skip clone
+            repetition_detection=self.repetition_detection,
        )

+    @model_validator(mode="before")
+    @classmethod
+    def validate_response_format(cls, data):
+        response_format = data.get("response_format")
+        if response_format is None:
+            return data
+
+        rf_type = (
+            response_format.get("type")
+            if isinstance(response_format, dict)
+            else getattr(response_format, "type", None)
+        )
+
+        if rf_type == "json_schema":
+            json_schema = (
+                response_format.get("json_schema")
+                if isinstance(response_format, dict)
+                else getattr(response_format, "json_schema", None)
+            )
+            if json_schema is None:
+                raise VLLMValidationError(
+                    "When response_format type is 'json_schema', the "
+                    "'json_schema' field must be provided.",
+                    parameter="response_format",
+                )
+
+        return data
+
    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -1249,13 +1249,23 @@ class OpenAIServingChat(OpenAIServing):
                                )

                            # get the expected call based on partial JSON
-                            # parsing which "autocompletes" the JSON
-                            expected_call = json.dumps(
-                                tool_parser.prev_tool_call_arr[index].get(
-                                    "arguments", {}
-                                ),
-                                ensure_ascii=False,
+                            # parsing which "autocompletes" the JSON.
+                            # Tool parsers (e.g. Qwen3Coder) store
+                            # arguments as a JSON string in
+                            # prev_tool_call_arr. Calling json.dumps()
+                            # on an already-serialized string would
+                            # double-serialize it (e.g. '{"k":1}' becomes
+                            # '"{\\"k\\":1}"'), which then causes the
+                            # replace() below to fail and append the
+                            # entire double-serialized string as a
+                            # spurious final delta.
+                            args = tool_parser.prev_tool_call_arr[index].get(
+                                "arguments", {}
                            )
+                            if isinstance(args, str):
+                                expected_call = args
+                            else:
+                                expected_call = json.dumps(args, ensure_ascii=False)

                            # get what we've streamed so far for arguments
                            # for the current tool
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -143,7 +143,8 @@ class BaseFrontendArgs:
    templates and other tokenizer configuration."""
    enable_log_outputs: bool = False
    """If set to True, log model outputs (generations).
-    Requires --enable-log-requests."""
+    Requires `--enable-log-requests`. As with `--enable-log-requests`,
+    information is only logged at INFO level at maximum."""
    enable_log_deltas: bool = True
    """If set to False, output deltas will not be logged. Relevant only if 
    --enable-log-outputs is set.
@@ -277,6 +278,10 @@ class FrontendArgs(BaseFrontendArgs):
    Enable offline FastAPI documentation for air-gapped environments.
    Uses vendored static assets bundled with vLLM.
    """
+    use_gpu_for_pooling_score: bool = False
+    """If set, run pooling score MaxSim on GPU in the API server process.
+    Can significantly improve late-interaction scoring performance.
+    https://github.com/vllm-project/vllm/pull/35330"""

    @classmethod
    def _customize_cli_kwargs(
--- a/vllm/entrypoints/openai/completion/protocol.py
+++ b/vllm/entrypoints/openai/completion/protocol.py
@@ -26,6 +26,7 @@ from vllm.logprobs import Logprob
 from vllm.renderers import TokenizeParams
 from vllm.sampling_params import (
    BeamSearchParams,
+    RepetitionDetectionParams,
    RequestOutputKind,
    SamplingParams,
    StructuredOutputsParams,
@@ -166,6 +167,16 @@ class CompletionRequest(OpenAIBaseModel):
        ),
    )

+    repetition_detection: RepetitionDetectionParams | None = Field(
+        default=None,
+        description="Parameters for detecting repetitive N-gram patterns "
+        "in output tokens. If such repetition is detected, generation will "
+        "be ended early. LLMs can sometimes generate repetitive, unhelpful "
+        "token patterns, stopping only when they hit the maximum output length "
+        "(e.g. 'abcdabcdabcd...' or '\emoji \emoji \emoji ...'). This feature "
+        "can detect such behavior and terminate early, saving time and tokens.",
+    )
+
    # --8<-- [end:completion-extra-params]

    def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
@@ -259,7 +270,7 @@ class CompletionRequest(OpenAIBaseModel):
                structured_outputs_kwargs["json"] = json_schema.json_schema
            elif response_format.type == "structural_tag":
                structural_tag = response_format
-                assert structural_tag is not None and isinstance(
+                assert isinstance(
                    structural_tag,
                    (
                        LegacyStructuralTagResponseFormat,
@@ -302,7 +313,6 @@ class CompletionRequest(OpenAIBaseModel):
            skip_special_tokens=self.skip_special_tokens,
            spaces_between_special_tokens=self.spaces_between_special_tokens,
            include_stop_str_in_output=self.include_stop_str_in_output,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,
@@ -311,8 +321,37 @@ class CompletionRequest(OpenAIBaseModel):
            allowed_token_ids=self.allowed_token_ids,
            extra_args=extra_args or None,
            skip_clone=True,  # Created fresh per request, safe to skip clone
+            repetition_detection=self.repetition_detection,
        )

+    @model_validator(mode="before")
+    @classmethod
+    def validate_response_format(cls, data):
+        response_format = data.get("response_format")
+        if response_format is None:
+            return data
+
+        rf_type = (
+            response_format.get("type")
+            if isinstance(response_format, dict)
+            else getattr(response_format, "type", None)
+        )
+
+        if rf_type == "json_schema":
+            json_schema = (
+                response_format.get("json_schema")
+                if isinstance(response_format, dict)
+                else getattr(response_format, "json_schema", None)
+            )
+            if json_schema is None:
+                raise VLLMValidationError(
+                    "When response_format type is 'json_schema', the "
+                    "'json_schema' field must be provided.",
+                    parameter="response_format",
+                )
+
+        return data
+
    @model_validator(mode="before")
    @classmethod
    def check_structured_outputs_count(cls, data):
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -62,11 +62,6 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
    TranscriptionResponse,
    TranslationRequest,
 )
-from vllm.entrypoints.pooling.classify.protocol import (
-    ClassificationChatRequest,
-    ClassificationCompletionRequest,
-    ClassificationResponse,
-)
 from vllm.entrypoints.pooling.embed.protocol import (
    EmbeddingBytesResponse,
    EmbeddingChatRequest,
@@ -161,7 +156,6 @@ CompletionLikeRequest: TypeAlias = (
    | TokenizeCompletionRequest
    | DetokenizeRequest
    | EmbeddingCompletionRequest
-    | ClassificationCompletionRequest
    | RerankRequest
    | ScoreRequest
    | PoolingCompletionRequest
@@ -171,7 +165,6 @@ ChatLikeRequest: TypeAlias = (
    ChatCompletionRequest
    | TokenizeChatRequest
    | EmbeddingChatRequest
-    | ClassificationChatRequest
    | PoolingChatRequest
 )

@@ -194,12 +187,10 @@ AnyResponse: TypeAlias = (
    | TranscriptionResponse
    | TokenizeResponse
    | PoolingResponse
-    | ClassificationResponse
    | ScoreResponse
    | GenerateResponse
 )

-
 RequestT = TypeVar("RequestT", bound=AnyRequest)


@@ -223,8 +214,8 @@ class ServeContext(Generic[RequestT]):

 class OpenAIServing:
    request_id_prefix: ClassVar[str] = """
-    A short string prepended to every request’s ID (e.g. "embd", "classify")
-    so you can easily tell “this ID came from Embedding vs Classification.”
+    A short string prepended to every request’s ID (e.g. "embd")
+    so you can easily tell “this ID came from Embedding.”
    """

    def __init__(
@@ -456,7 +447,7 @@ class OpenAIServing:
    ) -> ErrorResponse | None:
        """
        Default preprocessing hook. Subclasses may override
-        to prepare `ctx` (classification, embedding, etc.).
+        to prepare `ctx` (embedding, etc.).
        """
        return None

@@ -817,7 +808,7 @@ class OpenAIServing:
        token_num = len(input_ids)
        max_model_len = self.model_config.max_model_len

-        # Note: EmbeddingRequest, ClassificationRequest,
+        # Note: EmbeddingRequest,
        # and ScoreRequest doesn't have max_tokens
        if isinstance(
            request,
@@ -828,8 +819,6 @@ class OpenAIServing:
                ScoreTextRequest,
                ScoreQueriesDocumentsRequest,
                RerankRequest,
-                ClassificationCompletionRequest,
-                ClassificationChatRequest,
            ),
        ):
            # Note: input length can be up to the entire model context length
@@ -839,8 +828,6 @@ class OpenAIServing:
                    ScoreDataRequest: "score",
                    ScoreTextRequest: "score",
                    ScoreQueriesDocumentsRequest: "score",
-                    ClassificationCompletionRequest: "classification",
-                    ClassificationChatRequest: "classification",
                }
                operation = operations.get(type(request), "embedding generation")
                raise VLLMValidationError(
--- a/vllm/entrypoints/openai/responses/protocol.py
+++ b/vllm/entrypoints/openai/responses/protocol.py
@@ -328,8 +328,9 @@ class ResponsesRequest(OpenAIBaseModel):
        # Also check text.format for OpenAI-style json_schema
        if self.text is not None and self.text.format is not None:
            if structured_outputs is not None:
-                raise ValueError(
-                    "Cannot specify both structured_outputs and text.format"
+                raise VLLMValidationError(
+                    "Cannot specify both structured_outputs and text.format",
+                    parameter="structured_outputs",
                )
            response_format = self.text.format
            if (
@@ -378,14 +379,19 @@ class ResponsesRequest(OpenAIBaseModel):
        )

    @model_validator(mode="before")
+    @classmethod
    def validate_background(cls, data):
        if not data.get("background"):
            return data
        if not data.get("store", True):
-            raise ValueError("background can only be used when `store` is true")
+            raise VLLMValidationError(
+                "background can only be used when `store` is true",
+                parameter="background",
+            )
        return data

    @model_validator(mode="before")
+    @classmethod
    def validate_prompt(cls, data):
        if data.get("prompt") is not None:
            raise VLLMValidationError(
@@ -394,16 +400,19 @@ class ResponsesRequest(OpenAIBaseModel):
        return data

    @model_validator(mode="before")
+    @classmethod
    def check_cache_salt_support(cls, data):
        if data.get("cache_salt") is not None and (
            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
        ):
-            raise ValueError(
-                "Parameter 'cache_salt' must be a non-empty string if provided."
+            raise VLLMValidationError(
+                "Parameter 'cache_salt' must be a non-empty string if provided.",
+                parameter="cache_salt",
            )
        return data

    @model_validator(mode="before")
+    @classmethod
    def function_call_parsing(cls, data):
        """Parse function_call dictionaries into ResponseFunctionToolCall objects.
        This ensures Pydantic can properly resolve union types in the input field.
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -85,6 +85,8 @@ from vllm.entrypoints.openai.responses.protocol import (
    ResponseCreatedEvent,
    ResponseInProgressEvent,
    ResponseInputOutputMessage,
+    ResponseReasoningPartAddedEvent,
+    ResponseReasoningPartDoneEvent,
    ResponsesRequest,
    ResponsesResponse,
    ResponseUsage,
@@ -1339,6 +1341,19 @@ class OpenAIServingResponses(OpenAIServing):
                                ),
                            )
                        )
+                        yield _increment_sequence_number_and_return(
+                            ResponseReasoningPartAddedEvent(
+                                type="response.reasoning_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                part=ResponseReasoningTextContent(
+                                    text="",
+                                    type="reasoning_text",
+                                ),
+                            )
+                        )
                    else:
                        yield _increment_sequence_number_and_return(
                            ResponseOutputItemAddedEvent(
@@ -1354,22 +1369,21 @@ class OpenAIServingResponses(OpenAIServing):
                                ),
                            )
                        )
-                    yield _increment_sequence_number_and_return(
-                        ResponseContentPartAddedEvent(
-                            type="response.content_part.added",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                            content_index=current_content_index,
-                            part=ResponseOutputText(
-                                type="output_text",
-                                text="",
-                                annotations=[],
-                                logprobs=[],
-                            ),
+                        yield _increment_sequence_number_and_return(
+                            ResponseContentPartAddedEvent(
+                                type="response.content_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                part=ResponseOutputText(
+                                    type="output_text",
+                                    text="",
+                                    annotations=[],
+                                    logprobs=[],
+                                ),
+                            )
                        )
-                    )
-                    current_content_index += 1
                    first_delta_sent = True
                # todo(kebe7jun) tool call support

@@ -1397,6 +1411,19 @@ class OpenAIServingResponses(OpenAIServing):
                            text=reason_content,
                        )
                    )
+                    yield _increment_sequence_number_and_return(
+                        ResponseReasoningPartDoneEvent(
+                            type="response.reasoning_part.done",
+                            sequence_number=-1,
+                            item_id=current_item_id,
+                            output_index=current_output_index,
+                            content_index=current_content_index,
+                            part=ResponseReasoningTextContent(
+                                text=reason_content,
+                                type="reasoning_text",
+                            ),
+                        )
+                    )
                    current_content_index = 0
                    reasoning_item = ResponseReasoningItem(
                        type="reasoning",
@@ -1418,6 +1445,8 @@ class OpenAIServingResponses(OpenAIServing):
                            item=reasoning_item,
                        )
                    )
+                    current_output_index += 1
+                    current_item_id = str(uuid.uuid4())
                    yield _increment_sequence_number_and_return(
                        ResponseOutputItemAddedEvent(
                            type="response.output_item.added",
@@ -1432,8 +1461,6 @@ class OpenAIServingResponses(OpenAIServing):
                            ),
                        )
                    )
-                    current_output_index += 1
-                    current_item_id = str(uuid.uuid4())
                    yield _increment_sequence_number_and_return(
                        ResponseContentPartAddedEvent(
                            type="response.content_part.added",
@@ -1449,7 +1476,6 @@ class OpenAIServingResponses(OpenAIServing):
                            ),
                        )
                    )
-                    current_content_index += 1
                    # reset previous delta messages
                    previous_delta_messages = []

@@ -1485,7 +1511,6 @@ class OpenAIServingResponses(OpenAIServing):
                            ),
                        )
                    )
-                current_content_index += 1

                previous_delta_messages.append(delta_message)
        if previous_delta_messages:
@@ -1505,7 +1530,19 @@ class OpenAIServingResponses(OpenAIServing):
                        text=reason_content,
                    )
                )
-                current_content_index += 1
+                yield _increment_sequence_number_and_return(
+                    ResponseReasoningPartDoneEvent(
+                        type="response.reasoning_part.done",
+                        sequence_number=-1,
+                        item_id=current_item_id,
+                        output_index=current_output_index,
+                        content_index=current_content_index,
+                        part=ResponseReasoningTextContent(
+                            text=reason_content,
+                            type="reasoning_text",
+                        ),
+                    )
+                )
                reasoning_item = ResponseReasoningItem(
                    type="reasoning",
                    content=[
@@ -1543,7 +1580,6 @@ class OpenAIServingResponses(OpenAIServing):
                        item_id=current_item_id,
                    )
                )
-                current_content_index += 1
                part = ResponseOutputText(
                    text=final_content,
                    type="output_text",
@@ -1559,7 +1595,6 @@ class OpenAIServingResponses(OpenAIServing):
                        part=part,
                    )
                )
-                current_content_index += 1
                item = ResponseOutputMessage(
                    type="message",
                    role="assistant",
--- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
@@ -11,6 +11,7 @@ from typing import Final, Literal, TypeAlias, TypeVar, cast

 import numpy as np
 from fastapi import Request
+from soundfile import LibsndfileError
 from transformers import PreTrainedTokenizerBase

 import vllm.envs as envs
@@ -57,6 +58,14 @@ try:
 except ImportError:
    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]

+# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
+# being librosa's main backend. Used to validate if an audio loading error is due to a
+# server error vs a client error (invalid audio file).
+# 1 = unrecognised format      (file is not a supported audio container)
+# 3 = malformed file           (corrupt or structurally invalid audio)
+# 4 = unsupported encoding     (codec not supported by this libsndfile build)
+_BAD_SF_CODES = {1, 3, 4}
+
 SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 SpeechToTextResponseVerbose: TypeAlias = (
    TranscriptionResponseVerbose | TranslationResponseVerbose
@@ -315,9 +324,15 @@ class OpenAISpeechToText(OpenAIServing):
            )

        with io.BytesIO(audio_data) as bytes_:
-            # NOTE resample to model SR here for efficiency. This is also a
-            # pre-requisite for chunking, as it assumes Whisper SR.
-            y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+            try:
+                # NOTE resample to model SR here for efficiency. This is also a
+                # pre-requisite for chunking, as it assumes Whisper SR.
+                y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+            except LibsndfileError as exc:
+                # Distinguish client errors (invalid audio) from server errors
+                if exc.code in _BAD_SF_CODES:
+                    raise ValueError("Invalid or unsupported audio file.") from exc
+                raise

        duration = librosa.get_duration(y=y, sr=sr)
        do_split_audio = (
--- a/vllm/entrypoints/openai/translations/init.py
+++ b/vllm/entrypoints/openai/translations/init.py
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "The 'vllm.entrypoints.openai.translations' module has been renamed to "
-    "'vllm.entrypoints.openai.speech_to_text'. Please update your imports. "
-    "This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
--- a/vllm/entrypoints/openai/translations/api_router.py
+++ b/vllm/entrypoints/openai/translations/api_router.py
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.api_router' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.api_router'. Please update your "
-    "imports. This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-from vllm.entrypoints.openai.speech_to_text.api_router import *  # noqa: F401,F403,E402
--- a/vllm/entrypoints/openai/translations/protocol.py
+++ b/vllm/entrypoints/openai/translations/protocol.py
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.protocol' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.protocol'. Please update your "
-    "imports. This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-from vllm.entrypoints.openai.speech_to_text.protocol import *  # noqa: F401,F403,E402
--- a/vllm/entrypoints/openai/translations/serving.py
+++ b/vllm/entrypoints/openai/translations/serving.py
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.serving' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.serving'. Please update your "
-    "imports. This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-from vllm.entrypoints.openai.speech_to_text.serving import *  # noqa: F401,F403,E402
--- a/vllm/entrypoints/openai/translations/speech_to_text.py
+++ b/vllm/entrypoints/openai/translations/speech_to_text.py
@@ -1,15 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.speech_to_text' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.speech_to_text'. Please update "
-    "your imports. This backward-compatible alias will be removed in version "
-    "0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-from vllm.entrypoints.openai.speech_to_text.speech_to_text import *  # noqa: F401,F403,E402