Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -38,6 +38,7 @@ from vllm.logprobs import Logprob
from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs
from vllm.sampling_params import (
BeamSearchParams,
RepetitionDetectionParams,
RequestOutputKind,
SamplingParams,
StructuredOutputsParams,
@@ -336,6 +337,16 @@ class ChatCompletionRequest(OpenAIBaseModel):
),
)
repetition_detection: RepetitionDetectionParams | None = Field(
default=None,
description="Parameters for detecting repetitive N-gram patterns "
"in output tokens. If such repetition is detected, generation will "
"be ended early. LLMs can sometimes generate repetitive, unhelpful "
"token patterns, stopping only when they hit the maximum output length "
"(e.g. 'abcdabcdabcd...' or '\emoji \emoji \emoji ...'). This feature "
"can detect such behavior and terminate early, saving time and tokens.",
)
# --8<-- [end:chat-completion-extra-params]
def build_chat_params(
@@ -490,7 +501,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
skip_special_tokens=self.skip_special_tokens,
spaces_between_special_tokens=self.spaces_between_special_tokens,
include_stop_str_in_output=self.include_stop_str_in_output,
truncate_prompt_tokens=self.truncate_prompt_tokens,
output_kind=RequestOutputKind.DELTA
if self.stream
else RequestOutputKind.FINAL_ONLY,
@@ -500,8 +510,37 @@ class ChatCompletionRequest(OpenAIBaseModel):
allowed_token_ids=self.allowed_token_ids,
extra_args=extra_args or None,
skip_clone=True, # Created fresh per request, safe to skip clone
repetition_detection=self.repetition_detection,
)
@model_validator(mode="before")
@classmethod
def validate_response_format(cls, data):
response_format = data.get("response_format")
if response_format is None:
return data
rf_type = (
response_format.get("type")
if isinstance(response_format, dict)
else getattr(response_format, "type", None)
)
if rf_type == "json_schema":
json_schema = (
response_format.get("json_schema")
if isinstance(response_format, dict)
else getattr(response_format, "json_schema", None)
)
if json_schema is None:
raise VLLMValidationError(
"When response_format type is 'json_schema', the "
"'json_schema' field must be provided.",
parameter="response_format",
)
return data
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):

View File

@@ -1249,13 +1249,23 @@ class OpenAIServingChat(OpenAIServing):
)
# get the expected call based on partial JSON
# parsing which "autocompletes" the JSON
expected_call = json.dumps(
tool_parser.prev_tool_call_arr[index].get(
"arguments", {}
),
ensure_ascii=False,
# parsing which "autocompletes" the JSON.
# Tool parsers (e.g. Qwen3Coder) store
# arguments as a JSON string in
# prev_tool_call_arr. Calling json.dumps()
# on an already-serialized string would
# double-serialize it (e.g. '{"k":1}' becomes
# '"{\\"k\\":1}"'), which then causes the
# replace() below to fail and append the
# entire double-serialized string as a
# spurious final delta.
args = tool_parser.prev_tool_call_arr[index].get(
"arguments", {}
)
if isinstance(args, str):
expected_call = args
else:
expected_call = json.dumps(args, ensure_ascii=False)
# get what we've streamed so far for arguments
# for the current tool

View File

@@ -143,7 +143,8 @@ class BaseFrontendArgs:
templates and other tokenizer configuration."""
enable_log_outputs: bool = False
"""If set to True, log model outputs (generations).
Requires --enable-log-requests."""
Requires `--enable-log-requests`. As with `--enable-log-requests`,
information is only logged at INFO level at maximum."""
enable_log_deltas: bool = True
"""If set to False, output deltas will not be logged. Relevant only if
--enable-log-outputs is set.
@@ -277,6 +278,10 @@ class FrontendArgs(BaseFrontendArgs):
Enable offline FastAPI documentation for air-gapped environments.
Uses vendored static assets bundled with vLLM.
"""
use_gpu_for_pooling_score: bool = False
"""If set, run pooling score MaxSim on GPU in the API server process.
Can significantly improve late-interaction scoring performance.
https://github.com/vllm-project/vllm/pull/35330"""
@classmethod
def _customize_cli_kwargs(

View File

@@ -26,6 +26,7 @@ from vllm.logprobs import Logprob
from vllm.renderers import TokenizeParams
from vllm.sampling_params import (
BeamSearchParams,
RepetitionDetectionParams,
RequestOutputKind,
SamplingParams,
StructuredOutputsParams,
@@ -166,6 +167,16 @@ class CompletionRequest(OpenAIBaseModel):
),
)
repetition_detection: RepetitionDetectionParams | None = Field(
default=None,
description="Parameters for detecting repetitive N-gram patterns "
"in output tokens. If such repetition is detected, generation will "
"be ended early. LLMs can sometimes generate repetitive, unhelpful "
"token patterns, stopping only when they hit the maximum output length "
"(e.g. 'abcdabcdabcd...' or '\emoji \emoji \emoji ...'). This feature "
"can detect such behavior and terminate early, saving time and tokens.",
)
# --8<-- [end:completion-extra-params]
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
@@ -259,7 +270,7 @@ class CompletionRequest(OpenAIBaseModel):
structured_outputs_kwargs["json"] = json_schema.json_schema
elif response_format.type == "structural_tag":
structural_tag = response_format
assert structural_tag is not None and isinstance(
assert isinstance(
structural_tag,
(
LegacyStructuralTagResponseFormat,
@@ -302,7 +313,6 @@ class CompletionRequest(OpenAIBaseModel):
skip_special_tokens=self.skip_special_tokens,
spaces_between_special_tokens=self.spaces_between_special_tokens,
include_stop_str_in_output=self.include_stop_str_in_output,
truncate_prompt_tokens=self.truncate_prompt_tokens,
output_kind=RequestOutputKind.DELTA
if self.stream
else RequestOutputKind.FINAL_ONLY,
@@ -311,8 +321,37 @@ class CompletionRequest(OpenAIBaseModel):
allowed_token_ids=self.allowed_token_ids,
extra_args=extra_args or None,
skip_clone=True, # Created fresh per request, safe to skip clone
repetition_detection=self.repetition_detection,
)
@model_validator(mode="before")
@classmethod
def validate_response_format(cls, data):
response_format = data.get("response_format")
if response_format is None:
return data
rf_type = (
response_format.get("type")
if isinstance(response_format, dict)
else getattr(response_format, "type", None)
)
if rf_type == "json_schema":
json_schema = (
response_format.get("json_schema")
if isinstance(response_format, dict)
else getattr(response_format, "json_schema", None)
)
if json_schema is None:
raise VLLMValidationError(
"When response_format type is 'json_schema', the "
"'json_schema' field must be provided.",
parameter="response_format",
)
return data
@model_validator(mode="before")
@classmethod
def check_structured_outputs_count(cls, data):

View File

@@ -62,11 +62,6 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
TranscriptionResponse,
TranslationRequest,
)
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest,
ClassificationCompletionRequest,
ClassificationResponse,
)
from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse,
EmbeddingChatRequest,
@@ -161,7 +156,6 @@ CompletionLikeRequest: TypeAlias = (
| TokenizeCompletionRequest
| DetokenizeRequest
| EmbeddingCompletionRequest
| ClassificationCompletionRequest
| RerankRequest
| ScoreRequest
| PoolingCompletionRequest
@@ -171,7 +165,6 @@ ChatLikeRequest: TypeAlias = (
ChatCompletionRequest
| TokenizeChatRequest
| EmbeddingChatRequest
| ClassificationChatRequest
| PoolingChatRequest
)
@@ -194,12 +187,10 @@ AnyResponse: TypeAlias = (
| TranscriptionResponse
| TokenizeResponse
| PoolingResponse
| ClassificationResponse
| ScoreResponse
| GenerateResponse
)
RequestT = TypeVar("RequestT", bound=AnyRequest)
@@ -223,8 +214,8 @@ class ServeContext(Generic[RequestT]):
class OpenAIServing:
request_id_prefix: ClassVar[str] = """
A short string prepended to every requests ID (e.g. "embd", "classify")
so you can easily tell “this ID came from Embedding vs Classification.”
A short string prepended to every requests ID (e.g. "embd")
so you can easily tell “this ID came from Embedding.”
"""
def __init__(
@@ -456,7 +447,7 @@ class OpenAIServing:
) -> ErrorResponse | None:
"""
Default preprocessing hook. Subclasses may override
to prepare `ctx` (classification, embedding, etc.).
to prepare `ctx` (embedding, etc.).
"""
return None
@@ -817,7 +808,7 @@ class OpenAIServing:
token_num = len(input_ids)
max_model_len = self.model_config.max_model_len
# Note: EmbeddingRequest, ClassificationRequest,
# Note: EmbeddingRequest,
# and ScoreRequest doesn't have max_tokens
if isinstance(
request,
@@ -828,8 +819,6 @@ class OpenAIServing:
ScoreTextRequest,
ScoreQueriesDocumentsRequest,
RerankRequest,
ClassificationCompletionRequest,
ClassificationChatRequest,
),
):
# Note: input length can be up to the entire model context length
@@ -839,8 +828,6 @@ class OpenAIServing:
ScoreDataRequest: "score",
ScoreTextRequest: "score",
ScoreQueriesDocumentsRequest: "score",
ClassificationCompletionRequest: "classification",
ClassificationChatRequest: "classification",
}
operation = operations.get(type(request), "embedding generation")
raise VLLMValidationError(

View File

@@ -328,8 +328,9 @@ class ResponsesRequest(OpenAIBaseModel):
# Also check text.format for OpenAI-style json_schema
if self.text is not None and self.text.format is not None:
if structured_outputs is not None:
raise ValueError(
"Cannot specify both structured_outputs and text.format"
raise VLLMValidationError(
"Cannot specify both structured_outputs and text.format",
parameter="structured_outputs",
)
response_format = self.text.format
if (
@@ -378,14 +379,19 @@ class ResponsesRequest(OpenAIBaseModel):
)
@model_validator(mode="before")
@classmethod
def validate_background(cls, data):
if not data.get("background"):
return data
if not data.get("store", True):
raise ValueError("background can only be used when `store` is true")
raise VLLMValidationError(
"background can only be used when `store` is true",
parameter="background",
)
return data
@model_validator(mode="before")
@classmethod
def validate_prompt(cls, data):
if data.get("prompt") is not None:
raise VLLMValidationError(
@@ -394,16 +400,19 @@ class ResponsesRequest(OpenAIBaseModel):
return data
@model_validator(mode="before")
@classmethod
def check_cache_salt_support(cls, data):
if data.get("cache_salt") is not None and (
not isinstance(data["cache_salt"], str) or not data["cache_salt"]
):
raise ValueError(
"Parameter 'cache_salt' must be a non-empty string if provided."
raise VLLMValidationError(
"Parameter 'cache_salt' must be a non-empty string if provided.",
parameter="cache_salt",
)
return data
@model_validator(mode="before")
@classmethod
def function_call_parsing(cls, data):
"""Parse function_call dictionaries into ResponseFunctionToolCall objects.
This ensures Pydantic can properly resolve union types in the input field.

View File

@@ -85,6 +85,8 @@ from vllm.entrypoints.openai.responses.protocol import (
ResponseCreatedEvent,
ResponseInProgressEvent,
ResponseInputOutputMessage,
ResponseReasoningPartAddedEvent,
ResponseReasoningPartDoneEvent,
ResponsesRequest,
ResponsesResponse,
ResponseUsage,
@@ -1339,6 +1341,19 @@ class OpenAIServingResponses(OpenAIServing):
),
)
)
yield _increment_sequence_number_and_return(
ResponseReasoningPartAddedEvent(
type="response.reasoning_part.added",
sequence_number=-1,
output_index=current_output_index,
item_id=current_item_id,
content_index=current_content_index,
part=ResponseReasoningTextContent(
text="",
type="reasoning_text",
),
)
)
else:
yield _increment_sequence_number_and_return(
ResponseOutputItemAddedEvent(
@@ -1354,22 +1369,21 @@ class OpenAIServingResponses(OpenAIServing):
),
)
)
yield _increment_sequence_number_and_return(
ResponseContentPartAddedEvent(
type="response.content_part.added",
sequence_number=-1,
output_index=current_output_index,
item_id=current_item_id,
content_index=current_content_index,
part=ResponseOutputText(
type="output_text",
text="",
annotations=[],
logprobs=[],
),
yield _increment_sequence_number_and_return(
ResponseContentPartAddedEvent(
type="response.content_part.added",
sequence_number=-1,
output_index=current_output_index,
item_id=current_item_id,
content_index=current_content_index,
part=ResponseOutputText(
type="output_text",
text="",
annotations=[],
logprobs=[],
),
)
)
)
current_content_index += 1
first_delta_sent = True
# todo(kebe7jun) tool call support
@@ -1397,6 +1411,19 @@ class OpenAIServingResponses(OpenAIServing):
text=reason_content,
)
)
yield _increment_sequence_number_and_return(
ResponseReasoningPartDoneEvent(
type="response.reasoning_part.done",
sequence_number=-1,
item_id=current_item_id,
output_index=current_output_index,
content_index=current_content_index,
part=ResponseReasoningTextContent(
text=reason_content,
type="reasoning_text",
),
)
)
current_content_index = 0
reasoning_item = ResponseReasoningItem(
type="reasoning",
@@ -1418,6 +1445,8 @@ class OpenAIServingResponses(OpenAIServing):
item=reasoning_item,
)
)
current_output_index += 1
current_item_id = str(uuid.uuid4())
yield _increment_sequence_number_and_return(
ResponseOutputItemAddedEvent(
type="response.output_item.added",
@@ -1432,8 +1461,6 @@ class OpenAIServingResponses(OpenAIServing):
),
)
)
current_output_index += 1
current_item_id = str(uuid.uuid4())
yield _increment_sequence_number_and_return(
ResponseContentPartAddedEvent(
type="response.content_part.added",
@@ -1449,7 +1476,6 @@ class OpenAIServingResponses(OpenAIServing):
),
)
)
current_content_index += 1
# reset previous delta messages
previous_delta_messages = []
@@ -1485,7 +1511,6 @@ class OpenAIServingResponses(OpenAIServing):
),
)
)
current_content_index += 1
previous_delta_messages.append(delta_message)
if previous_delta_messages:
@@ -1505,7 +1530,19 @@ class OpenAIServingResponses(OpenAIServing):
text=reason_content,
)
)
current_content_index += 1
yield _increment_sequence_number_and_return(
ResponseReasoningPartDoneEvent(
type="response.reasoning_part.done",
sequence_number=-1,
item_id=current_item_id,
output_index=current_output_index,
content_index=current_content_index,
part=ResponseReasoningTextContent(
text=reason_content,
type="reasoning_text",
),
)
)
reasoning_item = ResponseReasoningItem(
type="reasoning",
content=[
@@ -1543,7 +1580,6 @@ class OpenAIServingResponses(OpenAIServing):
item_id=current_item_id,
)
)
current_content_index += 1
part = ResponseOutputText(
text=final_content,
type="output_text",
@@ -1559,7 +1595,6 @@ class OpenAIServingResponses(OpenAIServing):
part=part,
)
)
current_content_index += 1
item = ResponseOutputMessage(
type="message",
role="assistant",

View File

@@ -11,6 +11,7 @@ from typing import Final, Literal, TypeAlias, TypeVar, cast
import numpy as np
from fastapi import Request
from soundfile import LibsndfileError
from transformers import PreTrainedTokenizerBase
import vllm.envs as envs
@@ -57,6 +58,14 @@ try:
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
SpeechToTextResponseVerbose: TypeAlias = (
TranscriptionResponseVerbose | TranslationResponseVerbose
@@ -315,9 +324,15 @@ class OpenAISpeechToText(OpenAIServing):
)
with io.BytesIO(audio_data) as bytes_:
# NOTE resample to model SR here for efficiency. This is also a
# pre-requisite for chunking, as it assumes Whisper SR.
y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
try:
# NOTE resample to model SR here for efficiency. This is also a
# pre-requisite for chunking, as it assumes Whisper SR.
y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
except LibsndfileError as exc:
# Distinguish client errors (invalid audio) from server errors
if exc.code in _BAD_SF_CODES:
raise ValueError("Invalid or unsupported audio file.") from exc
raise
duration = librosa.get_duration(y=y, sr=sr)
do_split_audio = (

View File

@@ -1,12 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"The 'vllm.entrypoints.openai.translations' module has been renamed to "
"'vllm.entrypoints.openai.speech_to_text'. Please update your imports. "
"This backward-compatible alias will be removed in version 0.17+.",
DeprecationWarning,
stacklevel=2,
)

View File

@@ -1,14 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"'vllm.entrypoints.openai.translations.api_router' has been moved to "
"'vllm.entrypoints.openai.speech_to_text.api_router'. Please update your "
"imports. This backward-compatible alias will be removed in version 0.17+.",
DeprecationWarning,
stacklevel=2,
)
from vllm.entrypoints.openai.speech_to_text.api_router import * # noqa: F401,F403,E402

View File

@@ -1,14 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"'vllm.entrypoints.openai.translations.protocol' has been moved to "
"'vllm.entrypoints.openai.speech_to_text.protocol'. Please update your "
"imports. This backward-compatible alias will be removed in version 0.17+.",
DeprecationWarning,
stacklevel=2,
)
from vllm.entrypoints.openai.speech_to_text.protocol import * # noqa: F401,F403,E402

View File

@@ -1,14 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"'vllm.entrypoints.openai.translations.serving' has been moved to "
"'vllm.entrypoints.openai.speech_to_text.serving'. Please update your "
"imports. This backward-compatible alias will be removed in version 0.17+.",
DeprecationWarning,
stacklevel=2,
)
from vllm.entrypoints.openai.speech_to_text.serving import * # noqa: F401,F403,E402

View File

@@ -1,15 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"'vllm.entrypoints.openai.translations.speech_to_text' has been moved to "
"'vllm.entrypoints.openai.speech_to_text.speech_to_text'. Please update "
"your imports. This backward-compatible alias will be removed in version "
"0.17+.",
DeprecationWarning,
stacklevel=2,
)
from vllm.entrypoints.openai.speech_to_text.speech_to_text import * # noqa: F401,F403,E402