Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -38,6 +38,7 @@ from vllm.logprobs import Logprob
|
||||
from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs
|
||||
from vllm.sampling_params import (
|
||||
BeamSearchParams,
|
||||
RepetitionDetectionParams,
|
||||
RequestOutputKind,
|
||||
SamplingParams,
|
||||
StructuredOutputsParams,
|
||||
@@ -336,6 +337,16 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
),
|
||||
)
|
||||
|
||||
repetition_detection: RepetitionDetectionParams | None = Field(
|
||||
default=None,
|
||||
description="Parameters for detecting repetitive N-gram patterns "
|
||||
"in output tokens. If such repetition is detected, generation will "
|
||||
"be ended early. LLMs can sometimes generate repetitive, unhelpful "
|
||||
"token patterns, stopping only when they hit the maximum output length "
|
||||
"(e.g. 'abcdabcdabcd...' or '\emoji \emoji \emoji ...'). This feature "
|
||||
"can detect such behavior and terminate early, saving time and tokens.",
|
||||
)
|
||||
|
||||
# --8<-- [end:chat-completion-extra-params]
|
||||
|
||||
def build_chat_params(
|
||||
@@ -490,7 +501,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
skip_special_tokens=self.skip_special_tokens,
|
||||
spaces_between_special_tokens=self.spaces_between_special_tokens,
|
||||
include_stop_str_in_output=self.include_stop_str_in_output,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
output_kind=RequestOutputKind.DELTA
|
||||
if self.stream
|
||||
else RequestOutputKind.FINAL_ONLY,
|
||||
@@ -500,8 +510,37 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
allowed_token_ids=self.allowed_token_ids,
|
||||
extra_args=extra_args or None,
|
||||
skip_clone=True, # Created fresh per request, safe to skip clone
|
||||
repetition_detection=self.repetition_detection,
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_response_format(cls, data):
|
||||
response_format = data.get("response_format")
|
||||
if response_format is None:
|
||||
return data
|
||||
|
||||
rf_type = (
|
||||
response_format.get("type")
|
||||
if isinstance(response_format, dict)
|
||||
else getattr(response_format, "type", None)
|
||||
)
|
||||
|
||||
if rf_type == "json_schema":
|
||||
json_schema = (
|
||||
response_format.get("json_schema")
|
||||
if isinstance(response_format, dict)
|
||||
else getattr(response_format, "json_schema", None)
|
||||
)
|
||||
if json_schema is None:
|
||||
raise VLLMValidationError(
|
||||
"When response_format type is 'json_schema', the "
|
||||
"'json_schema' field must be provided.",
|
||||
parameter="response_format",
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_stream_options(cls, data):
|
||||
|
||||
@@ -1249,13 +1249,23 @@ class OpenAIServingChat(OpenAIServing):
|
||||
)
|
||||
|
||||
# get the expected call based on partial JSON
|
||||
# parsing which "autocompletes" the JSON
|
||||
expected_call = json.dumps(
|
||||
tool_parser.prev_tool_call_arr[index].get(
|
||||
"arguments", {}
|
||||
),
|
||||
ensure_ascii=False,
|
||||
# parsing which "autocompletes" the JSON.
|
||||
# Tool parsers (e.g. Qwen3Coder) store
|
||||
# arguments as a JSON string in
|
||||
# prev_tool_call_arr. Calling json.dumps()
|
||||
# on an already-serialized string would
|
||||
# double-serialize it (e.g. '{"k":1}' becomes
|
||||
# '"{\\"k\\":1}"'), which then causes the
|
||||
# replace() below to fail and append the
|
||||
# entire double-serialized string as a
|
||||
# spurious final delta.
|
||||
args = tool_parser.prev_tool_call_arr[index].get(
|
||||
"arguments", {}
|
||||
)
|
||||
if isinstance(args, str):
|
||||
expected_call = args
|
||||
else:
|
||||
expected_call = json.dumps(args, ensure_ascii=False)
|
||||
|
||||
# get what we've streamed so far for arguments
|
||||
# for the current tool
|
||||
|
||||
Reference in New Issue
Block a user