fix issues

This commit is contained in:
2026-06-26 12:55:02 +08:00
parent 3d62430fd7
commit c84151eef9
9 changed files with 1879 additions and 5 deletions

View File

@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.protocol import (
ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
DeltaToolCall, ErrorResponse, FunctionCall, RequestResponseMetadata,
ToolCall, UsageInfo)
PromptTokensDetails, ToolCall, UsageInfo)
from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
LoRAModulePath,
OpenAIServing,
@@ -179,6 +179,16 @@ class OpenAIServingChat(OpenAIServing):
logger.exception("Error in loading multi-modal data")
return self.create_error_response(str(e))
# n > max_num_seqs deadlock guard: scheduler uses break (not continue)
# when can_schedule(num_new_seqs=n) fails, so an n that exceeds
# max_num_seqs permanently blocks the entire waiting queue with no error.
_sched_cfg = await self.engine_client.get_scheduler_config()
_max_seqs = _sched_cfg.max_num_seqs
if request.n is not None and request.n > _max_seqs:
return self.create_error_response(
f"n={request.n} exceeds max_num_seqs={_max_seqs}. "
f"Use n<={_max_seqs} or omit n.")
# validation for OpenAI tools
# tool_choice = "required" is not supported
if request.tool_choice == "required":
@@ -318,6 +328,7 @@ class OpenAIServingChat(OpenAIServing):
previous_num_tokens = [0] * num_choices
finish_reason_sent = [False] * num_choices
num_prompt_tokens = 0
num_cached_tokens: Optional[int] = None
if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
tool_choice_function_name = request.tool_choice.function.name
@@ -385,6 +396,10 @@ class OpenAIServingChat(OpenAIServing):
num_prompt_tokens = len(res.prompt_token_ids)
if res.encoder_prompt_token_ids is not None:
num_prompt_tokens += len(res.encoder_prompt_token_ids)
if (num_cached_tokens is None
and res.metrics is not None
and res.metrics.num_cached_tokens is not None):
num_cached_tokens = res.metrics.num_cached_tokens
# We need to do it here, because if there are exceptions in
# the result_generator, it needs to be sent as the FIRST
@@ -691,6 +706,9 @@ class OpenAIServingChat(OpenAIServing):
completion_tokens=completion_tokens,
total_tokens=num_prompt_tokens + completion_tokens,
reasoning_tokens=total_reasoning,
prompt_tokens_details=(
PromptTokensDetails(cached_tokens=num_cached_tokens)
if num_cached_tokens is not None else None),
)
final_usage_chunk = ChatCompletionStreamResponse(
@@ -713,6 +731,10 @@ class OpenAIServingChat(OpenAIServing):
total_tokens=num_prompt_tokens + num_completion_tokens,
reasoning_tokens=total_reasoning)
except asyncio.CancelledError:
# Client disconnected; abort the engine request so GPU is freed.
await self.engine_client.abort(request_id)
return
except ValueError as e:
# TODO: Use a vllm-specific Validation Error
logger.error("error in chat completion stream generator: %s", e)
@@ -739,6 +761,7 @@ class OpenAIServingChat(OpenAIServing):
async for res in result_generator:
final_res = res
except asyncio.CancelledError:
await self.engine_client.abort(request_id)
return self.create_error_response("Client disconnected")
assert final_res is not None
@@ -881,11 +904,16 @@ class OpenAIServingChat(OpenAIServing):
total_reasoning_tokens = sum(
rp.count_reasoning_tokens(list(output.token_ids))
for output in final_res.outputs)
num_cached_tokens = (final_res.metrics.num_cached_tokens
if final_res.metrics is not None else None)
usage = UsageInfo(
prompt_tokens=num_prompt_tokens,
completion_tokens=num_generated_tokens,
total_tokens=num_prompt_tokens + num_generated_tokens,
reasoning_tokens=total_reasoning_tokens,
prompt_tokens_details=(
PromptTokensDetails(cached_tokens=num_cached_tokens)
if num_cached_tokens is not None else None),
)
request_metadata.final_usage_info = usage