fix issues
This commit is contained in:
@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
|
||||
ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
|
||||
DeltaToolCall, ErrorResponse, FunctionCall, RequestResponseMetadata,
|
||||
ToolCall, UsageInfo)
|
||||
PromptTokensDetails, ToolCall, UsageInfo)
|
||||
from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
|
||||
LoRAModulePath,
|
||||
OpenAIServing,
|
||||
@@ -179,6 +179,16 @@ class OpenAIServingChat(OpenAIServing):
|
||||
logger.exception("Error in loading multi-modal data")
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
# n > max_num_seqs deadlock guard: scheduler uses break (not continue)
|
||||
# when can_schedule(num_new_seqs=n) fails, so an n that exceeds
|
||||
# max_num_seqs permanently blocks the entire waiting queue with no error.
|
||||
_sched_cfg = await self.engine_client.get_scheduler_config()
|
||||
_max_seqs = _sched_cfg.max_num_seqs
|
||||
if request.n is not None and request.n > _max_seqs:
|
||||
return self.create_error_response(
|
||||
f"n={request.n} exceeds max_num_seqs={_max_seqs}. "
|
||||
f"Use n<={_max_seqs} or omit n.")
|
||||
|
||||
# validation for OpenAI tools
|
||||
# tool_choice = "required" is not supported
|
||||
if request.tool_choice == "required":
|
||||
@@ -318,6 +328,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
previous_num_tokens = [0] * num_choices
|
||||
finish_reason_sent = [False] * num_choices
|
||||
num_prompt_tokens = 0
|
||||
num_cached_tokens: Optional[int] = None
|
||||
|
||||
if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
|
||||
tool_choice_function_name = request.tool_choice.function.name
|
||||
@@ -385,6 +396,10 @@ class OpenAIServingChat(OpenAIServing):
|
||||
num_prompt_tokens = len(res.prompt_token_ids)
|
||||
if res.encoder_prompt_token_ids is not None:
|
||||
num_prompt_tokens += len(res.encoder_prompt_token_ids)
|
||||
if (num_cached_tokens is None
|
||||
and res.metrics is not None
|
||||
and res.metrics.num_cached_tokens is not None):
|
||||
num_cached_tokens = res.metrics.num_cached_tokens
|
||||
|
||||
# We need to do it here, because if there are exceptions in
|
||||
# the result_generator, it needs to be sent as the FIRST
|
||||
@@ -691,6 +706,9 @@ class OpenAIServingChat(OpenAIServing):
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=num_prompt_tokens + completion_tokens,
|
||||
reasoning_tokens=total_reasoning,
|
||||
prompt_tokens_details=(
|
||||
PromptTokensDetails(cached_tokens=num_cached_tokens)
|
||||
if num_cached_tokens is not None else None),
|
||||
)
|
||||
|
||||
final_usage_chunk = ChatCompletionStreamResponse(
|
||||
@@ -713,6 +731,10 @@ class OpenAIServingChat(OpenAIServing):
|
||||
total_tokens=num_prompt_tokens + num_completion_tokens,
|
||||
reasoning_tokens=total_reasoning)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
# Client disconnected; abort the engine request so GPU is freed.
|
||||
await self.engine_client.abort(request_id)
|
||||
return
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
logger.error("error in chat completion stream generator: %s", e)
|
||||
@@ -739,6 +761,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
async for res in result_generator:
|
||||
final_res = res
|
||||
except asyncio.CancelledError:
|
||||
await self.engine_client.abort(request_id)
|
||||
return self.create_error_response("Client disconnected")
|
||||
|
||||
assert final_res is not None
|
||||
@@ -881,11 +904,16 @@ class OpenAIServingChat(OpenAIServing):
|
||||
total_reasoning_tokens = sum(
|
||||
rp.count_reasoning_tokens(list(output.token_ids))
|
||||
for output in final_res.outputs)
|
||||
num_cached_tokens = (final_res.metrics.num_cached_tokens
|
||||
if final_res.metrics is not None else None)
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=num_generated_tokens,
|
||||
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||
reasoning_tokens=total_reasoning_tokens,
|
||||
prompt_tokens_details=(
|
||||
PromptTokensDetails(cached_tokens=num_cached_tokens)
|
||||
if num_cached_tokens is not None else None),
|
||||
)
|
||||
|
||||
request_metadata.final_usage_info = usage
|
||||
|
||||
Reference in New Issue
Block a user