fix issues

2026-06-26 12:55:02 +08:00
parent 3d62430fd7
commit c84151eef9
9 changed files with 1879 additions and 5 deletions
--- a/qwen3_6_scripts/serving_chat.py
+++ b/qwen3_6_scripts/serving_chat.py
@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.protocol import (
    ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
    ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
    DeltaToolCall, ErrorResponse, FunctionCall, RequestResponseMetadata,
-    ToolCall, UsageInfo)
+    PromptTokensDetails, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
                                                    LoRAModulePath,
                                                    OpenAIServing,
@@ -179,6 +179,16 @@ class OpenAIServingChat(OpenAIServing):
            logger.exception("Error in loading multi-modal data")
            return self.create_error_response(str(e))

+        # n > max_num_seqs deadlock guard: scheduler uses break (not continue)
+        # when can_schedule(num_new_seqs=n) fails, so an n that exceeds
+        # max_num_seqs permanently blocks the entire waiting queue with no error.
+        _sched_cfg = await self.engine_client.get_scheduler_config()
+        _max_seqs = _sched_cfg.max_num_seqs
+        if request.n is not None and request.n > _max_seqs:
+            return self.create_error_response(
+                f"n={request.n} exceeds max_num_seqs={_max_seqs}. "
+                f"Use n<={_max_seqs} or omit n.")
+
        # validation for OpenAI tools
        # tool_choice = "required" is not supported
        if request.tool_choice == "required":
@@ -318,6 +328,7 @@ class OpenAIServingChat(OpenAIServing):
        previous_num_tokens = [0] * num_choices
        finish_reason_sent = [False] * num_choices
        num_prompt_tokens = 0
+        num_cached_tokens: Optional[int] = None

        if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
            tool_choice_function_name = request.tool_choice.function.name
@@ -385,6 +396,10 @@ class OpenAIServingChat(OpenAIServing):
                    num_prompt_tokens = len(res.prompt_token_ids)
                    if res.encoder_prompt_token_ids is not None:
                        num_prompt_tokens += len(res.encoder_prompt_token_ids)
+                if (num_cached_tokens is None
+                        and res.metrics is not None
+                        and res.metrics.num_cached_tokens is not None):
+                    num_cached_tokens = res.metrics.num_cached_tokens

                # We need to do it here, because if there are exceptions in
                # the result_generator, it needs to be sent as the FIRST
@@ -691,6 +706,9 @@ class OpenAIServingChat(OpenAIServing):
                    completion_tokens=completion_tokens,
                    total_tokens=num_prompt_tokens + completion_tokens,
                    reasoning_tokens=total_reasoning,
+                    prompt_tokens_details=(
+                        PromptTokensDetails(cached_tokens=num_cached_tokens)
+                        if num_cached_tokens is not None else None),
                )

                final_usage_chunk = ChatCompletionStreamResponse(
@@ -713,6 +731,10 @@ class OpenAIServingChat(OpenAIServing):
                total_tokens=num_prompt_tokens + num_completion_tokens,
                reasoning_tokens=total_reasoning)

+        except asyncio.CancelledError:
+            # Client disconnected; abort the engine request so GPU is freed.
+            await self.engine_client.abort(request_id)
+            return
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            logger.error("error in chat completion stream generator: %s", e)
@@ -739,6 +761,7 @@ class OpenAIServingChat(OpenAIServing):
            async for res in result_generator:
                final_res = res
        except asyncio.CancelledError:
+            await self.engine_client.abort(request_id)
            return self.create_error_response("Client disconnected")

        assert final_res is not None
@@ -881,11 +904,16 @@ class OpenAIServingChat(OpenAIServing):
            total_reasoning_tokens = sum(
                rp.count_reasoning_tokens(list(output.token_ids))
                for output in final_res.outputs)
+        num_cached_tokens = (final_res.metrics.num_cached_tokens
+                             if final_res.metrics is not None else None)
        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
            reasoning_tokens=total_reasoning_tokens,
+            prompt_tokens_details=(
+                PromptTokensDetails(cached_tokens=num_cached_tokens)
+                if num_cached_tokens is not None else None),
        )

        request_metadata.final_usage_info = usage