Ensure Usage Data in Streaming Responses Aligns with vLLM’s Implementation (#3814)
This commit is contained in:
@@ -820,13 +820,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|||||||
)
|
)
|
||||||
|
|
||||||
final_usage_chunk = CompletionStreamResponse(
|
final_usage_chunk = CompletionStreamResponse(
|
||||||
id=str(uuid.uuid4().hex),
|
id=content["meta_info"]["id"],
|
||||||
choices=[],
|
choices=[],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
usage=usage,
|
usage=usage,
|
||||||
)
|
)
|
||||||
final_usage_data = final_usage_chunk.model_dump_json(
|
final_usage_data = final_usage_chunk.model_dump_json(
|
||||||
exclude_unset=True, exclude_none=True
|
exclude_none=True
|
||||||
)
|
)
|
||||||
yield f"data: {final_usage_data}\n\n"
|
yield f"data: {final_usage_data}\n\n"
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
@@ -1495,13 +1495,13 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
)
|
)
|
||||||
|
|
||||||
final_usage_chunk = ChatCompletionStreamResponse(
|
final_usage_chunk = ChatCompletionStreamResponse(
|
||||||
id=str(uuid.uuid4().hex),
|
id=content["meta_info"]["id"],
|
||||||
choices=[],
|
choices=[],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
usage=usage,
|
usage=usage,
|
||||||
)
|
)
|
||||||
final_usage_data = final_usage_chunk.model_dump_json(
|
final_usage_data = final_usage_chunk.model_dump_json(
|
||||||
exclude_unset=True, exclude_none=True
|
exclude_none=True
|
||||||
)
|
)
|
||||||
yield f"data: {final_usage_data}\n\n"
|
yield f"data: {final_usage_data}\n\n"
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user