Ensure Usage Data in Streaming Responses Aligns with vLLM’s Implementation (#3814)

2025-03-13 13:12:55 +08:00
parent ad46550d25
commit 4014804157
1 changed files with 4 additions and 4 deletions
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -820,13 +820,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
                    )

                    final_usage_chunk = CompletionStreamResponse(
-                        id=str(uuid.uuid4().hex),
+                        id=content["meta_info"]["id"],
                        choices=[],
                        model=request.model,
                        usage=usage,
                    )
                    final_usage_data = final_usage_chunk.model_dump_json(
-                        exclude_unset=True, exclude_none=True
+                        exclude_none=True
                    )
                    yield f"data: {final_usage_data}\n\n"
            except ValueError as e:
@@ -1495,13 +1495,13 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
                    )

                    final_usage_chunk = ChatCompletionStreamResponse(
-                        id=str(uuid.uuid4().hex),
+                        id=content["meta_info"]["id"],
                        choices=[],
                        model=request.model,
                        usage=usage,
                    )
                    final_usage_data = final_usage_chunk.model_dump_json(
-                        exclude_unset=True, exclude_none=True
+                        exclude_none=True
                    )
                    yield f"data: {final_usage_data}\n\n"
            except ValueError as e: