[Bugfix] Fix ChatCompletion endpoint of mini_lb when stream is set (#6703)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
This commit is contained in:
shangmingc
2025-05-28 21:33:36 +08:00
committed by GitHub
parent c7588d593e
commit e9fd11c0d1

View File

@@ -117,8 +117,8 @@ class MiniLoadBalancer:
) as session:
# Create the tasks for both prefill and decode requests
tasks = [
session.post(f"{prefill_server}/generate", json=modified_request),
session.post(f"{decode_server}/generate", json=modified_request),
session.post(f"{prefill_server}/{endpoint}", json=modified_request),
session.post(f"{decode_server}/{endpoint}", json=modified_request),
]
# Wait for both responses to complete. Since this is streaming, they return immediately.
prefill_response, decode_response = await asyncio.gather(*tasks)