[Bugfix] Fix ChatCompletion endpoint of mini_lb when stream is set (#6703)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
This commit is contained in:
@@ -117,8 +117,8 @@ class MiniLoadBalancer:
|
|||||||
) as session:
|
) as session:
|
||||||
# Create the tasks for both prefill and decode requests
|
# Create the tasks for both prefill and decode requests
|
||||||
tasks = [
|
tasks = [
|
||||||
session.post(f"{prefill_server}/generate", json=modified_request),
|
session.post(f"{prefill_server}/{endpoint}", json=modified_request),
|
||||||
session.post(f"{decode_server}/generate", json=modified_request),
|
session.post(f"{decode_server}/{endpoint}", json=modified_request),
|
||||||
]
|
]
|
||||||
# Wait for both responses to complete. Since this is streaming, they return immediately.
|
# Wait for both responses to complete. Since this is streaming, they return immediately.
|
||||||
prefill_response, decode_response = await asyncio.gather(*tasks)
|
prefill_response, decode_response = await asyncio.gather(*tasks)
|
||||||
|
|||||||
Reference in New Issue
Block a user