Support v1/responses and use harmony in serving_chat (#8837)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
Co-authored-by: Xinyuan Tong <justinning0323@outlook.com>
Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
Chang Su
2025-08-06 16:20:34 -07:00
committed by GitHub
parent cbbd685a46
commit 92cc32d9fc
16 changed files with 2878 additions and 43 deletions

View File

@@ -750,7 +750,11 @@ class TokenizerManager:
try:
await asyncio.wait_for(state.event.wait(), timeout=4)
except asyncio.TimeoutError:
if request is not None and await request.is_disconnected():
if (
request is not None
and not obj.background
and await request.is_disconnected()
):
# Abort the request for disconnected requests (non-streaming, waiting queue)
self.abort_request(obj.rid)
# Use exception to kill the whole call stack and asyncio task
@@ -805,7 +809,11 @@ class TokenizerManager:
if obj.stream:
yield out
else:
if request is not None and await request.is_disconnected():
if (
request is not None
and not obj.background
and await request.is_disconnected()
):
# Abort the request for disconnected requests (non-streaming, running)
self.abort_request(obj.rid)
# Use exception to kill the whole call stack and asyncio task
@@ -1548,8 +1556,17 @@ class TokenizerManager:
if isinstance(recv_obj, BatchStrOut):
state.text += recv_obj.output_strs[i]
if state.obj.stream:
state.output_ids.extend(recv_obj.output_ids[i])
output_token_ids = state.output_ids[state.last_output_offset :]
state.last_output_offset = len(state.output_ids)
else:
state.output_ids.extend(recv_obj.output_ids[i])
output_token_ids = state.output_ids.copy()
out_dict = {
"text": state.text,
"output_ids": output_token_ids,
"meta_info": meta_info,
}
elif isinstance(recv_obj, BatchTokenIDOut):