Fix streaming (#600)
This commit is contained in:
@@ -180,7 +180,6 @@ class RuntimeEndpoint(BaseBackend):
|
|||||||
self._assert_success(res)
|
self._assert_success(res)
|
||||||
pos = 0
|
pos = 0
|
||||||
|
|
||||||
incomplete_text = ""
|
|
||||||
for chunk in res.iter_lines(decode_unicode=False):
|
for chunk in res.iter_lines(decode_unicode=False):
|
||||||
chunk = chunk.decode("utf-8")
|
chunk = chunk.decode("utf-8")
|
||||||
if chunk and chunk.startswith("data:"):
|
if chunk and chunk.startswith("data:"):
|
||||||
@@ -188,14 +187,10 @@ class RuntimeEndpoint(BaseBackend):
|
|||||||
break
|
break
|
||||||
data = json.loads(chunk[5:].strip("\n"))
|
data = json.loads(chunk[5:].strip("\n"))
|
||||||
chunk_text = data["text"][pos:]
|
chunk_text = data["text"][pos:]
|
||||||
incomplete_text = data["incomplete_text"]
|
|
||||||
meta_info = data["meta_info"]
|
meta_info = data["meta_info"]
|
||||||
pos += len(chunk_text)
|
pos += len(chunk_text)
|
||||||
yield chunk_text, meta_info
|
yield chunk_text, meta_info
|
||||||
|
|
||||||
if len(incomplete_text) > 0:
|
|
||||||
yield incomplete_text, meta_info
|
|
||||||
|
|
||||||
def select(
|
def select(
|
||||||
self,
|
self,
|
||||||
s: StreamExecutor,
|
s: StreamExecutor,
|
||||||
|
|||||||
@@ -55,13 +55,11 @@ class DetokenizerManager:
|
|||||||
# Trim stop str
|
# Trim stop str
|
||||||
# TODO(lmzheng): handle the case where multiple stop strs are hit
|
# TODO(lmzheng): handle the case where multiple stop strs are hit
|
||||||
output_strs = []
|
output_strs = []
|
||||||
incomplete_strs = []
|
|
||||||
for i in range(len(recv_obj.rids)):
|
for i in range(len(recv_obj.rids)):
|
||||||
new_text = read_texts[i][len(surr_texts[i]) :]
|
new_text = read_texts[i][len(surr_texts[i]) :]
|
||||||
complete_new_text = find_printable_text(new_text)
|
if recv_obj.finished_reason[i] is None:
|
||||||
incomplete_new_text = new_text[len(complete_new_text) :]
|
new_text = find_printable_text(new_text)
|
||||||
output_strs.append(recv_obj.decoded_texts[i] + complete_new_text)
|
output_strs.append(recv_obj.decoded_texts[i] + new_text)
|
||||||
incomplete_strs.append(incomplete_new_text)
|
|
||||||
|
|
||||||
if isinstance(recv_obj.finished_reason[i], FINISH_MATCHED_STR):
|
if isinstance(recv_obj.finished_reason[i], FINISH_MATCHED_STR):
|
||||||
pos = output_strs[i].find(recv_obj.finished_reason[i].matched)
|
pos = output_strs[i].find(recv_obj.finished_reason[i].matched)
|
||||||
@@ -72,7 +70,6 @@ class DetokenizerManager:
|
|||||||
BatchStrOut(
|
BatchStrOut(
|
||||||
rids=recv_obj.rids,
|
rids=recv_obj.rids,
|
||||||
output_strs=output_strs,
|
output_strs=output_strs,
|
||||||
incomplete_strs=incomplete_strs,
|
|
||||||
meta_info=recv_obj.meta_info,
|
meta_info=recv_obj.meta_info,
|
||||||
finished_reason=recv_obj.finished_reason,
|
finished_reason=recv_obj.finished_reason,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -123,7 +123,6 @@ class BatchTokenIDOut:
|
|||||||
class BatchStrOut:
|
class BatchStrOut:
|
||||||
rids: List[str]
|
rids: List[str]
|
||||||
output_strs: List[str]
|
output_strs: List[str]
|
||||||
incomplete_strs: List[str]
|
|
||||||
meta_info: List[Dict]
|
meta_info: List[Dict]
|
||||||
finished_reason: List[BaseFinishReason]
|
finished_reason: List[BaseFinishReason]
|
||||||
|
|
||||||
|
|||||||
@@ -317,7 +317,6 @@ class TokenizerManager:
|
|||||||
recv_obj.meta_info[i]["id"] = rid
|
recv_obj.meta_info[i]["id"] = rid
|
||||||
out_dict = {
|
out_dict = {
|
||||||
"text": recv_obj.output_strs[i],
|
"text": recv_obj.output_strs[i],
|
||||||
"incomplete_text": recv_obj.incomplete_strs[i],
|
|
||||||
"meta_info": recv_obj.meta_info[i],
|
"meta_info": recv_obj.meta_info[i],
|
||||||
}
|
}
|
||||||
state.out_list.append(out_dict)
|
state.out_list.append(out_dict)
|
||||||
|
|||||||
@@ -164,7 +164,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|||||||
logprobs = None
|
logprobs = None
|
||||||
|
|
||||||
delta = text[len(stream_buffer) :]
|
delta = text[len(stream_buffer) :]
|
||||||
stream_buffer = content["text"]
|
stream_buffer = stream_buffer + delta
|
||||||
choice_data = CompletionResponseStreamChoice(
|
choice_data = CompletionResponseStreamChoice(
|
||||||
index=0,
|
index=0,
|
||||||
text=delta,
|
text=delta,
|
||||||
@@ -323,7 +323,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
|
|
||||||
text = content["text"]
|
text = content["text"]
|
||||||
delta = text[len(stream_buffer) :]
|
delta = text[len(stream_buffer) :]
|
||||||
stream_buffer = text
|
stream_buffer = stream_buffer + delta
|
||||||
choice_data = ChatCompletionResponseStreamChoice(
|
choice_data = ChatCompletionResponseStreamChoice(
|
||||||
index=0,
|
index=0,
|
||||||
delta=DeltaMessage(content=delta),
|
delta=DeltaMessage(content=delta),
|
||||||
|
|||||||
Reference in New Issue
Block a user