diff --git a/python/sglang/backend/runtime_endpoint.py b/python/sglang/backend/runtime_endpoint.py index 6f11d5492..97812941d 100644 --- a/python/sglang/backend/runtime_endpoint.py +++ b/python/sglang/backend/runtime_endpoint.py @@ -180,7 +180,6 @@ class RuntimeEndpoint(BaseBackend): self._assert_success(res) pos = 0 - incomplete_text = "" for chunk in res.iter_lines(decode_unicode=False): chunk = chunk.decode("utf-8") if chunk and chunk.startswith("data:"): @@ -188,14 +187,10 @@ class RuntimeEndpoint(BaseBackend): break data = json.loads(chunk[5:].strip("\n")) chunk_text = data["text"][pos:] - incomplete_text = data["incomplete_text"] meta_info = data["meta_info"] pos += len(chunk_text) yield chunk_text, meta_info - if len(incomplete_text) > 0: - yield incomplete_text, meta_info - def select( self, s: StreamExecutor, diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index be1eb4d44..3e0183b1b 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -55,13 +55,11 @@ class DetokenizerManager: # Trim stop str # TODO(lmzheng): handle the case where multiple stop strs are hit output_strs = [] - incomplete_strs = [] for i in range(len(recv_obj.rids)): new_text = read_texts[i][len(surr_texts[i]) :] - complete_new_text = find_printable_text(new_text) - incomplete_new_text = new_text[len(complete_new_text) :] - output_strs.append(recv_obj.decoded_texts[i] + complete_new_text) - incomplete_strs.append(incomplete_new_text) + if recv_obj.finished_reason[i] is None: + new_text = find_printable_text(new_text) + output_strs.append(recv_obj.decoded_texts[i] + new_text) if isinstance(recv_obj.finished_reason[i], FINISH_MATCHED_STR): pos = output_strs[i].find(recv_obj.finished_reason[i].matched) @@ -72,7 +70,6 @@ class DetokenizerManager: BatchStrOut( rids=recv_obj.rids, output_strs=output_strs, - incomplete_strs=incomplete_strs, meta_info=recv_obj.meta_info, finished_reason=recv_obj.finished_reason, ) diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 681e888a9..7b26a4f2d 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -123,7 +123,6 @@ class BatchTokenIDOut: class BatchStrOut: rids: List[str] output_strs: List[str] - incomplete_strs: List[str] meta_info: List[Dict] finished_reason: List[BaseFinishReason] diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 90aebcd4b..3f3c848e0 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -317,7 +317,6 @@ class TokenizerManager: recv_obj.meta_info[i]["id"] = rid out_dict = { "text": recv_obj.output_strs[i], - "incomplete_text": recv_obj.incomplete_strs[i], "meta_info": recv_obj.meta_info[i], } state.out_list.append(out_dict) diff --git a/python/sglang/srt/openai_api_adapter.py b/python/sglang/srt/openai_api_adapter.py index 75656f324..4306950f0 100644 --- a/python/sglang/srt/openai_api_adapter.py +++ b/python/sglang/srt/openai_api_adapter.py @@ -164,7 +164,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request): logprobs = None delta = text[len(stream_buffer) :] - stream_buffer = content["text"] + stream_buffer = stream_buffer + delta choice_data = CompletionResponseStreamChoice( index=0, text=delta, @@ -323,7 +323,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): text = content["text"] delta = text[len(stream_buffer) :] - stream_buffer = text + stream_buffer = stream_buffer + delta choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(content=delta),