From a95d5589c3bbfeecaec9a1109e601785b24d014c Mon Sep 17 00:00:00 2001 From: Gleb Drozdov <159446314+g-drozdov@users.noreply.github.com> Date: Thu, 17 Oct 2024 22:06:52 +0400 Subject: [PATCH] Add matched_stop token or str to distinguish between eos or stop str finish_reason generation (#1684) --- python/sglang/srt/openai_api/adapter.py | 70 +++++++----- python/sglang/srt/openai_api/protocol.py | 4 + test/srt/test_matched_stop.py | 139 +++++++++++++++++++++++ 3 files changed, 186 insertions(+), 27 deletions(-) create mode 100644 test/srt/test_matched_stop.py diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index b4727dfd7..0b820a8b0 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -621,16 +621,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False): else: logprobs = None + finish_reason = ret_item["meta_info"]["finish_reason"] + if to_file: # to make the choise data json serializable choice_data = { "index": 0, "text": text, "logprobs": logprobs, - "finish_reason": ( - ret_item["meta_info"]["finish_reason"]["type"] - if ret_item["meta_info"]["finish_reason"] - else "" + "finish_reason": (finish_reason["type"] if finish_reason else ""), + "matched_stop": ( + finish_reason["matched"] + if finish_reason and "matched" in finish_reason + else None ), } else: @@ -638,10 +641,11 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False): index=idx, text=text, logprobs=logprobs, - finish_reason=( - ret_item["meta_info"]["finish_reason"]["type"] - if ret_item["meta_info"]["finish_reason"] - else "" + finish_reason=(finish_reason["type"] if finish_reason else ""), + matched_stop=( + finish_reason["matched"] + if finish_reason and "matched" in finish_reason + else None ), ) @@ -771,14 +775,16 @@ async def v1_completions(tokenizer_manager, raw_request: Request): delta = text[len(stream_buffer) :] stream_buffer = stream_buffer + delta + finish_reason = content["meta_info"]["finish_reason"] choice_data = CompletionResponseStreamChoice( index=index, text=delta, logprobs=logprobs, - finish_reason=( - content["meta_info"]["finish_reason"]["type"] - if content["meta_info"]["finish_reason"] - else "" + finish_reason=(finish_reason["type"] if finish_reason else ""), + matched_stop=( + finish_reason["matched"] + if finish_reason and "matched" in finish_reason + else None ), ) chunk = CompletionStreamResponse( @@ -1016,16 +1022,19 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False): else: choice_logprobs = None + finish_reason = ret_item["meta_info"]["finish_reason"] + if to_file: # to make the choice data json serializable choice_data = { "index": 0, "message": {"role": "assistant", "content": ret_item["text"]}, "logprobs": choice_logprobs, - "finish_reason": ( - ret_item["meta_info"]["finish_reason"]["type"] - if ret_item["meta_info"]["finish_reason"] - else "" + "finish_reason": (finish_reason["type"] if finish_reason else ""), + "matched_stop": ( + finish_reason["matched"] + if finish_reason and "matched" in finish_reason + else None ), } else: @@ -1033,10 +1042,11 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False): index=idx, message=ChatMessage(role="assistant", content=ret_item["text"]), logprobs=choice_logprobs, - finish_reason=( - ret_item["meta_info"]["finish_reason"]["type"] - if ret_item["meta_info"]["finish_reason"] - else "" + finish_reason=(finish_reason["type"] if finish_reason else ""), + matched_stop=( + finish_reason["matched"] + if finish_reason and "matched" in finish_reason + else None ), ) @@ -1159,6 +1169,8 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): else: choice_logprobs = None + finish_reason = content["meta_info"]["finish_reason"] + if is_first: # First chunk with role is_first = False @@ -1166,9 +1178,12 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): index=index, delta=DeltaMessage(role="assistant"), finish_reason=( - content["meta_info"]["finish_reason"]["type"] - if content["meta_info"]["finish_reason"] - else "" + finish_reason["type"] if finish_reason else "" + ), + matched_stop=( + finish_reason["matched"] + if finish_reason and "matched" in finish_reason + else None ), logprobs=choice_logprobs, ) @@ -1185,10 +1200,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): choice_data = ChatCompletionResponseStreamChoice( index=index, delta=DeltaMessage(content=delta), - finish_reason=( - content["meta_info"]["finish_reason"]["type"] - if content["meta_info"]["finish_reason"] - else "" + finish_reason=(finish_reason["type"] if finish_reason else ""), + matched_stop=( + finish_reason["matched"] + if finish_reason and "matched" in finish_reason + else None ), logprobs=choice_logprobs, ) diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 349944f70..583db66c4 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -184,6 +184,7 @@ class CompletionResponseChoice(BaseModel): text: str logprobs: Optional[LogProbs] = None finish_reason: Optional[str] = None + matched_stop: Union[None, int, str] = None class CompletionResponse(BaseModel): @@ -200,6 +201,7 @@ class CompletionResponseStreamChoice(BaseModel): text: str logprobs: Optional[LogProbs] = None finish_reason: Optional[str] = None + matched_stop: Union[None, int, str] = None class CompletionStreamResponse(BaseModel): @@ -291,6 +293,7 @@ class ChatCompletionResponseChoice(BaseModel): message: ChatMessage logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None finish_reason: str + matched_stop: Union[None, int, str] = None class ChatCompletionResponse(BaseModel): @@ -312,6 +315,7 @@ class ChatCompletionResponseStreamChoice(BaseModel): delta: DeltaMessage logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None finish_reason: Optional[str] = None + matched_stop: Union[None, int, str] = None class ChatCompletionStreamResponse(BaseModel): diff --git a/test/srt/test_matched_stop.py b/test/srt/test_matched_stop.py new file mode 100644 index 000000000..a3399687d --- /dev/null +++ b/test/srt/test_matched_stop.py @@ -0,0 +1,139 @@ +import json +import unittest + +import requests + +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + +MANY_NEW_TOKENS_PROMPT = """ +Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters. +Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters. +Each section should be as comprehensive as possible to create a rich and immersive experience for the reader. +The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long. +""" + + +class TestMatchedStop(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=300, + other_args=["--max-running-requests", "10"], + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def run_completions_generation( + self, + prompt=MANY_NEW_TOKENS_PROMPT, + max_tokens=1, + stop=None, + finish_reason=None, + matched_stop=None, + ): + payload = { + "prompt": prompt, + "model": self.model, + "temperature": 0, + "top_p": 1, + "max_tokens": max_tokens, + } + + if stop is not None: + payload["stop"] = stop + + response_completions = requests.post( + self.base_url + "/v1/completions", + json=payload, + ) + print(json.dumps(response_completions.json())) + print("=" * 100) + + assert ( + response_completions.json()["choices"][0]["finish_reason"] == finish_reason + ) + assert response_completions.json()["choices"][0]["matched_stop"] == matched_stop + + def run_chat_completions_generation( + self, + prompt=MANY_NEW_TOKENS_PROMPT, + max_tokens=1, + stop=None, + finish_reason=None, + matched_stop=None, + ): + chat_payload = { + "model": self.model, + "messages": [ + {"role": "system", "content": "You are a helpful AI assistant"}, + {"role": "user", "content": prompt}, + ], + "temperature": 0, + "top_p": 1, + "max_tokens": max_tokens, + } + + if stop is not None: + chat_payload["stop"] = stop + + response_chat = requests.post( + self.base_url + "/v1/chat/completions", + json=chat_payload, + ) + print(json.dumps(response_chat.json())) + print("=" * 100) + + assert response_chat.json()["choices"][0]["finish_reason"] == finish_reason + assert response_chat.json()["choices"][0]["matched_stop"] == matched_stop + + def test_finish_stop_str(self): + self.run_completions_generation( + max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n" + ) + self.run_chat_completions_generation( + max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n" + ) + + def test_finish_stop_eos(self): + llama_format_prompt = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|> + + What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|> + """ + eos_token_id = 128009 + self.run_completions_generation( + prompt=llama_format_prompt, + max_tokens=1000, + finish_reason="stop", + matched_stop=eos_token_id, + ) + self.run_chat_completions_generation( + prompt="What is 2 + 2?", + max_tokens=1000, + finish_reason="stop", + matched_stop=eos_token_id, + ) + + def test_finish_length(self): + self.run_completions_generation( + max_tokens=5, finish_reason="length", matched_stop=None + ) + self.run_chat_completions_generation( + max_tokens=5, finish_reason="length", matched_stop=None + ) + + +if __name__ == "__main__": + unittest.main()