diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 64be034a4..8f74007fd 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -995,7 +995,8 @@ def v1_chat_generate_request( image_data = conv.image_data audio_data = conv.audio_data modalities = conv.modalities - stop = conv.stop_str or [] + stop = conv.stop_str or [] if not request.ignore_eos else [] + if request.stop: if isinstance(request.stop, str): stop.append(request.stop) diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index fe6646d60..3488fee60 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -676,5 +676,79 @@ class TestOpenAIEmbedding(CustomTestCase): self.assertTrue(len(response.data[1].embedding) > 0) +class TestOpenAIServerIgnoreEOS(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + api_key=cls.api_key, + other_args=["--chat-template=llama_3_vision"], + ) + cls.base_url += "/v1" + cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_ignore_eos(self): + """ + Test that ignore_eos=True allows generation to continue beyond EOS token + and reach the max_tokens limit. + """ + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + + max_tokens = 200 + + response_default = client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count from 1 to 20."}, + ], + temperature=0, + max_tokens=max_tokens, + extra_body={"ignore_eos": False}, + ) + + response_ignore_eos = client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count from 1 to 20."}, + ], + temperature=0, + max_tokens=max_tokens, + extra_body={"ignore_eos": True}, + ) + + default_tokens = len( + self.tokenizer.encode(response_default.choices[0].message.content) + ) + ignore_eos_tokens = len( + self.tokenizer.encode(response_ignore_eos.choices[0].message.content) + ) + + # Check if ignore_eos resulted in more tokens or exactly max_tokens + # The ignore_eos response should either: + # 1. Have more tokens than the default response (if default stopped at EOS before max_tokens) + # 2. Have exactly max_tokens (if it reached the max_tokens limit) + self.assertTrue( + ignore_eos_tokens > default_tokens or ignore_eos_tokens >= max_tokens, + f"ignore_eos did not generate more tokens: {ignore_eos_tokens} vs {default_tokens}", + ) + + self.assertEqual( + response_ignore_eos.choices[0].finish_reason, + "length", + f"Expected finish_reason='length' for ignore_eos=True, got {response_ignore_eos.choices[0].finish_reason}", + ) + + if __name__ == "__main__": unittest.main()