Fix ignore_eos parameter when loading a chat template (#5264)

2025-04-15 17:09:45 -07:00
parent 8ec0bb7d55
commit 27a009bb00
2 changed files with 76 additions and 1 deletions
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -995,7 +995,8 @@ def v1_chat_generate_request(
                image_data = conv.image_data
                audio_data = conv.audio_data
                modalities = conv.modalities
-                stop = conv.stop_str or []
+                stop = conv.stop_str or [] if not request.ignore_eos else []
                if request.stop:
                    if isinstance(request.stop, str):
                        stop.append(request.stop)
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -676,5 +676,79 @@ class TestOpenAIEmbedding(CustomTestCase):
        self.assertTrue(len(response.data[1].embedding) > 0)
 class TestOpenAIServerIgnoreEOS(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=["--chat-template=llama_3_vision"],
        )
        cls.base_url += "/v1"
        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
    @classmethod
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)
    def test_ignore_eos(self):
        """
        Test that ignore_eos=True allows generation to continue beyond EOS token
        and reach the max_tokens limit.
        """
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
        max_tokens = 200
        response_default = client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Count from 1 to 20."},
            ],
            temperature=0,
            max_tokens=max_tokens,
            extra_body={"ignore_eos": False},
        )
        response_ignore_eos = client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Count from 1 to 20."},
            ],
            temperature=0,
            max_tokens=max_tokens,
            extra_body={"ignore_eos": True},
        )
        default_tokens = len(
            self.tokenizer.encode(response_default.choices[0].message.content)
        )
        ignore_eos_tokens = len(
            self.tokenizer.encode(response_ignore_eos.choices[0].message.content)
        )
        # Check if ignore_eos resulted in more tokens or exactly max_tokens
        # The ignore_eos response should either:
        # 1. Have more tokens than the default response (if default stopped at EOS before max_tokens)
        # 2. Have exactly max_tokens (if it reached the max_tokens limit)
        self.assertTrue(
            ignore_eos_tokens > default_tokens or ignore_eos_tokens >= max_tokens,
            f"ignore_eos did not generate more tokens: {ignore_eos_tokens} vs {default_tokens}",
        )
        self.assertEqual(
            response_ignore_eos.choices[0].finish_reason,
            "length",
            f"Expected finish_reason='length' for ignore_eos=True, got {response_ignore_eos.choices[0].finish_reason}",
        )
 if __name__ == "__main__":
    unittest.main()