[Test] Add basic matched stop for beta eagle (#11833)

2025-10-20 01:17:00 +08:00
parent 48738af7f9
commit 7a020e0f3b
4 changed files with 201 additions and 222 deletions
--- a/python/sglang/test/kit_matched_stop.py
+++ b/python/sglang/test/kit_matched_stop.py
@@ -0,0 +1,157 @@
 import json
 import requests
 MANY_NEW_TOKENS_PROMPT = """
 Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters.
 Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters.
 Each section should be as comprehensive as possible to create a rich and immersive experience for the reader.
 The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long.
 """
 class MatchedStopMixin:
    def _run_completions_generation(
        self,
        prompt=MANY_NEW_TOKENS_PROMPT,
        max_tokens=1,
        stop=None,
        stop_regex=None,
        finish_reason=None,
        matched_stop=None,
    ):
        payload = {
            "prompt": prompt,
            "model": self.model,
            "temperature": 0,
            "top_p": 1,
            "max_tokens": max_tokens,
        }
        if stop is not None:
            payload["stop"] = stop
        if stop_regex is not None:
            payload["stop_regex"] = stop_regex
        response_completions = requests.post(
            self.base_url + "/v1/completions",
            json=payload,
        )
        res = response_completions.json()
        print(json.dumps(res))
        print("=" * 100)
        if not isinstance(matched_stop, list):
            matched_stop = [matched_stop]
        assert (
            res["choices"][0]["finish_reason"] == finish_reason
        ), f"Expected finish_reason: {finish_reason}, but got: {res['choices'][0]['finish_reason']}"
        assert (
            res["choices"][0]["matched_stop"] in matched_stop
        ), f"Expected matched_stop: {matched_stop}, but got: {res['choices'][0]['matched_stop']}"
    def _run_chat_completions_generation(
        self,
        prompt=MANY_NEW_TOKENS_PROMPT,
        max_tokens=1,
        stop=None,
        stop_regex=None,
        finish_reason=None,
        matched_stop=None,
    ):
        chat_payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": "You are a helpful AI assistant"},
                {"role": "user", "content": prompt},
            ],
            "temperature": 0,
            "top_p": 1,
            "max_tokens": max_tokens,
        }
        if stop is not None:
            chat_payload["stop"] = stop
        if stop_regex is not None:
            chat_payload["stop_regex"] = stop_regex
        response_chat = requests.post(
            self.base_url + "/v1/chat/completions",
            json=chat_payload,
        )
        res = response_chat.json()
        print(json.dumps(res))
        print("=" * 100)
        if not isinstance(matched_stop, list):
            matched_stop = [matched_stop]
        assert (
            res["choices"][0]["finish_reason"] == finish_reason
        ), f"Expected finish_reason: {finish_reason}, but got: {res['choices'][0]['finish_reason']}"
        assert (
            res["choices"][0]["matched_stop"] in matched_stop
        ), f"Expected matched_stop: {matched_stop}, but got: {res['choices'][0]['matched_stop']}"
    def test_finish_stop_str(self):
        self._run_completions_generation(
            max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
        )
        self._run_chat_completions_generation(
            max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
        )
    def test_finish_stop_regex_str(self):
        STOP_REGEX_STR = r"and|or"
        self._run_completions_generation(
            max_tokens=1000,
            stop_regex=STOP_REGEX_STR,
            finish_reason="stop",
            matched_stop=STOP_REGEX_STR,
        )
        self._run_chat_completions_generation(
            max_tokens=1000,
            stop_regex=STOP_REGEX_STR,
            finish_reason="stop",
            matched_stop=STOP_REGEX_STR,
        )
        # Match a complete sentence
        STOP_REGEX_STR_SENTENCE = r"[.!?]\s*$"
        self._run_chat_completions_generation(
            max_tokens=1000,
            stop_regex=STOP_REGEX_STR_SENTENCE,
            finish_reason="stop",
            matched_stop=STOP_REGEX_STR_SENTENCE,
        )
    def test_finish_stop_eos(self):
        llama_format_prompt = """\
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
 What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
        """
        eos_token_ids = [128000, 128009, 2]
        self._run_completions_generation(
            prompt=llama_format_prompt,
            max_tokens=1000,
            finish_reason="stop",
            matched_stop=eos_token_ids,
        )
        self._run_chat_completions_generation(
            prompt="What is 2 + 2?",
            max_tokens=1000,
            finish_reason="stop",
            matched_stop=eos_token_ids,
        )
    def test_finish_length(self):
        self._run_completions_generation(
            max_tokens=5, finish_reason="length", matched_stop=None
        )
        self._run_chat_completions_generation(
            max_tokens=5, finish_reason="length", matched_stop=None
        )
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -1622,6 +1622,9 @@ class CustomTestCase(unittest.TestCase):
            max_retry=max_retry,
        )
    def setUp(self):
        print(f"[Test Method] {self._testMethodName}", flush=True)
 def dump_bench_raw_result(
    path: str,
--- a/test/srt/openai_server/validation/test_matched_stop.py
+++ b/test/srt/openai_server/validation/test_matched_stop.py
@@ -1,10 +1,8 @@
 import json
 import unittest
 import requests
 from sglang.srt.sampling.sampling_params import MAX_LEN, get_max_seq_length
 from sglang.srt.utils import kill_process_tree
 from sglang.test.kit_matched_stop import MatchedStopMixin
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
@@ -12,15 +10,8 @@ from sglang.test.test_utils import (
    popen_launch_server,
 )
 MANY_NEW_TOKENS_PROMPT = """
 Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters.
 Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters.
 Each section should be as comprehensive as possible to create a rich and immersive experience for the reader.
 The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long.
 """
-
+class TestMatchedStop(CustomTestCase, MatchedStopMixin):
 class TestMatchedStop(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
@@ -36,138 +27,6 @@ class TestMatchedStop(CustomTestCase):
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)
    def run_completions_generation(
        self,
        prompt=MANY_NEW_TOKENS_PROMPT,
        max_tokens=1,
        stop=None,
        stop_regex=None,
        finish_reason=None,
        matched_stop=None,
    ):
        payload = {
            "prompt": prompt,
            "model": self.model,
            "temperature": 0,
            "top_p": 1,
            "max_tokens": max_tokens,
        }
        if stop is not None:
            payload["stop"] = stop
        if stop_regex is not None:
            payload["stop_regex"] = stop_regex
        response_completions = requests.post(
            self.base_url + "/v1/completions",
            json=payload,
        )
        print(json.dumps(response_completions.json()))
        print("=" * 100)
        assert (
            response_completions.json()["choices"][0]["finish_reason"] == finish_reason
        )
        assert response_completions.json()["choices"][0]["matched_stop"] == matched_stop
    def run_chat_completions_generation(
        self,
        prompt=MANY_NEW_TOKENS_PROMPT,
        max_tokens=1,
        stop=None,
        stop_regex=None,
        finish_reason=None,
        matched_stop=None,
    ):
        chat_payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": "You are a helpful AI assistant"},
                {"role": "user", "content": prompt},
            ],
            "temperature": 0,
            "top_p": 1,
            "max_tokens": max_tokens,
        }
        if stop is not None:
            chat_payload["stop"] = stop
        if stop_regex is not None:
            chat_payload["stop_regex"] = stop_regex
        response_chat = requests.post(
            self.base_url + "/v1/chat/completions",
            json=chat_payload,
        )
        print(json.dumps(response_chat.json()))
        print("=" * 100)
        assert response_chat.json()["choices"][0]["finish_reason"] == finish_reason
        assert response_chat.json()["choices"][0]["matched_stop"] == matched_stop
    def test_finish_stop_str(self):
        self.run_completions_generation(
            max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
        )
        self.run_chat_completions_generation(
            max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
        )
    def test_finish_stop_regex_str(self):
        STOP_REGEX_STR = r"and|or"
        self.run_completions_generation(
            max_tokens=1000,
            stop_regex=STOP_REGEX_STR,
            finish_reason="stop",
            matched_stop=STOP_REGEX_STR,
        )
        self.run_chat_completions_generation(
            max_tokens=1000,
            stop_regex=STOP_REGEX_STR,
            finish_reason="stop",
            matched_stop=STOP_REGEX_STR,
        )
        # Match a complete sentence
        STOP_REGEX_STR_SENTENCE = r"[.!?]\s*$"
        self.run_chat_completions_generation(
            max_tokens=1000,
            stop_regex=STOP_REGEX_STR_SENTENCE,
            finish_reason="stop",
            matched_stop=STOP_REGEX_STR_SENTENCE,
        )
    def test_finish_stop_eos(self):
        llama_format_prompt = """
        <|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
        What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
        """
        eos_token_id = 128009
        self.run_completions_generation(
            prompt=llama_format_prompt,
            max_tokens=1000,
            finish_reason="stop",
            matched_stop=eos_token_id,
        )
        self.run_chat_completions_generation(
            prompt="What is 2 + 2?",
            max_tokens=1000,
            finish_reason="stop",
            matched_stop=eos_token_id,
        )
    def test_finish_length(self):
        self.run_completions_generation(
            max_tokens=5, finish_reason="length", matched_stop=None
        )
        self.run_chat_completions_generation(
            max_tokens=5, finish_reason="length", matched_stop=None
        )
 class TestRegexPatternMaxLength(unittest.TestCase):
    @classmethod
--- a/test/srt/test_eagle_infer_beta.py
+++ b/test/srt/test_eagle_infer_beta.py
@@ -3,7 +3,10 @@ from types import SimpleNamespace
 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.kit_matched_stop import MatchedStopMixin
 from sglang.test.test_utils import (
    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
@@ -11,93 +14,50 @@ from sglang.test.test_utils import (
 )
-class TestEagleBS1(CustomTestCase):
+class TestEagleServerBase(CustomTestCase, MatchedStopMixin):
    num_questions = 60
    @classmethod
    def setUpClass(cls):
        cls.model = "meta-llama/Llama-2-7b-chat-hf"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--attention-backend",
                "triton",
                "--enable-beta-spec",
                "--speculative-algorithm",
                "EAGLE",
                "--speculative-draft-model",
                "lmzheng/sglang-EAGLE-llama2-chat-7B",
                "--speculative-num-steps",
                "5",
                "--speculative-eagle-topk",
                "1",
                "--speculative-num-draft-tokens",
                "6",
                "--max-running-requests",
                "1",
            ],
        )
    @classmethod
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)
    def test_gsm8k(self):
        args = SimpleNamespace(
            num_shots=5,
            data_path=None,
            num_questions=self.num_questions,
            max_new_tokens=512,
            parallel=128,
            host="http://127.0.0.1",
            port=int(self.base_url.split(":")[-1]),
        )
        metrics = run_eval(args)
        print(f"TestEagleBS1 -- {metrics=}")
        self.assertGreater(
            metrics["accuracy"], 0.33
        )  # 0.3333 for 60 questions; 0.234 for 1319 questions
 class TestEagleLargeBS(CustomTestCase):
    num_questions = 10000
    max_running_requests = 64
-    other_args = [
+    attention_backend = "triton"
-        "--trust-remote-code",
+    spec_steps = 5
-        "--attention-backend",
+    spec_topk = 1
-        "triton",
+    spec_draft_tokens = 6
-        "--enable-beta-spec",
+    page_size = 1
-        "--speculative-algorithm",
+    other_launch_args = []
-        "EAGLE",
+    model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
-        "--speculative-draft-model",
+    draft_model = DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
        "lmzheng/sglang-EAGLE-llama2-chat-7B",
        "--speculative-num-steps",
        "5",
        "--speculative-eagle-topk",
        "1",
        "--speculative-num-draft-tokens",
        "6",
        "--mem-fraction-static",
        "0.75",
        "--max-running-requests",
        str(max_running_requests),
        "--cuda-graph-bs",
        *[str(i) for i in range(1, max_running_requests + 1)],
    ]
    @classmethod
    def setUpClass(cls):
        cls.model = "meta-llama/Llama-2-7b-chat-hf"
        cls.base_url = DEFAULT_URL_FOR_TEST
        launch_args = [
            "--enable-beta-spec",
            "--trust-remote-code",
            "--attention-backend",
            cls.attention_backend,
            "--speculative-algorithm",
            "EAGLE",
            "--speculative-draft-model",
            cls.draft_model,
            "--speculative-num-steps",
            cls.spec_steps,
            "--speculative-eagle-topk",
            cls.spec_topk,
            "--speculative-num-draft-tokens",
            cls.spec_draft_tokens,
            "--page-size",
            str(cls.page_size),
            "--mem-fraction-static",
            "0.75",
            "--max-running-requests",
            str(cls.max_running_requests),
            "--cuda-graph-bs",
            *[str(i) for i in range(1, cls.max_running_requests + 1)],
        ]
        launch_args.extend(cls.other_launch_args)
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=cls.other_args,
+            other_args=launch_args,
        )
    @classmethod
@@ -108,7 +68,7 @@ class TestEagleLargeBS(CustomTestCase):
        args = SimpleNamespace(
            num_shots=5,
            data_path=None,
-            num_questions=self.num_questions,
+            num_questions=1000,
            max_new_tokens=512,
            parallel=128,
            host="http://127.0.0.1",