[Test] Add basic matched stop for beta eagle (#11833)

2025-10-20 01:17:00 +08:00
parent 48738af7f9
commit 7a020e0f3b
4 changed files with 201 additions and 222 deletions
--- a/test/srt/test_eagle_infer_beta.py
+++ b/test/srt/test_eagle_infer_beta.py
@@ -3,7 +3,10 @@ from types import SimpleNamespace

 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.kit_matched_stop import MatchedStopMixin
 from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
@@ -11,93 +14,50 @@ from sglang.test.test_utils import (
 )


-class TestEagleBS1(CustomTestCase):
-    num_questions = 60
-
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "meta-llama/Llama-2-7b-chat-hf"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--attention-backend",
-                "triton",
-                "--enable-beta-spec",
-                "--speculative-algorithm",
-                "EAGLE",
-                "--speculative-draft-model",
-                "lmzheng/sglang-EAGLE-llama2-chat-7B",
-                "--speculative-num-steps",
-                "5",
-                "--speculative-eagle-topk",
-                "1",
-                "--speculative-num-draft-tokens",
-                "6",
-                "--max-running-requests",
-                "1",
-            ],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=self.num_questions,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval(args)
-        print(f"TestEagleBS1 -- {metrics=}")
-        self.assertGreater(
-            metrics["accuracy"], 0.33
-        )  # 0.3333 for 60 questions; 0.234 for 1319 questions
-
-
-class TestEagleLargeBS(CustomTestCase):
-    num_questions = 10000
+class TestEagleServerBase(CustomTestCase, MatchedStopMixin):
    max_running_requests = 64
-    other_args = [
-        "--trust-remote-code",
-        "--attention-backend",
-        "triton",
-        "--enable-beta-spec",
-        "--speculative-algorithm",
-        "EAGLE",
-        "--speculative-draft-model",
-        "lmzheng/sglang-EAGLE-llama2-chat-7B",
-        "--speculative-num-steps",
-        "5",
-        "--speculative-eagle-topk",
-        "1",
-        "--speculative-num-draft-tokens",
-        "6",
-        "--mem-fraction-static",
-        "0.75",
-        "--max-running-requests",
-        str(max_running_requests),
-        "--cuda-graph-bs",
-        *[str(i) for i in range(1, max_running_requests + 1)],
-    ]
+    attention_backend = "triton"
+    spec_steps = 5
+    spec_topk = 1
+    spec_draft_tokens = 6
+    page_size = 1
+    other_launch_args = []
+    model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
+    draft_model = DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST

    @classmethod
    def setUpClass(cls):
-        cls.model = "meta-llama/Llama-2-7b-chat-hf"
        cls.base_url = DEFAULT_URL_FOR_TEST
+        launch_args = [
+            "--enable-beta-spec",
+            "--trust-remote-code",
+            "--attention-backend",
+            cls.attention_backend,
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-draft-model",
+            cls.draft_model,
+            "--speculative-num-steps",
+            cls.spec_steps,
+            "--speculative-eagle-topk",
+            cls.spec_topk,
+            "--speculative-num-draft-tokens",
+            cls.spec_draft_tokens,
+            "--page-size",
+            str(cls.page_size),
+            "--mem-fraction-static",
+            "0.75",
+            "--max-running-requests",
+            str(cls.max_running_requests),
+            "--cuda-graph-bs",
+            *[str(i) for i in range(1, cls.max_running_requests + 1)],
+        ]
+        launch_args.extend(cls.other_launch_args)
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=cls.other_args,
+            other_args=launch_args,
        )

    @classmethod
@@ -108,7 +68,7 @@ class TestEagleLargeBS(CustomTestCase):
        args = SimpleNamespace(
            num_shots=5,
            data_path=None,
-            num_questions=self.num_questions,
+            num_questions=1000,
            max_new_tokens=512,
            parallel=128,
            host="http://127.0.0.1",