Simplify eagle tests and TP sync in grammar backend (#4066)

2025-03-04 13:40:40 -08:00
parent 03b0364f76
commit 77a3954bf7
14 changed files with 122 additions and 126 deletions
--- a/test/srt/test_eagle_infer.py
+++ b/test/srt/test_eagle_infer.py
@@ -39,7 +39,7 @@ class TestEAGLEEngine(unittest.TestCase):
        self.ref_output = ref_engine.generate(self.prompt, self.sampling_params)["text"]
        ref_engine.shutdown()

-    def test_eagle_accuracy(self):
+    def test_correctness(self):
        configs = [
            self.BASE_CONFIG,
            {**self.BASE_CONFIG, "disable_cuda_graph": True},
@@ -95,67 +95,6 @@ class TestEAGLEEngine(unittest.TestCase):
            print("-" * 40)


-class TestEAGLEEngineTokenMap(unittest.TestCase):
-    BASE_CONFIG = {
-        "model_path": "meta-llama/Meta-Llama-3-8B-Instruct",
-        "speculative_draft_model_path": "lmzheng/sglang-EAGLE-LLaMA3-Instruct-8B",
-        "speculative_algorithm": "EAGLE",
-        "speculative_num_steps": 5,
-        "speculative_eagle_topk": 8,
-        "speculative_num_draft_tokens": 64,
-        "mem_fraction_static": 0.7,
-        "cuda_graph_max_bs": 4,
-        "dtype": "float16",
-    }
-
-    def setUp(self):
-        self.prompt = "Today is a sunny day and I like"
-        self.sampling_params = {"temperature": 0, "max_new_tokens": 8}
-
-        ref_engine = sgl.Engine(model_path=self.BASE_CONFIG["model_path"])
-        self.ref_output = ref_engine.generate(self.prompt, self.sampling_params)["text"]
-        ref_engine.shutdown()
-
-    def test_token_map_accuracy(self):
-        configs = [
-            self.BASE_CONFIG,
-            {
-                **self.BASE_CONFIG,
-                "speculative_token_map": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt",
-            },
-        ]
-
-        for config in configs:
-            print("testing config: ", config)
-            with self.subTest(cuda_graph="enabled"):
-                engine = sgl.Engine(**config)
-                try:
-                    self._test_basic_generation(engine)
-                    self._test_batch_generation(engine)
-                finally:
-                    engine.shutdown()
-
-    def _test_basic_generation(self, engine):
-        output = engine.generate(self.prompt, self.sampling_params)["text"]
-        print(f"{output=}, {self.ref_output=}")
-        self.assertEqual(output, self.ref_output)
-
-    def _test_batch_generation(self, engine):
-        prompts = [
-            "Hello, my name is",
-            "The president of the United States is",
-            "The capital of France is",
-            "The future of AI is",
-        ]
-        params = {"temperature": 0, "max_new_tokens": 30}
-
-        outputs = engine.generate(prompts, params)
-        for prompt, output in zip(prompts, outputs):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {output['text']}")
-            print("-" * 40)
-
-
 prompts = [
    "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like[/INST]"
    '[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nWhat are the mental triggers in Jeff Walker\'s Product Launch Formula and "Launch" book?[/INST]',
@@ -222,7 +161,7 @@ class TestEAGLEServer(unittest.TestCase):
                        "max_new_tokens": 1024,
                    },
                }
-                # set timeout = 1s,mock disconnected
+                # set timeout = 1s, mock disconnected
                requests.post(url, json=data, timeout=1)
            except Exception as e:
                print(e)
@@ -273,18 +212,71 @@ class TestEAGLEServerTriton(TestEAGLEServer):
                "--speculative-num-steps",
                "5",
                "--speculative-eagle-topk",
-                "8",
+                "4",
                "--speculative-num-draft-tokens",
-                "64",
+                "8",
                "--mem-fraction-static",
                "0.7",
                "--attention-backend",
                "triton",
                "--cuda-graph-max-bs",
-                "32",
+                "16",
            ],
        )


+class TestEAGLEEngineTokenMap(unittest.TestCase):
+    def setUp(self):
+        self.prompt = "Today is a sunny day and I like"
+        self.sampling_params = {"temperature": 0, "max_new_tokens": 8}
+
+        ref_engine = sgl.Engine(
+            model_path="meta-llama/Meta-Llama-3-8B-Instruct", cuda_graph_max_bs=2
+        )
+        self.ref_output = ref_engine.generate(self.prompt, self.sampling_params)["text"]
+        ref_engine.shutdown()
+
+    def test_correctness(self):
+        config = {
+            "model_path": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "speculative_draft_model_path": "lmsys/sglang-EAGLE-LLaMA3-Instruct-8B",
+            "speculative_algorithm": "EAGLE",
+            "speculative_num_steps": 5,
+            "speculative_eagle_topk": 4,
+            "speculative_num_draft_tokens": 8,
+            "speculative_token_map": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt",
+            "mem_fraction_static": 0.7,
+            "cuda_graph_max_bs": 4,
+            "dtype": "bfloat16",
+        }
+
+        engine = sgl.Engine(**config)
+        try:
+            self._test_basic_generation(engine)
+            self._test_batch_generation(engine)
+        finally:
+            engine.shutdown()
+
+    def _test_basic_generation(self, engine):
+        output = engine.generate(self.prompt, self.sampling_params)["text"]
+        print(f"{output=}, {self.ref_output=}")
+        self.assertEqual(output, self.ref_output)
+
+    def _test_batch_generation(self, engine):
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        params = {"temperature": 0, "max_new_tokens": 30}
+
+        outputs = engine.generate(prompts, params)
+        for prompt, output in zip(prompts, outputs):
+            print(f"Prompt: {prompt}")
+            print(f"Generated: {output['text']}")
+            print("-" * 40)
+
+
 if __name__ == "__main__":
    unittest.main()