From f127355a3066ff3d4c8dea7899a2e25335fd9890 Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Wed, 28 May 2025 07:32:05 +0800 Subject: [PATCH] Add batch test for draft extend (#6672) --- test/srt/test_eagle_infer.py | 62 ++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/test/srt/test_eagle_infer.py b/test/srt/test_eagle_infer.py index 4384d2c64..23c7ebdfe 100644 --- a/test/srt/test_eagle_infer.py +++ b/test/srt/test_eagle_infer.py @@ -577,5 +577,67 @@ class TestEAGLEServerTriton(TestEAGLEServer): ) +class TestEAGLEDraftExtend(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--speculative-algorithm", + "EAGLE", + "--speculative-draft-model-path", + DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, + "--speculative-num-steps", + 1, + "--speculative-eagle-topk", + 1, + "--speculative-num-draft-tokens", + 2, + "--max-running-requests", + 4, + "--attention-backend", + "fa3", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_one_batch_accept_length(self): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + url = self.base_url + "/generate" + data = { + "text": prompts, + "sampling_params": { + "temperature": 0, + "max_new_tokens": 512, + }, + } + response = requests.post(url, json=data) + self.assertEqual(response.status_code, 200) + outputs = response.json() + for i in range(len(prompts)): + output = outputs[i] + if "spec_verify_ct" in output["meta_info"]: + acc_length = ( + output["meta_info"]["completion_tokens"] + / output["meta_info"]["spec_verify_ct"] + ) + else: + acc_length = 1.0 + + print(f"{acc_length=}") + self.assertGreater(acc_length, 1.50) + + if __name__ == "__main__": unittest.main()