Fix a draft model accuracy bug in eagle; support step=1; return logprob in eagle (#4134)

Co-authored-by: Sehoon Kim <kssteven418@gmail.com> Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: Sehoon Kim <sehoon@x.ai>
2025-03-06 06:13:59 -08:00
parent 3a3918121f
commit bc1534ff32
11 changed files with 304 additions and 106 deletions
--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -143,11 +143,11 @@ class TestGPTQModelDynamic(unittest.TestCase):

        print(f"result = `{result}`")

-        assert "paris" in result["text"].lower()
+        self.assertIn("paris", result["text"].lower())

        throughput = max_tokens / (tok - tic)
        print(f"Throughput: {throughput} tokens/s")
-        assert throughput >= 140
+        self.assertGreaterEqual(throughput, 140)

    def test_gptq_module(self):
        check_quant_method(self.MODEL_PATH, use_marlin_kernel=False)