[Eagle] Fix kernel call after updating speculative sampling kernels (#7231)

2025-06-16 07:25:59 -07:00
parent 7ddf8e83d2
commit 53a525bf33
7 changed files with 24 additions and 33 deletions
--- a/test/srt/test_fa3.py
+++ b/test/srt/test_fa3.py
@@ -143,7 +143,7 @@ class TestFlashAttention3SpeculativeDecode(BaseFlashAttentionTest):
        args.extend(
            [
                "--cuda-graph-max-bs",
-                "2",
+                "4",
                "--speculative-algorithm",
                "EAGLE3",
                "--speculative-draft",
@@ -169,7 +169,7 @@ class TestFlashAttention3SpeculativeDecodeTopk(BaseFlashAttentionTest):
    model = DEFAULT_MODEL_NAME_FOR_TEST
    accuracy_threshold = 0.65
    speculative_decode = True
-    spec_decode_threshold = 1.5
+    spec_decode_threshold = 1.6

    @classmethod
    def get_server_args(cls):
@@ -177,7 +177,7 @@ class TestFlashAttention3SpeculativeDecodeTopk(BaseFlashAttentionTest):
        args.extend(
            [
                "--cuda-graph-max-bs",
-                "2",
+                "4",
                "--speculative-algorithm",
                "EAGLE3",
                "--speculative-draft",
@@ -201,7 +201,7 @@ class TestFlashAttention3MLASpeculativeDecode(BaseFlashAttentionTest):
    model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
    accuracy_threshold = 0.60
    speculative_decode = True
-    spec_decode_threshold = 1.5
+    spec_decode_threshold = 2.5

    @classmethod
    def get_server_args(cls):
@@ -209,7 +209,7 @@ class TestFlashAttention3MLASpeculativeDecode(BaseFlashAttentionTest):
        args.extend(
            [
                "--cuda-graph-max-bs",
-                "2",
+                "4",
                "--speculative-algorithm",
                "EAGLE",
                "--speculative-draft",
@@ -233,7 +233,7 @@ class TestFlashAttention3MLASpeculativeDecodeTopk(BaseFlashAttentionTest):
    model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
    accuracy_threshold = 0.60
    speculative_decode = True
-    spec_decode_threshold = 1.5
+    spec_decode_threshold = 2.95

    @classmethod
    def get_server_args(cls):
@@ -241,7 +241,7 @@ class TestFlashAttention3MLASpeculativeDecodeTopk(BaseFlashAttentionTest):
        args.extend(
            [
                "--cuda-graph-max-bs",
-                "2",
+                "4",
                "--speculative-algorithm",
                "EAGLE",
                "--speculative-draft",