[Eagle] Fix kernel call after updating speculative sampling kernels (#7231)

This commit is contained in:
Lianmin Zheng
2025-06-16 07:25:59 -07:00
committed by GitHub
parent 7ddf8e83d2
commit 53a525bf33
7 changed files with 24 additions and 33 deletions

View File

@@ -143,7 +143,7 @@ class TestFlashAttention3SpeculativeDecode(BaseFlashAttentionTest):
args.extend(
[
"--cuda-graph-max-bs",
"2",
"4",
"--speculative-algorithm",
"EAGLE3",
"--speculative-draft",
@@ -169,7 +169,7 @@ class TestFlashAttention3SpeculativeDecodeTopk(BaseFlashAttentionTest):
model = DEFAULT_MODEL_NAME_FOR_TEST
accuracy_threshold = 0.65
speculative_decode = True
spec_decode_threshold = 1.5
spec_decode_threshold = 1.6
@classmethod
def get_server_args(cls):
@@ -177,7 +177,7 @@ class TestFlashAttention3SpeculativeDecodeTopk(BaseFlashAttentionTest):
args.extend(
[
"--cuda-graph-max-bs",
"2",
"4",
"--speculative-algorithm",
"EAGLE3",
"--speculative-draft",
@@ -201,7 +201,7 @@ class TestFlashAttention3MLASpeculativeDecode(BaseFlashAttentionTest):
model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
accuracy_threshold = 0.60
speculative_decode = True
spec_decode_threshold = 1.5
spec_decode_threshold = 2.5
@classmethod
def get_server_args(cls):
@@ -209,7 +209,7 @@ class TestFlashAttention3MLASpeculativeDecode(BaseFlashAttentionTest):
args.extend(
[
"--cuda-graph-max-bs",
"2",
"4",
"--speculative-algorithm",
"EAGLE",
"--speculative-draft",
@@ -233,7 +233,7 @@ class TestFlashAttention3MLASpeculativeDecodeTopk(BaseFlashAttentionTest):
model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
accuracy_threshold = 0.60
speculative_decode = True
spec_decode_threshold = 1.5
spec_decode_threshold = 2.95
@classmethod
def get_server_args(cls):
@@ -241,7 +241,7 @@ class TestFlashAttention3MLASpeculativeDecodeTopk(BaseFlashAttentionTest):
args.extend(
[
"--cuda-graph-max-bs",
"2",
"4",
"--speculative-algorithm",
"EAGLE",
"--speculative-draft",