[fix]enable flashmla when using draft model P/D attention select (#11012)

This commit is contained in:
Hank Han
2025-10-04 20:59:34 +08:00
committed by GitHub
parent d01b921482
commit 666da3d59f
3 changed files with 14 additions and 5 deletions

View File

@@ -103,11 +103,11 @@ class TestFlashMLAMTP(CustomTestCase):
"--speculative-draft-model-path",
"lmsys/sglang-ci-dsv3-test-NextN",
"--speculative-num-steps",
"1",
"2",
"--speculative-eagle-topk",
"1",
"--speculative-num-draft-tokens",
"2",
"3",
"--attention-backend",
"flashmla",
]
@@ -146,7 +146,7 @@ class TestFlashMLAMTP(CustomTestCase):
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, 1.8)
self.assertGreater(avg_spec_accept_length, 2.4)
if __name__ == "__main__":