DP Attention with Auto DeepEP Dispatch (#7222)

This commit is contained in:
Cheng Wan
2025-07-05 01:54:24 -07:00
committed by GitHub
parent 75354d9ae9
commit 8fc910db03
13 changed files with 136 additions and 90 deletions

View File

@@ -539,8 +539,9 @@ class Test10(CustomTestCase):
"8",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -593,8 +594,9 @@ class Test11(CustomTestCase):
"4",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -647,8 +649,9 @@ class Test12(CustomTestCase):
"8",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -700,8 +703,9 @@ class Test13(CustomTestCase):
"1",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -756,8 +760,9 @@ class Test14(CustomTestCase):
"1",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -812,8 +817,9 @@ class Test15(CustomTestCase):
"1",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -867,8 +873,9 @@ class Test16(CustomTestCase):
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -922,8 +929,9 @@ class Test17(CustomTestCase):
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -979,8 +987,9 @@ class Test18(CustomTestCase):
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -1036,8 +1045,9 @@ class Test19(CustomTestCase):
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"128",
],
)
@@ -2213,8 +2223,11 @@ class Test40(CustomTestCase):
"8",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",
@@ -2277,8 +2290,11 @@ class Test41(CustomTestCase):
"4",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",
@@ -2341,8 +2357,11 @@ class Test42(CustomTestCase):
"8",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",
@@ -2404,8 +2423,11 @@ class Test43(CustomTestCase):
"1",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",
@@ -2470,8 +2492,11 @@ class Test44(CustomTestCase):
"1",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",
@@ -2536,8 +2561,11 @@ class Test45(CustomTestCase):
"1",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",
@@ -2601,8 +2629,11 @@ class Test46(CustomTestCase):
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",
@@ -2666,8 +2697,11 @@ class Test47(CustomTestCase):
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",
@@ -2733,8 +2767,11 @@ class Test48(CustomTestCase):
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",
@@ -2800,8 +2837,11 @@ class Test49(CustomTestCase):
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
"auto",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
"32",
"--speculative-algo",
"NEXTN",
"--speculative-draft",