[5/N] MoE Refactor: Update MoE parallelism arguments (#8658)

This commit is contained in:
Cheng Wan
2025-08-01 01:20:03 -07:00
committed by GitHub
parent c8d3a402c1
commit 6c88f6c8d9
38 changed files with 342 additions and 299 deletions

View File

@@ -33,7 +33,8 @@ class TestDeepseek(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--enable-two-batch-overlap",
"--ep-num-redundant-experts",
"32",
@@ -88,7 +89,8 @@ class TestDeepseekMTP(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--enable-two-batch-overlap",
"--ep-num-redundant-experts",
"32",

View File

@@ -31,7 +31,8 @@ class TestPureDP(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
"--max-running-requests",
@@ -77,7 +78,8 @@ class TestHybridDPTP(CustomTestCase):
"--enable-dp-attention",
"--dp",
"2",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
"--max-running-requests",
@@ -118,7 +120,8 @@ class TestTP(CustomTestCase):
"--trust-remote-code",
"--tp",
"4",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
"--max-running-requests",
@@ -166,7 +169,8 @@ class TestNoGatherdBuffer(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -212,7 +216,8 @@ class TestTBO(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--enable-two-batch-overlap",
"--cuda-graph-max-bs",
"128",
@@ -259,7 +264,8 @@ class TestMTP(CustomTestCase):
"--dp",
"2",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -326,7 +332,8 @@ class TestMTPWithTBO(CustomTestCase):
"--dp-size",
"4",
"--enable-two-batch-overlap",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--trust-remote-code",
"--speculative-algorithm",
"EAGLE",

View File

@@ -34,7 +34,8 @@ class _BaseTestDynamicEPLB(CustomTestCase):
"--dp",
"2",
"--enable-dp-attention",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
@@ -96,8 +97,7 @@ class TestStaticEPLB(CustomTestCase):
trust_remote_code=True,
ep_num_redundant_experts=4,
enable_dp_attention=True,
enable_deepep_moe=True,
deepep_mode="normal",
moe_a2a_backend="deepep",
disable_cuda_graph=True,
expert_distribution_recorder_mode="stat",
tp_size=2,

View File

@@ -407,9 +407,8 @@ class Test10(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -449,9 +448,8 @@ class Test11(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -491,9 +489,8 @@ class Test12(CustomTestCase):
"--enable-dp-attention",
"--dp",
"8",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -532,9 +529,8 @@ class Test13(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -576,9 +572,8 @@ class Test14(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -620,9 +615,8 @@ class Test15(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -663,9 +657,8 @@ class Test16(CustomTestCase):
"--dp",
"4",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -706,9 +699,8 @@ class Test17(CustomTestCase):
"--dp",
"8",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -751,9 +743,8 @@ class Test18(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -796,9 +787,8 @@ class Test19(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
@@ -835,7 +825,8 @@ class Test20(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -873,7 +864,8 @@ class Test21(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -911,7 +903,8 @@ class Test22(CustomTestCase):
"--enable-dp-attention",
"--dp",
"8",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -948,7 +941,8 @@ class Test23(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -988,7 +982,8 @@ class Test24(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -1028,7 +1023,8 @@ class Test25(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -1067,7 +1063,8 @@ class Test26(CustomTestCase):
"--dp",
"4",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -1106,7 +1103,8 @@ class Test27(CustomTestCase):
"--dp",
"8",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -1147,7 +1145,8 @@ class Test28(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -1188,7 +1187,8 @@ class Test29(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
],
)
@@ -1701,9 +1701,8 @@ class Test40(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -1755,9 +1754,8 @@ class Test41(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -1809,9 +1807,8 @@ class Test42(CustomTestCase):
"--enable-dp-attention",
"--dp",
"8",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -1862,9 +1859,8 @@ class Test43(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -1918,9 +1914,8 @@ class Test44(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -1974,9 +1969,8 @@ class Test45(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -2029,9 +2023,8 @@ class Test46(CustomTestCase):
"--dp",
"4",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -2084,9 +2077,8 @@ class Test47(CustomTestCase):
"--dp",
"8",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -2141,9 +2133,8 @@ class Test48(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -2198,9 +2189,8 @@ class Test49(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
@@ -2249,7 +2239,8 @@ class Test50(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -2297,7 +2288,8 @@ class Test51(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -2345,7 +2337,8 @@ class Test52(CustomTestCase):
"--enable-dp-attention",
"--dp",
"8",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -2392,7 +2385,8 @@ class Test53(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -2442,7 +2436,8 @@ class Test54(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -2492,7 +2487,8 @@ class Test55(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -2541,7 +2537,8 @@ class Test56(CustomTestCase):
"--dp",
"4",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -2590,7 +2587,8 @@ class Test57(CustomTestCase):
"--dp",
"8",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -2641,7 +2639,8 @@ class Test58(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
@@ -2692,7 +2691,8 @@ class Test59(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",

View File

@@ -27,7 +27,8 @@ class TestPureTP(CustomTestCase):
"--trust-remote-code",
"--tp",
"2",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--disable-cuda-graph",
],
)
@@ -65,7 +66,8 @@ class TestDPAttn(unittest.TestCase):
"--dp",
"2",
"--enable-dp-attention",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--deepep-mode",
"normal",
"--disable-cuda-graph",

View File

@@ -31,7 +31,8 @@ class TestMoEDeepEPEvalAccuracyLarge(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],

View File

@@ -27,7 +27,6 @@ class TestEpMoE(CustomTestCase):
"2",
"--ep-size",
"2",
"--enable-ep-moe",
],
)
@@ -75,7 +74,6 @@ class TestEpMoEFP8(CustomTestCase):
"2",
"--ep-size",
"2",
"--enable-ep-moe",
"--quantization",
"fp8",
],

View File

@@ -33,7 +33,8 @@ class TestTwoBatchOverlap(unittest.TestCase):
"--dp",
"2",
"--enable-dp-attention",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--deepep-mode",
"normal",
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph
@@ -122,7 +123,8 @@ class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
"--dp",
"2",
"--enable-dp-attention",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--deepep-mode",
"normal",
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph