[fix]: fix cutlass moe ut and and Opt H20 cutlass groupGemm performance (#9272)

Co-authored-by: wanghanpei <wanghanpei@bytedance.com>
This commit is contained in:
kousakawang
2025-08-18 04:09:49 +08:00
committed by GitHub
parent b3c1f2e4f2
commit 0fc54b971e
3 changed files with 132 additions and 41 deletions

View File

@@ -153,9 +153,8 @@ def run_test(tp_size, batch_size, model_config, check=False):
x,
w1,
w2,
topk_weights,
topk_ids,
inplace=False, # Use False for benchmarking to avoid side effects if run multiple times
(topk_weights, topk_ids, "dummy"),
inplace=False,
activation="silu", # Assuming SiLU activation common in MoEs
use_fp8_w8a8=True,
w1_scale=w1_scale,
@@ -221,8 +220,7 @@ def run_test(tp_size, batch_size, model_config, check=False):
x,
w1, # Original shape
w2, # Original shape
topk_weights,
topk_ids,
(topk_weights, topk_ids, "dummy"),
inplace=False, # Important: Use False to get output tensor
activation="silu",
use_fp8_w8a8=True,
@@ -266,7 +264,7 @@ if __name__ == "__main__":
"--batch-sizes",
type=int,
nargs="+",
default=[1, 4, 8, 16, 32, 64, 128, 256, 512], # Adjusted default
default=[1, 4, 8, 16, 32, 64, 128, 256, 512, 1024], # Adjusted default
help="List of batch sizes to test",
)
parser.add_argument("--check", action="store_true", help="Enable check mode")