diff --git a/benchmark/kernels/fused_moe_triton/benchmark_ep_pre_reorder_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_ep_pre_reorder_triton.py deleted file mode 100644 index 89c7ce067..000000000 --- a/benchmark/kernels/fused_moe_triton/benchmark_ep_pre_reorder_triton.py +++ /dev/null @@ -1,101 +0,0 @@ -import argparse -import itertools - -import pandas as pd -import torch -import triton - -from sglang.srt.layers.moe.ep_moe.kernels import pre_reorder_triton_kernel - - -def benchmark_pre_reorder(batch_size, topk, model_config): - hidden_size = model_config["hidden_size"] - block_size = model_config["block_size"] - expert_range = model_config["expert_range"] - - input_ptr = torch.randn(batch_size, hidden_size, dtype=torch.float16, device="cuda") - gateup_input_ptr = torch.zeros( - batch_size * topk, hidden_size, dtype=torch.float16, device="cuda" - ) - src2dst_ptr = torch.randint( - 0, batch_size * topk, (batch_size, topk), dtype=torch.int32, device="cuda" - ) - topk_ids_ptr = torch.randint( - expert_range[0], - expert_range[1] + 1, - (batch_size, topk), - dtype=torch.int32, - device="cuda", - ) - a1_scales_ptr = torch.rand( - expert_range[1] - expert_range[0] + 1, dtype=torch.float32, device="cuda" - ) - - input_ptr = input_ptr.view(-1) - gateup_input_ptr = gateup_input_ptr.view(-1) - src2dst_ptr = src2dst_ptr.view(-1) - topk_ids_ptr = topk_ids_ptr.view(-1) - - def run_kernel(): - pre_reorder_triton_kernel[(batch_size,)]( - input_ptr, - gateup_input_ptr, - src2dst_ptr, - topk_ids_ptr, - a1_scales_ptr, - expert_range[0], - expert_range[1], - topk, - hidden_size, - block_size, - use_per_token_if_dynamic=True, - ) - - for _ in range(10): - run_kernel() - torch.cuda.synchronize() - - ms, _, _ = triton.testing.do_bench(run_kernel, quantiles=[0.5, 0.2, 0.8]) - return ms - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--hidden-size", type=int, required=True) - parser.add_argument("--block-size", type=int, default=512) - args = parser.parse_args() - - model_config = { - "hidden_size": args.hidden_size, - "block_size": args.block_size, - "expert_range": (0, 255), - } - - batch_sizes = [64, 128, 256, 512, 640, 768, 1024] - topks = [2, 4, 8] - configs = list(itertools.product(batch_sizes, topks)) - - # Prepare results dict: keys = topk, each row is indexed by batch_size - results_dict = {topk: {} for topk in topks} - - for batch_size, topk in configs: - ms = benchmark_pre_reorder(batch_size, topk, model_config) - results_dict[topk][batch_size] = ms - - # Build dataframe - df = pd.DataFrame( - { - "batch_size": batch_sizes, - **{ - f"TopK={topk}": [results_dict[topk].get(bs, None) for bs in batch_sizes] - for topk in topks - }, - } - ) - - print("\npre-reorder-performance:") - print(df.to_string(index=False, float_format="%.6f")) - - -if __name__ == "__main__": - main() diff --git a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py index bf3f80bb5..2b4faa24b 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py @@ -37,11 +37,15 @@ def get_model_config(model_name: str, tp_size: int): intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: - E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size + elif config.architectures[0] == "Llama4ForConditionalGeneration": + E = config.text_config.num_local_experts + topk = config.text_config.num_experts_per_tok + intermediate_size = config.text_config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in [ "Grok1ForCausalLM", "Grok1ImgGen", diff --git a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py index f6ddeb13b..390d33f56 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py @@ -51,6 +51,11 @@ def get_model_config(model_name: str, tp_size: int): topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size + elif config.architectures[0] == "Llama4ForConditionalGeneration": + E = config.text_config.num_local_experts + topk = config.text_config.num_experts_per_tok + intermediate_size = config.text_config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size elif config.architectures[0] in [ "Grok1ForCausalLM", "Grok1ImgGen",