[8/N] MoE Refactor: deprecate EPMoE (#11211)

2025-10-07 21:51:41 -07:00
parent 7c3f07dbcb
commit 3c06b673af
19 changed files with 526 additions and 1808 deletions
--- a/benchmark/kernels/fbgemm/README.md
+++ b/benchmark/kernels/fbgemm/README.md
@@ -1,29 +0,0 @@
-## Benchmark FBGEMM Grouped GEMM
-
-Benchmark FBGEMM Grouped GEMM in both Triton and CUDA version and SGLang Triton Grouped GEMM, it will be used to compare the bandwidth of different implementations.
-
-### Requirements
-
-```shell
-pip install fbgemm-gpu-genai
-```
-
-### Usage
-
-```bash
-python3 benchmark/fbgemm/benchmark_fbgemm_grouped_gemm.py --model Qwen/Qwen2-57B-A14B-Instruct --tp-size 4 --use-fp8-w8a8
-```
-
-For example, in H200, the Qwen2-57B-A14B-Instruct TP4 fp8w8a8 grouped gemm bandwidth result is as follows:
-
-```shell
-grouped-gemm-performance:
-   batch_size  FBGEMM Triton Grouped GEMM FP8  FBGEMM CUTLASS F8F8BF16 Rowwise  SGLang Grouped GEMM FP8
-0       256.0                     3704.841339                      3042.626402              2254.725030
-1       512.0                     3691.426346                      3029.065684              2269.504543
-2      1024.0                     3653.938629                      2258.471467              2358.319020
-3      2048.0                     3596.644313                      2271.611904              2476.895397
-4      4096.0                     3468.496435                      2231.283986              2179.473910
-```
-
-The theoretical peak bandwidth of H200 is 4.8 TB/s. Taking batch_size 256 as an example, the bandwidth of FBGEMM Triton Grouped GEMM FP8 is 3704.841339 GB/s, the bandwidth of FBGEMM CUTLASS F8F8BF16 Rowwise is 3042.626402 GB/s, and the bandwidth of SGLang Grouped GEMM FP8 is 2254.725030 GB/s. Therefore, FBGEMM Triton Grouped GEMM FP8 achieves 77.9% of H200's theoretical peak bandwidth, FBGEMM CUTLASS F8F8BF16 Rowwise achieves 63.4% of H200's theoretical peak bandwidth, and SGLang Grouped GEMM FP8 achieves 46.9% of H200's theoretical peak bandwidth.
--- a/benchmark/kernels/fbgemm/benchmark_fbgemm_grouped_gemm.py
+++ b/benchmark/kernels/fbgemm/benchmark_fbgemm_grouped_gemm.py
@@ -1,516 +0,0 @@
-# python3 benchmark/fbgemm/benchmark_fbgemm_grouped_gemm.py --model Qwen/Qwen2-57B-A14B-Instruct --tp-size 4 --use-fp8-w8a8
-import argparse
-
-import torch
-import triton
-from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
-    quantize_fp8_row,
-    triton_quantize_fp8_row,
-)
-from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import (
-    grouped_gemm as fbgemm_grouped_gemm,
-)
-from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import (
-    grouped_gemm_fp8_rowwise as fbgemm_grouped_gemm_fp8_rowwise,
-)
-from transformers import AutoConfig
-
-from sglang.srt.layers.moe.ep_moe.kernels import (
-    grouped_gemm_triton as sglang_grouped_gemm,
-)
-
-
-def get_model_config(model_name: str, tp_size: int):
-    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-
-    if config.architectures[0] == "DbrxForCausalLM":
-        num_groups = config.ffn_config.moe_num_experts
-        intermediate_size = config.ffn_config.ffn_hidden_size
-    elif config.architectures[0] == "JambaForCausalLM":
-        num_groups = config.num_experts
-        intermediate_size = config.intermediate_size
-    elif config.architectures[0] == "Qwen2MoeForCausalLM":
-        num_groups = config.num_experts
-        intermediate_size = config.moe_intermediate_size
-    elif config.architectures[0] == "Qwen3MoeForCausalLM":
-        num_groups = config.num_experts
-        intermediate_size = config.moe_intermediate_size
-    elif config.architectures[0] in [
-        "DeepseekV2ForCausalLM",
-        "DeepseekV3ForCausalLM",
-    ]:
-        num_groups = config.n_routed_experts
-        intermediate_size = config.moe_intermediate_size
-    elif config.architectures[0] == "Llama4ForConditionalGeneration":
-        num_groups = config.text_config.num_local_experts
-        intermediate_size = config.text_config.intermediate_size
-    elif config.architectures[0] in [
-        "Grok1ForCausalLM",
-        "Grok1ImgGen",
-        "Grok1AForCausalLM",
-    ]:
-        num_groups = config.num_local_experts
-        intermediate_size = config.moe_intermediate_size
-    else:
-        num_groups = config.num_local_experts
-        intermediate_size = config.intermediate_size
-
-    shape_configs = {
-        "num_groups": num_groups,
-        "hidden_size": config.hidden_size,
-        "intermediate_size": intermediate_size,
-        "dtype": config.torch_dtype,
-    }
-    print(f"{shape_configs=}")
-    return shape_configs
-
-
-def create_test_data(batch_size, num_groups, hidden_size, intermediate_size):
-    torch.manual_seed(42)
-
-    tokens_per_group = batch_size // num_groups
-    m_sizes = torch.full(
-        (num_groups,), tokens_per_group, dtype=torch.int32, device="cuda"
-    )
-
-    x = torch.randn(batch_size, hidden_size, dtype=torch.bfloat16, device="cuda")
-
-    base_weights = torch.randn(
-        num_groups, intermediate_size, hidden_size, dtype=torch.bfloat16, device="cuda"
-    )
-
-    w_fbgemm = base_weights.reshape(num_groups * intermediate_size, hidden_size)
-    w_sglang = base_weights
-
-    c_fbgemm = torch.empty(
-        batch_size, intermediate_size, dtype=torch.bfloat16, device="cuda"
-    )
-    c_sglang = torch.empty(
-        batch_size, intermediate_size, dtype=torch.bfloat16, device="cuda"
-    )
-
-    seg_indptr = torch.zeros(num_groups + 1, dtype=torch.int32, device="cuda")
-    for i in range(1, num_groups + 1):
-        seg_indptr[i] = seg_indptr[i - 1] + tokens_per_group
-
-    weight_indices = torch.arange(num_groups, dtype=torch.int32, device="cuda")
-
-    return (
-        x,
-        w_fbgemm,
-        w_sglang,
-        c_fbgemm,
-        c_sglang,
-        m_sizes,
-        seg_indptr,
-        weight_indices,
-    )
-
-
-def create_fp8_test_data(
-    batch_size, num_groups, hidden_size, intermediate_size, backend="triton"
-):
-    """
-    Create test data for FP8 grouped GEMM operations.
-
-    Args:
-        batch_size: Total batch size
-        num_groups: Number of groups
-        hidden_size: Hidden dimension size
-        intermediate_size: Intermediate dimension size
-        backend: "triton" for Triton GEMM, "cutlass" for CUTLASS GEMM
-
-    Returns:
-        For triton: (x_fp8, w_fp8, m_sizes, x_scale, w_scale)
-        For cutlass: (x, wq, w_scale, m_sizes)
-    """
-    torch.manual_seed(42)
-
-    tokens_per_group = batch_size // num_groups
-
-    # Create weight matrices for each group
-    w_list = []
-    for _ in range(num_groups):
-        w = torch.randn(
-            intermediate_size, hidden_size, dtype=torch.float16, device="cuda"
-        )
-        w_list.append(w)
-
-    # Quantize weights using quantize_fp8_row for each group
-    wq_list, w_scale_list = zip(*[quantize_fp8_row(w) for w in w_list])
-
-    if backend == "triton":
-        # Triton format: concatenated weights
-        w_fp8 = torch.concat(wq_list, dim=0).contiguous()
-        w_scale = torch.concat(w_scale_list, dim=0).contiguous()
-
-        # Create m_sizes as int32 for triton
-        m_sizes = torch.full(
-            (num_groups,), tokens_per_group, dtype=torch.int32, device="cuda"
-        )
-
-        # Create and quantize input
-        x_fp16 = torch.randn(
-            batch_size, hidden_size, dtype=torch.float16, device="cuda"
-        )
-        x_fp8, x_scale = triton_quantize_fp8_row(x_fp16)
-        x_scale = x_scale.view(batch_size, -1)
-
-        return x_fp8, w_fp8, m_sizes, x_scale, w_scale
-
-    elif backend == "cutlass":
-        # CUTLASS format: stacked weights
-        wq = torch.stack(wq_list, dim=0).contiguous()
-        w_scale = torch.stack(w_scale_list, dim=0).contiguous()
-
-        # Create m_sizes as int64 for cutlass
-        m_values = [tokens_per_group] * num_groups
-        m_sizes = torch.tensor(m_values).to(dtype=torch.int64, device="cuda")
-
-        # Create input data - separate for each group then concat
-        x_list = []
-        for _ in range(num_groups):
-            x = torch.randn(
-                tokens_per_group, hidden_size, dtype=torch.float16, device="cuda"
-            )
-            x_list.append(x)
-
-        # Concatenate inputs into single tensor
-        x = torch.concat(x_list, dim=0).contiguous()
-
-        return x, wq, w_scale, m_sizes
-
-    else:
-        raise ValueError(f"Unsupported backend: {backend}")
-
-
-def calculate_memory_bandwidth(m_sizes, hidden_size, intermediate_size, dtype):
-    """
-    Calculate memory bandwidth based on accessed expert weights.
-
-    Args:
-        m_sizes: Tensor containing batch sizes for each group
-        hidden_size: Hidden dimension size
-        intermediate_size: Intermediate dimension size
-        dtype: Data type of weights
-
-    Returns:
-        Memory size in bytes for accessed expert weights
-    """
-    # Count non-zero groups (active experts)
-    if hasattr(m_sizes, "cpu"):
-        active_experts = torch.count_nonzero(m_sizes).item()
-    else:
-        active_experts = sum(1 for m in m_sizes if m > 0)
-
-    # Calculate bytes per element based on dtype
-    if dtype in [torch.float16, torch.bfloat16]:
-        bytes_per_element = 2
-    elif dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        bytes_per_element = 1
-    elif dtype == torch.float32:
-        bytes_per_element = 4
-    else:
-        # Default to 2 bytes for unknown dtypes
-        bytes_per_element = 2
-
-    # Memory per expert weight matrix
-    memory_per_expert = hidden_size * intermediate_size * bytes_per_element
-
-    # Total memory for active experts
-    total_memory_bytes = active_experts * memory_per_expert
-
-    return total_memory_bytes
-
-
-def get_benchmark_config(use_fp8_w8a8=False):
-    if use_fp8_w8a8:
-        return {
-            "line_vals": [
-                "fbgemm_triton_grouped_gemm_fp8",
-                "fbgemm_cutlass_f8f8bf16_rowwise",
-                "sglang_grouped_gemm",
-            ],
-            "line_names": [
-                "FBGEMM Triton Grouped GEMM FP8",
-                "FBGEMM CUTLASS F8F8BF16 Rowwise",
-                "SGLang Grouped GEMM FP8",
-            ],
-            "styles": [("blue", "-"), ("orange", "-"), ("red", "-")],
-        }
-    else:
-        return {
-            "line_vals": ["fbgemm_triton_grouped_gemm", "sglang_grouped_gemm"],
-            "line_names": [
-                "FBGEMM Triton Grouped GEMM BF16",
-                "SGLang Grouped GEMM BF16",
-            ],
-            "styles": [("blue", "-"), ("green", "-")],
-        }
-
-
-def run_benchmark(
-    model_config, use_fp8_w8a8=False, save_path="./benchmark_grouped_gemm/"
-):
-    config = get_benchmark_config(use_fp8_w8a8)
-
-    benchmark_config = triton.testing.Benchmark(
-        x_names=["batch_size"],
-        x_vals=[256, 512, 1024, 2048, 4096],
-        line_arg="provider",
-        line_vals=config["line_vals"],
-        line_names=config["line_names"],
-        styles=config["styles"],
-        ylabel="Bandwidth (GB/s)",
-        plot_name="grouped-gemm-performance",
-        args={},
-    )
-
-    @triton.testing.perf_report(benchmark_config)
-    def dynamic_benchmark(batch_size, provider, model_config, use_fp8_w8a8=False):
-        print(f"Benchmarking {provider} with batch_size={batch_size}")
-        torch.cuda.manual_seed_all(0)
-
-        num_groups = model_config["num_groups"]
-        hidden_size = model_config["hidden_size"]
-        intermediate_size = model_config["intermediate_size"]
-
-        if provider == "fbgemm_triton_grouped_gemm_fp8":
-            try:
-                test_data = create_fp8_test_data(
-                    batch_size,
-                    num_groups,
-                    hidden_size,
-                    intermediate_size,
-                    backend="triton",
-                )
-                x_fp8, w_fp8, m_sizes, x_scale, w_scale = test_data
-
-                # Calculate memory bandwidth
-                memory_bytes = calculate_memory_bandwidth(
-                    m_sizes, hidden_size, intermediate_size, torch.float8_e4m3fn
-                )
-
-                def run_func():
-                    return fbgemm_grouped_gemm_fp8_rowwise(
-                        x_fp8, w_fp8, m_sizes, x_scale, w_scale, use_fast_accum=True
-                    )
-
-            except Exception as e:
-                print(f"FP8 not supported, skipping: {e}")
-                return float("inf"), float("inf"), float("inf")
-
-        elif provider == "fbgemm_cutlass_f8f8bf16_rowwise":
-            try:
-                test_data = create_fp8_test_data(
-                    batch_size,
-                    num_groups,
-                    hidden_size,
-                    intermediate_size,
-                    backend="cutlass",
-                )
-                x, wq, w_scale, m_sizes = test_data
-
-                # Calculate memory bandwidth
-                memory_bytes = calculate_memory_bandwidth(
-                    m_sizes, hidden_size, intermediate_size, torch.float8_e4m3fn
-                )
-
-                # Quantize input using triton_quantize_fp8_row
-                xq, x_scale = triton_quantize_fp8_row(x)
-                x_scale = x_scale.view(batch_size, -1)
-
-                def run_func():
-                    return torch.ops.fbgemm.f8f8bf16_rowwise_grouped_stacked(
-                        xq, wq, x_scale, w_scale, m_sizes
-                    )
-
-            except Exception as e:
-                print(
-                    f"CUTLASS f8f8bf16_rowwise_grouped_stacked not supported, "
-                    f"skipping: {e}"
-                )
-                return float("inf"), float("inf"), float("inf")
-        else:
-            test_data = create_test_data(
-                batch_size, num_groups, hidden_size, intermediate_size
-            )
-            (
-                x,
-                w_fbgemm,
-                w_sglang,
-                c_fbgemm,
-                c_sglang,
-                m_sizes,
-                seg_indptr,
-                weight_indices,
-            ) = test_data
-
-            # Calculate memory bandwidth for BF16 operations
-            memory_bytes = calculate_memory_bandwidth(
-                m_sizes, hidden_size, intermediate_size, torch.bfloat16
-            )
-
-            if provider == "fbgemm_triton_grouped_gemm":
-
-                def run_func():
-                    return fbgemm_grouped_gemm(
-                        x, w_fbgemm, m_sizes, use_fast_accum=True
-                    )
-
-            else:
-
-                def run_func():
-                    return sglang_grouped_gemm(
-                        x,
-                        w_sglang,
-                        c_sglang,
-                        num_groups,
-                        weight_column_major=True,
-                        seg_indptr=seg_indptr,
-                        weight_indices=weight_indices,
-                        c_dtype=c_sglang.dtype,
-                    )
-
-        for _ in range(10):
-            try:
-                run_func()
-            except Exception as e:
-                print(f"Error during warmup for {provider}: {e}")
-                return float("inf"), float("inf"), float("inf")
-
-        torch.cuda.synchronize()
-
-        try:
-            quantiles = [0.5, 0.2, 0.8]
-            ms, min_ms, max_ms = triton.testing.do_bench(run_func, quantiles=quantiles)
-
-            # Convert time (ms) to bandwidth (GB/s)
-            # Bandwidth = Memory (bytes) / Time (seconds)
-            # Convert ms to seconds and bytes to GB (1e9)
-            gb_per_s = (memory_bytes / 1e9) / (ms / 1000)
-            # min bandwidth = max time, max bandwidth = min time
-            min_gb_per_s = (memory_bytes / 1e9) / (max_ms / 1000)
-            max_gb_per_s = (memory_bytes / 1e9) / (min_ms / 1000)
-
-            return gb_per_s, min_gb_per_s, max_gb_per_s
-        except Exception as e:
-            print(f"Error during benchmarking for {provider}: {e}")
-            return 0.0, 0.0, 0.0
-
-    dynamic_benchmark.run(
-        show_plots=True,
-        print_data=True,
-        save_path=save_path,
-        model_config=model_config,
-        use_fp8_w8a8=use_fp8_w8a8,
-    )
-
-
-def verify_correctness(model_config):
-    print("Verifying correctness...")
-    batch_size = 128
-    num_groups = model_config["num_groups"]
-    hidden_size = model_config["hidden_size"]
-    intermediate_size = model_config["intermediate_size"]
-
-    test_data = create_test_data(batch_size, num_groups, hidden_size, intermediate_size)
-    (
-        x,
-        w_fbgemm,
-        w_sglang,
-        c_fbgemm,
-        c_sglang,
-        m_sizes,
-        seg_indptr,
-        weight_indices,
-    ) = test_data
-
-    result_fbgemm = fbgemm_grouped_gemm(x, w_fbgemm, m_sizes, use_fast_accum=True)
-
-    result_sglang = sglang_grouped_gemm(
-        x,
-        w_sglang,
-        c_sglang,
-        num_groups,
-        weight_column_major=True,
-        seg_indptr=seg_indptr,
-        weight_indices=weight_indices,
-        c_dtype=c_sglang.dtype,
-    )
-
-    if torch.allclose(result_fbgemm, result_sglang, rtol=1e-3, atol=1e-3):
-        print("✓ BF16 Correctness verification passed!")
-    else:
-        max_diff = torch.max(torch.abs(result_fbgemm - result_sglang))
-        print(f"✗ BF16 Correctness verification failed! Max diff: {max_diff}")
-        return False
-
-    return True
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Benchmark FBGEMM vs SGLang Grouped GEMM"
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="mistralai/Mixtral-8x7B-Instruct-v0.1",
-        help="Model name to get configuration from",
-    )
-    parser.add_argument(
-        "--tp-size", type=int, default=1, help="Tensor parallelism size"
-    )
-    parser.add_argument(
-        "--use-fp8-w8a8", action="store_true", help="Enable FP8 W8A8 benchmark"
-    )
-    parser.add_argument(
-        "--save-path",
-        type=str,
-        default="./benchmark_grouped_gemm/",
-        help="Path to save benchmark results",
-    )
-    parser.add_argument(
-        "--verify-correctness",
-        action="store_true",
-        help="Verify correctness before benchmarking",
-    )
-
-    args = parser.parse_args()
-
-    try:
-        model_config = get_model_config(args.model, args.tp_size)
-    except Exception as e:
-        print(f"Failed to get model config: {e}")
-        print("Using default configuration...")
-        model_config = {
-            "num_groups": 8,
-            "hidden_size": 4096,
-            "intermediate_size": 14336,
-            "dtype": torch.bfloat16,
-        }
-
-    print("Running benchmark with:")
-    print(f"  num_groups: {model_config['num_groups']}")
-    print(f"  hidden_size: {model_config['hidden_size']}")
-    print(f"  intermediate_size: {model_config['intermediate_size']}")
-    print(f"  use_fp8_w8a8: {args.use_fp8_w8a8}")
-
-    if args.verify_correctness:
-        if not verify_correctness(model_config):
-            print("Correctness verification failed. Exiting...")
-            return
-
-    try:
-        run_benchmark(
-            model_config=model_config,
-            use_fp8_w8a8=args.use_fp8_w8a8,
-            save_path=args.save_path,
-        )
-    except Exception as e:
-        print(f"Benchmark failed: {e}")
-
-
-if __name__ == "__main__":
-    main()