Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@@ -0,0 +1,2 @@
+sm*_kernel_*.cu
+kernel_selector.h
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -0,0 +1,286 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import glob
+import itertools
+import os
+import subprocess
+import sys
+
+import jinja2
+
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+    arch = arch[: arch.index(".") + 2].replace(".", "")
+    arch = int(arch)
+    # only SM89 and SM120 fully support
+    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+    # SM90 and SM100 can use this PTX, but it’s simulated
+    # with FP16 MMA, so it cannot achieve any acceleration.
+    if arch in [89, 120]:
+        SUPPORT_FP8 = True
+
+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
+// clang-format off
+""".lstrip()
+
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+"""
+)
+
+TEMPLATE = (
+    "template __global__ void Marlin<"
+    "{{a_type_id}}, "
+    "{{b_type_id}}, "
+    "{{c_type_id}}, "
+    "{{s_type_id}}, "
+    "{{threads}}, "
+    "{{thread_m_blocks}}, "
+    "{{thread_n_blocks}}, "
+    "{{thread_k_blocks}}, "
+    "{{m_block_size_8}}, "
+    "{{stages}}, "
+    "{{group_blocks}}, "
+    "{{is_zp_float}}>"
+    "( MARLIN_KERNEL_PARAMS );"
+)
+
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
+
+THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
+
+QUANT_CONFIGS = [
+    # AWQ-INT4
+    {
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4
+    {
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # AWQ-INT8
+    {
+        "b_type": "kU8B128",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # FP8
+    {
+        "b_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 8],
+    },
+    # NVFP4
+    {
+        "b_type": "kFE2M1f",
+        "s_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [1],
+    },
+    # MXFP4
+    {
+        "a_type": ["kBFloat16"],
+        "b_type": "kFE2M1f",
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kFE2M1f",
+        "c_type": ["kBFloat16"],
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [2],
+    },
+]
+
+
+def remove_old_kernels():
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
+        subprocess.call(["rm", "-f", filename])
+
+    filename = os.path.dirname(__file__) + "/kernel_selector.h"
+    subprocess.call(["rm", "-f", filename])
+
+
+def generate_new_kernels():
+    result_dict = {}
+
+    for quant_config in QUANT_CONFIGS:
+        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+        b_type = quant_config["b_type"]
+        all_group_blocks = quant_config["group_blocks"]
+        all_m_blocks = quant_config["thread_m_blocks"]
+        all_thread_configs = quant_config["thread_configs"]
+
+        for a_type, c_type in itertools.product(a_types, c_types):
+            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
+                continue
+            if "16" in a_type and "16" in c_type and a_type != c_type:
+                continue
+            s_type = quant_config.get("s_type", c_type)
+            if (a_type, b_type, c_type) not in result_dict:
+                result_dict[(a_type, b_type, c_type)] = []
+
+            for group_blocks, m_blocks, thread_configs in itertools.product(
+                all_group_blocks, all_m_blocks, all_thread_configs
+            ):
+                thread_k, thread_n, threads = thread_configs
+
+                if threads == 256:
+                    # for small batch (m_blocks == 1),
+                    #     we only need (128, 128, 256)
+                    # for large batch (m_blocks > 1),
+                    #     we only need (64, 256, 256)
+                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+                        continue
+                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+                        continue
+
+                config = {
+                    "threads": threads,
+                    "s_type": s_type,
+                    "thread_m_blocks": max(m_blocks, 1),
+                    "thread_k_blocks": thread_k // 16,
+                    "thread_n_blocks": thread_n // 16,
+                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+                    "stages": "pipe_stages",
+                    "group_blocks": group_blocks,
+                    "is_zp_float": "false",
+                }
+
+                result_dict[(a_type, b_type, c_type)].append(config)
+
+    kernel_selector_str = FILE_HEAD_COMMENT
+
+    for (a_type, b_type, c_type), config_list in result_dict.items():
+        all_template_str_list = []
+        for config in config_list:
+            s_type = config["s_type"]
+            template_str = jinja2.Template(TEMPLATE).render(
+                a_type_id=f"vllm::{a_type}.id()",
+                b_type_id=f"vllm::{b_type}.id()",
+                c_type_id=f"vllm::{c_type}.id()",
+                s_type_id=f"vllm::{s_type}.id()",
+                **config,
+            )
+            all_template_str_list.append(template_str)
+
+            conditions = [
+                f"a_type == vllm::{a_type}",
+                f"b_type == vllm::{b_type}",
+                f"c_type == vllm::{c_type}",
+                f"s_type == vllm::{s_type}",
+                f"threads == {config['threads']}",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
+                f"thread_n_blocks == {config['thread_n_blocks']}",
+                f"thread_k_blocks == {config['thread_k_blocks']}",
+                f"m_block_size_8 == {config['m_block_size_8']}",
+                f"group_blocks == {config['group_blocks']}",
+                f"is_zp_float == {config['is_zp_float']}",
+            ]
+            conditions = " && ".join(conditions)
+
+            if kernel_selector_str == FILE_HEAD_COMMENT:
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
+            else:
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+            kernel_template2 = (
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                "{{is_zp_float}}>;"
+            )
+
+            kernel_selector_str += (
+                jinja2.Template(kernel_template2).render(
+                    a_type_id=f"vllm::{a_type}.id()",
+                    b_type_id=f"vllm::{b_type}.id()",
+                    c_type_id=f"vllm::{c_type}.id()",
+                    s_type_id=f"vllm::{s_type}.id()",
+                    **config,
+                )
+                + "\n"
+            )
+
+        file_content = FILE_HEAD + "\n\n"
+        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+        if a_type == "kFE4M3fn":
+            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+        else:
+            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+        filename = filename.lower()
+
+        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+            f.write(file_content)
+
+    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+        kernel_selector_str += (
+            "else if (a_type == vllm::kFE4M3fn)\n"
+            "  TORCH_CHECK(false, "
+            '"marlin kernel with fp8 activation is not built.");'
+        )
+
+    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+        f.write(kernel_selector_str)
+
+
+if __name__ == "__main__":
+    remove_old_kernels()
+    generate_new_kernels()
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -0,0 +1,47 @@
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "quantization/gptq_marlin/marlin.cuh"
+#include "quantization/gptq_marlin/marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+
+#define MARLIN_KERNEL_PARAMS                                                  \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
+      const int4 *__restrict__ b_bias_ptr,                                    \
+      const float *__restrict__ a_scales_ptr,                                 \
+      const int4 *__restrict__ scales_ptr,                                    \
+      const uint16_t *__restrict__ global_scale_ptr,                          \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
+      const int32_t *__restrict__ sorted_token_ids_ptr,                       \
+      const int32_t *__restrict__ expert_ids_ptr,                             \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr,                 \
+      const float *__restrict__ topk_weights_ptr, int top_k,                  \
+      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
+      int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
+      bool use_fp32_reduce
+
+namespace MARLIN_NAMESPACE_NAME {
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
+          >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+
+}
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -0,0 +1,863 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "kernel.h"
+#include "core/registration.h"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
+
+using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+template <int moe_block_size>
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    const int32_t* __restrict__ sorted_token_ids_ptr,
+    const int32_t* __restrict__ expert_ids_ptr,
+    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
+    int size_k, int top_k) {
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  int num_moe_blocks = div_ceil(num_tokens_past_padded, moe_block_size);
+  int32_t block_sorted_ids[moe_block_size];
+  int block_num_valid_tokens = 0;
+  int64_t old_expert_id = 0;
+  int64_t expert_id = 0;
+  int row_stride = size_k * sizeof(half) / 16;
+
+  auto read_moe_block_data = [&](int block_id) {
+    block_num_valid_tokens = moe_block_size;
+    int4* tmp_block_sorted_ids = reinterpret_cast<int4*>(block_sorted_ids);
+    for (int i = 0; i < moe_block_size / 4; i++) {
+      tmp_block_sorted_ids[i] =
+          ((int4*)sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
+    }
+    for (int i = 0; i < moe_block_size; i++) {
+      if (block_sorted_ids[i] >= size_m * top_k) {
+        block_num_valid_tokens = i;
+        break;
+      };
+    }
+  };
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int in_offset = (row / top_k) * row_stride;
+    int out_offset = row * row_stride;
+
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + in_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + out_offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      auto cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        auto cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int index = blockIdx.x; index < num_moe_blocks; index += gridDim.x) {
+    old_expert_id = expert_id;
+    int tmp_expert_id = expert_ids_ptr[index];
+    if (tmp_expert_id == -1) continue;
+    expert_id = tmp_expert_id;
+    perm_int_ptr += (expert_id - old_expert_id) * size_k;
+    read_moe_block_data(index);
+
+    for (int i = 0; i < block_num_valid_tokens; i++)
+      permute_row(block_sorted_ids[i]);
+  }
+}
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128}};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128}};
+
+typedef struct {
+  int blocks_per_sm;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups =
+        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
+                          int thread_m_blocks, int prob_m, int prob_n,
+                          int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full, int has_zp,
+                          int is_zp_float, bool is_a_8bit) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+  int tb_m = thread_m_blocks * 16;
+
+  // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
+  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
+  int sh_block_meta_size = tb_m * 16;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
+  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_red_size = tb_m * (tb_n + 8) * 2;
+  int sh_bias_size = tb_n * 2;
+  int tmp_size =
+      (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
+  tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
+
+  int sh_s_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
+                            group_size, has_act_order, is_k_full);
+  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+  int sh_zp_size = 0;
+  if (has_zp) {
+    if (is_zp_float)
+      sh_zp_size = sh_s_size;
+    else if (num_bits == 4)
+      sh_zp_size = sh_s_size / 4;
+    else if (num_bits == 8)
+      sh_zp_size = sh_s_size / 2;
+  }
+
+  int total_size = tmp_size + sh_a_size + sh_s_size + sh_zp_size +
+                   sh_g_idx_size + sh_block_meta_size;
+
+  return total_size;
+}
+
+bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
+                     int thread_m_blocks, int prob_m, int prob_n, int prob_k,
+                     int num_bits, int group_size, bool has_act_order,
+                     bool is_k_full, int has_zp, int is_zp_float,
+                     int max_shared_mem, bool is_a_8bit) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  // Check that pipeline fits into cache
+  int cache_size =
+      get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+  return cache_size <= max_shared_mem;
+}
+
+MarlinFuncPtr get_marlin_kernel(
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
+    int threads, bool is_zp_float) {
+  int num_bits = b_type.size_bits();
+  auto kernel = MarlinDefault;
+
+#include "kernel_selector.h"
+
+  return kernel;
+}
+
+exec_config_t determine_exec_config(
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
+    int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
+    bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
+    bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
+    bool is_a_8bit) {
+  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
+  thread_config_t* thread_configs = thread_m_blocks > 1
+                                        ? large_batch_thread_configs
+                                        : small_batch_thread_configs;
+  int thread_configs_size =
+      thread_m_blocks > 1
+          ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
+          : sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
+
+  int count = 0;
+  constexpr int device_max_reg_size = 255 * 1024;
+  for (int i = 0; i < thread_configs_size; i++) {
+    thread_config_t th_config = thread_configs[i];
+
+    if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
+                         prob_n, prob_k, num_bits, group_size, has_act_order,
+                         is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
+                         is_a_8bit)) {
+      continue;
+    }
+
+    int cache_size = get_kernel_cache_size(
+        th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+        is_a_8bit);
+
+    int group_blocks = 0;
+    if (!has_act_order) {
+      group_blocks = group_size == -1 ? -1 : (group_size / 16);
+    }
+
+    auto kernel =
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
+                          th_config.num_threads, is_zp_float);
+
+    if (kernel == MarlinDefault) continue;
+
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr, kernel);
+    int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
+    int allow_count = min(device_max_reg_size / reg_size,
+                          max_shared_mem / (cache_size + 1536));
+    if (thread_m_blocks == 1)
+      allow_count = max(min(allow_count, 4), 1);
+    else
+      allow_count = max(min(allow_count, 2), 1);
+
+    if (prob_n / th_config.thread_n * prob_m * top_k * 4 < sms * allow_count) {
+      allow_count =
+          max(prob_n / th_config.thread_n * prob_m * top_k * 4 / sms, 1);
+    }
+
+    if (allow_count > count) {
+      count = allow_count;
+      exec_cfg = {count, th_config};
+    };
+  }
+
+  return exec_cfg;
+}
+
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
+               void* perm, void* a_tmp, void* sorted_token_ids,
+               void* expert_ids, void* num_tokens_past_padded,
+               void* topk_weights, int moe_block_size, int num_experts,
+               int top_k, bool mul_topk_weights, bool is_ep, int prob_m,
+               int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& a_type, vllm::ScalarType const& b_type,
+               vllm::ScalarType const& c_type, vllm::ScalarType const& s_type,
+               bool has_bias, bool has_act_order, bool is_k_full, bool has_zp,
+               int num_groups, int group_size, int dev, cudaStream_t stream,
+               int thread_k, int thread_n, int sms, int blocks_per_sm,
+               bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
+  int thread_m_blocks = div_ceil(moe_block_size, 16);
+  bool m_block_size_8 = moe_block_size == 8;
+  bool is_a_8bit = a_type.size_bits() == 8;
+
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  int num_bits = b_type.size_bits();
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* bias_ptr = (const int4*)b_bias;
+  const float* a_s_ptr = (const float*)a_s;
+  const int4* b_s_ptr = (const int4*)b_s;
+  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
+  const int4* zp_ptr = (const int4*)zp;
+  const int* g_idx_ptr = (const int*)g_idx;
+  const int* perm_ptr = (const int*)perm;
+  int4* a_tmp_ptr = (int4*)a_tmp;
+  const int32_t* sorted_token_ids_ptr = (const int32_t*)sorted_token_ids;
+  const int32_t* expert_ids_ptr = (const int32_t*)expert_ids;
+  const int32_t* num_tokens_past_padded_ptr =
+      (const int32_t*)num_tokens_past_padded;
+  const float* topk_weights_ptr = (const float*)topk_weights;
+  int* locks = (int*)workspace;
+
+  if (has_act_order) {
+    // Permute A columns
+    auto kernel = permute_cols_kernel<8>;
+    if (moe_block_size == 8) {
+    } else if (moe_block_size == 16)
+      kernel = permute_cols_kernel<16>;
+    else if (moe_block_size == 32)
+      kernel = permute_cols_kernel<32>;
+    else if (moe_block_size == 48)
+      kernel = permute_cols_kernel<48>;
+    else if (moe_block_size == 64)
+      kernel = permute_cols_kernel<64>;
+    else
+      TORCH_CHECK(false, "unsupported moe_block_size ", moe_block_size);
+
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    kernel<<<sms, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, sorted_token_ids_ptr, expert_ids_ptr,
+        num_tokens_past_padded_ptr, prob_m, prob_k, top_k);
+    // clang-format on
+    A_ptr = a_tmp_ptr;
+    prob_m = prob_m * top_k;
+    top_k = 1;
+
+    // If we have a full K, then we can run the non-act-order version of Marlin
+    // (since the weight rows are reordered by increasing group ids, and by
+    // having a full K, we have full original groups)
+    if (is_k_full) has_act_order = false;
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  int major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         dev);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         dev);
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
+              "marlin kernel only support Ampere or newer GPUs.");
+  if (a_type == vllm::kFE4M3fn) {
+    TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
+                "FP8 only support Ada Lovelace or newer GPUs.");
+    TORCH_CHECK(
+        major_capability * 10 + minor_capability == 89 ||
+            major_capability * 10 + minor_capability == 120,
+        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
+        "Marlin W4A16 on other devices).");
+  }
+
+  // Set thread config
+  exec_config_t exec_cfg;
+  thread_config_t thread_tfg;
+  if (thread_k != -1 && thread_n != -1) {
+    thread_tfg = thread_config_t{thread_k, thread_n, thread_k * thread_n / 64};
+    if (blocks_per_sm == -1) blocks_per_sm = 1;
+    exec_cfg = exec_config_t{blocks_per_sm, thread_tfg};
+    TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+                " is not divisible by thread_n = ", thread_n);
+    TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+                " is not divisible by thread_k = ", thread_k);
+  } else {
+    // Auto config
+    exec_cfg = determine_exec_config(
+        a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
+        top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
+        has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
+        is_a_8bit);
+    thread_tfg = exec_cfg.tb_cfg;
+  }
+
+  int num_threads = thread_tfg.num_threads;
+  thread_k = thread_tfg.thread_k;
+  thread_n = thread_tfg.thread_n;
+  int blocks = sms * exec_cfg.blocks_per_sm;
+  if (exec_cfg.blocks_per_sm > 1)
+    max_shared_mem = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+
+  TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
+                              prob_m, prob_n, prob_k, num_bits, group_size,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
+                              max_shared_mem, is_a_8bit),
+              "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+              ", thread_k = ", thread_tfg.thread_k,
+              ", thread_n = ", thread_tfg.thread_n,
+              ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
+              ", max_shared_mem = ", max_shared_mem);
+
+  int sh_cache_size =
+      get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+
+  auto kernel = get_marlin_kernel(
+      a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
+      thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
+      num_threads, is_zp_float);
+
+  if (kernel == MarlinDefault) {
+    TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
+                ", ", prob_k, "]", ", has_act_order = ", has_act_order,
+                ", num_groups = ", num_groups, ", group_size = ", group_size,
+                ", thread_m_blocks = ", thread_m_blocks,
+                ", thread_n_blocks = ", thread_n_blocks,
+                ", thread_k_blocks = ", thread_k_blocks,
+                ", num_bits = ", num_bits);
+  }
+
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                       max_shared_mem);
+  // avoid ">>>" being formatted to "> > >"
+  // clang-format off
+  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
+      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
+      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
+      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
+  // clang-format on
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& a_scales_or_none,
+    std::optional<torch::Tensor> const& global_scale_or_none,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
+    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float, int64_t thread_k, int64_t thread_n,
+    int64_t blocks_per_sm) {
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
+
+  auto c_dtype = a.dtype();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    a_type_id = vllm::kFloat16.id();
+    c_type_id = vllm::kFloat16.id();
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    a_type_id = vllm::kBFloat16.id();
+    c_type_id = vllm::kBFloat16.id();
+  } else {
+    c_dtype = b_scales.dtype();
+    if (b_scales.scalar_type() == at::ScalarType::Half) {
+      c_type_id = vllm::kFloat16.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
+      c_type_id = vllm::kBFloat16.id();
+    } else {
+      c_type_id = vllm::kBFloat16.id();
+
+      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
+      torch::Tensor c = c_or_none.value();
+      c_dtype = c.dtype();
+
+      if (c.scalar_type() == at::ScalarType::Half) {
+        c_type_id = vllm::kFloat16.id();
+      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
+        c_type_id = vllm::kBFloat16.id();
+      } else {
+        TORCH_CHECK(false, "unsupported c dtype");
+      }
+    }
+
+    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      a_type_id = vllm::kFE4M3fn.id();
+    } else if (a.scalar_type() == at::ScalarType::Char) {
+      a_type_id = vllm::kS8.id();
+    } else {
+      TORCH_CHECK(false, "unsupported `a` scalar_type");
+    }
+  }
+
+  s_type_id = c_type_id;
+  if (b_type_id == vllm::kFE2M1f.id()) {
+    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      s_type_id = vllm::kFE4M3fn.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
+      s_type_id = vllm::kFE8M0fnu.id();
+    } else {
+      TORCH_CHECK(false,
+                  "When b_type = float4_e2m1f, b_scale scalar type must be",
+                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
+    }
+  }
+
+  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
+  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
+  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
+  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
+
+  int pack_factor = 32 / b_type.size_bits();
+  int num_experts = b_q_weight.size(0);
+
+  if (moe_block_size != 8) {
+    TORCH_CHECK(moe_block_size % 16 == 0,
+                "unsupported moe_block_size=", moe_block_size);
+    TORCH_CHECK(moe_block_size >= 16 && moe_block_size <= 64,
+                "unsupported moe_block_size=", moe_block_size);
+  }
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(
+      size_k % MARLIN_NAMESPACE_NAME::tile_size == 0, "size_k = ", size_k,
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK((size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(1),
+              "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
+              ", size_k = ", size_k,
+              ", tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      b_q_weight.size(2) % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "b_q_weight.size(2) = ", b_q_weight.size(2),
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  int actual_size_n =
+      (b_q_weight.size(2) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  torch::Tensor a_scales;
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+
+  if (a_scales_or_none.has_value()) {
+    a_scales = a_scales_or_none.value();
+    TORCH_CHECK(a_type.size_bits() == 8,
+                "a_scales can only be used for 8bit activation.");
+  } else {
+    a_scales = torch::empty({0}, options_fp32);
+    TORCH_CHECK(a_type.size_bits() != 8,
+                "the a_scales parameter must be passed for 8bit activation.");
+  }
+
+  // sms: number of SMs to use for the kernel
+  int sms = -1;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  torch::Tensor c;
+  if (c_or_none.has_value()) {
+    c = c_or_none.value();
+    TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
+    TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
+    TORCH_CHECK(c.size(0) == size_m * top_k,
+                "Shape mismatch: c.size(0) = ", c.size(0),
+                ", size_m * topk = ", size_m * top_k);
+    TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1),
+                ", size_n = ", size_n);
+  } else {
+    c = torch::empty({size_m * top_k, size_n}, options);
+  }
+
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
+  if (use_fp32_reduce && !use_atomic_add) {
+    // max num of threadblocks is sms * 4
+    long max_c_tmp_size = min(
+        (long)size_n * sorted_token_ids.size(0),
+        (long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n);
+    if (moe_block_size == 8) max_c_tmp_size *= 2;
+    c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
+  } else {
+    c_tmp = torch::empty({0}, options_fp32);
+  }
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int rank = b_scales.sizes().size();
+  TORCH_CHECK(rank == 3, "b_scales rank = ", rank, " is not 3");
+  TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(1);
+
+  torch::Tensor g_idx, perm, a_tmp;
+  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
+    g_idx = g_idx_or_none.value();
+    perm = perm_or_none.value();
+
+    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+
+    // Verify g_idx and perm
+    TORCH_CHECK((g_idx.size(-1) == 0 && perm.size(-1) == 0) ||
+                    (g_idx.size(-1) == size_k && perm.size(-1) == size_k),
+                "Unexpected g_idx.size(-1) = ", g_idx.size(-1),
+                " and perm.size(-1) = ", perm.size(-1),
+                ", where size_k = ", size_k);
+  } else {
+    g_idx = torch::empty({0}, options);
+    perm = torch::empty({0}, options);
+    a_tmp = torch::empty({0}, options);
+  }
+  bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
+
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m * top_k, size_k}, options);
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    a_tmp = torch::empty({0}, options);
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(1) = ", b_scales.size(1));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  torch::Tensor global_scale;
+  if (global_scale_or_none.has_value()) {
+    global_scale = global_scale_or_none.value();
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
+                "global_scale can only be used for nvfp4 format.");
+  } else {
+    global_scale = torch::empty({0}, options);
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
+                "the global_scale parameter must be passed for nvfp4 format.");
+  }
+
+  bool has_bias = b_bias_or_none.has_value();
+  torch::Tensor b_bias;
+  if (has_bias) {
+    b_bias = b_bias_or_none.value();
+    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
+    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
+    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
+  } else {
+    b_bias = torch::empty({0}, options);
+  }
+
+  torch::Tensor b_zeros;
+  if (b_zeros_or_none.has_value()) {
+    b_zeros = b_zeros_or_none.value();
+    TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
+  } else {
+    b_zeros = torch::empty({0}, options);
+  }
+  bool has_zp = b_zeros.size(-1) > 0;
+  if (has_zp) {
+    TORCH_CHECK(
+        b_type == vllm::kU4 || b_type == vllm::kU8,
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
+  } else {
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
+                "b_type must be uint4b8, uint8b128, int4, int8, "
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
+                b_type.str());
+  }
+
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
+                "Computation type must be float16 (half) when using float zero "
+                "points.");
+  }
+
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(2) == size_n,
+                  "b_zeros dim 2 = ", b_zeros.size(2),
+                  " is not size_n = ", size_n);
+      TORCH_CHECK(num_groups == b_zeros.size(1),
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(b_zeros.size(1) == num_groups,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor,
+                  "b_zeros dim 2 = ", b_zeros.size(2),
+                  " is not size_n / pack_factor = ", size_n / pack_factor);
+    }
+  }
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
+              "size_n = ", size_n, ", is not divisible by min_thread_n = ",
+              MARLIN_NAMESPACE_NAME::min_thread_n);
+
+  int max_n_tiles = size_n / MARLIN_NAMESPACE_NAME::min_thread_n;
+  int min_workspace_size = min(
+      max_n_tiles * (int)(sorted_token_ids.size(0) / moe_block_size), sms * 4);
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
+
+  int dev = a.get_device();
+
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
+              "scalar type of a_scales must be float");
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
+              "scalar type of global_scale must be the same with c");
+  if (a_type.size_bits() == 16) {
+    TORCH_CHECK(
+        a.scalar_type() == c.scalar_type(),
+        "scalar type of a must be the same with c for 16 bit activation");
+  }
+
+  MARLIN_NAMESPACE_NAME::marlin_mm(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
+      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
+      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
+      expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
+      topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
+      mul_topk_weights, is_ep, size_m, size_n, size_k, workspace.data_ptr(),
+      a_type, b_type, c_type, s_type, has_bias, has_act_order, is_k_full,
+      has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+      thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
+      is_zp_float);
+
+  return c;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
+}