Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -0,0 +1,735 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cmath>
+#include "core/math.hpp"
+#include "../cuda_compat.h"
+#include "dispatch_utils.h"
+
+#include "quantization/w8a8/fp8/common.cuh"
+
+#include <c10/util/Float8_e4m3fn.h>
+
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+  #include <cuda_fp8.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+  #include <hip/hip_fp8.h>
+
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
+typedef __hip_bfloat16_raw __nv_bfloat16_raw;
+  #if defined(HIP_FP8_TYPE_OCP)
+typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
+typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
+  #else
+// ROCm 6.2 fallback: only *_fnuz types exist
+typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3;
+typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3;
+  #endif
+#endif
+
+#include "core/registration.h"
+namespace vllm {
+
+template <typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+// Activation and gating kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          typename fp8_type>
+__global__ void act_and_mul_quant_kernel(
+    fp8_type* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const float* scale, const int d) {
+  const int32_t blocks_per_token = gridDim.y;
+
+  const int32_t elems_per_128bit_load = (128 / 8) / sizeof(scalar_t);
+
+  // We don't expect the hidden dimension to exceed 32 bits so int32 should
+  // be safe here.
+  const int32_t tgt_elems_per_block = div_ceil(d, blocks_per_token);
+  const int32_t elems_per_block =
+      round_to_next_multiple_of(tgt_elems_per_block, elems_per_128bit_load);
+  const int32_t block_start = blockIdx.y * elems_per_block;
+  int32_t block_end = block_start + elems_per_block;
+  block_end = block_end > d ? d : block_end;
+
+  // token_idx is 64 bit to prevent 32 bit overflow when the number of tokens
+  // is very large
+  const int64_t token_idx = blockIdx.x;
+  const scalar_t* __restrict__ x_ptr = input + token_idx * 2 * d;
+  const scalar_t* __restrict__ y_ptr = input + token_idx * 2 * d + d;
+  fp8_type* __restrict__ out_ptr = out + token_idx * d;
+
+  // 128-bit vectorized code
+  const int32_t vec_loop_end =
+      round_to_previous_multiple_of(elems_per_128bit_load, block_end);
+  const int32_t vec_end_idx = vec_loop_end / elems_per_128bit_load;
+  const int32_t vec_start_idx = block_start / elems_per_128bit_load;
+
+  const int4* __restrict__ x_128bit_ptr = reinterpret_cast<const int4*>(x_ptr);
+  const int4* __restrict__ y_128bit_ptr = reinterpret_cast<const int4*>(y_ptr);
+  int2* __restrict__ out_128bit_ptr = reinterpret_cast<int2*>(out_ptr);
+
+  float inverted_scale = 1 / *scale;
+#pragma unroll
+  for (int32_t vec_idx = vec_start_idx + threadIdx.x; vec_idx < vec_end_idx;
+       vec_idx += blockDim.x) {
+    const int4 x_128bit = VLLM_LDG(&x_128bit_ptr[vec_idx]);
+    const int4 y_128bit = VLLM_LDG(&y_128bit_ptr[vec_idx]);
+    using scalar_128bit_vec_t = std::array<scalar_t, elems_per_128bit_load>;
+    using scalar_64bit_vec_t = std::array<fp8_type, elems_per_128bit_load>;
+
+    scalar_64bit_vec_t out_vec;
+    const auto x_vec = reinterpret_cast<scalar_128bit_vec_t const&>(x_128bit);
+    const auto y_vec = reinterpret_cast<scalar_128bit_vec_t const&>(y_128bit);
+
+#pragma unroll
+    for (int i = 0; i < elems_per_128bit_load; i++) {
+      out_vec[i] = scaled_fp8_conversion<true, fp8_type>(
+          ACT_FN(x_vec[i]) * y_vec[i], inverted_scale);
+    }
+
+    out_128bit_ptr[vec_idx] = reinterpret_cast<const int2&>(out_vec);
+  }
+
+  // Scalar cleanup code
+  if (block_end > vec_loop_end) {
+    for (int64_t idx = vec_loop_end + threadIdx.x; idx < block_end;
+         idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] =
+          scaled_fp8_conversion<true, fp8_type>(ACT_FN(x) * y, inverted_scale);
+    }
+  }
+}
+
+__device__ __forceinline__ float silu(float x) {
+  return __fdividef(x, (1.f + expf(-x)));
+}
+
+__device__ __forceinline__ float2 silu2(float2 x) {
+  return make_float2(silu(x.x), silu(x.y));
+}
+
+__device__ __forceinline__ __nv_bfloat162 silu2_v2(float2 x) {
+#ifndef USE_ROCM
+  return make_bfloat162(__float2bfloat16_rn(silu(x.x)),
+                        __float2bfloat16_rn(silu(x.y)));
+#else
+  return __float22bfloat162_rn(make_float2(silu(x.x), silu(x.y)));
+#endif
+}
+
+#ifndef USE_ROCM
+__device__ __forceinline__ float warp_max(float v) {
+  static constexpr unsigned FULL_MASK = 0xffffffffu;
+  for (int offset = 1; offset < WARP_SIZE; offset *= 2) {
+    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, offset));
+  }
+  return v;
+}
+
+__device__ __forceinline__ __nv_bfloat16 warp_max(__nv_bfloat16 v) {
+  static constexpr unsigned FULL_MASK = 0xffffffffu;
+  for (int offset = 1; offset < WARP_SIZE; offset *= 2) {
+    v = __hmax(v, __shfl_xor_sync(FULL_MASK, v, offset));
+  }
+  return v;
+}
+#endif
+
+template <typename T, typename U>
+__device__ __forceinline__ void cp_async4(T* _smem_ptr, const U* _glob_ptr) {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  auto smem_ptr = reinterpret_cast<void*>(_smem_ptr);
+  auto glob_ptr = reinterpret_cast<const void*>(_glob_ptr);
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+#else
+  _smem_ptr[0] = _glob_ptr[0];
+#endif
+}
+
+__device__ __forceinline__ void cp_async_fence() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.commit_group;\n" ::);
+#else
+#endif
+}
+
+template <int N>
+__device__ __forceinline__ void cp_async_wait() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+#else
+#endif
+}
+
+template <>
+__device__ __forceinline__ void cp_async_wait<0>() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.wait_all;\n" ::);
+#else
+#endif
+}
+
+__device__ __forceinline__ float clip(float v, float mmin, float mmax) {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  return fminf(mmax, fmaxf(v, mmin));
+#else
+#endif
+}
+
+__device__ __forceinline__ __nv_bfloat16 clip(__nv_bfloat16 v,
+                                              __nv_bfloat16 mmin,
+                                              __nv_bfloat16 mmax) {
+  return __hmin(mmax, __hmax(v, mmin));
+}
+
+__device__ __forceinline__ __nv_bfloat162 clip(__nv_bfloat162 v,
+                                               __nv_bfloat162 mmin,
+                                               __nv_bfloat162 mmax) {
+  return __hmin2(mmax, __hmax2(v, mmin));
+}
+
+// We use the following values for fp8 min/max:
+//  __nv_fp8_e4m3 = (-448, +448)
+//  __nv_fp8_e4m3uz = (-240.0, +240.0)
+// It is currently assumed that only
+template <class T>
+constexpr __nv_bfloat16 get_fp8_max() {
+  static_assert(std::is_same_v<T, c10::Float8_e4m3fn> ||
+                std::is_same_v<T, c10::Float8_e4m3fnuz>);
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 17376});
+  } else {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 17264});
+  }
+}
+
+template <class T>
+constexpr __nv_bfloat16 get_fp8_min() {
+  static_assert(std::is_same_v<T, c10::Float8_e4m3fn> ||
+                std::is_same_v<T, c10::Float8_e4m3fnuz>);
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50144});
+  } else {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
+  }
+}
+
+template <typename Idx_t>
+__device__ __forceinline__ int warp_expert_search(
+    int idx, int n, const Idx_t* __restrict__ input, Idx_t val) {
+  const Idx_t* input_ptr = input + idx;
+  int base_offset = 0;
+
+  for (;;) {
+    bool move_on = (idx < n && *input_ptr <= val);
+
+    unsigned mask = __ballot_sync(0xffffffff, move_on);
+
+    if (mask != 0xffffffffu) {
+      int last_lane = 31 - __clz(mask);
+      return base_offset + last_lane;
+    }
+
+    input_ptr += 32;
+    base_offset += 32;
+    idx += 32;
+  }
+}
+
+template <int num_parallel_tokens>
+__device__ __forceinline__ void token_bounds(int32_t n_tokens,
+                                             int32_t worker_id,
+                                             int32_t& n_tokens_lower,
+                                             int32_t& n_tokens_upper) {
+  if (n_tokens < num_parallel_tokens && worker_id < n_tokens) {
+    if (worker_id >= num_parallel_tokens) return;
+    n_tokens_lower = worker_id;
+    n_tokens_upper = worker_id + 1;
+  } else {
+    int32_t chunk_size = n_tokens / num_parallel_tokens;
+    int32_t residual = n_tokens - chunk_size * num_parallel_tokens;
+    auto calc_id = [&](int32_t id) {
+      if (id < residual)
+        return min(n_tokens, id * (chunk_size + 1));
+      else
+        return min(n_tokens, id * chunk_size + residual);
+    };
+    n_tokens_lower = calc_id(worker_id);
+    n_tokens_upper = calc_id(worker_id + 1);
+  }
+}
+
+template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
+          typename scale_t, int THREADS, typename Idx_t, bool CEIL_UE8M0,
+          int GROUP_SIZE = 128, int NUM_STAGES = 3>
+__global__ void silu_mul_fp8_quant_deep_gemm_kernel(
+    const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
+    scale_t* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
+    // sizes
+    Idx_t E, Idx_t T, Idx_t H,
+    // strides (in elements)
+    Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
+    Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
+    Idx_t stride_ys_g, Idx_t stride_ys_p, Idx_t stride_counts_e) {
+#ifndef USE_ROCM
+  static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
+
+  static constexpr int LOAD_STAGE_SIZE = 2 * GROUP_SIZE / 8;
+  static constexpr int LOAD_STAGE_MOD = NUM_STAGES * LOAD_STAGE_SIZE;
+
+  static constexpr int COMPUTE_STAGE_SIZE = 2 * GROUP_SIZE / 4;
+  static constexpr int COMPUTE_STAGE_MOD = COMPUTE_STAGE_SIZE * NUM_STAGES;
+
+  extern __shared__ __align__(16) __int128_t smem_128[];
+
+  int* s_expert_offsets =
+      reinterpret_cast<int*>(smem_128 + (SMEM_SIZE_BYTES_Y / 16));
+
+  static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
+  static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
+  // We assign EPS with it's 16-bit unsigned counterpart to allow constexpr.
+  static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
+  int tid = threadIdx.x;
+  int warp_id = tid >> 5;
+  int lane_id = tid & 0x1f;
+
+  int running_sum{};
+  if (!warp_id) {
+    for (int i = 0; i < E; i += WARP_SIZE) {
+      bool valid = (i + threadIdx.x) < E;
+      int value =
+          (valid ? tokens_per_expert[i + threadIdx.x * stride_counts_e] : 0) +
+          (!lane_id ? running_sum : 0);
+
+      for (int offset = 1; offset < 32; offset *= 2) {
+        int n = __shfl_up_sync(0xFFFFFFFFu, value, offset);
+        if (lane_id >= offset) value += n;
+      }
+
+      if (valid) {
+        s_expert_offsets[i + threadIdx.x + 1] = value;
+      }
+
+      running_sum = __shfl_sync(0xFFFFFFFFu, value, WARP_SIZE - 1);
+    }
+
+    if (!lane_id) {
+      s_expert_offsets[0] = 0;
+    }
+  }
+
+  __syncthreads();
+
+  int32_t total_tokens = s_expert_offsets[E];
+
+  const int warp_position_yq = warp_id * (H / NUM_WARPS);
+  const int warp_position_scales = warp_id * (H / (GROUP_SIZE * NUM_WARPS));
+
+  // A single block will handle tokens_per_block tokens.
+  // Each block i iterates over tokens of a slice of n_tokens =
+  // expert_counts[i], with the size of chunk being
+  // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
+  // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
+
+  // Each warp will get space to store its hidden dim for gate and up.
+  __int128_t* s_hidden_load = smem_128 + warp_id * ((2 * 128 / 8) * NUM_STAGES);
+  __int128_t* smem_load_ptr = s_hidden_load + lane_id;
+
+  const __nv_bfloat16 fp8_inv = __hdiv(__float2bfloat16(1.f), fp8_max);
+
+  int32_t compute_pipeline_offset_64 = 0;
+  int32_t load_stage_offset{};
+  const __nv_bfloat16 one_bf16 = __float2bfloat16_rn(1.f);
+
+  __int64_t* smem_compute_ptr = reinterpret_cast<__int64_t*>(smem_128) +
+                                warp_id * (2 * (GROUP_SIZE / 4) * NUM_STAGES) +
+                                lane_id;
+  __int64_t* s_gate64_ptr = smem_compute_ptr;
+  __int64_t* s_up64_ptr = smem_compute_ptr + GROUP_SIZE / 4;
+
+  int tokens_lower, tokens_upper;
+
+  token_bounds<BLOCK_COUNT>(total_tokens, blockIdx.x, tokens_lower,
+                            tokens_upper);
+
+  Idx_t expert_id{}, expert_offset{}, next_expert_offset{};
+  int token_id = tokens_lower;
+  int32_t t_load{};
+
+  if (token_id < tokens_upper) {
+    expert_id = warp_expert_search<int>(lane_id, E, s_expert_offsets, token_id);
+    expert_offset = s_expert_offsets[expert_id];
+    next_expert_offset = s_expert_offsets[expert_id + 1];
+  } else {
+    // This thread block has no work to do.
+    return;
+  }
+
+  int t_load_bound = H / (GROUP_SIZE * NUM_WARPS);
+
+  Idx_t base_i = ((expert_id * stride_i_e) / 8) +
+                 (token_id - expert_offset) * stride_i_t / 8;
+  const Idx_t gate_warp_offset =
+      warp_id * ((stride_i_h * H) / (8 * NUM_WARPS)) + (lane_id & 0b1111);
+
+  const __int128_t* input_128_ptr =
+      reinterpret_cast<const __int128_t*>(_input) + gate_warp_offset +
+      ((lane_id < 16) ? 0 : ((H * stride_i_h) / 8));
+  __int128_t* load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
+
+  auto token_offset = token_id - expert_offset;
+
+  auto load_and_advance_y_pred = [&] {
+    if (t_load < t_load_bound) {
+      // Here we are simply continuing to load data
+      // from the current token.
+      auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
+
+      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
+      // unnecessary ALU ops.
+      load_stage_offset += LOAD_STAGE_SIZE;
+      load_stage_offset %= LOAD_STAGE_MOD;
+
+      cp_async4(smem_load_ptr_staged, load_ptr);
+      load_ptr += GROUP_SIZE / 8;
+      ++t_load;
+    } else if (token_id + 1 < tokens_upper) {
+      // We loaded everything from the current token, let's move on
+      // to the next one, and we checked that we have more tokens to load.
+      ++token_id;
+      t_load = 0;
+      if (token_id >= next_expert_offset) {
+        // We need to find the next expert.
+        do {
+          // This is a loop because it's possible
+          // that some experts are assigned 0 tokens.
+          // NOTE: We are guaranteed that there's at least
+          // one more token left so we don't have to check for
+          // expert_id bounds.
+          ++expert_id;
+          // This skips 1 memory read.
+          expert_offset = next_expert_offset;
+          next_expert_offset = s_expert_offsets[expert_id + 1];
+        } while (next_expert_offset == expert_offset);
+
+        base_i = expert_id * (stride_i_e / 8);
+        token_offset = 0;
+        load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
+      } else {
+        // We remain within the same expert, so just
+        // move by H/4 __int128_t (2 * H/8).
+        base_i += stride_yq_t / 4;
+        token_offset++;
+      }
+
+      load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
+
+      auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
+
+      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
+      // unnecessary ALU ops.
+      load_stage_offset += LOAD_STAGE_SIZE;
+      load_stage_offset %= LOAD_STAGE_MOD;
+
+      cp_async4(smem_load_ptr_staged, load_ptr);
+      load_ptr += GROUP_SIZE / 8;
+      ++t_load;
+    }
+    // We fence even if there is nothing to load to simplify pipelining.
+    cp_async_fence();
+  };
+
+  // We need to warm-up the pipeline.
+  #pragma unroll
+  for (int i = 0; i < NUM_STAGES - 1; i++) {
+    load_and_advance_y_pred();
+  }
+
+  __nv_fp8x4_e4m3* y_q_base_ptr =
+      reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
+
+  Idx_t scale_group_offset = 0;
+  if constexpr (std::is_same<scale_t, uint8_t>::value) {
+    // packed int32_t format
+    int pack_id = warp_position_scales / 4;
+    int scale_in_pack = warp_position_scales % 4;
+    scale_group_offset = pack_id * stride_ys_p + scale_in_pack * stride_ys_g;
+  } else {
+    scale_group_offset = warp_position_scales * stride_ys_g;
+  }
+
+  scale_t* const y_scale_base_ptr = _y_s + scale_group_offset;
+
+  for (auto j = tokens_lower; j < tokens_upper; j++) {
+    int current_group_id = warp_position_scales;  // Running count of which
+                                                  // group is being processed
+    const Idx_t base_ys = expert_id * stride_ys_e;
+    auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
+    __nv_fp8x4_e4m3* y_q_ptr =
+        y_q_base_ptr + (expert_id * stride_yq_e + token_offset * stride_yq_t +
+                        warp_position_yq * stride_yq_h) /
+                           4;
+    const int COMPUTE_LIMIT = H / (GROUP_SIZE * NUM_WARPS);
+
+    for (int i = 0; i < COMPUTE_LIMIT; i++) {
+      cp_async_wait<NUM_STAGES - 2>();
+      __syncthreads();
+      load_and_advance_y_pred();
+
+      __int64_t* gate64_ptr = s_gate64_ptr + compute_pipeline_offset_64;
+      __int64_t* up64_ptr = s_up64_ptr + compute_pipeline_offset_64;
+
+      // COMPUTE_STAGE_SIZE/MOD must also be constexpr!
+      compute_pipeline_offset_64 += COMPUTE_STAGE_SIZE;
+      compute_pipeline_offset_64 %= COMPUTE_STAGE_MOD;
+
+      __int64_t gate64 = *gate64_ptr;
+      __int64_t up64 = *up64_ptr;
+
+      // Compute
+      __nv_bfloat162 res[2];
+      __nv_bfloat162* s_up_comp = reinterpret_cast<__nv_bfloat162*>(&up64);
+      __nv_bfloat162* s_gate_comp = reinterpret_cast<__nv_bfloat162*>(&gate64);
+
+  #pragma unroll
+      for (int32_t k = 0; k < 2; ++k) {
+        __nv_bfloat162 gate = silu2_v2(__bfloat1622float2(s_gate_comp[k]));
+        res[k] = __hmul2(gate, s_up_comp[k]);
+      }
+
+      auto _y_max2 = __hmax2(__habs2(res[0]), __habs2(res[1]));
+
+      _y_max2.x = __hmax(__hmax(_y_max2.x, _y_max2.y), EPS);
+
+      __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
+
+      if constexpr (CEIL_UE8M0) {
+        y_s = hexp2(hceil(hlog2(y_s)));
+      }
+
+      __nv_bfloat16 inv_y = __hdiv(one_bf16, y_s);
+
+      auto y_s2 = make_bfloat162(inv_y, inv_y);
+
+  #pragma unroll
+      for (int32_t k = 0; k < 2; ++k) {
+        res[k] = clip(__hmul2(res[k], y_s2), __bfloat162bfloat162(fp8_min),
+                      __bfloat162bfloat162(fp8_max));
+      }
+
+      *y_q_ptr = __nv_fp8x4_e4m3(res[0], res[1]);
+      y_q_ptr += WARP_SIZE * stride_yq_h;
+
+      if (!lane_id) {
+        // Store scales.
+        if constexpr (std::is_same<scale_t, uint8_t>::value) {
+          // Packed UE8MO format. Remove Mantissa.
+          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
+
+          bool const jump_pack = (current_group_id + 1) % 4 == 0;
+          // Minus 3 because we need to get to the first group in the
+          // next pack.
+          y_s_ptr += jump_pack ? (stride_ys_p - 3) : stride_ys_g;
+
+        } else {
+          // float32 format
+          static_assert(std::is_same<scale_t, float>::value);
+          *y_s_ptr = y_s;
+          y_s_ptr += stride_ys_g;
+        }
+
+        current_group_id += 1;
+      }
+    }
+  }
+#endif
+}
+
+}  // namespace vllm
+
+// Launch activation, gating, and quantize kernel.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                               \
+  int d = input.size(-1) / 2;                                               \
+  int64_t num_tokens = input.numel() / input.size(-1);                      \
+  dim3 grid(num_tokens, num_tokens > 16 ? num_tokens > 32 ? 1 : 2 : 4);     \
+  dim3 block(std::min(d, 512));                                             \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));         \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();             \
+  VLLM_DISPATCH_FLOATING_TYPES(                                             \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                      \
+        VLLM_DISPATCH_FP8_TYPES(                                            \
+            out.scalar_type(), "fused_add_rms_norm_kernel_fp8_type", [&] {  \
+              vllm::act_and_mul_quant_kernel<scalar_t, KERNEL<scalar_t>,    \
+                                             fp8_t>                         \
+                  <<<grid, block, 0, stream>>>(out.data_ptr<fp8_t>(),       \
+                                               input.data_ptr<scalar_t>(),  \
+                                               scale.data_ptr<float>(), d); \
+            });                                                             \
+      });
+
+void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
+                        torch::Tensor& input,  // [..., 2 * d]
+                        torch::Tensor& scale) {
+  TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn ||
+              out.dtype() == torch::kFloat8_e4m3fnuz);
+  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
+              input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(input.size(-1) % 2 == 0);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+}
+
+void persistent_masked_m_silu_mul_quant(
+    const at::Tensor& input,              // (E, T, 2*H)
+    const at::Tensor& tokens_per_expert,  // (E)
+    at::Tensor& y_q,                      // (E, T, H) [OUT]
+    at::Tensor& y_s,                      // (E, T, H//group_size) [OUT]
+    bool cast_scale_ue8m0) {
+#ifndef USE_ROCM
+
+  // This kernel currently only supports H % 128 == 0 and assumes a
+  // fixed GROUP_SIZE of 128.
+  static constexpr int GROUP_SIZE = 128;
+
+  TORCH_CHECK(input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
+              y_q.dtype() == torch::kFloat8_e4m3fnuz);
+  TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);
+
+  bool const is_packed_ue8m0 =
+      (y_s.dtype() == torch::kInt32 && cast_scale_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kFloat32 || is_packed_ue8m0);
+
+  using Idx_t = int64_t;
+
+  Idx_t E = input.size(0);
+  Idx_t T = input.size(1);
+  Idx_t H = input.size(2) / 2;
+  Idx_t stride_i_e = input.stride(0);
+  Idx_t stride_i_t = input.stride(1);
+  Idx_t stride_i_h = input.stride(2);
+  Idx_t stride_yq_e = y_q.stride(0);
+  Idx_t stride_yq_t = y_q.stride(1);
+  Idx_t stride_yq_h = y_q.stride(2);
+
+  Idx_t stride_counts_e = tokens_per_expert.stride(0);
+
+  int const NUM_GROUPS = H / GROUP_SIZE;
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // TODO: Get this from cuda_arch ?
+  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
+
+  #define KERNEL(BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,  \
+                 STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, STAGES)                \
+    static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE;                 \
+    int sms = SILU_V2_BLOCK_COUNT;                                             \
+    static constexpr int max_shared_mem_bytes =                                \
+        GROUP_SIZE * 2 * STAGES * NUM_WARPS * 2;                               \
+    dim3 grid(sms), block(THREAD_COUNT);                                       \
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(input));          \
+    VLLM_DISPATCH_FP8_TYPES(                                                   \
+        y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] {        \
+          vllm::silu_mul_fp8_quant_deep_gemm_kernel<                           \
+              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, scale_t, THREAD_COUNT, \
+              Idx_t, CEIL_UE8M0, GROUP_SIZE, STAGES>                           \
+              <<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>(  \
+                  reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),          \
+                  (fp8_t*)y_q.data_ptr(),                                      \
+                  reinterpret_cast<scale_t*>(y_s.data_ptr()),                  \
+                  reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
+                  T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e,       \
+                  stride_yq_t, stride_yq_h, STRIDE_YS_E, STRIDE_YS_T,          \
+                  STRIDE_YS_G, STRIDE_YS_P, stride_counts_e);                  \
+        });
+
+  #define LAUNCH_ON_H(scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,         \
+                      STRIDE_YS_P, CEIL_UE8M0)                                \
+    if (H >= 4096 && (NUM_GROUPS % 8) == 0) {                                 \
+      /* 8 warp config */                                                     \
+      static constexpr int NUM_STAGES = 4;                                    \
+      static constexpr int THREAD_COUNT = 256;                                \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, NUM_STAGES); \
+    } else {                                                                  \
+      /* 1 warp config */                                                     \
+      static constexpr int THREAD_COUNT = 32;                                 \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, 2);          \
+    }
+
+  Idx_t stride_ys_e = y_s.stride(0);
+  Idx_t stride_ys_t = y_s.stride(1);
+  Idx_t stride_ys_g = y_s.stride(2);
+  Idx_t stride_ys_p = 0;
+  if (!cast_scale_ue8m0) {
+    TORCH_CHECK(!is_packed_ue8m0);
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                false);
+    return;
+  }
+
+  if (!is_packed_ue8m0) {
+    // UE8M0 but not packed
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                true);
+    return;
+  }
+
+  TORCH_CHECK(cast_scale_ue8m0 && is_packed_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kInt32);
+
+  // Int32 packed ue8m0 scales tensor.
+  // Let E, T, G be the number to experts, number of tokens and number of groups
+  // respectively. Let, E = 2, T = 4, G = 6, in this case the int32 scales
+  // tensor are of shape [1, 4, 2] and stride [8, 1, 4]. The scales are expected
+  // to be arranged as follows,
+  // [[T0G0-T0G1-T0G2-T0G3, T0G4-T0G5-X-X,],
+  //  [T1G0-T1G1-T1G2-T1G3, T1G4-T1G5-X-X,]
+  //  [T2G0-T2G1-T2G2-T2G3, T2G4-T2G5-X-X,]
+  //  [T3G0-T3G1-T3G2-T3G3, T3G4-T3G5-X-X,]]
+  // where, TxGy is the scale ue8m0 scale value of Token x, Group y.
+  //
+  // In memory (in bytes) the scale values are arranged as,
+  //  [T0G0, T0G1, T0G2, T0G3, T1G0, T1G2, T1G3, T1G4, T2G0, T2G1, T2G3, T2G4,
+  //   T3G0, T3G1, T3G2, T3G3, T0G4, T0G5, X, X, T1G4, T1G5, X, X, T2G4, T2G5,
+  //   X, X, T3G4, T3G5, X, X]
+  //
+  // An Int32 tensor of size [1, 4, 2] and stride [8, 1, 4] can be represented
+  // as an uint8 tensor of shape [1, 2, 4, 4] and stride [32, 16, 4, 1]. In
+  // english, ignoring the Experts dimension, the original int32 tensor is
+  // simply treated as two packed [4, 4] uint8 tensor (or two [4, 1] int32
+  // tensor). The following strides setting reflects this change. Caveat: This
+  // means that the G dimension is no longer contiguous. i.e. Note that to move
+  // from G3 to G4, we need to jump along the packing dimension. The kernel
+  // handles this case.
+
+  stride_ys_e *= sizeof(int32_t);
+  stride_ys_p = T * sizeof(int32_t);  // Packing dimension
+  stride_ys_t = sizeof(int32_t);
+  stride_ys_g = 1;
+
+  LAUNCH_ON_H(uint8_t, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+              true);
+
+#endif
+}
--- a/csrc/quantization/awq/dequantize.cuh
+++ b/csrc/quantization/awq/dequantize.cuh
@@ -0,0 +1,102 @@
+/*
+Adapted from https://github.com/mit-han-lab/llm-awq
+Modified from NVIDIA FasterTransformer:
+https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+@article{lin2023awq,
+  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
+Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
+Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
+}
+*/
+
+#pragma once
+
+namespace vllm {
+namespace awq {
+
+__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
+  assert(false);
+#else
+  uint4 result;
+
+  uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+  uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
+
+  // First, we extract the i4s and construct an intermediate fp16 number.
+  static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+  static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
+  static constexpr uint32_t TOP_MASK = 0x00f000f0;
+  static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+  // Note that the entire sequence only requires 1 shift instruction. This is
+  // thanks to the register packing format and the fact that we force our
+  // integers to be unsigned, and account for this in the fp16 subtractions. In
+  // addition, I exploit the fact that sub and fma have the same throughput in
+  // order to convert elt_23 and elt_67 to fp16 without having to shift them to
+  // the bottom bits before hand.
+
+  // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
+  // dependency if we issue immediately before required.
+  const uint32_t top_i4s = i4s >> 8;
+  // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[0])
+               : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                 "n"(immLut));
+  // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[1])
+               : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                 "n"(immLut));
+  // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[2])
+               : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                 "n"(immLut));
+  // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[3])
+               : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                 "n"(immLut));
+
+  // I use inline PTX below because I am not sure if the compiler will emit
+  // float2half instructions if I use the half2 ctor. In this case, I chose
+  // performance reliability over code readability.
+
+  // This is the half2 {1032, 1032} represented as an integer.
+  // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
+  // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
+  static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
+  // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+  static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+  // This is the half2 {-72, -72} represented as an integer.
+  // static constexpr uint32_t NEG_72 = 0xd480d480;
+  // Haotian: Let's use {-64, -64}.
+  static constexpr uint32_t NEG_64 = 0xd400d400;
+
+  // Finally, we construct the output numbers.
+  // Convert elt_01
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(h[0])
+               : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+  // Convert elt_23
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(h[1])
+               : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+  // Convert elt_45
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(h[2])
+               : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+  // Convert elt_67
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(h[3])
+               : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+
+  return result;
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+}  // namespace awq
+}  // namespace vllm
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -0,0 +1,526 @@
+/*
+Adapted from https://github.com/mit-han-lab/llm-awq
+@article{lin2023awq,
+  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
+Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
+Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
+}
+ */
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "dequantize.cuh"
+
+#include <cuda_fp16.h>
+
+namespace vllm {
+namespace awq {
+
+template <int N>
+__global__ void __launch_bounds__(64)
+    gemm_forward_4bit_cuda_m16nXk32(int G, int split_k_iters,
+                                    half* __restrict__ A, int* __restrict__ B,
+                                    half* __restrict__ scaling_factors,
+                                    int* __restrict__ zeros, int M, int IC,
+                                    int OC, half* __restrict__ C) {
+  // Only support matrix n = 64 or 128
+  assert(N == 64 || N == 128);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
+  assert(false);
+#else
+  static constexpr uint32_t ZERO = 0x0;
+  float C_warp[32];
+  __shared__ half A_shared[16 * (32 + 8)];
+  __shared__ half B_shared[32 * (N + 8)];
+
+  int j_factors1 = ((OC + N - 1) / N);
+  int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
+  int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
+
+  half A_shared_warp[8];
+  half B_shared_warp[N / 4];
+  for (int j_0_4_init = 0; j_0_4_init < N / 32; ++j_0_4_init) {
+    for (int i = 0; i < 8; ++i) {
+      C_warp[(j_0_4_init * 8) + i] = 0.0;
+    }
+  }
+
+  static constexpr int row_stride_warp = 32 * 8 / 32;
+  static constexpr int row_stride = 2 * 32 * 8 / N;
+  // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
+  bool ld_A_flag =
+      (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp +
+       threadIdx.x * 8 / 32) < M;  // threadIdx.y is warp_id
+  // bool wb_C_flag = (threadIdx.x / 4) < M;
+
+  half* A_ptr =
+      A +
+      (((int)blockIdx_y) / j_factors1 * 16 +
+       (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) *
+          IC +
+      (((int)threadIdx.x) % (32 / 8)) * 8;
+
+  int* B_ptr = B + ((int)threadIdx.y) * (OC / 8) * (256 / N) +
+               (((int)threadIdx.x) / (N / 8)) * (OC / 8) +
+               (((int)blockIdx_y) % j_factors1) * (N / 8) +
+               (((int)threadIdx.x) % (N / 8)) * 1;
+  // Why * 1 in the above line?
+
+  half* A_shared_ptr = A_shared +
+                       ((int)threadIdx.y) * row_stride_warp * (32 + 8) +
+                       (((int)threadIdx.x) / (32 / 8)) * (32 + 8) +
+                       (((int)threadIdx.x) % (32 / 8)) * 8;
+
+  half* B_shared_ptr = B_shared +
+                       ((int)threadIdx.y) * (row_stride / 2) * (N + 8) +
+                       (((int)threadIdx.x) / (N / 8)) * (N + 8) +
+                       (((int)threadIdx.x) % (N / 8)) * 8;
+
+  int* zeros_ptr = zeros + (((int)blockIdx_y) % j_factors1) * (N / 8) +
+                   ((int)threadIdx.x) % (N / 8);
+
+  half* scaling_factors_ptr = scaling_factors +
+                              (((int)blockIdx_y) % j_factors1) * N +
+                              (((int)threadIdx.x) % (N / 8)) * 8;
+
+  half* C_ptr =
+      C +
+      static_cast<long long>(blockIdx_z) * M * OC  // blockIdz.x -> split_k dim
+      + (((int)blockIdx_y) % j_factors1) * N + ((int)threadIdx.y) * (N / 2) +
+      (((int)threadIdx.x) % 4) * 2;
+
+  // preload s.f. and zeros
+  int k_bound = (IC / 32 + split_k_iters - 1) / split_k_iters;
+  if ((k_bound - 1) * split_k_iters * 32 + blockIdx_z * 32 >= IC) k_bound -= 1;
+  for (int _k_0_0 = 0; _k_0_0 < k_bound; ++_k_0_0) {
+    int k_0_0 = _k_0_0 * split_k_iters + blockIdx_z;
+    __syncthreads();
+    // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
+    if (ld_A_flag) {
+      *(uint4*)(A_shared_ptr) = *(uint4*)(A_ptr + (k_0_0 * 32));
+    } else {
+      *(uint4*)(A_shared_ptr) = make_uint4(0, 0, 0, 0);
+    }
+
+    // for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 2; ++ax0_ax1_fused_0) {
+    uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr + k_0_0 * 32 / G * (OC / 8));
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_scale =
+        *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC));
+    /*
+    if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 &&
+    threadIdx.y == 0){ printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x,
+    B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x,
+    B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w);
+    }
+    */
+    // uint4 B_loaded_scale = make_uint4(0, 0, 0, 0);
+    int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8);
+
+    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < N / 16; ++ax0_ax1_fused_0) {
+      // B: 32 x 136 (128+8) float16
+      // each warp: 32 x 4
+      // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus
+      // zero -> WB UINT4
+      // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) *
+      // 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15)
+      // * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 *
+      // 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) *
+      // 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) *
+      // 8))); row stride in shared memory: (NWARPS * 32 * 8 / cta_N)
+      uint32_t B_loaded =
+          *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
+      uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+
+      // - zero and * scale
+      // TODO (Haotian): can save 4 assembly instructions if sormulate as deq =
+      // q * scale - zero * scale.
+      asm volatile("sub.f16x2 %0, %1, %2;\n"
+                   : "=r"(B_loaded_fp16.x)
+                   : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                   : "=r"(B_loaded_fp16.x)
+                   : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n"
+                   : "=r"(B_loaded_fp16.y)
+                   : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                   : "=r"(B_loaded_fp16.y)
+                   : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n"
+                   : "=r"(B_loaded_fp16.z)
+                   : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                   : "=r"(B_loaded_fp16.z)
+                   : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n"
+                   : "=r"(B_loaded_fp16.w)
+                   : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                   : "=r"(B_loaded_fp16.w)
+                   : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+      /*
+      if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 ==
+      0 && threadIdx.x == 17 && threadIdx.y == 0){ printf("[x] %X %X %X %X\n",
+      B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
+      }
+      */
+
+      // write back
+      *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (N + 8)) =
+          B_loaded_fp16;
+    }
+    __syncthreads();
+
+    for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) {
+      {
+        unsigned int addr;
+        __asm__ __volatile__(
+            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
+            "addr; }\n"
+            : "=r"(addr)
+            : "l"((void*)((&(A_shared[(k_0_1 * 16)])) +
+                          (((((int)threadIdx.x) & 15) * 40) +
+                           ((((int)threadIdx.x) >> 4) * 8)))));
+
+        __asm__ __volatile__(
+            "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+            "{%0, %1, %2, %3}, [%4];\n"
+            : "=r"(((unsigned*)(A_shared_warp + 0))[0]),
+              "=r"(((unsigned*)(A_shared_warp + 0))[1]),
+              "=r"(((unsigned*)(A_shared_warp + 0))[2]),
+              "=r"(((unsigned*)(A_shared_warp + 0))[3])
+            : "r"(addr));
+      }
+
+      for (int ax1_0 = 0; ax1_0 < N / 32; ++ax1_0) {
+        {
+          unsigned int addr;
+          __asm__ __volatile__(
+              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
+              "addr; }\n"
+              : "=r"(addr)
+              : "l"((void*)((&(B_shared[(((k_0_1 * (N * 16 + 128)) +
+                                          (((int)threadIdx.y) * (N / 2))) +
+                                         (ax1_0 * 16))])) +
+                            (((((int)threadIdx.x) & 15) * (N + 8)) +
+                             ((((int)threadIdx.x) >> 4) * 8)))));
+          __asm__ __volatile__(
+              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+              "{%0, %1, %2, %3}, [%4];\n"
+              : "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[0]),
+                "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[1]),
+                "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[2]),
+                "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[3])
+              : "r"(addr));
+        }
+      }
+      for (int j_0_4 = 0; j_0_4 < N / 32; ++j_0_4) {
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+              : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
+                "r"(((unsigned*)(A_shared_warp + 0))[1]),
+                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[0]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[3]));
+        }
+
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+              : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
+                "r"(((unsigned*)(A_shared_warp + 0))[1]),
+                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]));
+        }
+
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+              : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[2]),
+                "r"(((unsigned*)(A_shared_warp + 0))[3]),
+                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[3]));
+        }
+
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+              : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[2]),
+                "r"(((unsigned*)(A_shared_warp + 0))[3]),
+                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]));
+        }
+  #else
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
+              "%13};\n"
+              : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "=f"(((float*)(C_warp + (j_0_4 * 8)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
+                "r"(((unsigned*)(A_shared_warp + 0))[1]),
+                "r"(((unsigned*)(A_shared_warp + 0))[2]),
+                "r"(((unsigned*)(A_shared_warp + 0))[3]),
+                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[0]),
+                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
+                "f"(((float*)(C_warp + (j_0_4 * 8)))[3]));
+        }
+
+        {
+          __asm__ __volatile__(
+              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
+              "%13};\n"
+              : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])
+              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
+                "r"(((unsigned*)(A_shared_warp + 0))[1]),
+                "r"(((unsigned*)(A_shared_warp + 0))[2]),
+                "r"(((unsigned*)(A_shared_warp + 0))[3]),
+                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]),
+                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
+                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]));
+        }
+
+  #endif
+      }
+    }
+  }
+
+  // TODO: Shang: Hoist loop invariance.
+  for (int ax1_0_1 = 0; ax1_0_1 < (N / 32); ++ax1_0_1) {
+    for (int local_id = 0; local_id < 8; ++local_id) {
+      int row_offset = (((int)blockIdx_y) / j_factors1) * 16 +
+                       ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;
+      if (row_offset < M) {
+        *(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 +
+          local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]);
+      }
+    }
+  }
+#endif
+}
+
+__global__ void __launch_bounds__(64)
+    dequantize_weights(int* __restrict__ B, half* __restrict__ scaling_factors,
+                       int* __restrict__ zeros, half* __restrict__ C, int G) {
+  static constexpr uint32_t ZERO = 0x0;
+  half B_shared[32 * (128 + 8)];
+
+  half* B_shared_ptr2 = B_shared;
+
+  int N = blockDim.x * gridDim.x;  // 2
+  int col = (blockIdx.x * blockDim.x + threadIdx.x);
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
+  int index1 = 8 * col + 8 * row * N;
+  half* C_ptr2 = C + index1;
+
+  int index2 = col + row * N;
+  int* B_ptr2 = B + index2;
+
+  int index3 = col + (int)(row / G) * N;
+  int* zeros_ptr2 = zeros + index3;
+  int index4 = 8 * col + (int)(row / G) * N * 8;
+  half* scaling_factors_ptr2 = scaling_factors + index4;
+
+  uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr2);
+  uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+  uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr2);
+
+  uint32_t B_loaded = *(uint32_t*)B_ptr2;
+  uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(B_loaded_fp16.x)
+               : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(B_loaded_fp16.x)
+               : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(B_loaded_fp16.y)
+               : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(B_loaded_fp16.y)
+               : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(B_loaded_fp16.z)
+               : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(B_loaded_fp16.z)
+               : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+  asm volatile("sub.f16x2 %0, %1, %2;\n"
+               : "=r"(B_loaded_fp16.w)
+               : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(B_loaded_fp16.w)
+               : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+
+  *(uint4*)B_shared_ptr2 = B_loaded_fp16;
+
+  for (int i = 0; i < 8; ++i) {
+    *(C_ptr2 + i) = B_shared[i];
+  }
+}
+
+}  // namespace awq
+}  // namespace vllm
+
+torch::Tensor awq_dequantize(torch::Tensor _kernel,
+                             torch::Tensor _scaling_factors,
+                             torch::Tensor _zeros, int64_t split_k_iters,
+                             int64_t thx, int64_t thy) {
+  int in_c = _kernel.size(0);
+  int qout_c = _kernel.size(1);
+  int out_c = qout_c * 8;
+  int G = in_c / _scaling_factors.size(0);
+
+  int x_thread = thx;
+  int y_thread = thy;
+
+  int x_blocks = 1;
+  int y_blocks = 1;
+  if (thx == 0) {
+    x_thread = qout_c;
+  }
+  if (thy == 0) {
+    y_thread = in_c;
+  }
+  if (thx == 0 && thy == 0) {
+    x_thread = 8;
+    y_thread = 8;
+    x_blocks = (int)(qout_c / 8);
+    y_blocks = (int)(in_c / 8);
+  }
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_scaling_factors));
+
+  auto options = torch::TensorOptions()
+                     .dtype(_scaling_factors.dtype())
+                     .device(_scaling_factors.device());
+  at::Tensor _de_kernel = torch::empty({in_c, out_c}, options);
+
+  auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
+  auto de_kernel = reinterpret_cast<half*>(_de_kernel.data_ptr<at::Half>());
+  auto scaling_factors =
+      reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
+  auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
+
+  dim3 num_blocks(x_blocks, y_blocks);
+  dim3 threads_per_block(x_thread, y_thread);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  vllm::awq::dequantize_weights<<<num_blocks, threads_per_block, 0, stream>>>(
+      kernel, scaling_factors, zeros, de_kernel, G);
+
+  return _de_kernel;
+}
+
+// in_feats: M, IC [float16]
+// kernel: IC, OC // 8 [int32] -> cast to IC, OC [uint4b]
+// scaling_factors: IC // G, OC [float16]
+// zeros: IC // G, OC // 8 [int32] -> cast to IC // G, OC [uint4b]
+// assume that batch_size < 16 for now
+
+torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
+                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
+                       int64_t split_k_iters) {
+  int num_in_feats = _in_feats.size(0);
+  int num_in_channels = _in_feats.size(1);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
+
+  auto options = torch::TensorOptions()
+                     .dtype(_in_feats.dtype())
+                     .device(_in_feats.device());
+  at::Tensor _out_feats =
+      torch::empty({split_k_iters, num_in_feats, _kernel.size(1) * 8}, options);
+  int num_out_feats = _out_feats.size(-2);
+  int num_out_channels = _out_feats.size(-1);
+
+  auto in_feats = reinterpret_cast<half*>(_in_feats.data_ptr<at::Half>());
+  auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
+  auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
+  auto scaling_factors =
+      reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
+  auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
+  int group_size = num_in_channels / _scaling_factors.size(0);
+
+  if (num_out_channels % 64 != 0)
+    throw std::invalid_argument("OC is not multiple of cta_N = 64");
+  if (num_out_channels % 8 != 0)
+    throw std::invalid_argument("OC is not multiple of pack_num = 8");
+  if (group_size % 32 != 0)
+    throw std::invalid_argument("Group size should be a multiple of 32");
+  if (num_out_channels % group_size != 0)
+    throw std::invalid_argument("OC is not multiple of Group size");
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (num_out_channels % 128 == 0) {
+    int j_factors1 = num_out_channels / 128 / 1;
+    dim3 num_blocks((num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters);
+    // threadIdx.x: 32
+    // threadIdx.y: i_factors[2] * j_factors[2]
+    dim3 threads_per_block(32, 2);
+    vllm::awq::gemm_forward_4bit_cuda_m16nXk32<128>
+        <<<num_blocks, threads_per_block, 0, stream>>>(
+            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros,
+            num_in_feats, num_in_channels, num_out_channels, out_feats);
+  } else if (num_out_channels % 64 == 0) {
+    int j_factors1 = num_out_channels / 64 / 1;
+    dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 *
+                    split_k_iters);
+
+    // threadIdx.x: 32
+    // threadIdx.y: i_factors[2] * j_factors[2]
+    dim3 threads_per_block(32, 2);
+    vllm::awq::gemm_forward_4bit_cuda_m16nXk32<64>
+        <<<num_blocks, threads_per_block, 0, stream>>>(
+            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros,
+            num_in_feats, num_in_channels, num_out_channels, out_feats);
+  }
+  return _out_feats.sum(0);
+}
--- a/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
+++ b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
@@ -0,0 +1,104 @@
+// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
+#pragma once
+
+#include <cuda.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+// ElementB is int32 (packed int4)
+// ElementGroupScale is cutlass::Array<cutlass::float_e4m3_t, 8> (packed fp8)
+template <typename ElementA, typename ElementB, typename ElementC,
+          typename ElementAccumulator, typename ElementGroupScale>
+__global__ void get_group_gemm_starts(
+    int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets,
+    ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int,
+    ElementB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int,
+    ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k,
+    int64_t scale_k) {
+  int expert_id = threadIdx.x;
+
+  int64_t expert_offset = expert_offsets[expert_id];
+
+  // same as w8a8
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset;
+  b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id);
+
+  // w4a8 specific
+  constexpr int pack_factor = 8;  // pack 8 int4 into int32
+  b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor);
+  b_group_scales_offsets[expert_id] =
+      b_group_scales_base_as_int + (expert_id * scale_k * n);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                  \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                       \
+    get_group_gemm_starts<cutlass::float_e4m3_t, int32_t, C_TYPE, float, \
+                          cutlass::Array<cutlass::float_e4m3_t, 8>>      \
+        <<<1, num_experts, 0, stream>>>(                                 \
+            static_cast<int64_t*>(expert_offsets.data_ptr()),            \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),     \
+            static_cast<int32_t**>(b_ptrs.data_ptr()),                   \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                  \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),              \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),              \
+            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>**>(     \
+                b_group_scales_ptrs.data_ptr()),                         \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),   \
+            static_cast<int32_t*>(b_tensors.data_ptr()),                 \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                \
+            static_cast<float*>(a_scales.data_ptr()),                    \
+            static_cast<float*>(b_scales.data_ptr()),                    \
+            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>*>(      \
+                b_group_scales.data_ptr()),                              \
+            n, k, scale_k);                                              \
+  }
+
+namespace {
+
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor& out_tensors,
+    torch::Tensor const& a_scales, torch::Tensor const& b_scales,
+    torch::Tensor const& b_group_scales, const int64_t b_group_size) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);  // int4 8x packed into int32
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_group_scales.dtype() ==
+              torch::kFloat8_e4m3fn);  // the underlying torch type is e4m3
+  TORCH_CHECK(out_tensors.dtype() ==
+              torch::kBFloat16);  // only support bf16 for now
+  // expect int64_t to avoid overflow during offset calculations
+  TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  // logical k, n
+  int64_t n = out_tensors.size(1);
+  int64_t k = a_tensors.size(1);
+  int64_t scale_k = cutlass::ceil_div(k, b_group_size);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
--- a/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
@@ -0,0 +1,483 @@
+#include <vector>
+#include <tuple>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+
+// vllm includes
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+#include "cutlass_extensions/torch_utils.hpp"
+#include "cutlass_extensions/common.hpp"
+
+#include "core/registration.h"
+#include "get_group_starts.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "w4a8_utils.cuh"
+
+namespace vllm::cutlass_w4a8_moe {
+
+using namespace cute;
+
+// -------------------------------------------------------------------------------------
+// Static configuration shared across all instantiations
+// -------------------------------------------------------------------------------------
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;  // <M,N,K> per
+                                                             // group
+using MmaType = cutlass::float_e4m3_t;
+using QuantType = cutlass::int4b_t;
+
+constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
+static int constexpr PackFactor = 8;  // 8 int4 packed into int32
+
+// A matrix configuration
+using ElementA = MmaType;
+using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+constexpr int AlignmentA =
+    128 /
+    cutlass::sizeof_bits<ElementA>::value;  // Alignment of A matrix in units of
+                                            // elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB = QuantType;  // Element type for B matrix operand
+using LayoutB =
+    cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+constexpr int AlignmentB =
+    128 / cutlass::sizeof_bits<
+              ElementB>::value;  // Memory access granularity/alignment of B
+                                 // matrix in units of elements (up to 16 bytes)
+
+// This example manually swaps and transposes, so keep transpose of input
+// layouts
+using LayoutA_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+// Need to pass a pointer type to make the 3rd dimension of Stride be _0
+using StrideA =
+    cute::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
+using StrideB =
+    cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
+
+// Define the CuTe layout for reoredered quantized tensor B
+// LayoutAtomQuant places values that will be read by the same thread in
+// contiguous locations in global memory. It specifies the reordering within a
+// single warp's fragment
+using LayoutAtomQuant =
+    decltype(cutlass::compute_memory_reordering_atom<MmaType>());
+using LayoutB_Reordered = decltype(cute::tile_to_shape(
+    LayoutAtomQuant{}, Layout<Shape<int, int, Int<1>>, StrideB>{}));
+
+using ElementScale = cutlass::float_e4m3_t;
+using LayoutScale = cutlass::layout::RowMajor;
+
+// C/D matrix configuration
+using ElementC =
+    cutlass::bfloat16_t;  // Element type for C and D matrix operands
+using LayoutC =
+    cutlass::layout::RowMajor;  // Layout type for C and D matrix operands
+constexpr int AlignmentC =
+    128 / cutlass::sizeof_bits<
+              ElementC>::value;  // Memory access granularity/alignment of C
+                                 // matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using ElementD = ElementC;
+using LayoutD = LayoutC;
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator = float;     // Element type for internal accumulation
+using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that
+                                      // supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+using StageCountType =
+    cutlass::gemm::collective::StageCountAuto;  // Stage count maximized based
+                                                // on the tile size
+
+// per-channel and per-token scales for epilogue
+using ElementSChannel = float;
+
+template <class TileShape_MN, class ClusterShape_MNK, class KernelSchedule,
+          class EpilogueSchedule>
+struct W4A8GroupedGemmKernel {
+  using TileShape =
+      decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
+  using ClusterShape = ClusterShape_MNK;
+
+  // per-channel, per-token scales epilogue
+  using ChTokScalesEpilogue =
+      typename vllm::c3x::ScaledEpilogueArray<ElementAccumulator, ElementD,
+                                              TileShape>;
+  using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementSChannel, ElementC,
+          typename cutlass::layout::LayoutTranspose<LayoutC>::type*, AlignmentC,
+          ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type*,
+          AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  // =========================================================== MIXED INPUT
+  // WITH SCALES
+  // ===========================================================================
+  // The Scale information must get paired with the operand that will be scaled.
+  // In this example, B is scaled so we make a tuple of B's information and the
+  // scale information.
+  using CollectiveMainloopShuffled =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass,
+          cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>,
+          LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*,
+          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>;
+
+  using GemmShuffled =
+      cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
+
+  using StrideC = typename GemmKernelShuffled::InternalStrideC;
+  using StrideD = typename GemmKernelShuffled::InternalStrideD;
+
+  using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
+  using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
+  using StrideS = typename CollectiveMainloopShuffled::StrideScale;
+  using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
+
+  // static asserts for passing in strides/layouts
+  // pack to 2x int64
+  static_assert(sizeof(StrideS) == 2 * sizeof(int64_t));
+  // pack to 3xint32,
+  static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0,
+                "LayoutB_Reordered size must be divisible by 4 bytes");
+
+  static void grouped_mm(
+      torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+      const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+      const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+      const int64_t b_group_size, const torch::Tensor& expert_offsets,
+      const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides,
+      const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+      const torch::Tensor& group_scale_strides) {
+    auto device = a_tensors.device();
+    auto device_id = device.index();
+    const at::cuda::OptionalCUDAGuard device_guard(device);
+    auto stream = at::cuda::getCurrentCUDAStream(device_id);
+
+    int num_experts = static_cast<int>(expert_offsets.size(0));
+    int n = static_cast<int>(b_tensors.size(1));
+    int k = static_cast<int>(b_tensors.size(2)) * PackFactor;
+
+    auto options_int =
+        torch::TensorOptions().dtype(torch::kInt64).device(device);
+    torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int);
+
+    // get the correct offsets to pass to gemm
+    run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+                              a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs,
+                              a_tensors, b_tensors, out_tensors, a_scales,
+                              b_scales, b_group_scales, b_group_size);
+
+    // construct args
+    using Args = typename GemmShuffled::Arguments;
+    using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
+    using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
+    Args arguments;
+
+    ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+        static_cast<ProblemShape::UnderlyingProblemShape*>(
+            problem_sizes_torch.data_ptr());
+    ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+    // SwapAB so B operands come first
+    MainloopArguments mainloop_arguments{
+        static_cast<const QuantType**>(b_ptrs.data_ptr()),
+        static_cast<LayoutB_Reordered*>(b_strides.data_ptr()),
+        static_cast<const MmaType**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr()),
+        static_cast<const cutlass::Array<ElementScale, 8>**>(
+            b_group_scales_ptrs.data_ptr()),
+        static_cast<StrideS*>(group_scale_strides.data_ptr()),
+        static_cast<int>(b_group_size)};
+
+    EpilogueArguments epilogue_arguments{
+        // since we are doing SwapAB the channel scales comes first, then token
+        // scales
+        ChTokScalesEpilogue::prepare_args(  // see ScaledEpilogueArray
+            static_cast<const ElementAccumulator**>(
+                b_scales_ptrs.data_ptr()),  // per-channel
+            static_cast<const ElementAccumulator**>(
+                a_scales_ptrs.data_ptr()),  // per-token
+            true, true),
+        nullptr,                                       // C
+        static_cast<StrideC*>(c_strides.data_ptr()),   // C
+        static_cast<ElementD**>(out_ptrs.data_ptr()),  // D
+        static_cast<StrideC*>(c_strides.data_ptr())    // D
+    };
+
+    static const cutlass::KernelHardwareInfo hw_info{
+        device_id,
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            device_id)};
+
+    arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape,
+                     mainloop_arguments, epilogue_arguments, hw_info};
+
+    // Allocate workspace
+    size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
+    torch::Tensor workspace =
+        torch::empty(workspace_size,
+                     torch::TensorOptions().dtype(torch::kU8).device(device));
+
+    // Run GEMM
+    GemmShuffled gemm;
+    CUTLASS_CHECK(gemm.can_implement(arguments));
+    CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+    CUTLASS_CHECK(gemm.run(stream));
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Kernel instantiations and dispatch logic
+// ----------------------------------------------------------------------------
+using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
+using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+
+// Kernel_TileShape_ClusterShape_Schedule
+using Kernel_128x16_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_128x16_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x16_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x16_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x32_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x32_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x64_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x64_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x128_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x128_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_128x256_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+void mm_dispatch(
+    torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+    const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+    const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+    const int64_t b_group_size, const torch::Tensor& expert_offsets,
+    const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+    const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+    const torch::Tensor& group_scale_strides, const std::string& schedule) {
+  if (schedule == "Kernel_128x16_1x1x1_Coop") {
+    Kernel_128x16_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_128x16_2x1x1_Coop") {
+    Kernel_128x16_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x16_1x1x1_Coop") {
+    Kernel_256x16_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x16_2x1x1_Coop") {
+    Kernel_256x16_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x32_1x1x1_Coop") {
+    Kernel_256x32_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x32_2x1x1_Coop") {
+    Kernel_256x32_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x64_1x1x1_Coop") {
+    Kernel_256x64_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x64_2x1x1_Coop") {
+    Kernel_256x64_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x128_1x1x1_Coop") {
+    Kernel_256x128_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x128_2x1x1_Coop") {
+    Kernel_256x128_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_128x256_2x1x1_Coop") {
+    Kernel_128x256_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else {
+    TORCH_CHECK(false,
+                "cutlass_w4a8_moe_mm: unknown schedule string: ", schedule);
+  }
+}
+
+void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+        const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+        const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+        const int64_t b_group_size, const torch::Tensor& expert_offsets,
+        const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+        const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+        const torch::Tensor& group_scale_strides,
+        std::optional<std::string> maybe_schedule) {
+  // user has specified a schedule
+  if (maybe_schedule) {
+    mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                b_group_scales, b_group_size, expert_offsets, problem_sizes,
+                a_strides, b_strides, c_strides, group_scale_strides,
+                *maybe_schedule);
+    return;
+  }
+
+  // use heuristic
+  int m_full = a_tensors.size(0);
+  int n = b_tensors.size(1);
+  int k = b_tensors.size(2) * PackFactor;  // logical k
+  int num_experts = b_tensors.size(0);
+  // per-expert batch size assuming uniform distribution
+  int m_expert = m_full / num_experts;
+
+  std::string schedule;
+  if (m_expert <= 16) {
+    schedule = "Kernel_128x16_2x1x1_Coop";
+  } else if (m_expert <= 32) {
+    schedule = "Kernel_256x32_1x1x1_Coop";
+  } else if (m_expert <= 64) {
+    schedule = "Kernel_256x64_1x1x1_Coop";
+  } else if (m_expert <= 128) {
+    schedule = "Kernel_256x128_2x1x1_Coop";
+  } else {  // m_expert > 128
+    schedule = "Kernel_128x256_2x1x1_Coop";
+  }
+
+  mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+              b_group_scales, b_group_size, expert_offsets, problem_sizes,
+              a_strides, b_strides, c_strides, group_scale_strides, schedule);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> encode_and_reorder_int4b(
+    torch::Tensor const& b_tensors) {
+  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);
+  TORCH_CHECK(b_tensors.dim() == 3);  // (experts, n, k)
+  TORCH_CHECK(b_tensors.is_contiguous());
+  TORCH_CHECK(b_tensors.is_cuda());
+
+  int n = static_cast<int>(b_tensors.size(1));
+  int k = static_cast<int>(b_tensors.size(2)) * PackFactor;  // logical k
+
+  // CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0.
+  // These misalignments cause silent OOB unless run under Compute Sanitizer.
+  TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256");
+  TORCH_CHECK(n % 16 == 0, "n must be divisible by 16");
+
+  // we will store the layout to an int32 tensor;
+  // this is the number of elements we need per layout
+  constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t);
+
+  torch::Tensor b_tensors_packed = torch::empty_like(b_tensors);
+  int num_experts = static_cast<int>(b_tensors.size(0));
+
+  auto b_ptr = static_cast<QuantType const*>(b_tensors.const_data_ptr());
+  auto b_packed_ptr = static_cast<QuantType*>(b_tensors_packed.data_ptr());
+
+  // multiply by ull so result does not overflow int32
+  size_t num_int4_elems = 1ull * num_experts * n * k;
+  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr,
+                                                           num_int4_elems);
+  TORCH_CHECK(ok, "unified_encode_int4b failed");
+
+  // construct the layout once; assumes each expert has the same layout
+  using LayoutType = LayoutB_Reordered;
+  std::vector<LayoutType> layout_B_reordered_host(num_experts);
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}});
+  auto shape_B = cute::make_shape(n, k, Int<1>{});
+  auto layout_B = make_layout(shape_B, stride_B);
+  LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
+
+  // reorder weights for each expert
+  for (int i = 0; i < num_experts; i++) {
+    // since the storage type of int4b is 1 byte but one element is 4 bits
+    // we need to adjust the offset
+    int64_t offset =
+        1ull * i * n * k * cutlass::sizeof_bits<QuantType>::value / 8;
+    cutlass::reorder_tensor(b_packed_ptr + offset, layout_B,
+                            layout_B_reordered);
+  }
+
+  // save the packed layout to torch tensor so we can re-use it
+  auto cpu_opts =
+      torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
+  torch::Tensor layout_cpu =
+      torch::empty({num_experts, layout_width}, cpu_opts);
+
+  int32_t* layout_data = layout_cpu.data_ptr<int32_t>();
+  for (int i = 0; i < num_experts; ++i) {
+    std::memcpy(layout_data + i * layout_width,  // dst (int32*)
+                &layout_B_reordered,             // src (LayoutType*)
+                sizeof(LayoutType));             // number of bytes
+  }
+
+  torch::Tensor packed_layout =
+      layout_cpu.to(b_tensors.device(), /*non_blocking=*/false);
+
+  return {b_tensors_packed, packed_layout};
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_w4a8_moe_mm", &mm);
+  m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b);
+}
+
+}  // namespace vllm::cutlass_w4a8_moe
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -0,0 +1,430 @@
+//
+// Based off of:
+//   https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu
+//
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+#include "cutlass_extensions/torch_utils.hpp"
+#include "w4a8_utils.cuh"
+
+#include "core/registration.h"
+
+#include "cutlass/cutlass.h"
+#include <limits>
+
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+
+#include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+#include <cuda_runtime.h>
+
+namespace vllm::cutlass_w4a8 {
+
+using namespace cute;
+
+// -------------------------------------------------------------------------------------
+// Static configuration shared across all instantiations
+// -------------------------------------------------------------------------------------
+using MmaType = cutlass::float_e4m3_t;  // A/scale element type
+using QuantType = cutlass::int4b_t;     // B element type (packed int4)
+
+static int constexpr TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
+static int constexpr ScalePackSize = 8;  // pack 8 scale elements together
+static int constexpr PackFactor = 8;     // 8 4-bit packed into int32
+
+// A matrix configuration
+using ElementA = MmaType;                   // Element type for A matrix operand
+using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+using LayoutA_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+constexpr int AlignmentA =
+    128 / cutlass::sizeof_bits<
+              ElementA>::value;  // Memory access granularity/alignment of A
+                                 // matrix in units of elements (up to 16 bytes)
+using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+
+// B matrix configuration
+using ElementB = QuantType;  // Element type for B matrix operand
+using LayoutB =
+    cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+using LayoutB_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+constexpr int AlignmentB =
+    128 / cutlass::sizeof_bits<
+              ElementB>::value;  // Memory access granularity/alignment of B
+                                 // matrix in units of elements (up to 16 bytes)
+using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+
+// Define the CuTe layout for reordered quantized tensor B
+// LayoutAtomQuant places values that will be read by the same thread in
+// contiguous locations in global memory. It specifies the reordering within a
+// single warp's fragment
+using LayoutAtomQuant =
+    decltype(cutlass::compute_memory_reordering_atom<MmaType>());
+using LayoutB_Reordered = decltype(cute::tile_to_shape(
+    LayoutAtomQuant{}, Layout<Shape<int, int, int>, StrideB>{}));
+
+// Group-wise scales
+using ElementScale = MmaType;
+using LayoutScale = cutlass::layout::RowMajor;
+
+// Per-tok, per-chan scales
+using ElementSChannel = float;
+
+// C/D matrix configuration
+using ElementC =
+    cutlass::bfloat16_t;  // Element type for C and D matrix operands
+using LayoutC =
+    cutlass::layout::RowMajor;  // Layout type for C and D matrix operands
+constexpr int AlignmentC =
+    128 / cutlass::sizeof_bits<
+              ElementC>::value;  // Memory access granularity/alignment of C
+                                 // matrix in units of elements (up to 16 bytes)
+
+using ElementD = ElementC;
+using LayoutD = LayoutC;
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator = float;     // Element type for internal accumulation
+using ElementCompute = float;         // Element type for epilogue computation
+using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that
+                                      // supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+using KernelSchedule =
+    cutlass::gemm::KernelTmaWarpSpecializedCooperative;  // Kernel to launch
+                                                         // based on the default
+                                                         // setting in the
+                                                         // Collective Builder
+using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+// ----------------------------------------------------------------------------
+// Kernel template — Tile/Cluster shapes
+// ----------------------------------------------------------------------------
+template <class TileShape_MN, class ClusterShape_MNK>
+struct W4A8GemmKernel {
+  using TileShape =
+      decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
+  using ClusterShape = ClusterShape_MNK;
+
+  // Epilogue per-tok, per-chan scales
+  using ChTokScalesEpilogue =
+      typename vllm::c3x::ScaledEpilogue<ElementAccumulator, ElementD,
+                                         TileShape>;
+  using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
+          ElementAccumulator, ElementSChannel,
+          // Transpose layout of D here since we use explicit swap + transpose
+          // the void type for C tells the builder to allocate 0 smem for the C
+          // matrix. We can enable this if beta == 0 by changing ElementC to
+          // void below.
+          ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type,
+          AlignmentC, ElementD,
+          typename cutlass::layout::LayoutTranspose<LayoutD>::type, AlignmentD,
+          EpilogueSchedule,  // This is the only epi supporting the required
+                             // swap + transpose.
+          EVTCompute>::CollectiveOp;
+
+  // The Scale information must get paired with the operand that will be scaled.
+  // In this example, B is scaled so we make a tuple of B's information and the
+  // scale information.
+  using CollectiveMainloopShuffled =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass,
+          cute::tuple<ElementB, cutlass::Array<ElementScale, ScalePackSize>>,
+          LayoutB_Reordered, AlignmentB, ElementA, LayoutA_Transpose,
+          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>,  // Indicates ProblemShape
+      CollectiveMainloopShuffled, CollectiveEpilogue>;
+  using GemmShuffled =
+      cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
+
+  using StrideC = typename GemmKernelShuffled::StrideC;
+  using StrideD = typename GemmKernelShuffled::StrideD;
+  using StrideS = typename CollectiveMainloopShuffled::StrideScale;
+
+  static torch::Tensor mm(torch::Tensor const& A,
+                          torch::Tensor const& B,             // already packed
+                          torch::Tensor const& group_scales,  // already packed
+                          int64_t group_size,
+                          torch::Tensor const& channel_scales,
+                          torch::Tensor const& token_scales,
+                          std::optional<at::ScalarType> const& maybe_out_type) {
+    // TODO: param validation
+    int m = A.size(0);
+    int k = A.size(1);
+    int n = B.size(1);
+
+    // safely cast group_size to int
+    TORCH_CHECK(group_size > 0 && group_size <= std::numeric_limits<int>::max(),
+                "group_size out of supported range for int: ", group_size);
+    int const group_size_int = static_cast<int>(group_size);
+
+    // Allocate output
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+    auto device = A.device();
+    auto stream = at::cuda::getCurrentCUDAStream(device.index());
+    torch::Tensor D =
+        torch::empty({m, n}, torch::TensorOptions()
+                                 .dtype(equivalent_scalar_type_v<ElementD>)
+                                 .device(device));
+    // prepare arg pointers
+    auto A_ptr = static_cast<MmaType const*>(A.const_data_ptr());
+    auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
+    auto D_ptr = static_cast<ElementD*>(D.data_ptr());
+    // can we avoid hardcode the 8 here
+    auto S_ptr =
+        static_cast<cutlass::Array<ElementScale, ScalePackSize> const*>(
+            group_scales.const_data_ptr());
+
+    // runtime layout for B
+    auto shape_B = cute::make_shape(n, k, 1);
+    LayoutB_Reordered layout_B_reordered =
+        cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
+
+    // strides
+    int const scale_k = cutlass::ceil_div(k, group_size_int);
+    StrideA stride_A =
+        cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+    // Reverse stride here due to swap and transpose
+    StrideD stride_D =
+        cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(n, m, 1));
+    StrideS stride_S = cutlass::make_cute_packed_stride(
+        StrideS{}, cute::make_shape(n, scale_k, 1));
+
+    // Create a structure of gemm kernel arguments suitable for invoking an
+    // instance of Gemm auto arguments =
+    // args_from_options<GemmShuffled>(options);
+    /// Populates a Gemm::Arguments structure from the given arguments
+    /// Swap the A and B tensors, as well as problem shapes here.
+    using Args = typename GemmShuffled::Arguments;
+    using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
+    using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
+
+    MainloopArguments mainloop_arguments{
+        B_ptr, layout_B_reordered, A_ptr,         stride_A,
+        S_ptr, stride_S,           group_size_int};
+
+    EpilogueArguments epilogue_arguments{
+        ChTokScalesEpilogue::prepare_args(channel_scales, token_scales),
+        nullptr,
+        {},  // no C
+        D_ptr,
+        stride_D};
+
+    Args arguments{cutlass::gemm::GemmUniversalMode::kGemm,
+                   {n, m, k, 1},  // shape
+                   mainloop_arguments,
+                   epilogue_arguments};
+
+    // Workspace
+    size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
+    torch::Tensor workspace =
+        torch::empty(workspace_size,
+                     torch::TensorOptions().dtype(torch::kU8).device(device));
+
+    // Run GEMM
+    GemmShuffled gemm;
+    CUTLASS_CHECK(gemm.can_implement(arguments));
+    CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+    CUTLASS_CHECK(gemm.run(stream));
+
+    return D;
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Kernel instantiations and dispatch logic
+// ----------------------------------------------------------------------------
+using Kernel_256x128_1x1x1 =
+    W4A8GemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>>;
+using Kernel_256x64_1x1x1 = W4A8GemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>>;
+using Kernel_256x32_1x1x1 = W4A8GemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>>;
+using Kernel_256x16_1x1x1 = W4A8GemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>>;
+using Kernel_128x256_2x1x1 =
+    W4A8GemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>>;
+using Kernel_128x256_1x1x1 =
+    W4A8GemmKernel<Shape<_128, _256>, Shape<_1, _1, _1>>;
+using Kernel_128x128_1x1x1 =
+    W4A8GemmKernel<Shape<_128, _128>, Shape<_1, _1, _1>>;
+using Kernel_128x64_1x1x1 = W4A8GemmKernel<Shape<_128, _64>, Shape<_1, _1, _1>>;
+using Kernel_128x32_1x1x1 = W4A8GemmKernel<Shape<_128, _32>, Shape<_1, _1, _1>>;
+using Kernel_128x16_1x1x1 = W4A8GemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>>;
+
+torch::Tensor mm_dispatch(torch::Tensor const& A,
+                          torch::Tensor const& B,             // already packed
+                          torch::Tensor const& group_scales,  // already packed
+                          int64_t group_size,
+                          torch::Tensor const& channel_scales,
+                          torch::Tensor const& token_scales,
+                          std::optional<at::ScalarType> const& maybe_out_type,
+                          const std::string& schedule) {
+  if (schedule == "256x128_1x1x1") {
+    return Kernel_256x128_1x1x1::mm(A, B, group_scales, group_size,
+                                    channel_scales, token_scales,
+                                    maybe_out_type);
+  } else if (schedule == "256x64_1x1x1") {
+    return Kernel_256x64_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "256x32_1x1x1") {
+    return Kernel_256x32_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "256x16_1x1x1") {
+    return Kernel_256x16_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "128x256_2x1x1") {
+    return Kernel_128x256_2x1x1::mm(A, B, group_scales, group_size,
+                                    channel_scales, token_scales,
+                                    maybe_out_type);
+  } else if (schedule == "128x256_1x1x1") {
+    return Kernel_128x256_1x1x1::mm(A, B, group_scales, group_size,
+                                    channel_scales, token_scales,
+                                    maybe_out_type);
+  } else if (schedule == "128x128_1x1x1") {
+    return Kernel_128x128_1x1x1::mm(A, B, group_scales, group_size,
+                                    channel_scales, token_scales,
+                                    maybe_out_type);
+  } else if (schedule == "128x64_1x1x1") {
+    return Kernel_128x64_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "128x32_1x1x1") {
+    return Kernel_128x32_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "128x16_1x1x1") {
+    return Kernel_128x16_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  }
+  TORCH_CHECK(false, "Unknown W4A8 schedule: ", schedule);
+  return {};
+}
+
+torch::Tensor mm(torch::Tensor const& A,
+                 torch::Tensor const& B,             // already packed
+                 torch::Tensor const& group_scales,  // already packed
+                 int64_t group_size, torch::Tensor const& channel_scales,
+                 torch::Tensor const& token_scales,
+                 std::optional<at::ScalarType> const& maybe_out_type,
+                 std::optional<std::string> maybe_schedule) {
+  // requested a specific schedule
+  if (maybe_schedule) {
+    return mm_dispatch(A, B, group_scales, group_size, channel_scales,
+                       token_scales, maybe_out_type, *maybe_schedule);
+  }
+  std::string schedule;
+  int M = A.size(0);
+  int K = A.size(1);
+  int N = B.size(1);
+  // heuristic
+  if (M <= 16) {
+    schedule = (K == 16384 && N == 18432) ? "256x16_1x1x1" : "128x16_1x1x1";
+  } else if (M <= 32) {
+    schedule = (K == 16384 && N == 18432) ? "256x32_1x1x1" : "128x32_1x1x1";
+  } else if (M <= 64) {
+    if (K == 16384 && N == 18432)
+      schedule = "256x64_1x1x1";
+    else if (N <= 8192 && K <= 8192)
+      schedule = "128x32_1x1x1";
+    else
+      schedule = "128x64_1x1x1";
+  } else if (M <= 128) {
+    if (K == 16384 && N == 18432)
+      schedule = "256x128_1x1x1";
+    else if (N <= 8192)
+      schedule = "128x64_1x1x1";
+    else
+      schedule = "128x128_1x1x1";
+  } else if (M <= 256) {
+    if (N <= 4096)
+      schedule = "128x64_1x1x1";
+    else if (N <= 8192)
+      schedule = "128x128_1x1x1";
+    else
+      schedule = "128x256_1x1x1";
+  } else if (M <= 512 && N <= 4096) {
+    schedule = "128x128_1x1x1";
+  } else if (M <= 1024) {
+    schedule = "128x256_1x1x1";
+  } else {
+    schedule = "128x256_2x1x1";
+  }
+  return mm_dispatch(A, B, group_scales, group_size, channel_scales,
+                     token_scales, maybe_out_type, schedule);
+}
+
+// ----------------------------------------------------------------------------
+// Pre-processing utils
+// ----------------------------------------------------------------------------
+torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
+  TORCH_CHECK(scales.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(scales.is_contiguous());
+  TORCH_CHECK(scales.is_cuda());
+
+  auto packed_scales = torch::empty(
+      {scales.numel() * ScalePackSize},
+      torch::TensorOptions().dtype(scales.dtype()).device(scales.device()));
+  auto scales_ptr = static_cast<MmaType const*>(scales.const_data_ptr());
+  auto packed_scales_ptr =
+      static_cast<cutlass::Array<ElementScale, ScalePackSize>*>(
+          packed_scales.data_ptr());
+
+  cutlass::pack_scale_fp8(scales_ptr, packed_scales_ptr, scales.numel());
+
+  return packed_scales;
+}
+
+torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
+  TORCH_CHECK(B.dtype() == torch::kInt32);
+  TORCH_CHECK(B.dim() == 2);
+
+  torch::Tensor B_packed = torch::empty_like(B);
+
+  int k = B.size(0) * PackFactor;  // logical k
+  int n = B.size(1);
+  TORCH_CHECK((n * k) % 32 == 0, "need multiples of 32 int4s for 16B chunks");
+
+  auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
+  auto B_packed_ptr = static_cast<QuantType*>(B_packed.data_ptr());
+  auto shape_B = cute::make_shape(n, k, 1);
+  auto layout_B = make_layout(shape_B, LayoutRight{});  // row major
+  LayoutB_Reordered layout_B_reordered =
+      cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
+
+  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr,
+                                                           n * k);
+  TORCH_CHECK(ok, "unified_encode_int4b failed");
+  cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
+
+  return B_packed;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_w4a8_mm", &mm);
+  m.impl("cutlass_pack_scale_fp8", &pack_scale_fp8);
+  m.impl("cutlass_encode_and_reorder_int4b", &encode_and_reorder_int4b);
+}
+
+}  // namespace vllm::cutlass_w4a8
--- a/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
@@ -0,0 +1,90 @@
+#include "w4a8_utils.cuh"
+
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+
+namespace vllm::cutlass_w4a8_utils {
+
+/*
+  GPU-accelerated implementation of cutlass::unified_encode_int4b.
+  Constructs a lookup table in constant memory to map 8 bits
+  (two 4-bit values) at a time. Assumes memory is contiguous
+  and pointers are 16-byte aligned.
+*/
+__constant__ uint8_t kNibbleLUT[256];
+
+__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
+                                            size_t nbytes) {
+  constexpr size_t V = sizeof(uint4);  // 16 bytes
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
+  const size_t nvec = nbytes / V;
+
+  // 1-D grid-stride loop over 16-byte chunks
+  for (size_t vec = tid; vec < nvec; vec += nthreads) {
+    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
+    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
+#pragma unroll
+    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
+    reinterpret_cast<uint4*>(out)[vec] = v;
+  }
+}
+
+static bool upload_lut() {
+  std::array<uint8_t, 256> lut{};
+  auto map_nib = [](uint8_t v) -> uint8_t {
+    // 1..7 -> (8 - v); keep 0 and 8..15
+    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
+  };
+  for (int b = 0; b < 256; ++b) {
+    uint8_t lo = b & 0xF;
+    uint8_t hi = (b >> 4) & 0xF;
+    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
+  }
+  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
+                                     /*offset=*/0, cudaMemcpyHostToDevice);
+
+  return (e == cudaSuccess);
+}
+
+bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
+                          size_t num_int4_elems) {
+  // Build/upload LUT
+  if (!upload_lut()) return false;
+
+  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
+                "int4 storage must be 1 byte");
+  const size_t nbytes = num_int4_elems >> 1;
+
+  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
+  auto* out_bytes = reinterpret_cast<uint8_t*>(out);
+
+  // kernel launch params
+  constexpr int block = 256;
+  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
+  int grid = int((nvec + block - 1) / block);
+  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel
+
+  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
+
+  // launch errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("unified_encode_int4b_device launch error: %s (%d)\n",
+           cudaGetErrorString(err), err);
+    return false;
+  }
+
+  // runtime errors
+  err = cudaDeviceSynchronize();
+  if (err != cudaSuccess) {
+    printf("unified_encode_int4b_device runtime error: %s (%d)\n",
+           cudaGetErrorString(err), err);
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace vllm::cutlass_w4a8_utils
--- a/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <cstddef>
+#include "cutlass/numeric_types.h"
+
+namespace vllm::cutlass_w4a8_utils {
+
+bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
+                          size_t num_int4_elems);
+
+}  // namespace vllm::cutlass_w4a8_utils
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp8.h>
+#include "dispatch_utils.h"
+
+#include "cuda_utils.h"
+#include "launch_bounds_utils.h"
+#include "nvfp4_utils.cuh"
+
+namespace vllm {
+
+// silu in float32
+__device__ __forceinline__ float silu(float x) {
+  return __fdividef(x, (1.f + __expf(-x)));
+}
+
+__device__ __forceinline__ float2 silu2(float2 x) {
+  return make_float2(silu(x.x), silu(x.y));
+}
+
+template <class Type>
+__inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
+                                                       PackedVec<Type>& vec2) {
+  PackedVec<Type> result;
+  using packed_type = typename TypeConverter<Type>::Type;
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
+    // silu_mul in float32
+    if constexpr (std::is_same_v<Type, half>) {
+      float2 silu_vec = silu2(__half22float2(vec.elts[i]));
+      result.elts[i] =
+          __float22half2_rn(__fmul2_rn(silu_vec, __half22float2(vec2.elts[i])));
+    } else {
+      float2 silu_vec = silu2(__bfloat1622float2(vec.elts[i]));
+      result.elts[i] = __float22bfloat162_rn(
+          __fmul2_rn(silu_vec, __bfloat1622float2(vec2.elts[i])));
+    }
+  }
+  return result;
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
+    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                             float const* SFScale, uint32_t* out,
+                             uint32_t* SFout) {
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+         colIdx += blockDim.x) {
+      int64_t inOffset =
+          rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
+                          numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];
+
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      auto& out_pos = out[outOffset];
+
+      // Compute silu and mul
+      PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2);
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                             CVT_FP4_NUM_THREADS_PER_SF>(
+              rowIdx, colIdx, numCols, SFout);
+
+      out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
+                                                     sf_out);
+    }
+  }
+}
+
+}  // namespace vllm
+
+void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
+                                     torch::Tensor& output_sf,
+                                     torch::Tensor& input,  // [..., 2 * d]
+                                     torch::Tensor& input_sf) {
+  int32_t m = input.size(0);
+  int32_t n = input.size(1) / 2;
+
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
+                  input.scalar_type() == at::ScalarType::BFloat16,
+              "Unsupported input data type for quantize_to_fp4.");
+
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
+        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+        auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+        vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
+            m, n, input_ptr, input_sf_ptr,
+            reinterpret_cast<uint32_t*>(output_ptr),
+            reinterpret_cast<uint32_t*>(sf_out));
+      });
+}
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "core/registration.h"
+
+#include <torch/all.h>
+#include <cutlass/arch/arch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include "cutlass_extensions/common.hpp"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include <cassert>
+
+using namespace cute;
+
+template <typename ElementAB, typename ElementC, typename ElementSF,
+          typename ElementAccumulator, typename LayoutSFA, typename LayoutSFB,
+          typename ScaleConfig>
+__global__ void __get_group_gemm_starts(
+    ElementAB** a_offsets, ElementAB** b_offsets, ElementC** out_offsets,
+    ElementSF** a_scales_offsets, ElementSF** b_scales_offsets,
+    ElementAccumulator** alpha_offsets, LayoutSFA* layout_sfa_base_as_int,
+    LayoutSFB* layout_sfb_base_as_int, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementSF* a_scales_base_as_int, ElementSF* b_scales_base_as_int,
+    ElementAccumulator* alphas_base_as_int, const int32_t* expert_offsets,
+    const int32_t* sf_offsets, const int32_t* problem_sizes_as_shapes,
+    const int K, const int N) {
+  int64_t expert_id = threadIdx.x;
+  if (expert_id >= gridDim.x * blockDim.x) {
+    return;
+  }
+  // Originally int32_t but upcasting to int64_t to avoid overflow
+  // during offset calculations
+  int64_t expert_offset = static_cast<int64_t>(expert_offsets[expert_id]);
+  int64_t sf_offset = static_cast<int64_t>(sf_offsets[expert_id]);
+  // size for block in block scale.
+  int64_t group_size = 16;
+  int64_t m = static_cast<int64_t>(problem_sizes_as_shapes[expert_id * 3]);
+  int64_t n = static_cast<int64_t>(problem_sizes_as_shapes[expert_id * 3 + 1]);
+  int64_t k = static_cast<int64_t>(problem_sizes_as_shapes[expert_id * 3 + 2]);
+  assert((m >= 0 && n == N && k == K && k % 2 == 0) &&
+         "unexpected problem sizes");
+
+  int64_t half_k = static_cast<int64_t>(k / 2);
+  int64_t group_k = static_cast<int64_t>(k / group_size);
+  // Shape of A as uint8/byte = [M, K // 2]
+  // Shape of B as uint8/byte = [E, N, K // 2]
+  a_offsets[expert_id] = a_base_as_int + expert_offset * half_k;
+
+  b_offsets[expert_id] = b_base_as_int + expert_id * n * half_k;
+  // Shape of C = [M, N]
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  // Shape of a_scale = [sum(sf_sizes), K // group_size]
+  a_scales_offsets[expert_id] = a_scales_base_as_int + sf_offset * group_k;
+
+  assert((reinterpret_cast<uintptr_t>(a_scales_offsets[expert_id]) % 128) ==
+             0 &&
+         "TMA requires 128-byte alignment");
+
+  // Shape of B scale = [E, N, K // group_size]
+  b_scales_offsets[expert_id] = b_scales_base_as_int + expert_id * n * group_k;
+  assert((reinterpret_cast<uintptr_t>(b_scales_offsets[expert_id]) % 128) ==
+             0 &&
+         "TMA requires 128-byte alignment");
+  // Shape of alpha = [E]
+  alpha_offsets[expert_id] = alphas_base_as_int + expert_id;
+
+  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
+  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
+
+  *layout_sfa_ptr = ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(
+      static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), 1));
+  *layout_sfb_ptr = ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(
+      static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), 1));
+}
+
+#define __CALL_GET_STARTS_KERNEL_BLOCKSCALE(ELEMENT_AB_TYPE, SF_TYPE,         \
+                                            TENSOR_C_TYPE, C_TYPE, LayoutSFA, \
+                                            LayoutSFB, ScaleConfig)           \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
+    __get_group_gemm_starts<ELEMENT_AB_TYPE, C_TYPE, SF_TYPE, float,          \
+                            LayoutSFA, LayoutSFB, ScaleConfig>                \
+        <<<1, num_experts, 0, stream>>>(                                      \
+            static_cast<ELEMENT_AB_TYPE**>(a_starts.data_ptr()),              \
+            static_cast<ELEMENT_AB_TYPE**>(b_starts.data_ptr()),              \
+            static_cast<C_TYPE**>(out_starts.data_ptr()),                     \
+            static_cast<SF_TYPE**>(a_scales_starts.data_ptr()),               \
+            static_cast<SF_TYPE**>(b_scales_starts.data_ptr()),               \
+            static_cast<float**>(alpha_starts.data_ptr()),                    \
+            reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),              \
+            reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),              \
+            static_cast<ELEMENT_AB_TYPE*>(a_tensors.data_ptr()),              \
+            static_cast<ELEMENT_AB_TYPE*>(b_tensors.data_ptr()),              \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                     \
+            static_cast<SF_TYPE*>(a_scales.data_ptr()),                       \
+            static_cast<SF_TYPE*>(b_scales.data_ptr()),                       \
+            static_cast<float*>(alphas.data_ptr()),                           \
+            static_cast<int32_t*>(expert_offsets.data_ptr()),                 \
+            static_cast<int32_t*>(sf_offsets.data_ptr()),                     \
+            static_cast<int32_t*>(problem_sizes.data_ptr()), K, N);           \
+  }
+
+template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+void run_get_group_gemm_starts(
+    const torch::Tensor& a_starts, const torch::Tensor& b_starts,
+    const torch::Tensor& out_starts, const torch::Tensor& a_scales_starts,
+    const torch::Tensor& b_scales_starts, const torch::Tensor& alpha_starts,
+    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
+    /*these are used for their base addresses*/
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor const& out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& alphas,
+    torch::Tensor const& expert_offsets, torch::Tensor const& sf_offsets,
+    torch::Tensor const& problem_sizes, int M, int N, int K) {
+  int num_experts = (int)expert_offsets.size(0);
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  TORCH_CHECK(out_tensors.size(1) == N,
+              "Output tensor shape doesn't match expected shape");
+  TORCH_CHECK(K / 2 == b_tensors.size(2),
+              "b_tensors(dim = 2) and a_tensors(dim = 1) trailing"
+              " dimension must match");
+  if (false) {
+  }
+  //(ELEMENT_AB_TYPE, BS_TYPE, TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB,
+  // ScaleConfig)
+  __CALL_GET_STARTS_KERNEL_BLOCKSCALE(
+      cutlass::float_e2m1_t, cutlass::float_ue4m3_t, torch::kBFloat16,
+      cutlass::bfloat16_t, LayoutSFA, LayoutSFB, ScaleConfig)
+  __CALL_GET_STARTS_KERNEL_BLOCKSCALE(cutlass::float_e2m1_t,
+                                      cutlass::float_ue4m3_t, torch::kFloat16,
+                                      half, LayoutSFA, LayoutSFB, ScaleConfig)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+template <typename OutType>
+void run_fp4_blockwise_scaled_group_mm_sm100(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  using ProblemShape =
+      cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
+  using ElementType = cutlass::float_e2m1_t;
+  using ElementSFType = cutlass::float_ue4m3_t;
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  using ElementC = OutType;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  // Layout definitions
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+
+  // Alignment constraints
+  static constexpr int AlignmentA = 32;
+  static constexpr int AlignmentB = 32;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Architecture definitions
+  using ArchTag = cutlass::arch::Sm100;
+  using EpilogueOperatorClass =
+      cutlass::arch::OpClassTensorOp;  // Epilogue Operator class tag
+  using MainloopOperatorClass =
+      cutlass::arch::OpClassBlockScaledTensorOp;  // Mainloop Operator class tag
+  using StageCountType =
+      cutlass::gemm::collective::StageCountAuto;  // Stage count maximized based
+                                                  // on the tile size
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  struct MMA1SMConfig {
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using KernelSchedule = cutlass::gemm::
+        KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100;  // Kernel to launch
+    using EpilogueSchedule =
+        cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;  // Epilogue to launch
+  };
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, EpilogueOperatorClass, typename MMA1SMConfig::MmaTileShape,
+          ClusterShape, Shape<_128, _64>, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutC*, AlignmentD,
+          typename MMA1SMConfig::EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, MainloopOperatorClass, ElementA, LayoutA*, AlignmentA,
+          ElementB, LayoutB*, AlignmentB, ElementAccumulator,
+          typename MMA1SMConfig::MmaTileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          typename MMA1SMConfig::KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm1SM = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using Gemm = Gemm1SM;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using LayoutSFA =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+  using ScaleConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor c_strides1 =
+      torch::full({num_experts}, output.stride(0), options_int);
+  torch::Tensor a_strides1 =
+      torch::full({num_experts}, a.stride(0) * 2, options_int);
+  torch::Tensor b_strides1 =
+      torch::full({num_experts}, b.stride(1) * 2, options_int);
+
+  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
+      a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
+      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
+      expert_offsets, sf_offsets, problem_sizes, M, N, K);
+
+  // Create an instance of the GEMM
+  Gemm gemm_op;
+
+  // Initialize problem_sizes_as_shapes correctly
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Set the Scheduler info
+  cutlass::KernelHardwareInfo hw_info;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::
+      PersistentTileSchedulerSm100GroupParams<
+          typename ProblemShape::UnderlyingProblemShape>::RasterOrderOptions;
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = RasterOrderOptions::AlongM;
+  hw_info.device_id = a.get_device();
+  static std::unordered_map<int, int> cached_sm_counts;
+  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
+    cached_sm_counts[hw_info.device_id] =
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            hw_info.device_id);
+  }
+  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementType**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides1.data_ptr()),
+      static_cast<const ElementType**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides1.data_ptr()),
+      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(c_strides1.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides1.data_ptr())};
+  auto& fusion_args = epilogue_args.thread;
+  fusion_args.alpha_ptr_array =
+      reinterpret_cast<float**>(alpha_ptrs.data_ptr());
+  fusion_args.dAlpha = {_0{}, _0{}, 1};
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info,
+      scheduler};
+
+  size_t workspace_size = Gemm::get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM: status=", (int)can_implement_status);
+
+  // Run the GEMM
+  auto status = gemm_op.initialize(args, workspace.data_ptr());
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);
+
+  status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+void run_fp4_blockwise_scaled_group_mm_sm120(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  using ProblemShape =
+      cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
+  using ElementType = cutlass::float_e2m1_t;
+  using ElementSFType = cutlass::float_ue4m3_t;
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  // NOTE: For SM120 it seems templating the output type is not supported and
+  // we need to hardcode the output type to bfloat16
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  // Layout definitions
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+
+  // Alignment constraints
+  static constexpr int AlignmentA = 32;
+  static constexpr int AlignmentB = 32;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Architecture definitions
+  using ArchTag = cutlass::arch::Sm120;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  using MmaTileShape = Shape<_128, _128, _128>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
+      ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, MmaTileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutD*, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto,
+          FusionOperation>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
+          LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using LayoutSFA =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+  using ScaleConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor c_strides1 =
+      torch::full({num_experts}, output.stride(0), options_int);
+  torch::Tensor a_strides1 =
+      torch::full({num_experts}, a.stride(0) * 2, options_int);
+  torch::Tensor b_strides1 =
+      torch::full({num_experts}, b.stride(1) * 2, options_int);
+
+  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
+      a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
+      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
+      expert_offsets, sf_offsets, problem_sizes, M, N, K);
+
+  // Create an instance of the GEMM
+  Gemm gemm_op;
+
+  // Initialize problem_sizes_as_shapes correctly
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Set the Scheduler info
+  cutlass::KernelHardwareInfo hw_info;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = RasterOrderOptions::AlongM;
+  hw_info.device_id = a.get_device();
+  static std::unordered_map<int, int> cached_sm_counts;
+  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
+    cached_sm_counts[hw_info.device_id] =
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            hw_info.device_id);
+  }
+  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementType**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides1.data_ptr()),
+      static_cast<const ElementType**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides1.data_ptr()),
+      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(c_strides1.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides1.data_ptr())};
+  auto& fusion_args = epilogue_args.thread;
+  fusion_args.alpha_ptr_array =
+      reinterpret_cast<float**>(alpha_ptrs.data_ptr());
+  fusion_args.dAlpha = {_0{}, _0{}, 1};
+  fusion_args.beta = 0.0f;
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info,
+      scheduler};
+
+  size_t workspace_size = Gemm::get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM: status=", (int)can_implement_status);
+
+  // Run the GEMM
+  auto status = gemm_op.initialize(args, workspace.data_ptr());
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);
+
+  status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType>
+void run_fp4_blockwise_scaled_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+  if (version_num >= 120 && version_num < 130) {
+    run_fp4_blockwise_scaled_group_mm_sm120(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+  if (version_num >= 100 && version_num < 120) {
+    run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
+      version_num, ". Required capability: 100 or 120");
+}
+
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+#endif
+
+#define CHECK_TYPE(x, st, m) \
+  TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) \
+  TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor.")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous.")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+void cutlass_fp4_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
+  // Input validation
+  CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
+  CHECK_INPUT(a_blockscale, SF_DTYPE, "a_blockscale");
+  CHECK_INPUT(b_blockscales, SF_DTYPE, "b_blockscales");
+  CHECK_INPUT(alphas, at::ScalarType::Float, "alphas");
+
+  TORCH_CHECK(a_blockscale.dim() == 2,
+              "expected a_blockscale to be of shape [num_experts, rounded_m,"
+              " k // group_size], observed rank: ",
+              a_blockscale.dim())
+  TORCH_CHECK(b_blockscales.dim() == 3,
+              "expected b_blockscale to be of shape: "
+              " [num_experts, n, k // group_size], observed rank: ",
+              b_blockscales.dim())
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be  a 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have the shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32.");
+
+  int M = static_cast<int>(a.size(0));
+  int N = static_cast<int>(b.size(1));
+  int E = static_cast<int>(b.size(0));
+  int K = static_cast<int>(2 * b.size(2));
+
+  if (output.scalar_type() == torch::kBFloat16) {
+    run_fp4_blockwise_scaled_group_mm<cutlass::bfloat16_t>(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+  } else {
+  #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+    int32_t version_num = get_sm_version_num();
+    if (version_num >= 120 && version_num < 130) {
+      TORCH_CHECK_NOT_IMPLEMENTED(
+          false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
+          output.scalar_type());
+    }
+  #endif
+    run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+  }
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_fp4_group_mm kernel, vLLM must "
+      "be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
+      "and CUDA 12.8 or above.");
+#endif
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
+}
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp8.h>
+#include "dispatch_utils.h"
+
+#include "nvfp4_utils.cuh"
+#include "launch_bounds_utils.h"
+
+namespace vllm {
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout,
+                    uint32_t* input_offset_by_experts,
+                    uint32_t* output_scale_offset_by_experts, int n_experts,
+                    bool low_latency) {
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
+
+  // Each global thread processes one element
+  for (int globalIdx = tid; globalIdx < numRows * colsPerRow;
+       globalIdx += gridDim.x * blockDim.x) {
+    // Calculate which row and column this global thread should process
+    int rowIdx = globalIdx / colsPerRow;
+    int colIdx = globalIdx % colsPerRow;
+
+    int64_t inOffset = rowIdx * colsPerRow + colIdx;
+    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+    // Get the output tensor offset.
+    // Same as inOffset because 8 elements are packed into one uint32_t.
+    int64_t outOffset = inOffset;
+    auto& out_pos = out[outOffset];
+
+    // Find index within the experts using different strategies based on expert
+    // count
+    int rowIdx_in_expert = 0;
+    int expert_idx = 0;
+
+    if constexpr (SMALL_NUM_EXPERTS) {
+      for (int i = 0; i < n_experts; i++) {
+        uint32_t current_offset = __ldca(&input_offset_by_experts[i]);
+        uint32_t next_offset = __ldca(&input_offset_by_experts[i + 1]);
+        if (rowIdx >= current_offset && rowIdx < next_offset) {
+          rowIdx_in_expert = rowIdx - current_offset;
+          expert_idx = i;
+          break;
+        }
+      }
+    } else {
+      // Load input offsets into registers first, then do the computation.
+      // Local array size set to 17 because of register limit.
+      uint32_t local_offsets[17];
+      for (int chunk_start = 0; chunk_start < n_experts; chunk_start += 16) {
+        *reinterpret_cast<int4*>(local_offsets) =
+            __ldca(reinterpret_cast<const int4*>(
+                &input_offset_by_experts[chunk_start]));
+        *reinterpret_cast<int4*>(local_offsets + 4) =
+            __ldca(reinterpret_cast<const int4*>(
+                &input_offset_by_experts[chunk_start + 4]));
+        *reinterpret_cast<int4*>(local_offsets + 8) =
+            __ldca(reinterpret_cast<const int4*>(
+                &input_offset_by_experts[chunk_start + 8]));
+        *reinterpret_cast<int4*>(local_offsets + 12) =
+            __ldca(reinterpret_cast<const int4*>(
+                &input_offset_by_experts[chunk_start + 12]));
+        local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]);
+
+// Check against the 16 loaded offsets
+#pragma unroll
+        for (int i = 0; i < 16; i++) {
+          if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) {
+            rowIdx_in_expert = rowIdx - local_offsets[i];
+            expert_idx = chunk_start + i;
+            break;
+          }
+        }
+      }
+    }
+
+    // Get the global scaling factor, which will be applied to the SF.
+    // Note SFScale is the same as next GEMM's alpha, which is
+    // (448.f / (Alpha_A / 6.f)).
+    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
+
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    // The actual output_scales dim is computed from the padded numCols.
+    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
+    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
+    uint32_t* SFout_in_expert =
+        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+
+    auto sf_out =
+        cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                           CVT_FP4_NUM_THREADS_PER_SF>(
+            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+
+    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+  }
+}
+
+// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
+template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
+__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout,
+                    uint32_t* input_offset_by_experts,
+                    uint32_t* output_scale_offset_by_experts, int n_experts) {
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+  extern __shared__ uint32_t shared_input_offsets[];
+
+  // Load input offsets into shared memory.
+  // If n_experts is larger than 4, use vectorized int4 to save instructions.
+  // If n_experts is smaller than 4, read directly.
+  if constexpr (SMALL_NUM_EXPERTS) {
+    for (int i = threadIdx.x; i < n_experts + 1; i += blockDim.x) {
+      shared_input_offsets[i] = input_offset_by_experts[i];
+    }
+  } else {
+    for (int i = threadIdx.x * 4; i < n_experts; i += blockDim.x * 4) {
+      *reinterpret_cast<int4*>(&shared_input_offsets[i]) =
+          *reinterpret_cast<const int4*>(&input_offset_by_experts[i]);
+    }
+    if (threadIdx.x == 0) {
+      shared_input_offsets[n_experts] = input_offset_by_experts[n_experts];
+    }
+  }
+
+  __syncthreads();
+
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
+
+  // Each global thread processes one element
+  for (int globalIdx = tid; globalIdx < numRows * colsPerRow;
+       globalIdx += gridDim.x * blockDim.x) {
+    // Calculate which row and column this global thread should process
+    int rowIdx = globalIdx / colsPerRow;
+    int colIdx = globalIdx % colsPerRow;
+
+    int64_t inOffset = rowIdx * colsPerRow + colIdx;
+    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+    int64_t outOffset = inOffset;
+    auto& out_pos = out[outOffset];
+
+    // Find expert using binary search for better performance with large m_topk
+    int rowIdx_in_expert = 0;
+    int expert_idx = 0;
+
+    // Binary search through experts using shared memory
+    int left = 0, right = n_experts - 1;
+    while (left <= right) {
+      int mid = (left + right) / 2;
+      // Get offsets: shared_input_offsets[i] corresponds to
+      // input_offset_by_experts[i]
+      uint32_t mid_offset = shared_input_offsets[mid];
+      uint32_t next_offset = shared_input_offsets[mid + 1];
+
+      if (rowIdx >= mid_offset && rowIdx < next_offset) {
+        rowIdx_in_expert = rowIdx - mid_offset;
+        expert_idx = mid;
+        break;
+      } else if (rowIdx < mid_offset) {
+        right = mid - 1;
+      } else {
+        left = mid + 1;
+      }
+    }
+
+    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
+
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
+    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
+    uint32_t* SFout_in_expert =
+        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+
+    auto sf_out =
+        cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                           CVT_FP4_NUM_THREADS_PER_SF>(
+            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+
+    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+  }
+}
+
+template <typename T>
+void quant_impl(void* output, void* output_scale, void* input,
+                void* input_global_scale, void* input_offset_by_experts,
+                void* output_scale_offset_by_experts, int m_topk, int k,
+                int n_experts, cudaStream_t stream) {
+  // TODO: this multiProcessorCount should be cached.
+  int device;
+  cudaGetDevice(&device);
+  int multiProcessorCount;
+  cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount,
+                         device);
+
+  // Grid, Block size.
+  // Each thread converts 8 values.
+  int const workSizePerRow = k / ELTS_PER_THREAD;
+  int const totalWorkSize = m_topk * workSizePerRow;
+  dim3 block(std::min(workSizePerRow, 512));
+  // Get number of blocks per SM
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
+  dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
+                     multiProcessorCount * numBlocksPerSM));
+  while (grid.x <= multiProcessorCount && block.x > 64) {
+    grid.x *= 2;
+    block.x = (block.x + 1) / 2;
+  }
+
+  int const blockRepeat =
+      (totalWorkSize + block.x * grid.x - 1) / (block.x * grid.x);
+  if (blockRepeat > 1) {
+    size_t shared_mem_size = (n_experts + 1) * sizeof(uint32_t);
+    if (n_experts >= 4) {
+      cvt_fp16_to_fp4<T, false, false>
+          <<<grid, block, shared_mem_size, stream>>>(
+              m_topk, k, reinterpret_cast<T*>(input),
+              reinterpret_cast<float*>(input_global_scale),
+              reinterpret_cast<uint32_t*>(output),
+              reinterpret_cast<uint32_t*>(output_scale),
+              reinterpret_cast<uint32_t*>(input_offset_by_experts),
+              reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+              n_experts);
+    } else {
+      cvt_fp16_to_fp4<T, false, true><<<grid, block, shared_mem_size, stream>>>(
+          m_topk, k, reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          n_experts);
+    }
+  } else {
+    if (n_experts >= 16) {
+      cvt_fp16_to_fp4<T, false, false><<<grid, block, 0, stream>>>(
+          m_topk, k, reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          n_experts, /* bool low_latency */ true);
+    } else {
+      cvt_fp16_to_fp4<T, false, true><<<grid, block, 0, stream>>>(
+          m_topk, k, reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          n_experts, /* bool low_latency */ true);
+    }
+  }
+}
+
+}  // namespace vllm
+
+/*Quantization entry for fp4 experts quantization*/
+#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, "must be contiguous")
+#define CHECK_INPUT(x, m) \
+  CHECK_TH_CUDA(x, m);    \
+  CHECK_CONTIGUOUS(x, m);
+
+constexpr auto HALF = at::ScalarType::Half;
+constexpr auto BF16 = at::ScalarType::BFloat16;
+constexpr auto FLOAT = at::ScalarType::Float;
+constexpr auto INT = at::ScalarType::Int;
+constexpr auto UINT8 = at::ScalarType::Byte;
+
+void scaled_fp4_experts_quant_sm1xxa(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+  CHECK_INPUT(output, "output must be a CUDA tensor");
+  CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
+  CHECK_INPUT(input, "input must be a CUDA tensor");
+  CHECK_INPUT(input_global_scale, "input_global_scale must be a CUDA tensor");
+  CHECK_INPUT(input_offset_by_experts,
+              "input_offset_by_experts must be a CUDA tensor");
+  CHECK_INPUT(output_scale_offset_by_experts,
+              "output_scale_offset_by_experts must be a CUDA tensor");
+
+  TORCH_CHECK(output.dim() == 2);
+  TORCH_CHECK(output_scale.dim() == 2);
+  TORCH_CHECK(input.dim() == 2);
+  TORCH_CHECK(input_global_scale.dim() == 1);
+  TORCH_CHECK(input_offset_by_experts.dim() == 1);
+  TORCH_CHECK(output_scale_offset_by_experts.dim() == 1);
+
+  TORCH_CHECK(input.scalar_type() == HALF || input.scalar_type() == BF16);
+  TORCH_CHECK(input_global_scale.scalar_type() == FLOAT);
+  TORCH_CHECK(input_offset_by_experts.scalar_type() == INT);
+  TORCH_CHECK(output_scale_offset_by_experts.scalar_type() == INT);
+  // output is uint8 (two nvfp4 values are packed into one uint8)
+  // output_scale is int32 (four fp8 values are packed into one int32)
+  TORCH_CHECK(output.scalar_type() == UINT8);
+  TORCH_CHECK(output_scale.scalar_type() == INT);
+
+  const int BLOCK_SIZE = 16;
+  auto m_topk = input.size(0);
+  auto k = input.size(1);
+  TORCH_CHECK(k % BLOCK_SIZE == 0, "k must be a multiple of 16");
+  auto n_experts = input_global_scale.size(0);
+  TORCH_CHECK(input_offset_by_experts.size(0) == n_experts + 1);
+  TORCH_CHECK(output_scale_offset_by_experts.size(0) == n_experts + 1);
+  TORCH_CHECK(output.size(0) == m_topk);
+  TORCH_CHECK(output.size(1) == k / 2);
+  int scales_k = k / BLOCK_SIZE;
+  // 4 means the swizzle requirement by nvidia nvfp4.
+  int padded_k = (scales_k + (4 - 1)) / 4 * 4;
+  // 4 means 4 fp8 values are packed into one int32
+  TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(input.get_device());
+
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "nvfp4_experts_quant_kernel", [&] {
+        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+        vllm::quant_impl<cuda_type>(
+            output.data_ptr(), output_scale.data_ptr(), input.data_ptr(),
+            input_global_scale.data_ptr(), input_offset_by_experts.data_ptr(),
+            output_scale_offset_by_experts.data_ptr(), m_topk, k, n_experts,
+            stream);
+      });
+}
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
+                             torch::Tensor const& input,
+                             torch::Tensor const& output_sf,
+                             torch::Tensor const& input_sf);
+#endif
+
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void scaled_fp4_experts_quant_sm1xxa(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts);
+#endif
+
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,
+                                     torch::Tensor& output_sf,
+                                     torch::Tensor& input,
+                                     torch::Tensor& input_sf);
+#endif
+
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
+}
+
+void scaled_fp4_experts_quant(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return scaled_fp4_experts_quant_sm1xxa(
+      output, output_scale, input, input_global_scale, input_offset_by_experts,
+      output_scale_offset_by_experts);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "No compiled nvfp4 experts quantization kernel");
+}
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf,
+                              torch::Tensor& input, torch::Tensor& input_sf) {
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "No compiled silu_and_mul nvfp4 quantization kernel");
+}
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp8.h>
+#include "dispatch_utils.h"
+
+#include "cuda_utils.h"
+#include "launch_bounds_utils.h"
+#include "nvfp4_utils.cuh"
+
+namespace vllm {
+
+template <typename Int>
+__host__ __device__ inline Int round_up(Int x, Int y) {
+  static_assert(std::is_integral_v<Int>,
+                "round_up argument must be integral type");
+  return (x + y - 1) / y * y;
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  int sf_m = round_up<int>(numRows, 128);
+  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
+  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
+  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
+    // Each thread writes 4 uint32_t elements.
+    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
+         col += blockDim.x * 4) {
+      SFout[row * sf_n_int + col] = 0x00;
+    }
+  }
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+         colIdx += blockDim.x) {
+      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = inOffset;
+      auto& out_pos = out[outOffset];
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                             CVT_FP4_NUM_THREADS_PER_SF>(
+              rowIdx, colIdx, numCols, SFout);
+
+      out_pos =
+          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
+    }
+  }
+}
+
+template <typename T>
+void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
+                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
+                           int multiProcessorCount, cudaStream_t stream) {
+  // Grid, Block size.
+  // Each thread converts 8 values.
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
+  // Get number of blocks per SM
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+  // Launch the cvt kernel.
+  if (useUE8M0) {
+    cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
+        reinterpret_cast<uint32_t*>(SFOuput));
+  } else {
+    cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
+        reinterpret_cast<uint32_t*>(SFOuput));
+  }
+}
+
+// Instantiate the function.
+template void invokeFP4Quantization(int m, int n, half const* input,
+                                    float const* SFScale, int64_t* output,
+                                    int32_t* SFOuput, bool useUE8M0,
+                                    int multiProcessorCount,
+                                    cudaStream_t stream);
+
+template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
+                                    float const* SFScale, int64_t* output,
+                                    int32_t* SFOuput, bool useUE8M0,
+                                    int multiProcessorCount,
+                                    cudaStream_t stream);
+
+}  // namespace vllm
+
+void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
+                             torch::Tensor const& input,
+                             torch::Tensor const& output_sf,
+                             torch::Tensor const& input_sf) {
+  int32_t m = input.size(0);
+  int32_t n = input.size(1);
+
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
+                  input.scalar_type() == at::ScalarType::BFloat16,
+              "Unsupported input data type for quantize_to_fp4.");
+
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+
+  // We don't support e8m0 scales at this moment.
+  bool useUE8M0 = false;
+
+  VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
+    using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+    auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+    vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
+                                sf_out, useUE8M0, multiProcessorCount, stream);
+  });
+}
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha);
+#endif
+
+#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha);
+#endif
+
+void cutlass_scaled_fp4_mm(torch::Tensor& D, const torch::Tensor& A,
+                           const torch::Tensor& B, const torch::Tensor& A_sf,
+                           const torch::Tensor& B_sf,
+                           const torch::Tensor& alpha) {
+  // Make sure we’re on A’s device.
+  const c10::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  const int32_t sm = get_sm_version_num();
+
+#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
+  if (sm >= 100 && sm < 120) {
+    cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+    return;
+  }
+#endif
+
+#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
+  if (sm >= 120 && sm < 130) {
+    cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel for SM ", sm,
+                              ". Recompile with CUDA >= 12.8 and CC >= 100.");
+}
+
+bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
+  int runtimeVersion;
+  cudaRuntimeGetVersion(&runtimeVersion);
+  return cuda_device_capability >= 100 && runtimeVersion >= 12080;
+}
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/packed_stride.hpp"
+
+#include "core/math.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+// Configuration for M in (256, inf)
+struct sm100_fp4_config_default {
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+// Configuration for M in (16, 256]
+struct sm100_fp4_config_M256 {
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _128, _256>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _128, _256>;
+};
+
+// Configuration for M in [1, 16]
+struct sm100_fp4_config_M16 {
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _128, _256>;
+};
+
+template <typename Config, typename OutType>
+struct Fp4GemmSm100 {
+  // A matrix configuration
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutATag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 32;
+
+  // B matrix configuration
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 32;
+
+  // C/D matrix configuration
+  using ElementD = OutType;
+  using ElementC = OutType;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  // Kernel functional config
+  using ElementAccumulator = float;
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  // Use config's tile shapes
+  using MmaTileShape = typename Config::TileShape;
+  using ClusterShape = typename Config::ClusterShape;
+  using PerSmTileShape_MNK = typename Config::PerSmTileShape_MNK;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, PerSmTileShape_MNK, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutCTag, AlignmentC, ElementD,
+          LayoutDTag, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutATag, AlignmentA, ElementB,
+          LayoutBTag, AlignmentB, ElementAccumulator, MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using LayoutA = decltype(cute::make_layout(make_shape(0, 0, 0), StrideA{}));
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using LayoutB = decltype(cute::make_layout(make_shape(0, 0, 0), StrideB{}));
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutC = decltype(cute::make_layout(make_shape(0, 0, 0), StrideC{}));
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using LayoutD = decltype(cute::make_layout(make_shape(0, 0, 0), StrideD{}));
+};
+
+template <typename Config>
+typename Config::Gemm::Arguments args_from_options(
+    at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+    at::Tensor const& A_sf, at::Tensor const& B_sf, at::Tensor const& alpha,
+    int64_t M, int64_t N, int64_t K) {
+  using ElementA = typename Config::Gemm::ElementA;
+  using ElementB = typename Config::Gemm::ElementB;
+  using ElementSFA = cutlass::float_ue4m3_t;
+  using ElementSFB = cutlass::float_ue4m3_t;
+  using ElementD = typename Config::Gemm::ElementD;
+  using ElementCompute = float;
+  using StrideA = typename Config::StrideA;
+  using StrideB = typename Config::StrideB;
+  using StrideD = typename Config::StrideD;
+  using Sm100BlkScaledConfig = typename Config::Gemm::GemmKernel::
+      CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  int m = static_cast<int>(M);
+  int n = static_cast<int>(N);
+  int k = static_cast<int>(K);
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {m, k, 1});
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, 1});
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {m, n, 1});
+
+  auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(
+      cute::make_shape(m, n, k, 1));
+  auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(
+      cute::make_shape(m, n, k, 1));
+
+  typename Config::Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      {// Mainloop arguments
+       static_cast<ElementA const*>(A.data_ptr()), stride_A,
+       static_cast<ElementB const*>(B.data_ptr()), stride_B,
+       static_cast<ElementSFA const*>(A_sf.data_ptr()), layout_SFA,
+       static_cast<ElementSFB const*>(B_sf.data_ptr()), layout_SFB},
+      {     // Epilogue arguments
+       {},  // epilogue.thread
+       static_cast<ElementD const*>(D.data_ptr()),
+       stride_D,
+       static_cast<ElementD*>(D.data_ptr()),
+       stride_D}};
+  auto& fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
+  return arguments;
+}
+
+template <typename Config>
+void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+             at::Tensor const& A_sf, at::Tensor const& B_sf,
+             at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
+             cudaStream_t stream) {
+  typename Config::Gemm gemm;
+
+  auto arguments =
+      args_from_options<Config>(D, A, B, A_sf, B_sf, alpha, m, n, k);
+
+  size_t workspace_size = Config::Gemm::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
+}
+
+// Dispatch function to select appropriate config based on M
+template <typename OutType>
+void cutlass_fp4_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
+                               torch::Tensor const& B,
+                               torch::Tensor const& A_sf,
+                               torch::Tensor const& B_sf,
+                               torch::Tensor const& alpha, int64_t m, int64_t n,
+                               int64_t k, cudaStream_t stream) {
+  uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
+
+  if (mp2 <= 16) {
+    // m in [1, 16]
+    runGemm<Fp4GemmSm100<sm100_fp4_config_M16, OutType>>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (mp2 <= 256) {
+    // m in (16, 256]
+    runGemm<Fp4GemmSm100<sm100_fp4_config_M256, OutType>>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    // m in (256, inf)
+    runGemm<Fp4GemmSm100<sm100_fp4_config_default, OutType>>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  }
+}
+
+#else
+template <typename OutType>
+void cutlass_fp4_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
+                               torch::Tensor const& B,
+                               torch::Tensor const& A_sf,
+                               torch::Tensor const& B_sf,
+                               torch::Tensor const& alpha, int64_t m, int64_t n,
+                               int64_t k, cudaStream_t stream) {
+  TORCH_CHECK(false,
+              "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
+              "a CUTLASS 3.8 source directory to enable support.");
+}
+#endif  // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+#define CHECK_TYPE(x, st, m) \
+  TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) \
+  TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+
+void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha) {
+  CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
+
+  CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
+  CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
+
+  CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
+
+  TORCH_CHECK(A.dim() == 2, "a must be a matrix");
+  TORCH_CHECK(B.dim() == 2, "b must be a matrix");
+  TORCH_CHECK(A.sizes()[1] == B.sizes()[1],
+              "a and b shapes cannot be multiplied (", A.sizes()[0], "x",
+              A.sizes()[1], " and ", B.sizes()[0], "x", B.sizes()[1], ")");
+
+  auto const m = A.sizes()[0];
+  auto const n = B.sizes()[0];
+  auto const k = A.sizes()[1] * 2;
+
+  constexpr int alignment = 32;
+  TORCH_CHECK(k % alignment == 0, "Expected k to be divisible by ", alignment,
+              ", but got a shape: (", A.sizes()[0], "x", A.sizes()[1],
+              "), k: ", k, ".");
+  TORCH_CHECK(n % alignment == 0, "Expected n to be divisible by ", alignment,
+              ", but got b shape: (", B.sizes()[0], "x", B.sizes()[1], ").");
+
+  auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
+  int rounded_m = round_up(m, 128);
+  int rounded_n = round_up(n, 128);
+  // Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
+  // integer.
+  int rounded_k = round_up(k / 16, 4);
+
+  TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
+  TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
+  TORCH_CHECK(A_sf.sizes()[1] == B_sf.sizes()[1],
+              "scale_a and scale_b shapes cannot be multiplied (",
+              A_sf.sizes()[0], "x", A_sf.sizes()[1], " and ", B_sf.sizes()[0],
+              "x", B_sf.sizes()[1], ")");
+  TORCH_CHECK(A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k,
+              "scale_a must be padded and swizzled to a shape (", rounded_m,
+              "x", rounded_k, "), but got a shape (", A_sf.sizes()[0], "x",
+              A_sf.sizes()[1], ")");
+  TORCH_CHECK(B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k,
+              "scale_b must be padded and swizzled to a shape (", rounded_n,
+              "x", rounded_k, "), but got a shape (", B_sf.sizes()[0], "x",
+              B_sf.sizes()[1], ")");
+
+  auto out_dtype = D.dtype();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
+
+  if (out_dtype == at::ScalarType::Half) {
+    cutlass_fp4_gemm_dispatch<cutlass::half_t>(D, A, B, A_sf, B_sf, alpha, m, n,
+                                               k, stream);
+  } else if (out_dtype == at::ScalarType::BFloat16) {
+    cutlass_fp4_gemm_dispatch<cutlass::bfloat16_t>(D, A, B, A_sf, B_sf, alpha,
+                                                   m, n, k, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm (", out_dtype,
+                ")");
+  }
+}
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/packed_stride.hpp"
+
+#include "core/math.hpp"
+
+using namespace cute;
+
+#define CHECK_TYPE(x, st, m) \
+  TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) \
+  TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+
+struct sm120_fp4_config_M256 {
+  using ClusterShape = Shape<_1, _1, _1>;
+  using MmaTileShape = Shape<_128, _128, _128>;
+  using PerSmTileShape_MNK = Shape<_128, _128, _128>;
+};
+
+struct sm120_fp4_config_default {
+  using ClusterShape = Shape<_1, _1, _1>;
+  using MmaTileShape = Shape<_256, _128, _128>;
+  using PerSmTileShape_MNK = Shape<_256, _128, _128>;
+};
+
+template <typename Config, typename OutType>
+struct Fp4GemmSm120 {
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutATag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 32;
+
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 32;
+
+  using ElementD = OutType;
+  using ElementC = OutType;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ElementAccumulator = float;
+  using ArchTag = cutlass::arch::Sm120;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  using MmaTileShape = typename Config::MmaTileShape;
+  using ClusterShape = typename Config::ClusterShape;
+  using PerSmTileShape_MNK = typename Config::PerSmTileShape_MNK;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, PerSmTileShape_MNK, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutCTag, AlignmentC, ElementD,
+          LayoutDTag, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutATag, AlignmentA, ElementB,
+          LayoutBTag, AlignmentB, ElementAccumulator, MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+};
+
+template <typename Gemm>
+typename Gemm::Arguments args_from_options(at::Tensor& D, at::Tensor const& A,
+                                           at::Tensor const& B,
+                                           at::Tensor const& A_sf,
+                                           at::Tensor const& B_sf,
+                                           torch::Tensor const& alpha, int M,
+                                           int N, int K) {
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementD = typename Gemm::ElementD;
+  using ElementSFA = cutlass::float_ue4m3_t;
+  using ElementSFB = cutlass::float_ue4m3_t;
+  using ElementCompute = float;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using Sm1xxBlkScaledConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1});
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1});
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1});
+
+  auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(
+      cute::make_shape(M, N, K, 1));
+  auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(
+      cute::make_shape(M, N, K, 1));
+
+  typename Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {M, N, K, 1},
+      {static_cast<ElementA const*>(A.data_ptr()), stride_A,
+       static_cast<ElementB const*>(B.data_ptr()), stride_B,
+       static_cast<ElementSFA const*>(A_sf.data_ptr()), layout_SFA,
+       static_cast<ElementSFB const*>(B_sf.data_ptr()), layout_SFB},
+      {{},
+       static_cast<ElementD const*>(D.data_ptr()),
+       stride_D,
+       static_cast<ElementD*>(D.data_ptr()),
+       stride_D}};
+  auto& fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
+
+  return arguments;
+}
+
+template <typename Gemm>
+void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+             at::Tensor const& A_sf, at::Tensor const& B_sf,
+             torch::Tensor const& alpha, int M, int N, int K,
+             cudaStream_t stream) {
+  Gemm gemm;
+
+  auto arguments = args_from_options<Gemm>(D, A, B, A_sf, B_sf, alpha, M, N, K);
+
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
+}
+
+void cutlass_fp4_bf16_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
+                                    torch::Tensor const& B,
+                                    torch::Tensor const& A_sf,
+                                    torch::Tensor const& B_sf,
+                                    torch::Tensor const& alpha, int m, int n,
+                                    int k, cudaStream_t stream) {
+  uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
+  if (mp2 <= 256) {
+    runGemm<Fp4GemmSm120<sm120_fp4_config_M256, cutlass::bfloat16_t>::Gemm>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    runGemm<Fp4GemmSm120<sm120_fp4_config_default, cutlass::bfloat16_t>::Gemm>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  }
+}
+
+void cutlass_fp4_f16_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
+                                   torch::Tensor const& B,
+                                   torch::Tensor const& A_sf,
+                                   torch::Tensor const& B_sf,
+                                   torch::Tensor const& alpha, int m, int n,
+                                   int k, cudaStream_t stream) {
+  uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
+  if (mp2 <= 256) {
+    runGemm<Fp4GemmSm120<sm120_fp4_config_M256, cutlass::half_t>::Gemm>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    runGemm<Fp4GemmSm120<sm120_fp4_config_default, cutlass::half_t>::Gemm>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  }
+}
+
+void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha) {
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+  CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
+
+  CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
+  CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
+
+  CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
+
+  TORCH_CHECK(A.dim() == 2, "a must be a matrix");
+  TORCH_CHECK(B.dim() == 2, "b must be a matrix");
+  TORCH_CHECK(A.sizes()[1] == B.sizes()[1],
+              "a and b shapes cannot be multiplied (", A.sizes()[0], "x",
+              A.sizes()[1], " and ", B.sizes()[0], "x", B.sizes()[1], ")");
+
+  auto const m = A.sizes()[0];
+  auto const n = B.sizes()[0];
+  auto const k = A.sizes()[1] * 2;
+
+  constexpr int alignment = 32;
+  TORCH_CHECK(k % alignment == 0, "Expected k to be divisible by ", alignment,
+              ", but got a shape: (", A.sizes()[0], "x", A.sizes()[1],
+              "), k: ", k, ".");
+  TORCH_CHECK(n % alignment == 0, "Expected n to be divisible by ", alignment,
+              ", but got b shape: (", B.sizes()[0], "x", B.sizes()[1], ").");
+
+  auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
+  int rounded_m = round_up(m, 128);
+  int rounded_n = round_up(n, 128);
+  // Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
+  // integer.
+  int rounded_k = round_up(k / 16, 4);
+
+  TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
+  TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
+  TORCH_CHECK(A_sf.sizes()[1] == B_sf.sizes()[1],
+              "scale_a and scale_b shapes cannot be multiplied (",
+              A_sf.sizes()[0], "x", A_sf.sizes()[1], " and ", B_sf.sizes()[0],
+              "x", B_sf.sizes()[1], ")");
+  TORCH_CHECK(A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k,
+              "scale_a must be padded and swizzled to a shape (", rounded_m,
+              "x", rounded_k, "), but got a shape (", A_sf.sizes()[0], "x",
+              A_sf.sizes()[1], ")");
+  TORCH_CHECK(B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k,
+              "scale_b must be padded and swizzled to a shape (", rounded_n,
+              "x", rounded_k, "), but got a shape (", B_sf.sizes()[0], "x",
+              B_sf.sizes()[1], ")");
+
+  auto out_dtype = D.dtype();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
+
+  if (out_dtype == at::ScalarType::BFloat16) {
+    return cutlass_fp4_bf16_gemm_dispatch(D, A, B, A_sf, B_sf, alpha, m, n, k,
+                                          stream);
+  } else if (out_dtype == at::ScalarType::Half) {
+    return cutlass_fp4_f16_gemm_dispatch(D, A, B, A_sf, B_sf, alpha, m, n, k,
+                                         stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm sm120 (",
+                out_dtype, ")");
+  }
+#else
+  TORCH_CHECK(false,
+              "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
+              "a CUTLASS 3.8 source directory to enable support.");
+#endif  // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+}
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda_fp8.h>
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+namespace vllm {
+
+// Convert PyTorch cpp type to CUDA type
+template <typename T>
+struct CUDATypeConverter {
+  using Type = T;
+};
+
+template <>
+struct CUDATypeConverter<at::Half> {
+  using Type = half;
+};
+
+template <>
+struct CUDATypeConverter<at::BFloat16> {
+  using Type = __nv_bfloat16;
+};
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct TypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+  return nullptr;
+}
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
+                                         uint8_t* SFout) {
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+}
+
+}  // namespace vllm
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -0,0 +1,263 @@
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../../dispatch_utils.h"
+#include "layernorm_utils.cuh"
+#include "quant_conversions.cuh"
+
+namespace vllm {
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void rms_norm_dynamic_per_token_quant_vec(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute rms
+  vllm::vectorized::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, var_epsilon, residual);
+
+  // Compute scale
+  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
+                                                     has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    token_scale = 1.0f / token_scale;
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
+                                     has_residual>(
+        out, input, weight, rms, &token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert token_scale for exact match with FBGemm
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
+                                     has_residual>(
+        out, input, weight, rms, &token_scale, hidden_size, residual);
+  }
+}
+
+// RMS norm + quant kernel
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__global__ void rms_norm_dynamic_per_token_quant_kernel(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  // For vectorization, token_input and token_output pointers need to be
+  // aligned at 8-byte and 4-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 4 == 0;
+
+  if (can_vectorize) {
+    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
+                                                has_residual>(
+        out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
+        residual);
+  }
+
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute RMS
+  vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
+                                            var_epsilon, residual);
+  // Compute Scale
+  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    token_scale = 1.0f / token_scale;
+    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
+        out, input, weight, rms, &token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert s_token_scale for exact match with FBGemm
+    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
+        out, input, weight, rms, &token_scale, hidden_size, residual);
+  }
+}
+
+// RMS norm + quant kernel
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
+          bool is_scale_transposed = false, int32_t group_size = 0>
+__global__ void rms_norm_per_block_quant_kernel(
+    scalar_out_t* __restrict__ out,  // [..., hidden_size]
+    float* __restrict__ scales,      // [num_tokens, hidden_size / group_size]
+                                     // or
+                                     // [hidden_size / group_size, num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  float rms;
+  // Compute RMS
+  // Always able to vectorize due to constraints on hidden_size
+  vllm::vectorized::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, var_epsilon, residual);
+
+  // Compute Scale
+  // Always able to vectorize due to constraints on hidden_size and group_size
+  vllm::vectorized::compute_dynamic_per_token_scales<
+      scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
+      nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual);
+
+  // RMS Norm + Quant
+  // Always able to vectorize due to constraints on hidden_size
+  // For int8, don't invert token_scale here: do it inside the norm_and_quant
+  // kernel. We do it because particular elements of token_scale can be shared
+  // between multiple threads, so this way, we avoid extra synchronization
+  // overhead.
+  vllm::vectorized::norm_and_quant<
+      scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
+      has_residual, is_scale_transposed, group_size>(
+      out, input, weight, rms, scales, hidden_size, residual);
+}
+
+}  // namespace vllm
+
+// Residual add + RMS norm + dynamic per token
+template <typename scalar_in_t>
+void rms_norm_dynamic_per_token_quant_dispatch(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> const& scale_ub,
+    std::optional<at::Tensor>& residual) {
+  int32_t hidden_size = input.size(-1);
+  auto num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        has_residual>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, hidden_size,
+                  has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
+        });
+  });
+}
+
+void rms_norm_dynamic_per_token_quant(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> scale_ub, std::optional<at::Tensor> residual) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
+  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+
+  if (scale_ub.has_value()) {
+    TORCH_CHECK(out.dtype() == kFp8Type);
+  }
+  TORCH_CHECK(weight.dtype() == input.dtype());
+  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+  if (residual) {
+    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+  }
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
+        rms_norm_dynamic_per_token_quant_dispatch<scalar_t>(
+            out, input, weight, scales, var_epsilon, scale_ub, residual);
+      });
+}
+
+// Residual add + RMS norm + dynamic per token
+void rms_norm_per_block_quant_dispatch(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens, hidden_size / group_size] or
+                                  // [hidden_size / group_size, num_tokens]
+    int32_t group_size,
+    double const var_epsilon,  // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> const& scale_ub,
+    std::optional<at::Tensor>& residual, bool is_scale_transposed) {
+  int32_t hidden_size = input.size(-1);
+  auto num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  const int max_block_size = (num_tokens <= 256) ? 512 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_per_block_quant_fp_dispatch", [&] {
+        using scalar_in_t = scalar_t;
+        VLLM_DISPATCH_GROUP_SIZE(group_size, gs, [&] {
+          VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
+            VLLM_DISPATCH_BOOL(is_scale_transposed, transpose_scale, [&] {
+              VLLM_DISPATCH_QUANT_TYPES(
+                  out.scalar_type(), "rms_norm_per_block_quant_kernel", [&] {
+                    vllm::rms_norm_per_block_quant_kernel<scalar_in_t, scalar_t,
+                                                          has_residual,
+                                                          transpose_scale, gs>
+                        <<<grid, block, 0, stream>>>(
+                            out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                            input.data_ptr<scalar_in_t>(),
+                            weight.data_ptr<scalar_in_t>(),
+                            scale_ub.has_value() ? scale_ub->data_ptr<float>()
+                                                 : nullptr,
+                            var_epsilon, hidden_size,
+                            has_residual ? residual->data_ptr<scalar_in_t>()
+                                         : nullptr);
+                  });
+            });
+          });
+        });
+      });
+}
+
+void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor const& weight,
+                              torch::Tensor& scales, double const var_epsilon,
+                              std::optional<torch::Tensor> scale_ub,
+                              std::optional<torch::Tensor> residual,
+                              int64_t group_size, bool is_scale_transposed) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
+  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+
+  if (scale_ub.has_value()) {
+    TORCH_CHECK(out.dtype() == kFp8Type);
+  }
+  TORCH_CHECK(weight.dtype() == input.dtype());
+  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+  if (residual) {
+    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+  }
+
+  TORCH_CHECK(group_size == 128 || group_size == 64,
+              "Unsupported group size: ", group_size);
+
+  rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size,
+                                    var_epsilon, scale_ub, residual,
+                                    is_scale_transposed);
+}
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -0,0 +1,539 @@
+#pragma once
+
+/**
+ * __device__ layernorm utilities.
+ */
+
+#include "quantization/vectorization.cuh"
+#include "quantization/utils.cuh"
+#include "quant_conversions.cuh"
+
+#include "../../cub_helpers.h"
+#include "../../cuda_compat.h"
+
+namespace vllm {
+
+// has_residual must be true, if residual is not a nullptr
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  // sum of squares
+  float ss = 0.0f;
+
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    ss += x * x;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+__device__ float warpReduceMaxSpecialized(volatile float* val, int64_t tid,
+                                          int64_t thread_in_warp,
+                                          int64_t reduced_elems) {
+  static_assert(WARP_SIZE == 32 || WARP_SIZE == 64);
+  if constexpr (WARP_SIZE == 64) {
+    if (thread_in_warp + 64 < reduced_elems)
+      val[tid] = fmaxf(val[tid], val[tid + 64]);
+  }
+  if (thread_in_warp + 32 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 32]);
+  if (thread_in_warp + 16 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 16]);
+  if (thread_in_warp + 8 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 8]);
+  if (thread_in_warp + 4 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 4]);
+  if (thread_in_warp + 2 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 2]);
+  if (thread_in_warp + 1 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 1]);
+  return val[tid];
+}
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
+          bool is_scale_transposed = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
+    int32_t const group_size = 0) {
+  float block_absmax_val_maybe = 0.0f;
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
+  __syncthreads();
+  if (group_size > 0) {
+    __shared__ float s_max_vals[1024];
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+    int64_t num_groups = hidden_size / group_size;
+    int64_t const threads_per_group = blockDim.x / num_groups;
+    int64_t const thread_in_group = threadIdx.x % threads_per_group;
+    int64_t const group_offset = threadIdx.x / threads_per_group * group_size;
+    int64_t const thread_offset = group_offset + thread_in_group;
+    int64_t const thread_end =
+        min(group_offset + group_size, static_cast<int64_t>(hidden_size));
+    for (auto i = thread_offset; i < thread_end; i += threads_per_group) {
+      float x = static_cast<float>(input[token_offset + i]);
+      if constexpr (has_residual) {
+        x += static_cast<float>(residual[token_offset + i]);
+      }
+      x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+      block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
+    }
+    s_max_vals[threadIdx.x] = block_absmax_val_maybe;
+    __syncthreads();
+
+    int64_t const warp_size = WARP_SIZE;
+    int64_t const num_warps = blockDim.x / warp_size;
+    int64_t const warp_id = threadIdx.x / warp_size;
+    int64_t const thread_in_warp = threadIdx.x % warp_size;
+    int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
+    for (auto i = 0; i < groups_per_warp; ++i) {
+      int64_t const group_id = i * num_warps + warp_id;
+      if (group_id < num_groups) {
+        int64_t warp_start = group_id * threads_per_group;
+        int64_t const start = warp_start + thread_in_warp;
+        int64_t const warp_end = min(warp_start + threads_per_group,
+                                     static_cast<int64_t>(hidden_size));
+        for (auto j = start; j + warp_size < warp_end; j += warp_size) {
+          s_max_vals[start] =
+              fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
+        }
+        warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
+                                 min(warp_end - warp_start, warp_size));
+      }
+    }
+    __syncthreads();
+
+    if (thread_in_group == 0 && thread_offset < thread_end) {
+      block_absmax_val_maybe = s_max_vals[threadIdx.x];
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      // Global output store
+      if constexpr (is_scale_transposed) {
+        all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
+                         blockIdx.x] = scale;
+      } else {
+        all_token_scales[blockIdx.x * num_groups +
+                         threadIdx.x / threads_per_group] = scale;
+      }
+    }
+    __syncthreads();
+  } else {
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+    for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+      float x = static_cast<float>(input[token_offset + i]);
+      if constexpr (has_residual) {
+        x += static_cast<float>(residual[token_offset + i]);
+      }
+
+      x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+      block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
+    }
+    using BlockReduce = cub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
+
+    __shared__ float s_token_scale;
+    if (threadIdx.x == 0) {
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      s_token_scale = scale;                 // Shared memory store
+      all_token_scales[blockIdx.x] = scale;  // Global output store
+    }
+    __syncthreads();
+
+    *token_scale = s_token_scale;
+  }
+}
+
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false, bool is_scale_transposed = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float* const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr,
+                               int32_t const group_size = 0) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+      residual[token_offset + i] = static_cast<scalar_t>(x);
+    }
+    // Norm
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    // Quant
+    // If groupwise is_scale_inverted is true, so we invert the scale here.
+    int64_t scale_idx = 0;
+    if (group_size > 0) {
+      if constexpr (is_scale_transposed) {
+        scale_idx = (i / group_size) * gridDim.x + blockIdx.x;
+      } else {
+        scale_idx = blockIdx.x * (hidden_size / group_size) + i / group_size;
+      }
+    }
+    auto scale_val =
+        (group_size > 0
+             ? (is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx])
+             : *scale);
+    output[token_offset + i] =
+        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale_val);
+  }
+}
+
+namespace vectorized {
+
+// Compute 1.0/rms(input)
+// hidden_size must be a multiple of 4
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  // sum of squares
+  float ss = 0.0f;
+
+  const int VEC_SIZE = 4;
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+#pragma unroll 4
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+
+    vec4_t<float> x;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
+    }
+
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      ss += x.val[j] * x.val[j];
+    }
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+// Vectorized version of vllm::compute_dynamic_per_token_scales
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
+          bool is_scale_transposed = false, int32_t group_size = 0>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
+
+  const int VEC_SIZE = 4;
+  float block_absmax_val_maybe = 0.0f;
+
+  // Vectorized input/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input = nullptr;
+  vec4_t<scalar_t> const* vec_weight = nullptr;
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+
+  if constexpr (group_size > 0) {
+    __shared__ float s_max_vals[1024];
+
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+    int64_t const num_groups = hidden_size / group_size;
+    int64_t const threads_per_group = blockDim.x / num_groups;
+    int64_t const thread_in_group = threadIdx.x % threads_per_group;
+    int64_t const group_offset =
+        threadIdx.x / threads_per_group * (group_size >> 2);
+    int64_t const thread_offset = group_offset + thread_in_group;
+    int64_t const thread_end = min(group_offset + (group_size >> 2),
+                                   static_cast<int64_t>(hidden_size >> 2));
+    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+    if constexpr (has_residual) {
+      vec_residual =
+          reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+    }
+    int32_t const num_vec_elems = thread_end;
+
+#pragma unroll 4
+    for (auto i = thread_offset; i < num_vec_elems; i += threads_per_group) {
+      vec4_t<scalar_t> in = vec_input[i];
+      vec4_t<scalar_t> const w = vec_weight[i];
+
+      vec4_t<float> x;
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] = static_cast<float>(in.val[j]);
+      }
+
+      if constexpr (has_residual) {
+        vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+        for (int j = 0; j < VEC_SIZE; ++j) {
+          x.val[j] += static_cast<float>(r.val[j]);
+        }
+      }
+
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        block_absmax_val_maybe =
+            fmaxf(block_absmax_val_maybe,
+                  fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+      }
+    }
+
+    s_max_vals[threadIdx.x] = block_absmax_val_maybe;
+    __syncthreads();
+
+    int64_t const warp_size = WARP_SIZE;
+    int64_t const num_warps = blockDim.x / warp_size;
+    int64_t const warp_id = threadIdx.x / warp_size;
+    int64_t const thread_in_warp = threadIdx.x % warp_size;
+    int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
+    for (auto i = 0; i < groups_per_warp; ++i) {
+      int64_t const group_id = i * num_warps + warp_id;
+      if (group_id < num_groups) {
+        int64_t warp_start = group_id * threads_per_group;
+        int64_t const start = warp_start + thread_in_warp;
+        int64_t const warp_end = min(warp_start + threads_per_group,
+                                     static_cast<int64_t>(hidden_size));
+        for (auto j = start; j + warp_size < warp_end; j += warp_size) {
+          s_max_vals[start] =
+              fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
+        }
+        warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
+                                 min(warp_end - warp_start, warp_size));
+      }
+    }
+    __syncthreads();
+
+    if (thread_in_group == 0 && thread_offset < thread_end) {
+      block_absmax_val_maybe = s_max_vals[threadIdx.x];
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      // Global output store
+      if constexpr (is_scale_transposed) {
+        all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
+                         blockIdx.x] = scale;
+      } else {
+        all_token_scales[blockIdx.x * num_groups +
+                         threadIdx.x / threads_per_group] = scale;
+      }
+    }
+    __syncthreads();
+
+  } else {
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+    if constexpr (has_residual) {
+      vec_residual =
+          reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+    }
+
+    int32_t const num_vec_elems = (hidden_size >> 2);
+
+#pragma unroll 4
+    for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+      vec4_t<scalar_t> in = vec_input[i];
+      vec4_t<scalar_t> const w = vec_weight[i];
+
+      vec4_t<float> x;
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] = static_cast<float>(in.val[j]);
+      }
+
+      if constexpr (has_residual) {
+        vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+        for (int j = 0; j < VEC_SIZE; ++j) {
+          x.val[j] += static_cast<float>(r.val[j]);
+        }
+      }
+
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        block_absmax_val_maybe =
+            fmaxf(block_absmax_val_maybe,
+                  fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+      }
+    }
+
+    using BlockReduce = cub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
+
+    __shared__ float s_token_scale;
+    if (threadIdx.x == 0) {
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      s_token_scale = scale;                 // shared memory store
+      all_token_scales[blockIdx.x] = scale;  // global output store
+    }
+    __syncthreads();
+
+    *token_scale = s_token_scale;
+  }
+}
+
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false, bool is_scale_transposed = false,
+          int32_t group_size = 0>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float* const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+  // Vectorized input/output/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  q8x4_t<scalar_out_t>* vec_output =
+      reinterpret_cast<q8x4_t<scalar_out_t>*>(&output[token_offset]);
+  vec4_t<scalar_t>* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
+  }
+
+  const int VEC_SIZE = 4;
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+// TODO(luka/varun) extract into type-agnostic vectorized quant function to
+//  replace scaled_fp8_conversion_vec
+#pragma unroll 4
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> const in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
+// Update residual
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        r.val[j] = static_cast<scalar_t>(x.val[j]);
+      }
+      vec_residual[i] = r;
+    }
+
+    q8x4_t<scalar_out_t> out;
+
+    float scale_val;
+
+    if constexpr (group_size > 0) {
+      int64_t const num_groups = hidden_size / group_size;
+      int64_t scale_idx = 0;
+      if constexpr (is_scale_transposed) {
+        scale_idx = (i * VEC_SIZE / group_size) * gridDim.x + blockIdx.x;
+      } else {
+        scale_idx = blockIdx.x * num_groups + i * VEC_SIZE / group_size;
+      }
+      scale_val =
+          is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx];
+    } else {
+      scale_val = *scale;
+    }
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale_val);
+    }
+    vec_output[i] = out;
+  }
+}
+
+}  // namespace vectorized
+
+}  // namespace vllm
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -0,0 +1,90 @@
+#pragma once
+
+/**
+ * __device__ helper functions to deal with float -> quant datatype conversion
+ */
+
+#include "quantization/vectorization.cuh"
+// TODO(luka/varun):refactor common.cuh to use this file instead
+#include "quantization/w8a8/fp8/common.cuh"
+
+namespace vllm {
+
+// TODO(luka/varun): combine into common utilities for int8
+//  (with int8_quant_kernels.cu)
+static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
+#ifdef USE_ROCM
+  static const float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  static const float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  // round
+  float dst = std::nearbyint(x);
+  // saturate
+
+  // See https://github.com/pytorch/pytorch/issues/127666
+  // See https://github.com/llvm/llvm-project/issues/95183
+  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
+  // Arch/gcc14. The following replaces std::clamp usage with similar logic
+  // dst = std::clamp(dst, i8_min, i8_max);
+  dst = (dst < i8_min) ? i8_min : (dst > i8_max) ? i8_max : dst;
+  return static_cast<int8_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+#endif
+}
+
+template <typename fp8_type>
+static __device__ __forceinline__ fp8_type float_to_fp8(float const x) {
+  float const r =
+      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
+  return static_cast<fp8_type>(r);
+}
+
+template <typename quant_type_t, bool is_scale_inverted, typename enable = void>
+struct ScaledQuant;
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, int8_t>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_int8_rn(x * scale);
+    } else {
+      return float_to_int8_rn(x / scale);
+    }
+  }
+};
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<quant_type_t, is_scale_inverted,
+                   typename std::enable_if_t<
+                       std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                       std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_fp8<quant_type_t>(x * scale);
+    } else {
+      return float_to_fp8<quant_type_t>(x / scale);
+    }
+  }
+};
+
+template <typename scalar_t, typename quant_type_t, bool is_scale_inverted>
+__device__ void scaled_quant_conversion(quant_type_t* __restrict__ output,
+                                        scalar_t const* __restrict__ input,
+                                        float const scale, int const tid,
+                                        int const num_elements,
+                                        int const step) {
+  for (int i = tid; i < num_elements; i += step) {
+    output[i] = ScaledQuant<quant_type_t, is_scale_inverted>(input[i], scale);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@@ -0,0 +1,571 @@
+// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/convert.cu
+// Dequant functions
+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = __int2half_rn(vui & 0xF);
+    v.y = __int2half_rn(vui >> 4);
+
+    v = __hsub2(v, __floats2half2_rn(8.0f, 8.0f));
+    v = __hmul2(v, {d, d});
+}
+
+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = __int2half_rn(vui & 0xF);
+    v.y = __int2half_rn(vui >> 4);
+
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+}
+
+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = __int2half_rn((x[ib].qs[iqs] >>  4) | xh_1);
+
+    v = __hsub2(v, __floats2half2_rn(16.0f, 16.0f));
+    v = __hmul2(v, {d, d});
+}
+
+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = __int2half_rn((x[ib].qs[iqs] >>  4) | xh_1);
+
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+}
+
+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x = __int2half_rn(x[ib].qs[iqs + 0]);
+    v.y = __int2half_rn(x[ib].qs[iqs + 1]);
+
+    v = __hmul2(v, {d, d});
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
+    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+    if (i >= k) {
+        return;
+    }
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0]        = convert_from_half<dst_t>(v.x);
+    y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const auto tid = threadIdx.x;
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
+    const int is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    half dall = __low2half(x[i].dm);
+    half dmin = __high2half(x[i].dm);
+    y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4))));
+    y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4))));
+    y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4))));
+    y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4))));
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i = blockIdx.x;
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+    const auto r = threadIdx.x/4;
+    const int tid = r/2;
+    const int is0 = r%2;
+    const int l0 = 16*is0 + 4*(threadIdx.x%4);
+    const int n = tid / 4;
+    const int j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    half d_all = x[i].d;
+    half dl = __hmul(d_all,  __int2half_rn(us - 32));
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) {
+        y[l] = convert_from_half<dst_t>(__hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
+    }
+}
+
+static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const auto i = blockIdx.x;
+
+    // assume 32 threads
+    const auto tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const half dall = __low2half(x[i].dm);
+    const half dmin = __high2half(x[i].dm);
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const half d1 = __hmul(dall, __int2half_rn(sc));
+    const half m1 = __hmul(dmin,  __int2half_rn(m));
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const half d2 = __hmul(dall, __int2half_rn(sc));
+    const half m2 = __hmul(dmin, __int2half_rn(m));
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1));
+        y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2));
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const auto i = blockIdx.x;
+
+    // assume 64 threads - this is very slightly better than the one below
+    const auto tid = threadIdx.x;
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const half dall = __low2half(x[i].dm);
+    const half dmin = __high2half(x[i].dm);
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const half d1 = __hmul(dall, __int2half_rn(sc)); const half m1 = __hmul(dmin, __int2half_rn(m));
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1));
+    y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1));
+    hm <<= 1;
+    y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2));
+    y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2));
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const auto i = blockIdx.x;
+
+    // assume 64 threads - this is very slightly better than the one below
+    const auto tid = threadIdx.x;
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const half d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))));
+    y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))));
+    y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32))));
+    y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32))));
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
+    const uint32_t aux32 = q2[2] | (q2[3] << 16);
+    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq2_xs * x = (const block_iq2_xs *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
+    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq2_s * x = (const block_iq2_s *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
+    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t  * q3 = x[i].qs + 8*ib;
+    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
+    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
+    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
+    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq3_s * x = (const block_iq3_s *) vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * qs = x[i].qs + 8*ib;
+    const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
+    const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
+    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
+    const uint8_t signs = x[i].signs[4*ib + il];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq1_s * x = (const block_iq1_s  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
+    const float d = __half2float(x[i].d) * (2*((x[i].qh[ib] >> 12) & 7) + 1);
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq1_m * x = (const block_iq1_m  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * sc = (const uint16_t *)x[i].scales;
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
+    const float d = __half2float(scale.f16) * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
+    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const auto i   = blockIdx.x;
+    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[ib].qs + 4*il;
+    const float d = __half2float(x[ib].d);
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const auto i   = blockIdx.x;
+    const block_iq4_xs * x = (const block_iq4_xs *)vx;
+
+    const auto tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
+    const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
+    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+template<typename dst_t>
+static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
+    switch (type) {
+        case 2:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case 3:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case 6:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case 7:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case 8:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case 10:
+            return dequantize_row_q2_K_cuda;
+        case 11:
+            return dequantize_row_q3_K_cuda;
+        case 12:
+            return dequantize_row_q4_K_cuda;
+        case 13:
+            return dequantize_row_q5_K_cuda;
+        case 14:
+            return dequantize_row_q6_K_cuda;
+        case 16:
+            return dequantize_row_iq2_xxs_cuda;
+        case 17:
+            return dequantize_row_iq2_xs_cuda;
+        case 18:
+            return dequantize_row_iq3_xxs_cuda;
+        case 19:
+            return dequantize_row_iq1_s_cuda;
+        case 20:
+            return dequantize_row_iq4_nl_cuda;
+        case 21:
+            return dequantize_row_iq3_s_cuda;
+        case 22:
+            return dequantize_row_iq2_s_cuda;
+        case 23:
+            return dequantize_row_iq4_xs_cuda;
+        case 29:
+            return dequantize_row_iq1_m_cuda;
+        default:
+            return nullptr;
+    }
+}
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -0,0 +1,542 @@
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../../cuda_compat.h"
+#include "dispatch_utils.h"
+
+#include "ggml-common.h"
+#include "vecdotq.cuh"
+#include "dequantize.cuh"
+#include "mmvq.cuh"
+#include "mmq.cuh"
+#include "moe.cuh"
+#include "moe_vec.cuh"
+
+// Q8 gemv
+template <typename scalar_t>
+static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
+                                     void* __restrict__ vy, const int kx,
+                                     const int kx_padded) {
+  const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
+  if (ix >= kx_padded) {
+    return;
+  }
+  const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
+  const int i_padded = iy * kx_padded + ix;
+
+  block_q8_1* y = (block_q8_1*)vy;
+
+  const int ib = i_padded / QK8_1;   // block index
+  const int iqs = i_padded % QK8_1;  // quant index
+
+  const float xi = ix < kx ? static_cast<float>(x[iy * kx + ix]) : 0.0f;
+  float amax = fabsf(xi);
+  float sum = xi;
+
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1) {
+    amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32));
+    sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32);
+  }
+
+  const float d = amax / 127;
+  const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
+
+  y[ib].qs[iqs] = q;
+
+  if (iqs > 0) {
+    return;
+  }
+
+  y[ib].ds.x = __float2half(d);
+  y[ib].ds.y = __float2half(sum);
+}
+
+template <typename scalar_t>
+static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
+                                   const int ky, cudaStream_t stream) {
+  const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
+  const int block_num_x =
+      (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+  constexpr int MAX_BLOCK_SIZE = 65535;
+  for (int off = 0; off < ky; off += MAX_BLOCK_SIZE) {
+    const int num_blocks_y = std::min(ky, off + MAX_BLOCK_SIZE) - off;
+    const dim3 num_blocks(block_num_x, num_blocks_y, 1);
+    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(
+        &x[off * kx], (int32_t*)vy + off * (kx_padded / 32 * 9), kx, kx_padded);
+  }
+}
+
+torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
+                              int64_t type, int64_t m, int64_t n,
+                              std::optional<at::ScalarType> const& dtype) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
+  auto dtype_ = dtype.value_or(torch::kFloat16);
+  auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
+  at::Tensor DW = torch::empty({m, n}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+
+  VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
+    auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
+    to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
+  });
+
+  return DW;
+}
+
+torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W,  // quant weight
+                                  torch::Tensor X,  // input
+                                  int64_t type, int64_t row) {
+  int col = X.sizes()[1];
+  int vecs = X.sizes()[0];
+  const int padded = (col + 512 - 1) / 512 * 512;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
+  at::Tensor Y = torch::empty({vecs, row}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
+  at::Tensor quant_X = torch::empty({vecs, padded / 32 * 9}, options);
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_vec_a8", [&] {
+    quantize_row_q8_1_cuda<scalar_t>(
+        (scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), col, vecs, stream);
+    switch (type) {
+      case 2:
+        mul_mat_vec_q4_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 3:
+        mul_mat_vec_q4_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 6:
+        mul_mat_vec_q5_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 7:
+        mul_mat_vec_q5_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 8:
+        mul_mat_vec_q8_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 10:
+        mul_mat_vec_q2_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 11:
+        mul_mat_vec_q3_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 12:
+        mul_mat_vec_q4_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 13:
+        mul_mat_vec_q5_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 14:
+        mul_mat_vec_q6_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 16:
+        mul_mat_vec_iq2_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 17:
+        mul_mat_vec_iq2_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 18:
+        mul_mat_vec_iq3_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 19:
+        mul_mat_vec_iq1_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 20:
+        mul_mat_vec_iq4_nl_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 21:
+        mul_mat_vec_iq3_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 22:
+        mul_mat_vec_iq2_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 23:
+        mul_mat_vec_iq4_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+      case 29:
+        mul_mat_vec_iq1_m_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
+        break;
+    }
+  });
+  return Y;
+}
+
+torch::Tensor ggml_mul_mat_a8(torch::Tensor W,  // quant weight
+                              torch::Tensor X,  // input
+                              int64_t type, int64_t row) {
+  int col = X.sizes()[1];
+  int padded = (col + 512 - 1) / 512 * 512;
+  int batch = X.sizes()[0];
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
+  at::Tensor Y = torch::empty({batch, row}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
+  at::Tensor quant_X = torch::empty({batch, padded / 32 * 9}, options);
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_a8", [&] {
+    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
+                           col, batch, stream);
+
+    switch (type) {
+      case 2:
+        ggml_mul_mat_q4_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 3:
+        ggml_mul_mat_q4_1_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 6:
+        ggml_mul_mat_q5_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 7:
+        ggml_mul_mat_q5_1_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 8:
+        ggml_mul_mat_q8_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 10:
+        ggml_mul_mat_q2_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 11:
+        ggml_mul_mat_q3_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 12:
+        ggml_mul_mat_q4_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 13:
+        ggml_mul_mat_q5_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 14:
+        ggml_mul_mat_q6_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+    }
+  });
+  return Y;
+}
+
+torch::Tensor ggml_moe_a8(torch::Tensor X,  // input
+                          torch::Tensor W,  // expert weights
+                          torch::Tensor sorted_token_ids,
+                          torch::Tensor expert_ids,
+                          torch::Tensor num_tokens_post_padded, int64_t type,
+                          int64_t row, int64_t top_k, int64_t tokens) {
+  int col = X.sizes()[1];
+  int padded = (col + 512 - 1) / 512 * 512;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
+  at::Tensor Y = torch::empty({tokens * top_k, row}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
+  at::Tensor quant_X = torch::empty({tokens, padded / 32 * 9}, options);
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_a8", [&] {
+    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
+                           col, tokens, stream);
+    switch (type) {
+      case 2:
+        ggml_moe_q4_0_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 3:
+        ggml_moe_q4_1_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 6:
+        ggml_moe_q5_0_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 7:
+        ggml_moe_q5_1_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 8:
+        ggml_moe_q8_0_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 10:
+        ggml_moe_q2_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 11:
+        ggml_moe_q3_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 12:
+        ggml_moe_q4_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 13:
+        ggml_moe_q5_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 14:
+        ggml_moe_q6_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+    }
+  });
+  return Y;
+}
+
+torch::Tensor ggml_moe_a8_vec(torch::Tensor X,  // input
+                              torch::Tensor W,  // expert weights
+                              torch::Tensor topk_ids, int64_t top_k,
+                              int64_t type, int64_t row, int64_t tokens) {
+  int col = X.sizes()[1];
+  const int padded = (col + 512 - 1) / 512 * 512;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
+  at::Tensor Y = torch::zeros({tokens * top_k, row}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
+  at::Tensor quant_X = torch::empty({tokens, padded / 32 * 9}, options);
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_vec_a8", [&] {
+    quantize_row_q8_1_cuda<scalar_t>((scalar_t*)X.data_ptr(),
+                                     (void*)quant_X.data_ptr(), col, tokens,
+                                     stream);
+    switch (type) {
+      case 2:
+        moe_vec_q4_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 3:
+        moe_vec_q4_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 6:
+        moe_vec_q5_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 7:
+        moe_vec_q5_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 8:
+        moe_vec_q8_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 10:
+        moe_vec_q2_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 11:
+        moe_vec_q3_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 12:
+        moe_vec_q4_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 13:
+        moe_vec_q5_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 14:
+        moe_vec_q6_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 16:
+        moe_vec_iq2_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 17:
+        moe_vec_iq2_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 18:
+        moe_vec_iq3_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 19:
+        moe_vec_iq1_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 20:
+        moe_vec_iq4_nl_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 21:
+        moe_vec_iq3_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 22:
+        moe_vec_iq2_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 23:
+        moe_vec_iq4_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+      case 29:
+        moe_vec_iq1_m_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
+            col, row, quant_X.stride(0), stream);
+        break;
+    }
+  });
+  return Y;
+}
+
+int64_t ggml_moe_get_block_size(int64_t type) {
+  switch (type) {
+    case 2:
+      return MOE_X_Q4_0;
+    case 3:
+      return MOE_X_Q4_1;
+    case 6:
+      return MOE_X_Q5_0;
+    case 7:
+      return MOE_X_Q5_1;
+    case 8:
+      return MOE_X_Q8_0;
+    case 10:
+      return MOE_X_Q2_K;
+    case 11:
+      return MOE_X_Q3_K;
+    case 12:
+      return MOE_X_Q4_K;
+    case 13:
+      return MOE_X_Q5_K;
+    case 14:
+      return MOE_X_Q6_K;
+  }
+  return 0;
+}
--- a/csrc/quantization/gguf/mmq.cuh
+++ b/csrc/quantization/gguf/mmq.cuh
@@ -0,0 +1,610 @@
+// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
+template <typename scalar_t, int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
+static __device__ __forceinline__ void mul_mat_q(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    const int blocks_per_row_x = ncols_x / qk;
+    const int blocks_per_col_y = nrows_y / QK8_1;
+    const int blocks_per_warp = WARP_SIZE_GGUF / qi;
+
+    const int & ncols_dst = ncols_y;
+
+    const auto row_dst_0 = blockIdx.x*mmq_y;
+    const int & row_x_0 = row_dst_0;
+
+    const auto col_dst_0 = blockIdx.y*mmq_x;
+    const int & col_y_0 = col_dst_0;
+
+    int   * tile_x_ql = nullptr;
+    half2 * tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE_GGUF];
+    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE_GGUF/QI8_1];
+
+    float sum[mmq_y/WARP_SIZE_GGUF][mmq_x/nwarps] = {{0.0f}};
+
+    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+
+        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
+
+#pragma unroll
+        for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
+            const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
+                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
+                const int index_y = (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
+                const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
+                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE_GGUF/QI8_1) + kby].ds;
+                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE_GGUF/QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = __low2float(*dsi_src);
+                }
+            }
+
+            __syncthreads();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k = ir*WARP_SIZE_GGUF/qr; k < (ir+1)*WARP_SIZE_GGUF/qr; k += vdr) {
+#pragma unroll
+                for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+                        sum[i/WARP_SIZE_GGUF][j/nwarps] += vec_dot(
+                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
+                            threadIdx.x + i, threadIdx.y + j, k);
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+#pragma unroll
+    for (int j = 0; j < mmq_x; j += nwarps) {
+        const auto col_dst = col_dst_0 + j + threadIdx.y;
+        if (col_dst >= ncols_dst) {
+            return;
+        }
+
+#pragma unroll
+        for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+            const auto row_dst = row_dst_0 + threadIdx.x + i;
+            if (row_dst >= nrows_dst) {
+                continue;
+            }
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE_GGUF][j/nwarps];
+        }
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q4_0  64
+#define  MMQ_Y_Q4_0  128
+#define NWARPS_Q4_0  8
+#else
+#define  MMQ_X_Q4_0 4
+#define  MMQ_Y_Q4_0 32
+#define NWARPS_Q4_0 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2)
+#endif
+mul_mat_q4_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q4_0;
+    const int mmq_y  =  MMQ_Y_Q4_0;
+    const int nwarps = NWARPS_Q4_0;
+
+    mul_mat_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q4_0_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int mmq_x  =  MMQ_X_Q4_0;
+    int mmq_y  =  MMQ_Y_Q4_0;
+    int nwarps = NWARPS_Q4_0;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q4_1 64
+#define  MMQ_Y_Q4_1 128
+#define NWARPS_Q4_1 8
+#else
+#define  MMQ_X_Q4_1 4
+#define  MMQ_Y_Q4_1 32
+#define NWARPS_Q4_1 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2)
+#endif
+mul_mat_q4_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q4_1;
+    const int mmq_y  =  MMQ_Y_Q4_1;
+    const int nwarps = NWARPS_Q4_1;
+
+    mul_mat_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q4_1_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int mmq_x  =  MMQ_X_Q4_1;
+    int mmq_y  =  MMQ_Y_Q4_1;
+    int nwarps = NWARPS_Q4_1;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q5_0 64
+#define  MMQ_Y_Q5_0 128
+#define NWARPS_Q5_0 8
+#else
+#define  MMQ_X_Q5_0 4
+#define  MMQ_Y_Q5_0 32
+#define NWARPS_Q5_0 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2)
+#endif
+mul_mat_q5_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q5_0;
+    const int mmq_y  =  MMQ_Y_Q5_0;
+    const int nwarps = NWARPS_Q5_0;
+
+    mul_mat_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q5_0_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    const int mmq_x  =  MMQ_X_Q5_0;
+    const int mmq_y  =  MMQ_Y_Q5_0;
+    const int nwarps = NWARPS_Q5_0;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q5_1 64
+#define  MMQ_Y_Q5_1 128
+#define NWARPS_Q5_1 8
+#else
+#define  MMQ_X_Q5_1 4
+#define  MMQ_Y_Q5_1 32
+#define NWARPS_Q5_1 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2)
+#endif
+mul_mat_q5_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q5_1;
+    const int mmq_y  =  MMQ_Y_Q5_1;
+    const int nwarps = NWARPS_Q5_1;
+
+    mul_mat_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q5_1_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q5_1;
+    const int mmq_y  =  MMQ_Y_Q5_1;
+    const int nwarps = NWARPS_Q5_1;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q8_0 64
+#define  MMQ_Y_Q8_0 128
+#define NWARPS_Q8_0 8
+#else
+#define  MMQ_X_Q8_0 4
+#define  MMQ_Y_Q8_0 32
+#define NWARPS_Q8_0 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2)
+#endif
+mul_mat_q8_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q8_0;
+    const int mmq_y  =  MMQ_Y_Q8_0;
+    const int nwarps = NWARPS_Q8_0;
+
+    mul_mat_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q8_0_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q8_0;
+    const int mmq_y  =  MMQ_Y_Q8_0;
+    const int nwarps = NWARPS_Q8_0;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q2_K 64
+#define  MMQ_Y_Q2_K 128
+#define NWARPS_Q2_K 8
+#else
+#define  MMQ_X_Q2_K 4
+#define  MMQ_Y_Q2_K 32
+#define NWARPS_Q2_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2)
+#endif
+mul_mat_q2_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q2_K;
+    const int mmq_y  =  MMQ_Y_Q2_K;
+    const int nwarps = NWARPS_Q2_K;
+
+    mul_mat_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q2_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q2_K;
+    const int mmq_y  =  MMQ_Y_Q2_K;
+    const int nwarps = NWARPS_Q2_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q3_K 64
+#define  MMQ_Y_Q3_K 128
+#define NWARPS_Q3_K 8
+#else
+#define  MMQ_X_Q3_K 4
+#define  MMQ_Y_Q3_K 32
+#define NWARPS_Q3_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2)
+#endif
+mul_mat_q3_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+    const int mmq_x  =  MMQ_X_Q3_K;
+    const int mmq_y  =  MMQ_Y_Q3_K;
+    const int nwarps = NWARPS_Q3_K;
+
+    mul_mat_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q3_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    const int mmq_x  =  MMQ_X_Q3_K;
+    const int mmq_y  =  MMQ_Y_Q3_K;
+    const int nwarps = NWARPS_Q3_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q4_K 64
+#define  MMQ_Y_Q4_K 128
+#define NWARPS_Q4_K 8
+#else
+#define  MMQ_X_Q4_K 4
+#define  MMQ_Y_Q4_K 32
+#define NWARPS_Q4_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2)
+#endif
+mul_mat_q4_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q4_K;
+    const int mmq_y  =  MMQ_Y_Q4_K;
+    const int nwarps = NWARPS_Q4_K;
+
+    mul_mat_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q4_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q4_K;
+    const int mmq_y  =  MMQ_Y_Q4_K;
+    const int nwarps = NWARPS_Q4_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q5_K 64
+#define  MMQ_Y_Q5_K 128
+#define NWARPS_Q5_K 8
+#else
+#define  MMQ_X_Q5_K 4
+#define  MMQ_Y_Q5_K 32
+#define NWARPS_Q5_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2)
+#endif
+mul_mat_q5_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q5_K;
+    const int mmq_y  =  MMQ_Y_Q5_K;
+    const int nwarps = NWARPS_Q5_K;
+
+    mul_mat_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q5_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    const int mmq_x  =  MMQ_X_Q5_K;
+    const int mmq_y  =  MMQ_Y_Q5_K;
+    const int nwarps = NWARPS_Q5_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+#if defined(USE_ROCM)
+#define  MMQ_X_Q6_K 64
+#define  MMQ_Y_Q6_K 128
+#define NWARPS_Q6_K 8
+#else
+#define  MMQ_X_Q6_K 4
+#define  MMQ_Y_Q6_K 32
+#define NWARPS_Q6_K 4
+#endif
+
+template<typename scalar_t, bool need_check> static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2)
+#endif
+mul_mat_q6_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int mmq_x  =  MMQ_X_Q6_K;
+    const int mmq_y  =  MMQ_Y_Q6_K;
+    const int nwarps = NWARPS_Q6_K;
+
+    mul_mat_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+}
+
+template<typename scalar_t>
+static void ggml_mul_mat_q6_K_q8_1_cuda(
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int mmq_x  =  MMQ_X_Q6_K;
+    const int mmq_y  =  MMQ_Y_Q6_K;
+    const int nwarps = NWARPS_Q6_K;
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@@ -0,0 +1,212 @@
+// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
+template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows, const int nvecs) {
+    const auto row = blockIdx.x*blockDim.y + threadIdx.y;
+    const auto vec = blockIdx.y;
+
+    if (row >= nrows || vec >= nvecs) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+    const int nrows_y = (ncols + 512 - 1) / 512 * 512;
+
+
+    // partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = vec*(nrows_y/QK8_1) + i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs  = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) {
+        tmp += VLLM_SHFL_XOR_SYNC(tmp, mask);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[vec*nrows + row] = tmp;
+    }
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
+
+template<typename scalar_t>
+static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, nvecs, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
+}
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@@ -0,0 +1,739 @@
+#include <cstdint>
+
+/* Adapted from ./csrc/quantization/gguf/mmq.cuh
+   based on ./vllm/model_executor/layers/fused_moe/fused_moe.py */
+template <typename scalar_t, int qk, int qr, int qi, bool need_sum,
+          typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+          allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles,
+          int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
+static __device__ __forceinline__ void moe_q(
+    const void* __restrict__ vx, const void* __restrict__ vy,
+    scalar_t* __restrict__ dst, const int* __restrict__ sorted_token_ids,
+    const int* __restrict__ expert_ids,
+    const int* __restrict__ num_tokens_post_padded, const int exp_stride,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y,
+    const int nrows_dst, const int top_k) {
+  const int blocks_per_row_x = ncols_x / qk;
+  const int blocks_per_col_y = nrows_y / QK8_1;
+  const int blocks_per_warp = WARP_SIZE_GGUF / qi;
+
+  const int ncols_dst = ncols_y * top_k;
+
+  const auto row_dst_0 = blockIdx.x * mmq_y;
+  const int& row_x_0 = row_dst_0;
+
+  const auto col_dst_0 = blockIdx.y * mmq_x;
+
+  int token_offs[mmq_x / nwarps];
+  for (int i = 0; i < mmq_x; i += nwarps) {
+    token_offs[i / nwarps] = sorted_token_ids[col_dst_0 + threadIdx.y + i];
+  }
+
+  const int exp_idx = expert_ids[blockIdx.y];
+  if (exp_idx > 255 || exp_idx < 0) return;
+  if (blockIdx.y * mmq_x > num_tokens_post_padded[0]) return;
+
+  const block_q_t* x = (const block_q_t*)((char*)vx + exp_idx * exp_stride);
+  const block_q8_1* y = (const block_q8_1*)(vy);
+
+  int* tile_x_ql = nullptr;
+  half2* tile_x_dm = nullptr;
+  int* tile_x_qh = nullptr;
+  int* tile_x_sc = nullptr;
+
+  allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+  __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF];
+  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF / QI8_1];
+
+  float sum[mmq_y / WARP_SIZE_GGUF][mmq_x / nwarps] = {{0.0f}};
+
+  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+    load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
+               tile_x_qh, tile_x_sc, threadIdx.y, nrows_x - row_x_0 - 1,
+               threadIdx.x, blocks_per_row_x);
+
+    const int n_per_r = ((qk * blocks_per_warp) / qr);
+#pragma unroll
+    for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
+      const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
+      const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+      for (int i = 0; i < mmq_x; i += nwarps) {
+        const int col_y_eff = token_offs[i / nwarps] / top_k;
+        const int block_x = ib0 * (qk / QK8_1) + kbxd;
+        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
+          const block_q8_1* by0 = &y[col_y_eff * blocks_per_col_y + block_x];
+          const int index_y =
+              (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
+          tile_y_qs[index_y] =
+              get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+        }
+      }
+
+      if (threadIdx.x < n_per_r / QK8_1) {
+        const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
+        const int col_y_eff = token_offs[threadIdx.y] / top_k;
+        const int block_x =
+            ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
+
+        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
+          const half2* dsi_src = &y[col_y_eff * blocks_per_col_y + block_x].ds;
+          half2* dsi_dst =
+              &tile_y_ds[threadIdx.y * (WARP_SIZE_GGUF / QI8_1) + kby];
+
+          if (need_sum) {
+            *dsi_dst = *dsi_src;
+          } else {
+            float* dfi_dst = (float*)dsi_dst;
+            *dfi_dst = __low2float(*dsi_src);
+          }
+        }
+      }
+      __syncthreads();
+
+      // #pragma unroll // unrolling this loop causes too much register pressure
+      for (int k = ir * WARP_SIZE_GGUF / qr; k < (ir + 1) * WARP_SIZE_GGUF / qr;
+           k += vdr) {
+#pragma unroll
+        for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+          for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+            sum[i / WARP_SIZE_GGUF][j / nwarps] +=
+                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs,
+                        tile_y_ds, threadIdx.x + i, threadIdx.y + j, k);
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+
+#pragma unroll
+  for (int j = 0; j < mmq_x; j += nwarps) {
+    const int col_dst = token_offs[j / nwarps];
+    if (col_dst >= ncols_dst) {
+      return;
+    }
+
+#pragma unroll
+    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+      const auto row_dst = row_dst_0 + threadIdx.x + i;
+      if (row_dst >= nrows_dst) {
+        continue;
+      }
+      dst[col_dst * nrows_dst + row_dst] = sum[i / WARP_SIZE_GGUF][j / nwarps];
+    }
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q4_0 8
+  #define MOE_Y_Q4_0 128
+  #define NWARPS_Q4_0 8
+#else
+  #define MOE_X_Q4_0 4
+  #define MOE_Y_Q4_0 32
+  #define NWARPS_Q4_0 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2)
+#endif
+    moe_q4_0(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q4_0;
+  const int mmq_y = MOE_Y_Q4_0;
+  const int nwarps = NWARPS_Q4_0;
+
+  moe_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q4_0<mmq_y>, load_tiles_q4_0<mmq_y, nwarps, need_check>,
+        VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q4_0_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  int mmq_x = MOE_X_Q4_0;
+  int mmq_y = MOE_Y_Q4_0;
+  int nwarps = NWARPS_Q4_0;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q4_1 8
+  #define MOE_Y_Q4_1 128
+  #define NWARPS_Q4_1 8
+#else
+  #define MOE_X_Q4_1 4
+  #define MOE_Y_Q4_1 32
+  #define NWARPS_Q4_1 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2)
+#endif
+    moe_q4_1(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q4_1;
+  const int mmq_y = MOE_Y_Q4_1;
+  const int nwarps = NWARPS_Q4_1;
+
+  moe_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q4_1<mmq_y>, load_tiles_q4_1<mmq_y, nwarps, need_check>,
+        VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q4_1_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  int mmq_x = MOE_X_Q4_1;
+  int mmq_y = MOE_Y_Q4_1;
+  int nwarps = NWARPS_Q4_1;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q5_0 8
+  #define MOE_Y_Q5_0 128
+  #define NWARPS_Q5_0 8
+#else
+  #define MOE_X_Q5_0 4
+  #define MOE_Y_Q5_0 32
+  #define NWARPS_Q5_0 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2)
+#endif
+    moe_q5_0(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
+  const int nwarps = NWARPS_Q5_0;
+
+  moe_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q5_0<mmq_y>, load_tiles_q5_0<mmq_y, nwarps, need_check>,
+        VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q5_0_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
+  const int nwarps = NWARPS_Q5_0;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q5_1 8
+  #define MOE_Y_Q5_1 128
+  #define NWARPS_Q5_1 8
+#else
+  #define MOE_X_Q5_1 4
+  #define MOE_Y_Q5_1 32
+  #define NWARPS_Q5_1 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2)
+#endif
+    moe_q5_1(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
+  const int nwarps = NWARPS_Q5_1;
+
+  moe_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q5_1<mmq_y>, load_tiles_q5_1<mmq_y, nwarps, need_check>,
+        VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q5_1_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
+  const int nwarps = NWARPS_Q5_1;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q8_0 8
+  #define MOE_Y_Q8_0 128
+  #define NWARPS_Q8_0 8
+#else
+  #define MOE_X_Q8_0 4
+  #define MOE_Y_Q8_0 32
+  #define NWARPS_Q8_0 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2)
+#endif
+    moe_q8_0(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
+  const int nwarps = NWARPS_Q8_0;
+
+  moe_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q8_0<mmq_y>, load_tiles_q8_0<mmq_y, nwarps, need_check>,
+        VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q8_0_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
+  const int nwarps = NWARPS_Q8_0;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q2_K 8
+  #define MOE_Y_Q2_K 128
+  #define NWARPS_Q2_K 8
+#else
+  #define MOE_X_Q2_K 4
+  #define MOE_Y_Q2_K 32
+  #define NWARPS_Q2_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2)
+#endif
+    moe_q2_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
+  const int nwarps = NWARPS_Q2_K;
+
+  moe_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q2_K<mmq_y>, load_tiles_q2_K<mmq_y, nwarps, need_check>,
+        VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q2_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
+  const int nwarps = NWARPS_Q2_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q3_K 8
+  #define MOE_Y_Q3_K 128
+  #define NWARPS_Q3_K 8
+#else
+  #define MOE_X_Q3_K 4
+  #define MOE_Y_Q3_K 32
+  #define NWARPS_Q3_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2)
+#endif
+    moe_q3_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
+  const int nwarps = NWARPS_Q3_K;
+
+  moe_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q3_K<mmq_y>, load_tiles_q3_K<mmq_y, nwarps, need_check>,
+        VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+template <typename scalar_t>
+static void ggml_moe_q3_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
+  const int nwarps = NWARPS_Q3_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q4_K 8
+  #define MOE_Y_Q4_K 128
+  #define NWARPS_Q4_K 8
+#else
+  #define MOE_X_Q4_K 4
+  #define MOE_Y_Q4_K 32
+  #define NWARPS_Q4_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2)
+#endif
+    moe_q4_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
+  const int nwarps = NWARPS_Q4_K;
+
+  moe_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q4_K<mmq_y>, load_tiles_q4_K<mmq_y, nwarps, need_check>,
+        VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q4_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
+  const int nwarps = NWARPS_Q4_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q5_K 8
+  #define MOE_Y_Q5_K 128
+  #define NWARPS_Q5_K 8
+#else
+  #define MOE_X_Q5_K 4
+  #define MOE_Y_Q5_K 32
+  #define NWARPS_Q5_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2)
+#endif
+    moe_q5_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
+  const int nwarps = NWARPS_Q5_K;
+
+  moe_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q5_K<mmq_y>, load_tiles_q5_K<mmq_y, nwarps, need_check>,
+        VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q5_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
+  const int nwarps = NWARPS_Q5_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MOE_X_Q6_K 8
+  #define MOE_Y_Q6_K 128
+  #define NWARPS_Q6_K 8
+#else
+  #define MOE_X_Q6_K 4
+  #define MOE_Y_Q6_K 32
+  #define NWARPS_Q6_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2)
+#endif
+    moe_q6_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
+  const int nwarps = NWARPS_Q6_K;
+
+  moe_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q6_K<mmq_y>, load_tiles_q6_K<mmq_y, nwarps, need_check>,
+        VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q6_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
+  const int nwarps = NWARPS_Q6_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
--- a/csrc/quantization/gguf/moe_vec.cuh
+++ b/csrc/quantization/gguf/moe_vec.cuh
@@ -0,0 +1,338 @@
+// copied and adapted from
+// https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
+template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr,
+          vec_dot_q_cuda_t vec_dot_q_cuda>
+static __global__ void moe_vec_q(const void* __restrict__ vx,
+                                 const void* __restrict__ vy,
+                                 scalar_t* __restrict__ dst,
+                                 const int* topk_ids, const int topk,
+                                 const int ncols, const int nrows,
+                                 const int token_stride) {
+  const auto row = blockIdx.x * blockDim.y + threadIdx.y;
+
+  const auto token = blockIdx.z / topk;
+  const auto expert = (topk_ids)[blockIdx.z];
+
+  if (row >= nrows) {
+    return;
+  }
+
+  const int blocks_per_row = ncols / qk;
+  const int blocks_per_warp = vdr * WARP_SIZE / qi;
+
+  // partial sum for each thread
+  float tmp = 0.0f;
+
+  const block_q_t* x = ((const block_q_t*)vx) + expert * nrows * blocks_per_row;
+  const block_q8_1* y =
+      (const block_q8_1*)(((const int*)vy) + token * token_stride);
+
+  for (auto i = threadIdx.x / (qi / vdr); i < blocks_per_row;
+       i += blocks_per_warp) {
+    const int ibx = row * blocks_per_row + i;  // x block index
+
+    const int iby = i * (qk / QK8_1);  // y block index that aligns with ibx
+
+    const int iqs =
+        vdr *
+        (threadIdx.x %
+         (qi / vdr));  // x block quant index when casting the quants to int
+
+    tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+  }
+
+  // sum up partial sums and write back result
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    tmp += VLLM_SHFL_XOR_SYNC(tmp, mask);
+  }
+
+  if (threadIdx.x == 0) {
+    dst[blockIdx.z * nrows + row] = tmp;
+  }
+}
+
+template <typename scalar_t>
+static void moe_vec_q4_0_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
+            vec_dot_q4_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q4_1_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
+            vec_dot_q4_1_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q5_0_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
+            vec_dot_q5_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q5_1_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
+            vec_dot_q5_1_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q8_0_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
+            vec_dot_q8_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q2_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
+            vec_dot_q2_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q3_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
+            vec_dot_q3_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q4_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
+            vec_dot_q4_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q5_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
+            vec_dot_q5_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_q6_K_q8_1_cuda(const void* vx, const void* vy,
+                                   scalar_t* dst, const int* topk_ids,
+                                   const int top_k, const int tokens,
+                                   const int ncols, const int nrows,
+                                   const int token_stride,
+                                   cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
+            vec_dot_q6_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq2_xxs_q8_1_cuda(const void* vx, const void* vy,
+                                      scalar_t* dst, const int* topk_ids,
+                                      const int top_k, const int tokens,
+                                      const int ncols, const int nrows,
+                                      const int token_stride,
+                                      cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq2_xs_q8_1_cuda(const void* vx, const void* vy,
+                                     scalar_t* dst, const int* topk_ids,
+                                     const int top_k, const int tokens,
+                                     const int ncols, const int nrows,
+                                     const int token_stride,
+                                     cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq2_s_q8_1_cuda(const void* vx, const void* vy,
+                                    scalar_t* dst, const int* topk_ids,
+                                    const int top_k, const int tokens,
+                                    const int ncols, const int nrows,
+                                    const int token_stride,
+                                    cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq3_xxs_q8_1_cuda(const void* vx, const void* vy,
+                                      scalar_t* dst, const int* topk_ids,
+                                      const int top_k, const int tokens,
+                                      const int ncols, const int nrows,
+                                      const int token_stride,
+                                      cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq1_s_q8_1_cuda(const void* vx, const void* vy,
+                                    scalar_t* dst, const int* topk_ids,
+                                    const int top_k, const int tokens,
+                                    const int ncols, const int nrows,
+                                    const int token_stride,
+                                    cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq1_m_q8_1_cuda(const void* vx, const void* vy,
+                                    scalar_t* dst, const int* topk_ids,
+                                    const int top_k, const int tokens,
+                                    const int ncols, const int nrows,
+                                    const int token_stride,
+                                    cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq4_nl_q8_1_cuda(const void* vx, const void* vy,
+                                     scalar_t* dst, const int* topk_ids,
+                                     const int top_k, const int tokens,
+                                     const int ncols, const int nrows,
+                                     const int token_stride,
+                                     cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ,
+            vec_dot_iq4_nl_q8_1><<<block_nums, block_dims, 0, stream>>>(
+      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq4_xs_q8_1_cuda(const void* vx, const void* vy,
+                                     scalar_t* dst, const int* topk_ids,
+                                     const int top_k, const int tokens,
+                                     const int ncols, const int nrows,
+                                     const int token_stride,
+                                     cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
+
+template <typename scalar_t>
+static void moe_vec_iq3_s_q8_1_cuda(const void* vx, const void* vy,
+                                    scalar_t* dst, const int* topk_ids,
+                                    const int top_k, const int tokens,
+                                    const int ncols, const int nrows,
+                                    const int token_stride,
+                                    cudaStream_t stream) {
+  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+  const dim3 block_nums(block_num_y, 1, tokens * top_k);
+  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+  moe_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
+      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
+                                              ncols, nrows, token_stride);
+}
--- a/csrc/quantization/gguf/vecdotq.cuh
+++ b/csrc/quantization/gguf/vecdotq.cuh
--- a/csrc/quantization/gptq/compat.cuh
+++ b/csrc/quantization/gptq/compat.cuh
@@ -0,0 +1,64 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _compat_cuh
+#define _compat_cuh
+
+namespace vllm {
+namespace gptq {
+// atomicAdd for half types, to support CC < 7.x
+
+__device__ __forceinline__ void atomicAdd_half(half* address, half val) {
+  unsigned int* address_as_ui =
+      (unsigned int*)((char*)address - ((size_t)address & 2));
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    __half_raw hsum;
+    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    half tmpres = __hadd(hsum, val);
+    hsum = __half_raw(tmpres);
+    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)
+                              : (old & 0xffff0000) | hsum.x;
+    old = atomicCAS(address_as_ui, assumed, old);
+  } while (assumed != old);
+}
+
+// atomicAdd for half2 types
+
+__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) {
+  unsigned int* address_as_ui = (unsigned int*)address;
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+  do {
+    assumed = old;
+    half2 old_val = *((half2*)&old);
+    half2 new_val = __hadd2(old_val, val);
+    old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
+  } while (assumed != old);
+}
+
+//
+
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+  #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
+
+__device__ __forceinline__ void atomicAdd(half* address, half val) {
+  atomicAdd_half(address, val);
+}
+
+    #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
+__device__ __forceinline__ void atomicAdd(half2* address, half2 val) {
+  atomicAdd_half2(address, val);
+}
+    #endif
+
+  #endif
+#endif
+
+}  // namespace gptq
+}  // namespace vllm
+#endif
--- a/csrc/quantization/gptq/matrix_view.cuh
+++ b/csrc/quantization/gptq/matrix_view.cuh
@@ -0,0 +1,295 @@
+/*
+Adapted from https://github.com/turboderp/exllamav2 and
+https://github.com/turboderp/exllama
+*/
+
+#ifndef _matrix_view_cuh
+#define _matrix_view_cuh
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+
+class MatrixView_half {
+ public:
+  const half* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_half(const half* data, const int height,
+                                             const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ half item(int row, int column) const {
+    return data[row * width + column];
+  }
+  __device__ __forceinline__ half2 item_half2(int row, int column) const {
+    return ((half2*)data)[(row * width + column) / 2];
+  }
+  __device__ __forceinline__ half2 item_half2half2(int row, int column) const {
+    return __half2half2(data[row * width + column]);
+  }
+  __device__ __forceinline__ const half* item_ptr(int row, int column) const {
+    return &data[row * width + column];
+  }
+
+  __device__ __forceinline__ void item4(half (&items)[4], int row,
+                                        int column) const {
+    half2* ptr = (half2*)item_ptr(row, column);
+    half2 i01 = ptr[0];
+    half2 i23 = ptr[1];
+    items[0] = __low2half(i01);
+    items[1] = __high2half(i01);
+    items[2] = __low2half(i23);
+    items[3] = __high2half(i23);
+  }
+  __device__ __forceinline__ void item4_f(float (&items)[4], int row,
+                                          int column) const {
+    half2* ptr = (half2*)item_ptr(row, column);
+    half2 i01 = ptr[0];
+    half2 i23 = ptr[1];
+    items[0] = __half2float(__low2half(i01));
+    items[1] = __half2float(__high2half(i01));
+    items[2] = __half2float(__low2half(i23));
+    items[3] = __half2float(__high2half(i23));
+  }
+
+  __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row,
+                                           int column) const {
+    half2* ptr = (half2*)item_ptr(row, column);
+    half2 i01 = ptr[0];
+    half2 i23 = ptr[1];
+    items[0] = __half2half2(__low2half(i01));
+    items[1] = __half2half2(__high2half(i01));
+    items[2] = __half2half2(__low2half(i23));
+    items[3] = __half2half2(__high2half(i23));
+  }
+};
+
+class MatrixView_half_rw {
+ public:
+  half* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_half_rw(half* data, const int height,
+                                                const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ half item(int row, int column) const {
+    return data[row * width + column];
+  }
+  __device__ __forceinline__ half2 item_half2(int row, int column) const {
+    return ((half2*)data)[(row * width + column) / 2];
+  }
+  __device__ __forceinline__ half* item_ptr(int row, int column) {
+    return &data[row * width + column];
+  }
+  __device__ __forceinline__ void set(int row, int column, half value) {
+    data[row * width + column] = value;
+  }
+  __device__ __forceinline__ void set_half2(int row, int column, half2 value) {
+    ((half2*)data)[(row * width + column) / 2] = value;
+  }
+
+  __device__ __forceinline__ void set4(int row, int column, half v0, half v1,
+                                       half v2, half v3) {
+    half2 v01 = __halves2half2(v0, v1);
+    half2 v23 = __halves2half2(v2, v3);
+    half2* ptr = (half2*)item_ptr(row, column);
+    ptr[0] = v01;
+    ptr[1] = v23;
+  }
+};
+
+class MatrixView_q4_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data,
+                                               const int height,
+                                               const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (column & 0x07) * 4;
+    return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
+  }
+
+  __device__ __forceinline__ void item2(int (&items)[2], int row,
+                                        int column) const {
+    int shift = (column & 0x07) * 4;
+    uint32_t d = data[row * width / 8 + column / 8] >> shift;
+    items[0] = d & 0x0f;
+    items[1] = (d >> 4) & 0x0f;
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row,
+                                        int column) const {
+    int shift = (column & 0x07) * 4;
+    uint32_t d = data[row * width / 8 + column / 8] >> shift;
+    items[0] = d & 0x0f;
+    items[1] = (d >> 4) & 0x0f;
+    items[2] = (d >> 8) & 0x0f;
+    items[3] = (d >> 12) & 0x0f;
+  }
+};
+
+class MatrixView_q4_column {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data,
+                                                  const int height,
+                                                  const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (row & 0x07) * 4;
+    return (data[row / 8 * width + column] >> shift) & 0x0f;
+  }
+
+  __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) {
+    return data[row / 8 * width + column];
+  }
+  __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row,
+                                                             int column) {
+    return &data[row / 8 * width + column];
+  }
+};
+
+class MatrixView_q2_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data,
+                                               const int height,
+                                               const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (column & 0x0f) * 2;
+    return (data[row * width / 16 + column / 16] >> shift) & 0x03;
+  }
+
+  __device__ __forceinline__ void item2(int (&items)[2], int row,
+                                        int column) const {
+    int shift = (column & 0x0f) * 2;
+    uint32_t d = data[row * width / 16 + column / 16] >> shift;
+    items[0] = d & 0x03;
+    items[1] = (d >> 2) & 0x03;
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row,
+                                        int column) const {
+    int shift = (column & 0x0f) * 2;
+    uint32_t d = data[row * width / 16 + column / 16] >> shift;
+    items[0] = d & 0x03;
+    items[1] = (d >> 2) & 0x03;
+    items[2] = (d >> 4) & 0x03;
+    items[3] = (d >> 6) & 0x03;
+  }
+};
+
+class MatrixView_q3_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data,
+                                               const int height,
+                                               const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int z_w = column * 3 / 32;
+    int z_mod = column & 0x1f;
+
+    if (z_mod == 10) {
+      return (data[row * width * 3 / 32 + z_w] >> 30) |
+             ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4);
+    } else if (z_mod == 21) {
+      return (data[row * width * 3 / 32 + z_w] >> 31) |
+             ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6);
+    } else if (z_mod < 10) {
+      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07;
+    } else if (z_mod < 21) {
+      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 32)) & 0x07;
+    } else {
+      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 64)) & 0x07;
+    }
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row,
+                                        int column) const {
+    int shift = (column & 0x1f);
+    uint32_t d;
+    if (shift <= 4) {
+      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3);
+    } else if (shift == 8) {
+      d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) |
+          ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8);
+    } else if (shift <= 16) {
+      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32);
+    } else if (shift == 20) {
+      d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) |
+          ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4);
+    } else {
+      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64);
+    }
+    items[0] = d & 0x07;
+    items[1] = (d >> 3) & 0x07;
+    items[2] = (d >> 6) & 0x07;
+    items[3] = (d >> 9) & 0x07;
+  }
+};
+
+class MatrixView_q8_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data,
+                                               const int height,
+                                               const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (column & 0x03) * 8;
+    return (data[row * width / 4 + column / 4] >> shift) & 0xff;
+  }
+
+  __device__ __forceinline__ void item2(int (&items)[2], int row,
+                                        int column) const {
+    int shift = (column & 0x03) * 8;
+    uint32_t d = data[row * width / 4 + column / 4] >> shift;
+    items[0] = d & 0xff;
+    items[1] = (d >> 8) & 0xff;
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row,
+                                        int column) const {
+    int shift = (column & 0x03) * 2;
+    uint32_t d = data[row * width / 4 + column / 4] >> shift;
+    items[0] = d & 0xff;
+    items[1] = (d >> 8) & 0xff;
+    items[2] = (d >> 16) & 0xff;
+    items[3] = (d >> 24) & 0xff;
+  }
+};
+
+}  // namespace gptq
+}  // namespace vllm
+#endif
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
--- a/csrc/quantization/gptq/qdq_2.cuh
+++ b/csrc/quantization/gptq/qdq_2.cuh
@@ -0,0 +1,76 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_2_cuh
+#define _qdq_2_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+
+// Permutation:
+//
+// ffddbb99 77553311  eeccaa88 66442200
+
+__forceinline__ __device__ void shuffle_2bit_16(uint32_t* q, int stride) {
+  uint32_t qa = q[0];
+  uint32_t qb = 0;
+
+#pragma unroll
+  for (int i = 0; i < 8; i++) {
+    uint32_t qa0 = qa & 0x03;
+    uint32_t qa1 = (qa & 0x0c) >> 2;
+    qa >>= 4;
+    qb |= (qa1 << (i * 2 + 16));
+    qb |= (qa0 << (i * 2));
+  }
+  q[0] = qb;
+}
+
+__forceinline__ __device__ void dequant_2bit_16(const uint32_t q_0,
+                                                half2 (&dq)[8], int stride,
+                                                const uint32_t zero) {
+  const uint32_t c0 = 0x64006400;
+  const half y4_ = __float2half_rn(1.0f / 4.0f);
+  const half y16_ = __float2half_rn(1.0f / 16.0f);
+  const half y64_ = __float2half_rn(1.0f / 64.0f);
+  const half2 y4 = __halves2half2(y4_, y4_);
+  const half2 y16 = __halves2half2(y16_, y16_);
+  const half2 y64 = __halves2half2(y64_, y64_);
+
+  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
+  const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero));
+  const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+  const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+  const half2 z1 = __half2half2(z1_.as_half);
+  const half2 z4 = __half2half2(z4_);
+  const half2 z16 = __half2half2(z16_);
+  const half2 z64 = __half2half2(z64_);
+
+  uint32_t qa = q_0;
+  half2_uint32 q0((qa & 0x00030003) | c0);  // half2(q[ 0], q[ 1])      + 1024
+  half2_uint32 q1((qa & 0x000c000c) | c0);  // half2(q[ 2], q[ 3]) *  4 + 1024
+  half2_uint32 q2((qa & 0x00300030) | c0);  // half2(q[ 4], q[ 5]) * 16 + 1024
+  half2_uint32 q3((qa & 0x00c000c0) | c0);  // half2(q[ 6], q[ 7]) * 64 + 1024
+  qa >>= 8;
+  half2_uint32 q4((qa & 0x00030003) | c0);  // half2(q[ 8], q[ 8])      + 1024
+  half2_uint32 q5((qa & 0x000c000c) | c0);  // half2(q[10], q[11]) *  4 + 1024
+  half2_uint32 q6((qa & 0x00300030) | c0);  // half2(q[12], q[13]) * 16 + 1024
+  half2_uint32 q7((qa & 0x00c000c0) | c0);  // half2(q[14], q[15]) * 64 + 1024
+
+  dq[0] = __hadd2(q0.as_half2, z1);
+  dq[1] = __hfma2(q1.as_half2, y4, z4);
+  dq[2] = __hfma2(q2.as_half2, y16, z16);
+  dq[3] = __hfma2(q3.as_half2, y64, z64);
+  dq[4] = __hadd2(q4.as_half2, z1);
+  dq[5] = __hfma2(q5.as_half2, y4, z4);
+  dq[6] = __hfma2(q6.as_half2, y16, z16);
+  dq[7] = __hfma2(q7.as_half2, y64, z64);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
--- a/csrc/quantization/gptq/qdq_3.cuh
+++ b/csrc/quantization/gptq/qdq_3.cuh
@@ -0,0 +1,149 @@
+#ifndef _qdq_3_cuh
+#define _qdq_3_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+// Permutation:
+//
+// v9997775 55333111  u8886664 44222000  (u, v lsb)
+// vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+// vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+__forceinline__ __device__ void shuffle_3bit_32(uint32_t* q, int stride) {
+  uint32_t qa = q[0 * stride];
+  uint32_t qb = q[1 * stride];
+  uint32_t qc = q[2 * stride];
+
+  // qa: aa999888 77766655  54443332 22111000
+  // qb: lkkkjjji iihhhggg  fffeeedd dcccbbba
+  // qc: vvvuuutt tsssrrrq  qqpppooo nnnmmmll
+
+  uint32_t qd = qc >> 26;
+  qc <<= 4;
+  qc |= qb >> 28;
+  qb <<= 2;
+  qb |= qa >> 30;
+
+  // qa: ..999888 77766655  54443332 22111000
+  // qb: ..jjjiii hhhgggff  feeedddc ccbbbaaa
+  // qc: ..tttsss rrrqqqpp  pooonnnm mmlllkkk
+  // qd:                               vvvuuu
+
+  uint32_t za = 0;
+  uint32_t zb = 0;
+  uint32_t zc = 0;
+
+  for (int i = 0; i < 5; i++) {
+    uint32_t t0 = qa & 0x07;
+    uint32_t t1 = (qa & 0x38) >> 3;
+    qa >>= 6;
+    za |= (t0 << (i * 3));
+    za |= (t1 << (i * 3 + 16));
+  }
+  for (int i = 0; i < 5; i++) {
+    uint32_t t0 = qb & 0x07;
+    uint32_t t1 = (qb & 0x38) >> 3;
+    qb >>= 6;
+    zb |= (t0 << (i * 3));
+    zb |= (t1 << (i * 3 + 16));
+  }
+  for (int i = 0; i < 5; i++) {
+    uint32_t t0 = qc & 0x07;
+    uint32_t t1 = (qc & 0x38) >> 3;
+    qc >>= 6;
+    zc |= (t0 << (i * 3));
+    zc |= (t1 << (i * 3 + 16));
+  }
+
+  // za:  9997775 55333111   8886664 44222000
+  // zb:  jjjhhhf ffdddbbb   iiiggge eecccaaa
+  // zc:  tttrrrp ppnnnlll   sssqqqo oommmkkk
+  // qd:                               vvvuuu
+
+  za |= ((qd & 0x01) >> 0) << 15;
+  zb |= ((qd & 0x02) >> 1) << 15;
+  zc |= ((qd & 0x04) >> 2) << 15;
+  za |= ((qd & 0x08) >> 3) << 31;
+  zb |= ((qd & 0x10) >> 4) << 31;
+  zc |= ((qd & 0x20) >> 5) << 31;
+
+  // za: v9997775 55333111  u8886664 44222000  (u, v lsb)
+  // zb: vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+  // zc: vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+  q[0 * stride] = za;
+  q[1 * stride] = zb;
+  q[2 * stride] = zc;
+}
+
+__forceinline__ __device__ void dequant_3bit_32(const uint32_t q_0,
+                                                const uint32_t q_1,
+                                                const uint32_t q_2,
+                                                half2 (&dq)[16], int stride,
+                                                const uint32_t zero) {
+  const uint32_t c0 = 0x64006400;
+  const half y8_ = __float2half_rn(1.0f / 8.0f);
+  const half y64_ = __float2half_rn(1.0f / 64.0f);
+  const half2 y8 = __halves2half2(y8_, y8_);
+  const half2 y64 = __halves2half2(y64_, y64_);
+  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
+  const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero));
+  const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+  const half2 z1 = __halves2half2(z1_.as_half, z1_.as_half);
+  const half2 z8 = __halves2half2(z8_, z8_);
+  const half2 z64 = __halves2half2(z64_, z64_);
+
+  uint32_t qa = q_0;
+  uint32_t qb = q_1;
+  uint32_t qc = q_2;
+
+  half2_uint32 q0((qa & 0x00070007) | c0);  // half2(q[ 0], q[ 1])      + 1024
+  half2_uint32 q1((qa & 0x00380038) | c0);  // half2(q[ 2], q[ 3]) *  8 + 1024
+  qa >>= 6;
+  half2_uint32 q2((qa & 0x00070007) | c0);  // half2(q[ 4], q[ 5])      + 1024
+  half2_uint32 q3((qa & 0x00380038) | c0);  // half2(q[ 6], q[ 7]) *  8 + 1024
+  half2_uint32 q4((qa & 0x01c001c0) | c0);  // half2(q[ 8], q[ 9]) * 64 + 1024
+  qa >>= 9;
+  qa &= 0x00010001;
+  half2_uint32 q5((qb & 0x00070007) | c0);  // half2(q[10], q[11])      + 1024
+  half2_uint32 q6((qb & 0x00380038) | c0);  // half2(q[12], q[13]) *  8 + 1024
+  qb >>= 6;
+  half2_uint32 q7((qb & 0x00070007) | c0);  // half2(q[14], q[15])      + 1024
+  half2_uint32 q8((qb & 0x00380038) | c0);  // half2(q[16], q[17]) *  8 + 1024
+  half2_uint32 q9((qb & 0x01c001c0) | c0);  // half2(q[18], q[19]) * 64 + 1024
+  qb >>= 8;
+  qb &= 0x00020002;
+  half2_uint32 q10((qc & 0x00070007) | c0);  // half2(q[20], q[21])      + 1024
+  half2_uint32 q11((qc & 0x00380038) | c0);  // half2(q[22], q[23]) *  8 + 1024
+  qc >>= 6;
+  half2_uint32 q12((qc & 0x00070007) | c0);  // half2(q[24], q[25])      + 1024
+  half2_uint32 q13((qc & 0x00380038) | c0);  // half2(q[26], q[27]) *  8 + 1024
+  half2_uint32 q14((qc & 0x01c001c0) | c0);  // half2(q[28], q[29]) * 64 + 1024
+  qc >>= 7;
+  qc &= 0x00040004;
+  half2_uint32 q15((qa | qb | qc) | c0);
+
+  dq[0] = __hadd2(q0.as_half2, z1);
+  dq[1] = __hfma2(q1.as_half2, y8, z8);
+  dq[2] = __hadd2(q2.as_half2, z1);
+  dq[3] = __hfma2(q3.as_half2, y8, z8);
+  dq[4] = __hfma2(q4.as_half2, y64, z64);
+  dq[5] = __hadd2(q5.as_half2, z1);
+  dq[6] = __hfma2(q6.as_half2, y8, z8);
+  dq[7] = __hadd2(q7.as_half2, z1);
+  dq[8] = __hfma2(q8.as_half2, y8, z8);
+  dq[9] = __hfma2(q9.as_half2, y64, z64);
+  dq[10] = __hadd2(q10.as_half2, z1);
+  dq[11] = __hfma2(q11.as_half2, y8, z8);
+  dq[12] = __hadd2(q12.as_half2, z1);
+  dq[13] = __hfma2(q13.as_half2, y8, z8);
+  dq[14] = __hfma2(q14.as_half2, y64, z64);
+  dq[15] = __hadd2(q15.as_half2, z1);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
--- a/csrc/quantization/gptq/qdq_4.cuh
+++ b/csrc/quantization/gptq/qdq_4.cuh
@@ -0,0 +1,126 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_4_cuh
+#define _qdq_4_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+// Permutation:
+//
+// 77775555 33331111  66664444 22220000
+
+__forceinline__ __device__ void shuffle_4bit_8(uint32_t* q, int stride) {
+  uint32_t qa = q[0];
+  uint32_t qb = 0;
+
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    uint32_t qa0 = qa & 0x0f;
+    uint32_t qa1 = (qa & 0xf0) >> 4;
+    qa >>= 8;
+    qb |= (qa1 << (i * 4 + 16));
+    qb |= (qa0 << (i * 4));
+  }
+  q[0] = qb;
+}
+
+__forceinline__ __device__ void dequant_4bit_8(const uint32_t q_0,
+                                               half2 (&dq)[4], int stride,
+                                               const uint32_t zero) {
+  const uint32_t c0 = 0x64006400;
+  const half y16_ = __float2half_rn(1.0f / 16.0f);
+  const half2 y16 = __halves2half2(y16_, y16_);
+  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
+  const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+  const half2 z1 = __half2half2(z1_.as_half);
+  const half2 z16 = __half2half2(z16_);
+
+  uint32_t qa = q_0;
+  half2_uint32 q0((qa & 0x000f000f) | c0);  // half2(q[ 0], q[ 1])      + 1024
+  half2_uint32 q1((qa & 0x00f000f0) | c0);  // half2(q[ 2], q[ 3]) * 16 + 1024
+  qa >>= 8;
+  half2_uint32 q2((qa & 0x000f000f) | c0);  // half2(q[ 4], q[ 5])      + 1024
+  half2_uint32 q3((qa & 0x00f000f0) | c0);  // half2(q[ 6], q[ 7]) * 16 + 1024
+
+  dq[0] = __hadd2(q0.as_half2, z1);
+  dq[1] = __hfma2(q1.as_half2, y16, z16);
+  dq[2] = __hadd2(q2.as_half2, z1);
+  dq[3] = __hfma2(q3.as_half2, y16, z16);
+}
+
+__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale(
+    const uint32_t zero, const half scale, half2 (&z1z16)[2],
+    half2 (&y1y16)[2]) {
+  half_uint16 z1(0xe400 | zero);  // half(-1024.0f - zero);
+  half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+
+  half2 scale2 = __half2half2(scale);
+
+  z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half));
+  z1z16[1] = __hmul2(scale2, __half2half2(z16));
+
+  const half y1 = __float2half_rn(1.0f);
+  const half y16 = __float2half_rn(1.0f / 16.0f);
+
+  y1y16[0] = __hmul2(scale2, __half2half2(y1));
+  y1y16[1] = __hmul2(scale2, __half2half2(y16));
+}
+
+__forceinline__ __device__ void dequant_4bit_8_prep_zero(const uint32_t zero,
+                                                         half2 (&z1z16)[2],
+                                                         half2 (&y1y16)[2]) {
+  half_uint16 z1(0xe400 | zero);  // half(-1024.0f - zero);
+  half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+
+  z1z16[0] = __half2half2(z1.as_half);
+  z1z16[1] = __half2half2(z16);
+
+  const half y1 = __float2half_rn(1.0f);
+  const half y16 = __float2half_rn(1.0f / 16.0f);
+
+  y1y16[0] = __half2half2(y1);
+  y1y16[1] = __half2half2(y16);
+}
+
+__forceinline__ __device__ void dequant_4bit_8_gptq(const uint32_t q_0,
+                                                    half2 (&dq)[4],
+                                                    half2 (&z1z16)[2],
+                                                    half2 (&y1y16)[2],
+                                                    int stride, bool scaled) {
+  const uint32_t c0 = 0x64006400;
+
+  uint32_t qa = q_0;
+  half2_uint32 q0((qa & 0x000f000f) |
+                  c0);  // half2( q[0]      + 1024, q[1]      + 1024 )
+  half2_uint32 q1((qa & 0x00f000f0) |
+                  c0);  // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 )
+  qa >>= 8;
+  half2_uint32 q2((qa & 0x000f000f) |
+                  c0);  // half2( q[4]      + 1024, q[5]      + 1024 )
+  half2_uint32 q3((qa & 0x00f000f0) |
+                  c0);  // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 )
+
+  if (scaled) {
+    dq[0] = __hfma2(q0.as_half2, y1y16[0],
+                    z1z16[0]);  // half2( q[0] * s - z * s, q[1] * s - z * s)
+    dq[1] = __hfma2(q1.as_half2, y1y16[1],
+                    z1z16[1]);  // half2( q[2] * s - z * s, q[3] * s - z * s)
+    dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]);
+    dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);
+  } else {
+    dq[0] = __hadd2(q0.as_half2, z1z16[0]);  // half2( q[0] - z, q[1] - z )
+    dq[1] = __hfma2(q1.as_half2, y1y16[1],
+                    z1z16[1]);               // half2( q[2] - z, q[3] - z )
+    dq[2] = __hadd2(q2.as_half2, z1z16[0]);  // half2( q[4] - z, q[5] - z )
+    dq[3] = __hfma2(q3.as_half2, y1y16[1],
+                    z1z16[1]);  // half2( q[6] - z, q[7] - z )
+  }
+}
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
--- a/csrc/quantization/gptq/qdq_8.cuh
+++ b/csrc/quantization/gptq/qdq_8.cuh
@@ -0,0 +1,30 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_8_cuh
+#define _qdq_8_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+
+__forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
+
+__forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
+                                               const uint32_t q_1,
+                                               half2 (&dq)[4], int stride,
+                                               const uint32_t zero) {
+  half dqh[8];
+  for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
+  for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
+
+  for (int i = 0; i < 4; i++)
+    dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
--- a/csrc/quantization/gptq/qdq_util.cuh
+++ b/csrc/quantization/gptq/qdq_util.cuh
@@ -0,0 +1,56 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_util_cuh
+#define _qdq_util_cuh
+
+namespace vllm {
+namespace gptq {
+
+union half2_uint32 {
+  uint32_t as_uint32;
+  half2 as_half2;
+  __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
+  __device__ half2_uint32(half2 val) : as_half2(val) {}
+};
+
+union half_uint16 {
+  uint16_t as_uint16;
+  half as_half;
+  __device__ half_uint16(uint16_t val) : as_uint16(val) {}
+  __device__ half_uint16(half val) : as_half(val) {}
+};
+
+// Max_scale premultiplied by 1/256
+
+__forceinline__ __device__ half dq_scale(const int qs, const half max_scale) {
+  int qs_i = qs + 1;
+  half qs_h = __int2half_rn(qs_i * qs_i);
+  qs_h = __hmul(qs_h, max_scale);
+  return qs_h;
+}
+
+__forceinline__ __device__ half dq(const int q, const int qzero,
+                                   const half scale) {
+  return __hmul(__int2half_rn(q - qzero), scale);
+}
+
+__forceinline__ __device__ half dq_ns(const int q, const int qzero) {
+  // return __hsub(__int2half_rn(q), __int2half_rn(qzero));
+  return __int2half_rn(q - qzero);
+}
+
+__forceinline__ __device__ int exb(const uint32_t q, const int shift,
+                                   const int mask) {
+  return (int)((q >> shift) & mask);
+}
+
+__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0,
+                                   const int shift, const int mask) {
+  return (int)(__funnelshift_rc(q0, q1, shift) & mask);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+#endif
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
--- a/csrc/quantization/gptq_allspark/allspark_repack.cu
+++ b/csrc/quantization/gptq_allspark/allspark_repack.cu
@@ -0,0 +1,163 @@
+#include "allspark_utils.cuh"
+#include <torch/all.h>
+#include "core/registration.h"
+
+namespace allspark {
+
+// Rearrange B to facilitate Ampere Tensor Core load data
+// reorder B from (K, N) to (N_32align / 4, K * 4)
+// K % 16 == 0, N % 16 == 0, N_32align % 32 == 0
+template <typename FType>
+__global__ void __launch_bounds__(128)
+    rearrange_kn_weight_as_n32k16_order_ldg16_kernel(
+        const uint8_t* B, const FType* B_scale, const FType* B_zero,
+        uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
+        const int K, const int N, const int N_32align) {
+  const auto lane_id = threadIdx.x % 32;
+  const auto warp_id = threadIdx.x / 32;
+
+  if (blockIdx.x != gridDim.x - 1) {
+    // Load B
+    // per block process 64(k) * 128(n) B elements
+    // per warp process 16(k) * 128 B elements
+    const int src_row_base_idx =
+        blockIdx.x * 64 + warp_id * 16 + ((lane_id % 8) / 2) * 2;
+    const int src_col_idx =
+        blockIdx.y * 128 + (lane_id / 8) * 32 + (lane_id % 2) * 16;
+    uint8_t B_frag[4][16];
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      int src_row_idx = src_row_base_idx + (i / 2) * 8 + (i % 2);
+      int src_offset = src_row_idx * N + src_col_idx;
+      bool guard = src_row_idx < K && src_col_idx < N;
+      ldg128_cg_0(*reinterpret_cast<uint32_t*>(B_frag[i]),
+                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 1),
+                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 2),
+                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 3), B + src_offset,
+                  guard);
+    }
+
+    // reorder B
+    uint8_t B_reorder_frag[8][8];
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+#pragma unroll
+      for (int j = 0; j < 16; ++j) {
+        int dst_i = j % 8;
+        int dst_j = i + (j / 8) * 4;
+        B_reorder_frag[dst_i][dst_j] = B_frag[i][j];
+      }
+    }
+
+    // Store B
+    const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
+    const int dst_col_idx =
+        blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
+    for (int i = 0; i < 8; ++i) {
+      int dst_row_idx = dst_row_base_idx + i;
+      int dst_offset = dst_row_idx * K * 4 + dst_col_idx;
+      bool guard = (dst_row_base_idx < N_32align / 4) && (dst_col_idx < K * 4);
+      if (guard) {
+        *reinterpret_cast<int2*>(B_result + dst_offset) =
+            *reinterpret_cast<int2*>(B_reorder_frag[i]);
+      }
+    }
+  } else {
+    // Load B_scale and B_zero
+    FType b_scale_reg, b_zero_reg;
+    auto src_offset = blockIdx.y * 128 + threadIdx.x;
+    ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
+    if (B_zero != nullptr)
+      ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
+    int dst_offset =
+        blockIdx.y * 128 + warp_id * 32 + (lane_id % 8) * 4 + lane_id / 8;
+    if (dst_offset < N_32align) {
+      B_scale_result[dst_offset] = b_scale_reg;
+      if (B_zero != nullptr) B_zero_result[dst_offset] = b_zero_reg;
+    }
+  }
+}
+
+template <typename FType>
+void rearrange_kn_weight_as_n32k16_order_ldg16(
+    const uint8_t* B, const FType* B_scale, const FType* B_zero,
+    uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
+    const int64_t K, const int64_t N, const int64_t N_32align,
+    cudaStream_t stream) {
+  if (N % 16 != 0 || K % 16 != 0) {
+    std::cerr << "Now only support N and K is multiples of 16" << std::endl;
+  }
+  const int BLOCK = 128;
+  int grid_x = (K + 64 - 1) / 64 + 1;
+  int grid_y = (N + 128 - 1) / 128;
+  dim3 grid(grid_x, grid_y);
+
+  rearrange_kn_weight_as_n32k16_order_ldg16_kernel<FType>
+      <<<grid, BLOCK, 0, stream>>>(B, B_scale, B_zero, B_result, B_scale_result,
+                                   B_zero_result, K, N, N_32align);
+}
+}  // namespace allspark
+
+void rearrange_kn_weight_as_n32k16_order(
+    torch::Tensor const& b_qweight, torch::Tensor const& b_scales,
+    std::optional<torch::Tensor> const& b_zeros, bool has_zp,
+    torch::Tensor& b_qweight_reorder, torch::Tensor& b_scales_reorder,
+    std::optional<torch::Tensor> const& b_zeros_reorder, const int64_t K,
+    const int64_t N, const int64_t N_32align) {
+  // Verify device and strides
+  TORCH_CHECK(b_qweight.device().is_cuda(), "b_qweight is not on GPU");
+  TORCH_CHECK(b_qweight.is_contiguous(), "b_qweight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  TORCH_CHECK(b_qweight_reorder.device().is_cuda(),
+              "b_qweight_reorder is not on GPU");
+  TORCH_CHECK(b_qweight_reorder.is_contiguous(),
+              "b_qweight_reorder is not contiguous");
+
+  TORCH_CHECK(b_scales_reorder.device().is_cuda(),
+              "b_scales_reorder is not on GPU");
+  TORCH_CHECK(b_scales_reorder.is_contiguous(),
+              "b_scales_reorder is not contiguous");
+
+  if (has_zp) {
+    TORCH_CHECK(b_zeros.value().device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.value().is_contiguous(), "b_zeros is not contiguous");
+
+    TORCH_CHECK(b_zeros_reorder.value().device().is_cuda(),
+                "b_zeros_reorder is not on GPU");
+    TORCH_CHECK(b_zeros_reorder.value().is_contiguous(),
+                "b_zeros_reorder is not contiguous");
+  }
+
+  const uint8_t* matB = reinterpret_cast<const uint8_t*>(b_qweight.data_ptr());
+  const void* b_scale = b_scales.data_ptr();
+  const void* b_zero = has_zp ? b_zeros.value().data_ptr() : nullptr;
+
+  uint8_t* matB_reorder =
+      reinterpret_cast<uint8_t*>(b_qweight_reorder.data_ptr());
+  void* b_scale_reorder = b_scales_reorder.data_ptr();
+  void* b_zero_reorder = has_zp ? b_zeros_reorder.value().data_ptr() : nullptr;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (b_scales.dtype() == at::ScalarType::Half) {
+    allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__half>(
+        matB, reinterpret_cast<const __half*>(b_scale),
+        reinterpret_cast<const __half*>(b_zero), matB_reorder,
+        reinterpret_cast<__half*>(b_scale_reorder),
+        reinterpret_cast<__half*>(b_zero_reorder), K, N, N_32align, stream);
+  } else if (b_scales.dtype() == at::ScalarType::BFloat16) {
+    allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__nv_bfloat16>(
+        matB, reinterpret_cast<const __nv_bfloat16*>(b_scale),
+        reinterpret_cast<const __nv_bfloat16*>(b_zero), matB_reorder,
+        reinterpret_cast<__nv_bfloat16*>(b_scale_reorder),
+        reinterpret_cast<__nv_bfloat16*>(b_zero_reorder), K, N, N_32align,
+        stream);
+  }
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("rearrange_kn_weight_as_n32k16_order",
+         &rearrange_kn_weight_as_n32k16_order);
+}
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -0,0 +1,410 @@
+#pragma once
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <iostream>
+#include "../gptq_marlin/marlin_dtypes.cuh"
+using marlin::MarlinScalarType2;
+
+namespace allspark {
+
+#define CHECK_CUDA(cmd)                                             \
+  do {                                                              \
+    cudaError_t cuda_status = cmd;                                  \
+    if (cuda_status != cudaSuccess) {                               \
+      std::string err_str = cudaGetErrorString(cuda_status);        \
+      std::cerr << "Failed: " << __FILE__ << ":" << __LINE__ << " " \
+                << err_str;                                         \
+      exit(-1);                                                     \
+    }                                                               \
+  } while (0)
+
+#define CHECK_CUBLAS(cmd)                                            \
+  do {                                                               \
+    cublasStatus_t cublas_status = cmd;                              \
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {                    \
+      std::cerr << "Failed:  " << __FILE__ << ":" << __LINE__ << " " \
+                << cublas_status << std::endl;                       \
+      exit(-1);                                                      \
+    }                                                                \
+  } while (0)
+
+template <typename FType, typename QType>
+struct SM8x_GEMM_W8A16_Splitk_Params {
+  const FType* A_ptr;
+  const QType* B_ptr;
+  const FType* B_scale_ptr;
+  const FType* B_zero_ptr;
+  FType* C_ptr;
+  int M;
+  int N;
+  int K;
+  int SplitK;
+  int GroupCnt;
+  int GroupSize;
+  FType* C_split_ptr;       // for non-fused splitk reduce
+  float* C_tmp_ptr;         // for fused splitk reduce
+  uint32_t* red_count_ptr;  // for fused splitk reduce
+};
+
+struct alignas(16) BlockTileSplitkParams {
+  int Mtile;
+  int Ntile;
+  int SplitK;
+  bool EnableFuse;
+};
+
+template <typename FType, int BLOCK, int N_MATRIX>
+__global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
+                                              uint32_t n, uint32_t n_matrix,
+                                              uint32_t matrix_size) {
+  auto idx = blockIdx.x * BLOCK + threadIdx.x;
+
+  if (idx >= matrix_size) {
+    return;
+  }
+
+  float sum = 0.f;
+
+  int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
+  for (int i = 0; i < n_mat; ++i) {
+    sum += MarlinScalarType2<FType>::num2float(C_split[idx + i * matrix_size]);
+  }
+
+  C[idx] = MarlinScalarType2<FType>::float2num(sum);
+}
+
+template <typename FType>
+void f16_gemm_splitk_reduce(const FType* C_split, FType* C, const uint32_t m,
+                            const uint32_t n, const uint32_t n_matrix,
+                            cudaStream_t stream) {
+  const int BLOCK = 128;
+  uint32_t matrix_size = m * n;
+  int grid = (matrix_size + BLOCK - 1) / BLOCK;
+
+  void (*kernel)(const FType*, FType*, uint32_t, uint32_t, uint32_t) = nullptr;
+
+  switch (n_matrix) {
+    case 4:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 4>;
+      break;
+    case 5:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 5>;
+      break;
+    case 6:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 6>;
+      break;
+    case 7:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 7>;
+      break;
+    case 8:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 8>;
+      break;
+    case 9:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 9>;
+      break;
+    case 10:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 10>;
+      break;
+    case 11:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 11>;
+      break;
+    case 12:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 12>;
+      break;
+    default:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, -1>;
+      break;
+  }
+
+  kernel<<<grid, BLOCK, 0, stream>>>(C_split, C, n, n_matrix, matrix_size);
+}
+
+template <typename T>
+struct HalfType;
+template <>
+struct HalfType<half> {
+  using T1 = __half;
+  using T2 = __half2;
+};
+template <>
+struct HalfType<__nv_bfloat16> {
+  using T1 = __nv_bfloat16;
+  using T2 = __nv_bfloat162;
+};
+
+// convert 64-bit pointer to 32-bit smem addr
+__device__ __forceinline__ uint32_t smem_u32addr(const void* smem_ptr) {
+  uint32_t addr;
+  asm("{.reg .u64 u64addr;\n"
+      " cvta.to.shared.u64 u64addr, %1;\n"
+      " cvt.u32.u64 %0, u64addr;}\n"
+      : "=r"(addr)
+      : "l"(smem_ptr));
+
+  return addr;
+}
+
+template <typename T>
+__device__ __forceinline__ void ldg16_cg_0(T& r0, const void* ptr, bool guard) {
+  static_assert(sizeof(T) == 2, "ldg16_cg_0: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %2, 0;\n"
+      " @!p mov.b16 %0, 0;\n"
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4 && \
+    __CUDA_ARCH__ >= 750
+      " @p ld.global.cg.L2::128B.b16 {%0}, [%1];}\n"
+#else
+      " @p ld.global.ca.b16 {%0}, [%1];}\n"
+#endif
+      : "=h"(reinterpret_cast<uint16_t&>(r0))
+      : "l"(ptr), "r"((int)guard));
+}
+
+template <typename T>
+__device__ __forceinline__ void ldg64_ca(T& r0, T& r1, const void* ptr,
+                                         bool guard) {
+  static_assert(sizeof(T) == 4, "ldg64_ca: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %3, 0;\n"
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4 && \
+    __CUDA_ARCH__ >= 750
+      " @p ld.global.ca.L2::128B.v2.b32 {%0, %1}, [%2];}\n"
+#else
+      " @p ld.global.ca.v2.b32 {%0, %1}, [%2];}\n"
+#endif
+      : "=r"(reinterpret_cast<uint32_t&>(r0)),
+        "=r"(reinterpret_cast<uint32_t&>(r1))
+      : "l"(ptr), "r"((int)guard));
+}
+
+template <typename T>
+__device__ __forceinline__ void ldg128_cg_0(T& r0, T& r1, T& r2, T& r3,
+                                            const void* ptr, bool guard) {
+  static_assert(sizeof(T) == 4, "ldg128_cg_0: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %5, 0;\n"
+      " @!p mov.b32 %0, 0;\n"
+      " @!p mov.b32 %1, 0;\n"
+      " @!p mov.b32 %2, 0;\n"
+      " @!p mov.b32 %3, 0;\n"
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4 && \
+    __CUDA_ARCH__ >= 750
+      " @p ld.global.cg.L2::128B.v4.b32 {%0, %1, %2, %3}, [%4];}\n"
+#else
+      " @p ld.global.cg.v4.b32 {%0, %1, %2, %3}, [%4];}\n"
+#endif
+      : "=r"(reinterpret_cast<uint32_t&>(r0)),
+        "=r"(reinterpret_cast<uint32_t&>(r1)),
+        "=r"(reinterpret_cast<uint32_t&>(r2)),
+        "=r"(reinterpret_cast<uint32_t&>(r3))
+      : "l"(ptr), "r"((int)guard));
+}
+
+template <typename T>
+__device__ __forceinline__ void lds128(T& reg0, T& reg1, T& reg2, T& reg3,
+                                       const uint32_t addr) {
+  static_assert(sizeof(T) == 4, "lds128: invalid T");
+
+  asm volatile("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(reinterpret_cast<uint32_t&>(reg0)),
+                 "=r"(reinterpret_cast<uint32_t&>(reg1)),
+                 "=r"(reinterpret_cast<uint32_t&>(reg2)),
+                 "=r"(reinterpret_cast<uint32_t&>(reg3))
+               : "r"(addr));
+}
+
+template <typename T>
+__device__ __forceinline__ void stg128(const T& r0, const T& r1, const T& r2,
+                                       const T& r3, const void* ptr,
+                                       bool guard) {
+  static_assert(sizeof(T) == 4, "stg128: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %1, 0;\n"
+      " @p st.global.v4.b32 [%0], {%2, %3, %4, %5};}\n"
+      :
+      : "l"(ptr), "r"((int)guard), "r"(reinterpret_cast<const uint32_t&>(r0)),
+        "r"(reinterpret_cast<const uint32_t&>(r1)),
+        "r"(reinterpret_cast<const uint32_t&>(r2)),
+        "r"(reinterpret_cast<const uint32_t&>(r3)));
+}
+
+template <typename T>
+__device__ __forceinline__ void ldsm_4(T& r0, T& r1, T& r2, T& r3,
+                                       const uint32_t& addr) {
+  static_assert(sizeof(T) == 4, "ldsm_4: invalid T");
+#if (__CUDA_ARCH__ >= 750) && (__CUDACC_VER_MAJOR__ >= 11)
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+      : "=r"(reinterpret_cast<uint32_t&>(r0)),
+        "=r"(reinterpret_cast<uint32_t&>(r1)),
+        "=r"(reinterpret_cast<uint32_t&>(r2)),
+        "=r"(reinterpret_cast<uint32_t&>(r3))
+      : "r"(addr));
+#endif
+}
+
+template <typename FType>
+__device__ __forceinline__ void hmma16816_f32(float (&d)[4],
+                                              const uint32_t (&a)[4],
+                                              const uint32_t (&b)[2]);
+
+template <>
+__device__ __forceinline__ void hmma16816_f32<__half>(float (&d)[4],
+                                                      const uint32_t (&a)[4],
+                                                      const uint32_t (&b)[2]) {
+#if (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, "
+      "{%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};\n"
+      : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]));
+#endif
+}
+
+template <>
+__device__ __forceinline__ void hmma16816_f32<__nv_bfloat16>(
+    float (&d)[4], const uint32_t (&a)[4], const uint32_t (&b)[2]) {
+#if (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 {%0, %1, %2, %3}, "
+      "{%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};\n"
+      : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]));
+#endif
+}
+
+template <int SIZE_IN_BYTES>
+__device__ __forceinline__ void cp_async(const uint32_t smem_addr,
+                                         const void* gmem_ptr,
+                                         const int src_in_bytes, bool guard) {
+  static_assert(
+      (SIZE_IN_BYTES == 4 || SIZE_IN_BYTES == 8 || SIZE_IN_BYTES == 16),
+      "Size is not supported");
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile(
+      "{.reg.pred p;\n"
+      " setp.ne.b32 p, %4, 0;\n"
+  #if __CUDACC_VER_MINOR__ >= 4
+      " @p cp.async.cg.shared.global.L2::256B [%0], [%1], %2, %3;}\n"
+  #else
+      " @p cp.async.cg.shared.global [%0], [%1], %2, %3;}\n"
+  #endif
+      ::"r"(smem_addr),
+      "l"(gmem_ptr), "n"(SIZE_IN_BYTES), "r"(src_in_bytes), "r"((int)guard));
+#endif
+}
+
+template <int SIZE_IN_BYTES>
+__device__ __forceinline__ void cp_async_ca(const uint32_t smem_addr,
+                                            const void* gmem_ptr,
+                                            const int src_in_bytes,
+                                            bool guard) {
+  static_assert(
+      (SIZE_IN_BYTES == 4 || SIZE_IN_BYTES == 8 || SIZE_IN_BYTES == 16),
+      "Size is not supported");
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile(
+      "{.reg.pred p;\n"
+      " setp.ne.b32 p, %4, 0;\n"
+  #if __CUDACC_VER_MINOR__ >= 4
+      " @p cp.async.ca.shared.global.L2::256B [%0], [%1], %2, %3;}\n"
+  #else
+      " @p cp.async.ca.shared.global [%0], [%1], %2, %3;}\n"
+  #endif
+      ::"r"(smem_addr),
+      "l"(gmem_ptr), "n"(SIZE_IN_BYTES), "r"(src_in_bytes), "r"((int)guard));
+#endif
+}
+
+__device__ __forceinline__ void cp_async_commit_group() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.commit_group;\n");
+#endif
+}
+
+template <int N>
+__device__ __forceinline__ void cp_asyc_wait_group() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.wait_group %0;\n" : : "n"(N));
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128(const uint32_t& idata,
+                                                          T* fdata);
+
+template <>
+// fast conversion: 4xuint8 to 4xhalf, subtracting bias = 128
+__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128<__half2>(
+    const uint32_t& idata, __half2* fdata) {
+  uint32_t i10, i32;
+  asm volatile(
+      "prmt.b32 %0, %2, 0x64, 0x4140;"
+      "prmt.b32 %1, %2, 0x64, 0x4342;"
+      : "=r"(i10), "=r"(i32)
+      : "r"(idata));
+
+  static constexpr uint32_t MAGIC_NUM = 0x64806480;
+  fdata[0] = __hsub2(reinterpret_cast<const __half2&>(i10),
+                     reinterpret_cast<const __half2&>(MAGIC_NUM));
+  fdata[1] = __hsub2(reinterpret_cast<const __half2&>(i32),
+                     reinterpret_cast<const __half2&>(MAGIC_NUM));
+}
+
+template <>
+// fast conversion: 4xuint8 to 4xbfloat16, subtracting bias = 128
+// reference from marlin fast implementation
+__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128<__nv_bfloat162>(
+    const uint32_t& idata, __nv_bfloat162* fdata) {
+  float fp32_imd[4];
+  uint32_t* fp32_imd_casted = reinterpret_cast<uint32_t*>(fp32_imd);
+  asm volatile(
+      "prmt.b32 %0, %4, 0x4B000000, 0x7650;"
+      "prmt.b32 %1, %4, 0x4B000000, 0x7651;"
+      "prmt.b32 %2, %4, 0x4B000000, 0x7652;"
+      "prmt.b32 %3, %4, 0x4B000000, 0x7653;"
+      : "=r"(fp32_imd_casted[0]), "=r"(fp32_imd_casted[1]),
+        "=r"(fp32_imd_casted[2]), "=r"(fp32_imd_casted[3])
+      : "r"(idata));
+
+  fp32_imd[0] -= 8388736.f;
+  fp32_imd[1] -= 8388736.f;
+  fp32_imd[2] -= 8388736.f;
+  fp32_imd[3] -= 8388736.f;
+
+  uint32_t* bf16_res = reinterpret_cast<uint32_t*>(fdata);
+  asm volatile(
+      "prmt.b32 %0, %2, %3, 0x7632;"
+      "prmt.b32 %1, %4, %5, 0x7632;"
+      : "=r"(bf16_res[0]), "=r"(bf16_res[1])
+      : "r"(fp32_imd_casted[0]), "r"(fp32_imd_casted[1]),
+        "r"(fp32_imd_casted[2]), "r"(fp32_imd_casted[3]));
+}
+
+static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __bfloat162bfloat162(x);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+static __device__ half2 inline num2num2(const half x) {
+  return __half2half2(x);
+}
+
+}  // namespace allspark
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
@@ -0,0 +1,2 @@
+sm*_kernel_*.cu
+kernel_selector.h
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -0,0 +1,288 @@
+#include "marlin.cuh"
+
+#include "core/registration.h"
+
+namespace marlin {
+
+template <int const num_threads, int const num_bits, bool is_a_8bit>
+__global__ void awq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
+    int size_k, int size_n) {
+  constexpr int pack_factor = 32 / num_bits;
+
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
+  int k_tiles = size_k / target_tile_k_size;
+  int n_tiles = size_n / target_tile_n_size;
+  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+
+  auto start_k_tile = blockIdx.x * block_k_tiles;
+  if (start_k_tile >= k_tiles) {
+    return;
+  }
+
+  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<repack_stages - 2>();
+    __syncthreads();
+  };
+
+  extern __shared__ int4 sh[];
+
+  constexpr int tile_n_ints = target_tile_n_size / pack_factor;
+
+  constexpr int stage_n_threads = tile_n_ints / 4;
+  constexpr int stage_k_threads = target_tile_k_size;
+  constexpr int stage_size = stage_k_threads * stage_n_threads;
+
+  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      cp_async_fence();
+      return;
+    }
+
+    int first_n = n_tile_id * target_tile_n_size;
+    int first_n_packed = first_n / pack_factor;
+
+    int4* sh_ptr = sh + stage_size * pipe;
+
+    if (threadIdx.x < stage_size) {
+      auto k_id = threadIdx.x / stage_n_threads;
+      auto n_id = threadIdx.x % stage_n_threads;
+
+      int first_k = k_tile_id * target_tile_k_size;
+
+      cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
+                reinterpret_cast<int4 const*>(
+                    &(b_q_weight_ptr[(first_k + k_id) * (size_n / pack_factor) +
+                                     first_n_packed + (n_id * 4)])));
+    }
+
+    cp_async_fence();
+  };
+
+  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      return;
+    }
+
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
+
+    if (warp_id >= 4) {
+      return;
+    }
+
+    int tc_col = th_id / 4;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
+
+    constexpr int tc_offsets[4] = {0, 1, 8, 9};
+
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
+    int cur_n_packed = cur_n / pack_factor;
+    int cur_n_pos = cur_n % pack_factor;
+
+    constexpr int sh_stride = tile_n_ints;
+    constexpr uint32_t mask = (1 << num_bits) - 1;
+
+    int4* sh_stage_ptr = sh + stage_size * pipe;
+    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
+
+    // Undo interleaving
+    int cur_n_pos_unpacked;
+    if constexpr (num_bits == 4) {
+      constexpr int undo_pack[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+      cur_n_pos_unpacked = undo_pack[cur_n_pos];
+    } else {
+      constexpr int undo_pack[4] = {0, 2, 1, 3};
+      cur_n_pos_unpacked = undo_pack[cur_n_pos];
+    }
+
+    uint32_t vals[8];
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      if constexpr (is_a_8bit) {
+        int cur_elem = tc_row + i;
+
+        int packed_src_0 =
+            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
+                             sh_stride * cur_elem];
+        int packed_src_1 =
+            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
+                             sh_stride * (cur_elem + 16)];
+
+        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      } else {
+        int cur_elem = tc_row + tc_offsets[i];
+
+        int packed_src_0 =
+            sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
+        int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
+                                            sh_stride * cur_elem];
+
+        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      }
+    }
+
+    constexpr int tile_size =
+        target_tile_k_size * target_tile_n_size / pack_factor;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
+    // Result of:
+    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+    if constexpr (!is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else if constexpr (is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else {
+      constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+      uint32_t res1 = 0;
+      uint32_t res2 = 0;
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        const int ii = is_a_8bit ? i : pack_idx[i];
+        res1 |= vals[ii] << (i * 8);
+        res2 |= vals[4 + ii] << (i * 8);
+      }
+
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+    }
+  };
+
+  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+#pragma unroll
+    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+    }
+
+    wait_for_stage();
+  };
+#pragma unroll
+  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+    int n_tile_id = 0;
+
+    start_pipes(k_tile_id, n_tile_id);
+
+    while (n_tile_id < n_tiles) {
+#pragma unroll
+      for (int pipe = 0; pipe < repack_stages; pipe++) {
+        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
+                        n_tile_id + pipe + repack_stages - 1);
+        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+        wait_for_stage();
+      }
+      n_tile_id += repack_stages;
+    }
+  }
+}
+
+}  // namespace marlin
+
+#define CALL_IF(NUM_BITS, IS_A_8BIT)                                       \
+  else if (num_bits == NUM_BITS && is_a_8bit == IS_A_8BIT) {               \
+    cudaFuncSetAttribute(                                                  \
+        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
+                                         IS_A_8BIT>,                       \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
+    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
+                                     IS_A_8BIT>                            \
+        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(      \
+            b_q_weight_ptr, out_ptr, size_k, size_n);                      \
+  }
+
+torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
+                                int64_t size_n, int64_t num_bits,
+                                bool is_a_8bit) {
+  // Verify compatibility with marlin tile of 16x64
+  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_k_size = ", marlin::tile_k_size);
+  TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
+              " is not divisible by tile_n_size = ", marlin::tile_n_size);
+
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int const pack_factor = 32 / num_bits;
+
+  // Verify B
+  TORCH_CHECK(b_q_weight.size(0) == size_k,
+              "b_q_weight.size(0) = ", b_q_weight.size(0),
+              " is not size_k = ", size_k);
+  TORCH_CHECK((size_n / pack_factor) == b_q_weight.size(1),
+              "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
+              ", size_n = ", size_n, ", pack_factor = ", pack_factor);
+
+  // Verify device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  torch::Tensor out = torch::empty(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+
+  // Get ptrs
+  uint32_t const* b_q_weight_ptr =
+      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
+  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
+
+  // Get dev info
+  int dev = b_q_weight.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (false) {
+  }
+  CALL_IF(4, false)
+  CALL_IF(8, false)
+  CALL_IF(4, true)
+  CALL_IF(8, true)
+  else {
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
+                ", is_a_8bit = ", is_a_8bit);
+  }
+
+  return out;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("awq_marlin_repack", &awq_marlin_repack);
+}
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@@ -0,0 +1,609 @@
+/*
+Fast Dequantization (Converting INT4/INT8/FP4/FP8 to FP16/BF16)
+
+The process of fast dequantization can be summarized as a combination
+of bitwise operations and floating-point computations:
+
+weight =>(bit_op / bitwise operations)=>
+f16_value =>(flop / floating-point computation)=>
+dequantized_weight
+
+Since the dequantized weights typically require subtracting the zero point and
+applying a scale factor, the floating-point computation step can be fused with
+the zero-point subtraction and scaling operations.
+
+The following are the parts that need to be modified for the fused operation
+of zero-point subtraction and scaling.
+
+## INT4 => FP16/BF16 or INT8 => FP16
+
+The floating-point computation is `__hsub2`
+
+If has zero points:
+
+    flop(bit_op(weight)) - flop(bit_op(zp))
+  = sub(bit_op(weight), bias) - sub(bit_op(zp), bias)
+  = bit_op(weight) - bit_op(zp)
+
+so we don't need additional modification.
+
+If has float zero points:
+
+    flop(bit_op(weight)) - fzp
+  = sub(bit_op(weight), bias) - fzp
+  = bit_op(weight) - (fzp + bias)
+
+where the `fzp + bias` can be computed at weight loading. But this
+may have accuracy issue, so we should not use this in most cases.
+
+If has not zero points:
+
+    scale(flop(bit_op(weight)))
+  = scale(sub(bit_op(weight), bias))
+  = scale(bit_op(weight)) - scale(bias)
+  = fma(bit_op(weight), scale_factor, scale(bias))
+
+where the `scale(bias)` can be cached. But this may have accuracy issue,
+so we should not use this in most cases.
+
+
+## INT8 => BF16
+
+INT8 => BF16 is a special case, it use byte_perm instead of flop.
+We cannot fused byte_perm with scaling.
+
+
+## FP4/FP8 => FP16/BF16
+
+    scale(flop(bit_op(weight)))
+  = scale(mul(bit_op(weight), multiplier))
+  = mul(bit_op(weight), scale_factor * multiplier)
+
+where `scale_factor * multiplier` can be computed at weight loading.
+
+*/
+
+#include "marlin_dtypes.cuh"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t2, vllm::ScalarTypeId w_type_id,
+          bool skip_flop = false>
+__device__ inline void dequant(int q, scalar_t2* frag_b);
+
+//
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
+//
+template <>
+__device__ inline void dequant<half2, vllm::kU4B8.id(), true>(int q,
+                                                              half2* frag_b) {
+  const int MASK = 0x000f000f;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+
+  frag_b[0] = *reinterpret_cast<half2*>(&lo);
+  frag_b[1] = *reinterpret_cast<half2*>(&hi);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU4B8.id(), false>(int q,
+                                                               half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU4.id(), true>(int q,
+                                                            half2* frag_b) {
+  dequant<half2, vllm::kU4B8.id(), true>(q, frag_b);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU4.id(), false>(int q,
+                                                             half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4B8.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  // clang-format on
+
+  frag_b[0] = *reinterpret_cast<nv_bfloat162*>(&lo);
+  frag_b[1] = *reinterpret_cast<nv_bfloat162*>(&hi);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4B8.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kU4B8.id(), true>(q, frag_b);
+
+  static constexpr uint32_t SUB = 0x43084308;
+
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kU4B8.id(), true>(q, frag_b);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kU4.id(), true>(q, frag_b);
+
+  static constexpr uint32_t SUB = 0x43004300;
+
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+}
+
+//
+// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+//
+template <>
+__device__ inline void dequant<half2, vllm::kU8B128.id(), true>(int q,
+                                                                half2* frag_b) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  frag_b[0] = *reinterpret_cast<half2*>(&lo);
+  frag_b[1] = *reinterpret_cast<half2*>(&hi);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU8B128.id(), false>(
+    int q, half2* frag_b) {
+  dequant<half2, vllm::kU8B128.id(), true>(q, frag_b);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+  frag_b[0] = __hsub2(frag_b[0],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(frag_b[1],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU8.id(), true>(int q,
+                                                            half2* frag_b) {
+  dequant<half2, vllm::kU8B128.id(), true>(q, frag_b);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU8.id(), false>(int q,
+                                                             half2* frag_b) {
+  dequant<half2, vllm::kU8.id(), true>(q, frag_b);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+  frag_b[0] = __hsub2(frag_b[0],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(frag_b[1],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU8B128.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388736.f;
+  fp32_intermediates[1] -= 8388736.f;
+  fp32_intermediates[2] -= 8388736.f;
+  fp32_intermediates[3] -= 8388736.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU8.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388608.f;
+  fp32_intermediates[1] -= 8388608.f;
+  fp32_intermediates[2] -= 8388608.f;
+  fp32_intermediates[3] -= 8388608.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kFE4M3fn.id(), true>(
+    int q, half2* frag_b) {
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT;
+  constexpr int MASK = 0x7F007F00;
+
+  // Extract and shift FP8 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kFE4M3fn.id(), false>(
+    int q, half2* frag_b) {
+  dequant<half2, vllm::kFE4M3fn.id(), true>(q, frag_b);
+
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE4M3fn.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+
+  constexpr int MASK = 0x7F007F00;
+
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE4M3fn.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kFE4M3fn.id(), true>(q, frag_b);
+
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg =
+      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+
+  // Convert to bfloat162 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kFE2M1f.id(), true>(int q,
+                                                                half2* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70007000;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kFE2M1f.id(), false>(
+    int q, half2* frag_b) {
+  dequant<half2, vllm::kFE2M1f.id(), true>(q, frag_b);
+
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (FP16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70007000;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kFE2M1f.id(), true>(q, frag_b);
+
+  // Constants for FP4 (E2M1) and BF16 formats
+  constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (BF16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg =
+      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kFE2M1f.id(), true>(
+    int q, __nv_fp8x4_e4m3* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP8_EXPONENT = 4;
+  constexpr int RIGHT_SHIFT = FP8_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70707070;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note1: reverse indexing is intentional because weights are permuted
+  // Note2: when dequant to 8bit type, we write to `frag_b[2]` instead of
+  //        `frag_b[1]` to fit the layout of tensorcore
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<int32_t, vllm::kU4B8.id(), true>(
+    int q, int32_t* frag_b) {
+  constexpr int repeated_zp = 0x08080808;
+  constexpr int MASK = 0x80808080;
+
+  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+  q >>= 4;
+  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+}
+
+template <>
+__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kU4B8.id(), true>(
+    int q, __nv_fp8x4_e4m3* frag_b) {
+  int s = q & 0x08080808;
+  int Out1 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
+  q >>= 4;
+  s = q & 0x08080808;
+  int Out2 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
+
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
+}
+
+template <typename scalar_t2, vllm::ScalarTypeId s_type_id>
+__device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
+
+template <>
+__device__ inline void dequant_fp8_scales<half2, vllm::kFE4M3fn.id()>(
+    int q, half2* frag_b) {
+  int Out1 = (q & 0xFF00FF00) >> 1;
+  ;
+  q <<= 8;
+  int Out2 = (q & 0xFF00FF00) >> 1;
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+};
+
+template <>
+__device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE4M3fn.id()>(
+    int q, nv_bfloat162* frag_b) {
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+  constexpr int MASK = 0x7F007F00;
+
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE8M0fnu.id()>(
+    int q, nv_bfloat162* frag_b) {
+  // In this conversion, 2 ** -127 in FP8E8M0 would become 0 in BF16,
+  // but we assume that such a extreme value would not occur in real models.
+  int Out1 = (q & 0xFF00FF00) >> 1;
+  q <<= 7;
+  int Out2 = q & 0x7F807F80;
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+};
+
+// subtract zero point in quanted format and then dequant
+template <typename scalar_t2, vllm::ScalarTypeId w_type_id,
+          bool skip_flop = false>
+__device__ inline void sub_zp_and_dequant(int q, scalar_t2* frag_b, int zp);
+
+template <>
+__device__ inline void sub_zp_and_dequant<int32_t, vllm::kU4.id(), true>(
+    int q, int32_t* frag_b, int zp) {
+  // INT4 with zp -> INT8
+  // see https://github.com/vllm-project/vllm/pull/24722
+  int repeated_zp = 0x01010101 * zp;
+  int MASK = 0x80808080;
+
+  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+  q >>= 4;
+  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+}
+
+template <>
+__device__ inline void sub_zp_and_dequant<__nv_fp8x4_e4m3, vllm::kU4.id(),
+                                          true>(int q, __nv_fp8x4_e4m3* frag_b,
+                                                int zp) {
+  // INT4 with zp -> FP8
+  // see https://github.com/vllm-project/vllm/pull/24722
+  uint32_t u_q = *reinterpret_cast<uint32_t*>(&q);
+  uint32_t u_zp = *reinterpret_cast<uint32_t*>(&zp);
+  uint32_t u_zp1 = u_zp + 1;
+  uint32_t repeated_zp = 0x01010101 * u_zp;
+
+  uint32_t q0, s;
+  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
+  s = (q0 + repeated_zp) & 0x80808080;
+  uint32_t Out1 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
+
+  u_q >>= 4;
+  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
+  s = (q0 + repeated_zp) & 0x80808080;
+  uint32_t Out2 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
+
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
+}
+
+#endif
+
+}  // namespace MARLIN_NAMESPACE_NAME
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import glob
+import itertools
+import os
+import subprocess
+import sys
+
+import jinja2
+
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+    arch = arch[: arch.index(".") + 2].replace(".", "")
+    arch = int(arch)
+    # only SM89 and SM120 fully support
+    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+    # SM90 and SM100 can use this PTX, but it’s simulated
+    # with FP16 MMA, so it cannot achieve any acceleration.
+    if arch in [89, 120]:
+        SUPPORT_FP8 = True
+
+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
+// clang-format off
+""".lstrip()
+
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+"""
+)
+
+TEMPLATE = (
+    "template __global__ void Marlin<"
+    "{{a_type_id}}, "
+    "{{b_type_id}}, "
+    "{{c_type_id}}, "
+    "{{s_type_id}}, "
+    "{{threads}}, "
+    "{{thread_m_blocks}}, "
+    "{{thread_n_blocks}}, "
+    "{{thread_k_blocks}}, "
+    "{{m_block_size_8}}, "
+    "{{stages}}, "
+    "{{group_blocks}}, "
+    "{{is_zp_float}}>"
+    "( MARLIN_KERNEL_PARAMS );"
+)
+
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
+
+THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
+
+QUANT_CONFIGS = [
+    # AWQ-INT4
+    {
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # HQQ
+    {
+        "a_type": ["kFloat16"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [4],
+        "is_zp_float": True,
+    },
+    # GPTQ-INT4
+    {
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": "kU8B128",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # FP8
+    {
+        "b_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 8],
+    },
+    # NVFP4
+    {
+        "b_type": "kFE2M1f",
+        "s_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [1],
+    },
+    # MXFP4
+    {
+        "a_type": ["kBFloat16"],
+        "b_type": "kFE2M1f",
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kFE2M1f",
+        "c_type": ["kBFloat16"],
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [2],
+    },
+]
+
+
+def remove_old_kernels():
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
+        subprocess.call(["rm", "-f", filename])
+
+    filename = os.path.dirname(__file__) + "/kernel_selector.h"
+    subprocess.call(["rm", "-f", filename])
+
+
+def generate_new_kernels():
+    result_dict = {}
+
+    for quant_config in QUANT_CONFIGS:
+        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+        b_type = quant_config["b_type"]
+        is_zp_float = quant_config.get("is_zp_float", False)
+        all_group_blocks = quant_config["group_blocks"]
+        all_m_blocks = quant_config["thread_m_blocks"]
+        all_thread_configs = quant_config["thread_configs"]
+
+        for a_type, c_type in itertools.product(a_types, c_types):
+            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
+                continue
+            if "16" in a_type and "16" in c_type and a_type != c_type:
+                continue
+            s_type = quant_config.get("s_type", c_type)
+            if (a_type, b_type, c_type) not in result_dict:
+                result_dict[(a_type, b_type, c_type)] = []
+
+            for group_blocks, m_blocks, thread_configs in itertools.product(
+                all_group_blocks, all_m_blocks, all_thread_configs
+            ):
+                thread_k, thread_n, threads = thread_configs
+
+                if threads == 256:
+                    # for small batch (m_blocks == 1),
+                    #     we only need (128, 128, 256)
+                    # for large batch (m_blocks > 1),
+                    #     we only need (64, 256, 256)
+                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+                        continue
+                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+                        continue
+
+                config = {
+                    "threads": threads,
+                    "s_type": s_type,
+                    "thread_m_blocks": max(m_blocks, 1),
+                    "thread_k_blocks": thread_k // 16,
+                    "thread_n_blocks": thread_n // 16,
+                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+                    "stages": "pipe_stages",
+                    "group_blocks": group_blocks,
+                    "is_zp_float": "true" if is_zp_float else "false",
+                }
+
+                result_dict[(a_type, b_type, c_type)].append(config)
+
+    kernel_selector_str = FILE_HEAD_COMMENT
+
+    for (a_type, b_type, c_type), config_list in result_dict.items():
+        all_template_str_list = []
+        for config in config_list:
+            s_type = config["s_type"]
+            template_str = jinja2.Template(TEMPLATE).render(
+                a_type_id=f"vllm::{a_type}.id()",
+                b_type_id=f"vllm::{b_type}.id()",
+                c_type_id=f"vllm::{c_type}.id()",
+                s_type_id=f"vllm::{s_type}.id()",
+                **config,
+            )
+            all_template_str_list.append(template_str)
+
+            conditions = [
+                f"a_type == vllm::{a_type}",
+                f"b_type == vllm::{b_type}",
+                f"c_type == vllm::{c_type}",
+                f"s_type == vllm::{s_type}",
+                f"threads == {config['threads']}",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
+                f"thread_n_blocks == {config['thread_n_blocks']}",
+                f"thread_k_blocks == {config['thread_k_blocks']}",
+                f"m_block_size_8 == {config['m_block_size_8']}",
+                f"group_blocks == {config['group_blocks']}",
+                f"is_zp_float == {config['is_zp_float']}",
+            ]
+            conditions = " && ".join(conditions)
+
+            if kernel_selector_str == FILE_HEAD_COMMENT:
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
+            else:
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+            kernel_template2 = (
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                "{{is_zp_float}}>;"
+            )
+
+            kernel_selector_str += (
+                jinja2.Template(kernel_template2).render(
+                    a_type_id=f"vllm::{a_type}.id()",
+                    b_type_id=f"vllm::{b_type}.id()",
+                    c_type_id=f"vllm::{c_type}.id()",
+                    s_type_id=f"vllm::{s_type}.id()",
+                    **config,
+                )
+                + "\n"
+            )
+
+        file_content = FILE_HEAD + "\n\n"
+        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+        if a_type == "kFE4M3fn":
+            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+        else:
+            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+        filename = filename.lower()
+
+        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+            f.write(file_content)
+
+    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+        kernel_selector_str += (
+            "else if (a_type == vllm::kFE4M3fn)\n"
+            "  TORCH_CHECK(false, "
+            '"marlin kernel with fp8 activation is not built.");'
+        )
+
+    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+        f.write(kernel_selector_str)
+
+
+if __name__ == "__main__":
+    remove_old_kernels()
+    generate_new_kernels()
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -0,0 +1,850 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+#include "kernel.h"
+#include "core/registration.h"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace marlin {
+
+__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
+
+using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int lda, int block_rows) {}
+
+}  // namespace marlin
+
+torch::Tensor gptq_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int lda, int block_rows) {
+  auto start_row = block_rows * blockIdx.x;
+  int finish_row = start_row + block_rows;
+  if (finish_row > size_m) {
+    finish_row = size_m;
+  }
+  int cur_block_rows = finish_row - start_row;
+
+  int input_row_stride = lda * sizeof(half) / 16;
+  int output_row_stride = size_k * sizeof(half) / 16;
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int input_offset = row * input_row_stride;
+    int output_offset = row * output_row_stride;
+
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + input_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + output_offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      auto cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        auto cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128},
+    {128, 64, 128}};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128},
+    {128, 64, 128}};
+
+typedef struct {
+  int blocks_per_sm;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups =
+        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
+                          int prob_m, int prob_n, int prob_k, int num_bits,
+                          int group_size, bool has_act_order, bool is_k_full,
+                          int has_zp, int is_zp_float) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+  int tb_m = thread_m_blocks * 16;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_red_size = tb_m * (tb_n + 8) * 2;
+  int sh_bias_size = tb_n * 2;
+  int tmp_size =
+      (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
+  tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
+
+  int sh_s_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
+                            group_size, has_act_order, is_k_full);
+  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+  int sh_zp_size = 0;
+  if (has_zp) {
+    if (is_zp_float)
+      sh_zp_size = sh_s_size;
+    else if (num_bits == 4)
+      sh_zp_size = sh_s_size / 4;
+    else if (num_bits == 8)
+      sh_zp_size = sh_s_size / 2;
+  }
+
+  int total_size =
+      tmp_size + sh_a_size + sh_s_size + sh_zp_size + sh_g_idx_size;
+
+  return total_size;
+}
+
+bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
+                     int prob_m, int prob_n, int prob_k, int num_bits,
+                     int group_size, bool has_act_order, bool is_k_full,
+                     int has_zp, int is_zp_float, int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  // Check that pipeline fits into cache
+  int cache_size = get_kernel_cache_size(
+      th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
+      has_act_order, is_k_full, has_zp, is_zp_float);
+  return cache_size <= max_shared_mem;
+}
+
+MarlinFuncPtr get_marlin_kernel(
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
+    int threads, bool is_zp_float) {
+  int num_bits = b_type.size_bits();
+  auto kernel = MarlinDefault;
+
+  #include "kernel_selector.h"
+
+  return kernel;
+}
+
+exec_config_t determine_exec_config(
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
+    int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
+    int num_bits, int group_size, bool has_act_order, bool is_k_full,
+    bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
+  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
+  thread_config_t* thread_configs = thread_m_blocks > 1
+                                        ? large_batch_thread_configs
+                                        : small_batch_thread_configs;
+  int thread_configs_size =
+      thread_m_blocks > 1
+          ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
+          : sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
+
+  for (int i = 0; i < thread_configs_size; i++) {
+    thread_config_t th_config = thread_configs[i];
+
+    if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
+                         num_bits, group_size, has_act_order, is_k_full, has_zp,
+                         is_zp_float, max_shared_mem - 512)) {
+      continue;
+    }
+
+    int cache_size = get_kernel_cache_size(
+        th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
+        group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+
+    int group_blocks = 0;
+    if (!has_act_order) {
+      group_blocks = group_size == -1 ? -1 : group_size / 16;
+    }
+
+    auto kernel =
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
+                          th_config.num_threads, is_zp_float);
+
+    if (kernel == MarlinDefault) continue;
+
+    // int m_tiles = div_ceil(prob_m, thread_m_blocks * 16);
+    // int n_tiles = prob_n / th_config.thread_n;
+    // int k_tiles = prob_k / th_config.thread_k;
+
+    return {1, th_config};
+  }
+
+  return exec_cfg;
+}
+
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
+               void* perm, void* a_tmp, int prob_m, int prob_n, int prob_k,
+               int lda, void* workspace, vllm::ScalarType const& a_type,
+               vllm::ScalarType const& b_type, vllm::ScalarType const& c_type,
+               vllm::ScalarType const& s_type, bool has_bias,
+               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
+               int group_size, int dev, cudaStream_t stream, int thread_k_init,
+               int thread_n_init, int sms, bool use_atomic_add,
+               bool use_fp32_reduce, bool is_zp_float) {
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  int num_bits = b_type.size_bits();
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
+
+  const int4* bias_ptr = (const int4*)b_bias;
+  const float* a_s_ptr = (const float*)a_s;
+  const int4* b_s_ptr = (const int4*)b_s;
+  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
+
+  const int4* zp_ptr = (const int4*)zp;
+  const int* g_idx_ptr = (const int*)g_idx;
+  const int* perm_ptr = (const int*)perm;
+  int4* a_tmp_ptr = (int4*)a_tmp;
+  int* locks = (int*)workspace;
+
+  if (has_act_order) {
+    // Permute A columns
+    int block_rows = div_ceil(prob_m, sms);
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    permute_cols_kernel<<<sms, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows);
+    // clang-format on
+    A_ptr = a_tmp_ptr;
+    lda = prob_k;
+
+    // If we have a full K, then we can run the non-act-order version of Marlin
+    // (since the weight rows are reordered by increasing group ids, and by
+    // having a full K, we have full original groups)
+    if (is_k_full) has_act_order = false;
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  int major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         dev);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         dev);
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
+              "marlin kernel only support Ampere or newer GPUs.");
+  if (a_type == vllm::kFE4M3fn) {
+    TORCH_CHECK(
+        major_capability * 10 + minor_capability == 89 ||
+            major_capability * 10 + minor_capability == 120,
+        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
+        "Marlin W4A16 on other devices).");
+  }
+
+  int max_par = 16;
+  if (prob_n <= 4096) max_par = 16 * 8;
+  int max_shared_mem_new = max_shared_mem;
+  int rest_m = prob_m;
+  int max_thread_m_blocks = 4;
+  while (rest_m) {
+    int par_count = rest_m / (max_thread_m_blocks * 16);
+    if (par_count > max_par) par_count = max_par;
+    int prob_m_split =
+        par_count > 0 ? (par_count * (max_thread_m_blocks * 16)) : rest_m;
+
+    int thread_k = thread_k_init;
+    int thread_n = thread_n_init;
+
+    int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks);
+    int m_block_size_8 = prob_m_split <= 8 && a_type.size_bits() == 16;
+
+    // Set thread config
+    exec_config_t exec_cfg;
+    thread_config_t thread_tfg;
+    if (thread_k != -1 && thread_n != -1) {
+      thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
+      exec_cfg = exec_config_t{1, thread_tfg};
+      TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+                  " is not divisible by thread_n = ", thread_n);
+      TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+                  " is not divisible by thread_k = ", thread_k);
+    } else {
+      // Auto config
+      exec_cfg = determine_exec_config(
+          a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
+          thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
+          is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
+      thread_tfg = exec_cfg.tb_cfg;
+      if (thread_tfg.thread_n != -1) {
+        if (prob_n / thread_tfg.thread_n *
+                div_ceil(prob_m_split, thread_m_blocks * 16) * 4 <=
+            sms) {
+          if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
+                              prob_n, prob_k, num_bits, group_size,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
+                              max_shared_mem_new)) {
+            thread_tfg = {128, 64, 128};
+            exec_cfg = {1, thread_tfg};
+          }
+        }
+      }
+
+      if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) {
+        max_thread_m_blocks--;
+        continue;
+      }
+    }
+
+    int num_threads = thread_tfg.num_threads;
+    thread_k = thread_tfg.thread_k;
+    thread_n = thread_tfg.thread_n;
+    int blocks = sms * exec_cfg.blocks_per_sm;
+    if (exec_cfg.blocks_per_sm > 1)
+      max_shared_mem_new = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
+
+    int thread_k_blocks = thread_k / 16;
+    int thread_n_blocks = thread_n / 16;
+
+    TORCH_CHECK(
+        is_valid_config(thread_tfg, thread_m_blocks, prob_m_split, prob_n,
+                        prob_k, num_bits, group_size, has_act_order, is_k_full,
+                        has_zp, is_zp_float, max_shared_mem_new),
+        "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+        ", thread_k = ", thread_tfg.thread_k,
+        ", thread_n = ", thread_tfg.thread_n,
+        ", num_threads = ", thread_tfg.num_threads, " for MKN = [", prob_m,
+        ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+        ", prob_m_split = ", prob_m_split, ", group_size = ", group_size,
+        ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+        ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
+        ", max_shared_mem_new = ", max_shared_mem_new);
+
+    auto kernel = get_marlin_kernel(
+        a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
+        thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
+        num_threads, is_zp_float);
+
+    if (kernel == MarlinDefault) {
+      TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
+                  ", ", prob_k, "]", ", has_act_order = ", has_act_order,
+                  ", num_groups = ", num_groups, ", group_size = ", group_size,
+                  ", prob_m_split = ", prob_m_split,
+                  ", thread_m_blocks = ", thread_m_blocks,
+                  ", thread_n_blocks = ", thread_n_blocks,
+                  ", thread_k_blocks = ", thread_k_blocks,
+                  ", num_threads = ", num_threads, ", num_bits = ", num_bits);
+    }
+
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_new);
+
+    bool part_use_atomic_add =
+        use_atomic_add && div_ceil(prob_m_split, 64) * prob_n <= 2048;
+
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr,
+        g_idx_ptr, num_groups,
+        prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add,
+        use_fp32_reduce, max_shared_mem_new);
+    // clang-format on
+
+    bool is_a_8bit = a_type.size_bits() == 8;
+    A_ptr += prob_m_split * (lda / (is_a_8bit ? 16 : 8));
+    a_s_ptr += prob_m_split;
+    C_ptr += prob_m_split * (prob_n / 8);
+    rest_m -= prob_m_split;
+  }
+}
+
+}  // namespace marlin
+
+torch::Tensor gptq_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& a_scales_or_none,
+    std::optional<torch::Tensor> const& global_scale_or_none,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
+
+  auto c_dtype = a.dtype();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    a_type_id = vllm::kFloat16.id();
+    c_type_id = vllm::kFloat16.id();
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    a_type_id = vllm::kBFloat16.id();
+    c_type_id = vllm::kBFloat16.id();
+  } else {
+    c_dtype = b_scales.dtype();
+    if (b_scales.scalar_type() == at::ScalarType::Half) {
+      c_type_id = vllm::kFloat16.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
+      c_type_id = vllm::kBFloat16.id();
+    } else {
+      c_type_id = vllm::kBFloat16.id();
+
+      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
+      torch::Tensor c = c_or_none.value();
+      c_dtype = c.dtype();
+
+      if (c.scalar_type() == at::ScalarType::Half) {
+        c_type_id = vllm::kFloat16.id();
+      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
+        c_type_id = vllm::kBFloat16.id();
+      } else {
+        TORCH_CHECK(false, "unsupported c dtype");
+      }
+    }
+
+    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      a_type_id = vllm::kFE4M3fn.id();
+    } else if (a.scalar_type() == at::ScalarType::Char) {
+      a_type_id = vllm::kS8.id();
+    } else {
+      TORCH_CHECK(false, "unsupported `a` scalar_type");
+    }
+  }
+
+  s_type_id = c_type_id;
+  if (b_type_id == vllm::kFE2M1f.id()) {
+    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      s_type_id = vllm::kFE4M3fn.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
+      s_type_id = vllm::kFE8M0fnu.id();
+    } else {
+      TORCH_CHECK(false,
+                  "When b_type = float4_e2m1f, b_scale scalar type must be",
+                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
+    }
+  }
+
+  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
+  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
+  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
+  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
+
+  int pack_factor = 32 / b_type.size_bits();
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(
+      size_k % MARLIN_NAMESPACE_NAME::tile_size == 0, "size_k = ", size_k,
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK((size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k,
+              ", tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      b_q_weight.size(1) % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "b_q_weight.size(1) = ", b_q_weight.size(1),
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  int actual_size_n =
+      (b_q_weight.size(1) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.stride(1) == 1, "A.stride(1) is not 1");
+  // We use int4 (16 bytes) to load A, so A must aligned to 16 bytes
+  TORCH_CHECK(a.stride(0) % 8 == 0, "A.stride(0) must divisible by 8");
+  TORCH_CHECK(((uint64_t)a.data_ptr()) % 16 == 0, "A must aligned to 16 bytes");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  torch::Tensor a_scales;
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+
+  if (a_scales_or_none.has_value()) {
+    a_scales = a_scales_or_none.value();
+    TORCH_CHECK(a_type.size_bits() == 8,
+                "a_scales can only be used for 8bit activation.");
+  } else {
+    a_scales = torch::empty({0}, options_fp32);
+    TORCH_CHECK(a_type.size_bits() != 8,
+                "the a_scales parameter must be passed for 8bit activation.");
+  }
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel
+  int sms = -1;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  torch::Tensor c;
+  if (c_or_none.has_value()) {
+    c = c_or_none.value();
+    TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
+    TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
+    TORCH_CHECK(c.size(0) == size_m, "Shape mismatch: c.size(0) = ", c.size(0),
+                ", size_m = ", size_m);
+    TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1),
+                ", size_n = ", size_n);
+  } else {
+    c = torch::empty({size_m, size_n}, options);
+  }
+  if (size_m == 0) return c;
+
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
+  if (use_fp32_reduce) {
+    int max_m_block_size = (size_m + 16 - 1) / 16 * 16;
+    max_m_block_size = min(max_m_block_size, 64);
+    int max_c_tmp_size =
+        sms * max_m_block_size * MARLIN_NAMESPACE_NAME::max_thread_n;
+    c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
+  } else {
+    c_tmp = torch::empty({0}, options_fp32);
+  }
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int rank = b_scales.sizes().size();
+  TORCH_CHECK(rank == 2, "b_scales rank = ", rank, " is not 2");
+  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(0);
+
+  torch::Tensor g_idx, perm, a_tmp;
+  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
+    g_idx = g_idx_or_none.value();
+    perm = perm_or_none.value();
+
+    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+
+    // Verify g_idx and perm
+    TORCH_CHECK((g_idx.size(-1) == 0 && perm.size(-1) == 0) ||
+                    (g_idx.size(-1) == size_k && perm.size(-1) == size_k),
+                "Unexpected g_idx.size(-1) = ", g_idx.size(-1),
+                " and perm.size(-1) = ", perm.size(-1),
+                ", where size_k = ", size_k);
+  } else {
+    g_idx = torch::empty({0}, options);
+    perm = torch::empty({0}, options);
+    a_tmp = torch::empty({0}, options);
+  }
+  bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
+
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m, size_k}, options);
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    a_tmp = torch::empty({0}, options);
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  torch::Tensor global_scale;
+  if (global_scale_or_none.has_value()) {
+    global_scale = global_scale_or_none.value();
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
+                "global_scale can only be used for nvfp4 format.");
+  } else {
+    global_scale = torch::empty({0}, options);
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
+                "the global_scale parameter must be passed for nvfp4 format.");
+  }
+
+  bool has_bias = b_bias_or_none.has_value();
+  torch::Tensor b_bias;
+  if (has_bias) {
+    b_bias = b_bias_or_none.value();
+    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
+    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
+    TORCH_CHECK(b_bias.size(0) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.stride(0) == 1, "b_bias.stride(0) != 1");
+  } else {
+    b_bias = torch::empty({0}, options);
+  }
+
+  torch::Tensor b_zeros;
+  if (b_zeros_or_none.has_value()) {
+    b_zeros = b_zeros_or_none.value();
+    TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
+  } else {
+    b_zeros = torch::empty({0}, options);
+  }
+  bool has_zp = b_zeros.size(-1) > 0;
+  if (has_zp) {
+    TORCH_CHECK(
+        b_type == vllm::kU4 || b_type == vllm::kU8,
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
+  } else {
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
+                "b_type must be uint4b8, uint8b128, int4, int8, "
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
+                b_type.str());
+  }
+
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
+                "Computation type must be float16 (half) when using float zero "
+                "points.");
+  }
+
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2");
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(1) == size_n,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not size_n = ", size_n);
+      TORCH_CHECK(num_groups == b_zeros.size(0),
+                  "b_zeros dim 0 = ", b_zeros.size(0),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(b_zeros.size(0) == num_groups,
+                  "b_zeros dim 0 = ", b_zeros.size(0),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not size_n / pack_factor = ", size_n / pack_factor);
+    }
+  }
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
+              "size_n = ", size_n, ", is not divisible by min_thread_n = ",
+              MARLIN_NAMESPACE_NAME::min_thread_n);
+
+  int min_workspace_size = sms;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
+
+  int dev = a.get_device();
+
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
+              "scalar type of a_scales must be float");
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
+              "scalar type of global_scale must be the same with c");
+  if (a_type.size_bits() == 16) {
+    TORCH_CHECK(
+        a.scalar_type() == c.scalar_type(),
+        "scalar type of a must be the same with c for 16 bit activation");
+  }
+
+  marlin::marlin_mm(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
+      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
+      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, a.stride(0),
+      workspace.data_ptr(), a_type, b_type, c_type, s_type, has_bias,
+      has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+      use_atomic_add, use_fp32_reduce, is_zp_float);
+
+  return c;
+}
+
+#endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("gptq_marlin_gemm", &gptq_marlin_gemm);
+}
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -0,0 +1,357 @@
+#include "marlin.cuh"
+
+#include "core/registration.h"
+
+namespace marlin {
+
+template <int const num_threads, int const num_bits, bool const has_perm,
+          bool is_a_8bit>
+__global__ void gptq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr,
+    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
+    int size_k, int size_n) {
+  constexpr int pack_factor = 32 / num_bits;
+
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
+  int k_tiles = size_k / target_tile_k_size;
+  int n_tiles = size_n / target_tile_n_size;
+  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+
+  auto start_k_tile = blockIdx.x * block_k_tiles;
+  if (start_k_tile >= k_tiles) {
+    return;
+  }
+
+  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<repack_stages - 2>();
+    __syncthreads();
+  };
+
+  extern __shared__ int4 sh[];
+
+  constexpr int perm_size = target_tile_k_size / 4;
+
+  int4* sh_perm_ptr = sh;
+  int4* sh_pipe_ptr = sh_perm_ptr;
+  if constexpr (has_perm) {
+    sh_pipe_ptr += perm_size;
+  }
+
+  constexpr int tile_ints = target_tile_k_size / pack_factor;
+
+  constexpr int stage_n_threads = target_tile_n_size / 4;
+  constexpr int stage_k_threads = has_perm ? target_tile_k_size : tile_ints;
+  constexpr int stage_size = stage_k_threads * stage_n_threads;
+
+  auto load_perm_to_shared = [&](int k_tile_id) {
+    int first_k_int4 = (k_tile_id * target_tile_k_size) / 4;
+
+    int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
+
+    if (threadIdx.x < perm_size) {
+      sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
+    }
+    __syncthreads();
+  };
+
+  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      cp_async_fence();
+      return;
+    }
+
+    int first_n = n_tile_id * target_tile_n_size;
+
+    int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
+
+    if constexpr (has_perm) {
+      if (threadIdx.x < stage_size) {
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
+
+        uint32_t const* sh_perm_int_ptr =
+            reinterpret_cast<uint32_t const*>(sh_perm_ptr);
+
+        int src_k = sh_perm_int_ptr[k_id];
+        int src_k_packed = src_k / pack_factor;
+
+        cp_async4(
+            &sh_ptr[k_id * stage_n_threads + n_id],
+            reinterpret_cast<int4 const*>(&(
+                b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
+      }
+
+    } else {
+      if (threadIdx.x < stage_size) {
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
+
+        int first_k = k_tile_id * target_tile_k_size;
+        int first_k_packed = first_k / pack_factor;
+
+        cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
+                  reinterpret_cast<int4 const*>(
+                      &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
+                                       first_n + (n_id * 4)])));
+      }
+    }
+
+    cp_async_fence();
+  };
+
+  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      return;
+    }
+
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
+
+    if (warp_id >= 4) {
+      return;
+    }
+
+    int tc_col = th_id / 4;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
+
+    constexpr int tc_offsets[4] = {0, 1, 8, 9};
+
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
+
+    constexpr int sh_stride = target_tile_n_size;
+    constexpr uint32_t mask = (1 << num_bits) - 1;
+
+    int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
+    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
+
+    uint32_t* sh_perm_int_ptr = reinterpret_cast<uint32_t*>(sh_perm_ptr);
+
+    uint32_t vals[8];
+
+    if constexpr (has_perm) {
+      static_assert(!is_a_8bit);
+      for (int i = 0; i < 4; i++) {
+        int k_idx = tc_row + tc_offsets[i];
+
+        uint32_t src_k = sh_perm_int_ptr[k_idx];
+        uint32_t src_k_pos = src_k % pack_factor;
+
+        uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
+        uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;
+
+        uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
+        uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;
+
+        vals[i] = b1_cur_val;
+        vals[4 + i] = b2_cur_val;
+      }
+
+    } else {
+      uint32_t b1_vals[tile_ints];
+      uint32_t b2_vals[tile_ints];
+
+#pragma unroll
+      for (int i = 0; i < tile_ints; i++) {
+        if constexpr (is_a_8bit) {
+          b1_vals[i] =
+              sh_stage_int_ptr[cur_n + sh_stride * i + (warp_id % 2) * 8];
+        } else {
+          b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
+          b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+        }
+      }
+
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        int cur_elem = tc_row + (is_a_8bit ? i : tc_offsets[i]);
+        int cur_int = cur_elem / pack_factor;
+        int cur_pos = cur_elem % pack_factor;
+
+        vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+        if constexpr (is_a_8bit)
+          vals[4 + i] =
+              (b1_vals[cur_int + tile_ints / 2] >> (cur_pos * num_bits)) & mask;
+        else
+          vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+      }
+    }
+
+    constexpr int tile_size =
+        target_tile_k_size * target_tile_n_size / pack_factor;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
+    // Result of:
+    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+    if constexpr (!is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else if constexpr (is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else {
+      constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+      uint32_t res1 = 0;
+      uint32_t res2 = 0;
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        const int ii = is_a_8bit ? i : pack_idx[i];
+        res1 |= vals[ii] << (i * 8);
+        res2 |= vals[4 + ii] << (i * 8);
+      }
+
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+    }
+  };
+
+  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+#pragma unroll
+    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+    }
+
+    wait_for_stage();
+  };
+#pragma unroll
+  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+    int n_tile_id = 0;
+
+    if constexpr (has_perm) {
+      load_perm_to_shared(k_tile_id);
+    }
+
+    start_pipes(k_tile_id, n_tile_id);
+
+    while (n_tile_id < n_tiles) {
+#pragma unroll
+      for (int pipe = 0; pipe < repack_stages; pipe++) {
+        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
+                        n_tile_id + pipe + repack_stages - 1);
+        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+        wait_for_stage();
+      }
+      n_tile_id += repack_stages;
+    }
+  }
+}
+
+}  // namespace marlin
+
+#define CALL_IF(NUM_BITS, HAS_PERM, IS_A_8BIT)                              \
+  else if (num_bits == NUM_BITS && has_perm == HAS_PERM &&                  \
+           is_a_8bit == IS_A_8BIT) {                                        \
+    cudaFuncSetAttribute(                                                   \
+        marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
+                                          HAS_PERM, IS_A_8BIT>,             \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
+    marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
+                                      HAS_PERM, IS_A_8BIT>                  \
+        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
+            b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);             \
+  }
+
+torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
+                                 int64_t size_k, int64_t size_n,
+                                 int64_t num_bits, bool is_a_8bit) {
+  // Verify compatibility with marlin tile of 16x64
+  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_k_size = ", marlin::tile_k_size);
+  TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
+              " is not divisible by tile_n_size = ", marlin::tile_n_size);
+
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int const pack_factor = 32 / num_bits;
+
+  // Verify B
+  TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k, ", pack_factor = ", pack_factor);
+  TORCH_CHECK(b_q_weight.size(1) == size_n,
+              "b_q_weight.size(1) = ", b_q_weight.size(1),
+              " is not size_n = ", size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
+
+  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+  TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  torch::Tensor out = torch::empty(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+
+  // Detect if there is act_order
+  bool has_perm = perm.size(0) != 0;
+
+  // Get ptrs
+  uint32_t const* b_q_weight_ptr =
+      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
+  uint32_t const* perm_ptr = reinterpret_cast<uint32_t const*>(perm.data_ptr());
+  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
+
+  // Get dev info
+  int dev = b_q_weight.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (false) {
+  }
+  CALL_IF(4, false, false)
+  CALL_IF(4, true, false)
+  CALL_IF(8, false, false)
+  CALL_IF(8, true, false)
+
+  CALL_IF(4, false, true)
+  CALL_IF(8, false, true)
+
+  else {
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
+                ", has_perm = ", has_perm, ", is_a_8bit = ", is_a_8bit);
+  }
+
+  return out;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("gptq_marlin_repack", &gptq_marlin_repack);
+}
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@@ -0,0 +1,43 @@
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+#include "marlin.cuh"
+#include "marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+
+#define MARLIN_KERNEL_PARAMS                                                   \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
+      const int4 *__restrict__ b_bias_ptr,                                     \
+      const float *__restrict__ a_scales_ptr,                                  \
+      const int4 *__restrict__ scales_ptr,                                     \
+      const uint16_t *__restrict__ global_scale_ptr,                           \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
+      int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
+      bool has_bias, bool use_atomic_add, bool use_fp32_reduce,                \
+      int max_shared_mem
+
+namespace MARLIN_NAMESPACE_NAME {
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
+          >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+
+}
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -0,0 +1,131 @@
+#pragma once
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
+
+// Marlin params
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+static constexpr int default_threads = 256;
+
+static constexpr int pipe_stages =
+    4;  // 4 pipeline stages fit into shared memory
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+static constexpr int max_thread_n = 256;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+// Repack params
+static constexpr int repack_stages = 8;
+
+static constexpr int repack_threads = 256;
+
+static constexpr int tile_k_size = tile_size;
+static constexpr int tile_n_size = tile_k_size * 4;
+
+// Helpers
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+using I4 = Vec<int, 4>;
+
+constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+// No support for async
+#else
+
+__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 4;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 8;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+#endif
+
+}  // namespace MARLIN_NAMESPACE_NAME
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@@ -0,0 +1,149 @@
+
+#ifndef _data_types_cuh
+#define _data_types_cuh
+#include "marlin.cuh"
+#include "core/scalar_type.hpp"
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_fp8.h>
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
+
+template <long scalar_type_id>
+class MarlinScalarType {};
+
+template <>
+class MarlinScalarType<vllm::kFloat16.id()> {
+ public:
+  using scalar_t = half;
+  using scalar_t2 = half2;
+  using scalar_t4 = half2;
+  using scalar_32bit_t = half2;
+
+  // Matrix fragments for tensor core instructions; their precise layout is
+  // documented here:
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+  using FragA = Vec<half2, 4>;
+  using FragB = Vec<half2, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<half2, 1>;
+  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
+  using FragZP = Vec<half2, 4>;
+
+  static __device__ float inline num2float(const half x) {
+    return __half2float(x);
+  }
+
+  static __device__ half2 inline num2num2(const half x) {
+    return __half2half2(x);
+  }
+
+  static __device__ half2 inline nums2num2(const half x1, const half x2) {
+    return __halves2half2(x1, x2);
+  }
+
+  static __host__ __device__ half inline float2num(const float x) {
+    return __float2half(x);
+  }
+
+  static __host__ __device__ float2 inline num22float2(const half2 x) {
+    return __half22float2(x);
+  }
+};
+
+template <>
+class MarlinScalarType<vllm::kBFloat16.id()> {
+ public:
+  using scalar_t = nv_bfloat16;
+  using scalar_t2 = nv_bfloat162;
+  using scalar_t4 = nv_bfloat162;
+  using scalar_32bit_t = nv_bfloat162;
+
+  using FragA = Vec<nv_bfloat162, 4>;
+  using FragB = Vec<nv_bfloat162, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<nv_bfloat162, 1>;
+  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
+  using FragZP = Vec<nv_bfloat162, 4>;
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+  static __device__ float inline num2float(const nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+
+  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+    return __bfloat162bfloat162(x);
+  }
+
+  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
+                                                  const nv_bfloat16 x2) {
+    return __halves2bfloat162(x1, x2);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
+    return __float2bfloat16(x);
+  }
+
+  static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) {
+    return __bfloat1622float2(x);
+  }
+#endif
+};
+
+template <>
+class MarlinScalarType<vllm::kFE4M3fn.id()> {
+ public:
+  using scalar_t = __nv_fp8_e4m3;
+  using scalar_t2 = __nv_fp8x2_e4m3;
+  using scalar_t4 = __nv_fp8x4_e4m3;
+  using scalar_32bit_t = __nv_fp8x4_e4m3;
+
+  using FragA = Vec<__nv_fp8x4_e4m3, 4>;
+  using FragB = Vec<__nv_fp8x4_e4m3, 2>;
+  using FragC = Vec<float, 4>;
+  using FragZP = Vec<__nv_fp8x2_e4m3, 4>;
+
+  static __host__ __device__
+      float2 inline num22float2(const __nv_fp8x2_e4m3 x) {
+    return (float2)x;
+  }
+};
+
+template <>
+class MarlinScalarType<vllm::kS8.id()> {
+ public:
+  using scalar_t = int8_t;
+  using scalar_t2 = int16_t;
+  using scalar_t4 = int32_t;
+  using scalar_32bit_t = int32_t;
+
+  using FragA = Vec<int32_t, 4>;
+  using FragB = Vec<int32_t, 2>;
+  using FragC = Vec<float, 4>;
+  using FragZP = Vec<int16_t, 4>;
+};
+
+template <typename scalar_t>
+class MarlinScalarType2 {};
+
+template <>
+class MarlinScalarType2<half> : public MarlinScalarType<vllm::kFloat16.id()> {};
+
+template <>
+class MarlinScalarType2<nv_bfloat16>
+    : public MarlinScalarType<vllm::kBFloat16.id()> {};
+
+template <>
+class MarlinScalarType2<__nv_fp8_e4m3>
+    : public MarlinScalarType<vllm::kFE4M3fn.id()> {};
+
+template <>
+class MarlinScalarType2<int8_t> : public MarlinScalarType<vllm::kS8.id()> {};
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
--- a/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
+++ b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
@@ -0,0 +1,106 @@
+
+
+#include "marlin.cuh"
+
+#include "core/registration.h"
+
+// for only non-zp format (like gptq)
+__global__ void marlin_int4_fp8_preprocess_kernel_without_zp(
+    // qweight: (size_k * size_n // 8,)
+    const int32_t* __restrict__ qweight,
+    // output: same shape with qweight
+    int32_t* __restrict__ output) {
+  int32_t val = qweight[blockIdx.x * 32 + threadIdx.x];
+  int32_t new_val = 0;
+
+#pragma unroll
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t single_val = val & 0xF;
+    single_val = single_val >= 8 ? single_val - 8 : 15 - single_val;
+    new_val |= single_val << (i * 4);
+    val >>= 4;
+  }
+
+  output[blockIdx.x * 32 + threadIdx.x] = new_val;
+}
+
+// for awq format only (with zp and with awq weight layout)
+__global__ void marlin_int4_fp8_preprocess_kernel_awq(
+    // AWQ qweight: (size_k, size_n // 8)
+    const int32_t* __restrict__ qweight,
+    // output: same shape with qweight
+    int32_t* __restrict__ output,
+    // AWQ zeros: (size_k // group_size, size_n // 8)
+    const int32_t* __restrict__ qzeros, int32_t size_n, int32_t size_k,
+    int32_t group_size) {
+  int32_t val =
+      qweight[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y];
+  int32_t zero =
+      qzeros[(blockIdx.x * 32 + threadIdx.x) / group_size * size_n / 8 +
+             blockIdx.y];
+  int32_t new_val = 0;
+
+#pragma unroll
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t single_val = val & 0xF;
+    int32_t single_zero = zero & 0xF;
+
+    single_val =
+        single_val >= single_zero ? single_val - single_zero : 15 - single_val;
+    new_val |= single_val << (i * 4);
+    val >>= 4;
+    zero >>= 4;
+  }
+
+  output[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y] = new_val;
+}
+
+torch::Tensor marlin_int4_fp8_preprocess(
+    torch::Tensor& qweight, std::optional<torch::Tensor> qzeros_or_none,
+    bool inplace) {
+  TORCH_CHECK(qweight.device().is_cuda(), "qweight is not on GPU");
+  TORCH_CHECK(qweight.scalar_type() == at::ScalarType::Int,
+              "qweight.dtype != torch.int32");
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(qweight));
+
+  torch::Tensor output = inplace ? qweight : torch::empty_like(qweight);
+
+  if (!qzeros_or_none.has_value()) {
+    TORCH_CHECK(qweight.numel() * 8 % 256 == 0,
+                "qweight.numel() * 8 % 256 != 0");
+
+    int blocks = qweight.numel() * 8 / 256;
+    marlin_int4_fp8_preprocess_kernel_without_zp<<<blocks, 32>>>(
+        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr());
+  } else {
+    int32_t size_k = qweight.size(0);
+    int32_t size_n = qweight.size(1) * 8;
+    torch::Tensor qzeros = qzeros_or_none.value();
+
+    TORCH_CHECK(size_k % 32 == 0, "size_k % 32 != 0");
+    TORCH_CHECK(qzeros.device().is_cuda(), "qzeros is not on GPU");
+    TORCH_CHECK(qzeros.scalar_type() == at::ScalarType::Int,
+                "qweight.dtype != torch.int32");
+    TORCH_CHECK(device_of(qweight) == device_of(qzeros),
+                "qzeros is not on the same device with qweight");
+
+    int32_t group_size = qweight.size(0) / qzeros.size(0);
+    TORCH_CHECK(qweight.size(1) == qzeros.size(1),
+                "qweight.size(1) != qzeros.size(1)");
+    TORCH_CHECK(qweight.size(0) % qzeros.size(0) == 0,
+                "qweight.size(0) % qzeros.size(0) != 0");
+    TORCH_CHECK(group_size % 8 == 0, "group_size % 8 != 0");
+
+    dim3 blocks(size_k / 32, size_n / 8);
+    marlin_int4_fp8_preprocess_kernel_awq<<<blocks, 32>>>(
+        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr(),
+        (const int32_t*)qzeros.data_ptr(), size_n, size_k, group_size);
+  }
+
+  return output;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("marlin_int4_fp8_preprocess", &marlin_int4_fp8_preprocess);
+}
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
--- a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
+++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
@@ -0,0 +1,817 @@
+// clang-format off
+// Adapted from: https://github.com/meta-pytorch/applied-ai/blob/main/kernels/cuda/inference/hadamard_transform/hadamard_transform_cuda.cu
+
+/***********
+Copyright 2024 Meta
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+***********/
+
+#include <torch/all.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+#include <mma.h>
+#include <cuda/annotated_ptr>
+#include <c10/cuda/CUDAException.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "core/registration.h"
+#include "dispatch_utils.h"
+
+namespace hadacore {
+
+#ifndef __CUDACC__
+#define __launch_bounds__(x,y)
+#endif
+
+#define MAX_WARPS_PER_SM 48
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+using b16 = uint16_t;
+using b32 = uint32_t;
+
+constexpr int launch_configs_big[7][3] = {
+    // default
+    {2, 1, 24},
+    {2, 2, 16}, 
+    {2, 4, 8}, 
+    {2, 8, 4}, 
+    {2, 16, 3},
+    {4, 16, 2},
+    {8, 16, 1}
+    // // extra coalescing
+    // {2, 1, 24},
+    // {2, 2, 16}, 
+    // {2, 4, 8}, 
+    // {2, 8, 4}, 
+    // {4, 8, 3},
+    // {8, 8, 2},
+    // {16, 8, 1}
+    // // less coalescing
+    // {2, 1, 24},
+    // {2, 2, 16}, 
+    // {2, 4, 8}, 
+    // {2, 8, 4}, 
+    // {1, 32, 1},
+    // {2, 32, 1},
+    // {4, 32, 1}
+};
+
+// a 4x2, b 2x2, c 2x2
+template <torch::ScalarType dtype>
+__device__ __forceinline__ void mma_m16_n8_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32& c0, b32& c1){
+    static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16);
+    // d, a, b, c
+    b32 zero = 0;
+    if constexpr(dtype == torch::ScalarType::Half) {
+        asm (
+            "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+            "{%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%8, %9};\n\t"
+            : "=r"(c0), "=r"(c1) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero)
+        );
+    } else {
+        b32 temp0, temp1, temp2, temp3;
+        asm (
+            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n\t"
+            : "=r"(temp0), "=r"(temp1), "=r"(temp2), "=r"(temp3) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero), "r"(zero), "r"(zero)
+        );
+        asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c0) : "r"(temp1), "r"(temp0));
+        asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c1) : "r"(temp3), "r"(temp2));
+    }
+}
+
+// a 4x2, b 4x2, c 4x2
+template <torch::ScalarType dtype>
+__device__ __forceinline__ void mma_m16_n16_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32 b2, b32 b3, b32& c0, b32& c1, b32& c2, b32& c3){
+    mma_m16_n8_k16_b16_b16_b16_noacc<dtype>(a0, a1, a2, a3, b0, b1, c0, c1);
+    mma_m16_n8_k16_b16_b16_b16_noacc<dtype>(a0, a1, a2, a3, b2, b3, c2, c3);
+}
+
+__device__ __forceinline__ void matrix_transpose_m8_n8_b16_inplace(b32& a0) {
+    asm (
+        "movmatrix.sync.aligned.m8n8.trans.b16 "
+        "%0, %1;\n\t"
+        : "=r"(a0) : "r"(a0)
+    );
+}
+
+#define p_p(i) ((val_1p[i] & 0x0000FFFF) | val_1p[i] << 16)
+#define p_n(i) ((val_1p[i] & 0x0000FFFF) | val_1n[i] << 16)
+#define n_p(i) ((val_1n[i] & 0x0000FFFF) | val_1p[i] << 16)
+#define n_n(i) ((val_1n[i] & 0x0000FFFF) | val_1n[i] << 16)
+
+template<int64_t num_chunks, int64_t warps_per_block, int64_t log_had_size, int64_t blocks_per_sm, bool enable_mask, torch::ScalarType dtype>
+__global__ void __launch_bounds__(32 * warps_per_block, blocks_per_sm)
+// a is column major, b is row major
+hadamard_transform_kernel(b16* a, b16* out, int total_num_chunks) {
+    static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently");
+
+    b32 b_frag_all[num_chunks][4]; // for all chunks, holds matrix fragment (which takes 4 regs of b16x2 * 32 threads)
+
+    int64_t blockid = blockIdx.x * warps_per_block + threadIdx.x / 32;
+    int64_t threadid = threadIdx.x % 32;
+    extern __shared__ b32 bfrag_arr[]; // num_chunks * warps_per_block * 128
+    int64_t real_num_chunks = ((blockid + 1) * num_chunks) > total_num_chunks ? (total_num_chunks - (blockid * num_chunks)) : num_chunks;
+    int64_t diff_num_chunks = real_num_chunks - num_chunks;
+
+    b32* a_start_ptr = (b32*) (a + blockid * num_chunks * 256); // offset a to where this warp starts
+    b32* out_start_ptr = (b32*) (out + blockid * num_chunks * 256);
+    b32* a_ptr = a_start_ptr + threadid * 4;
+    b32* b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128 + threadid * 4;
+
+    #if (__CUDA_ARCH__ < 900) // SM80, SM89
+    uint64_t cache_policy;
+    asm volatile(
+        "createpolicy.fractional.L2::evict_first.b64 %0, 1.0;\n"
+        : "=l"(cache_policy)
+    );
+    #endif
+
+    #pragma unroll
+    for (int64_t k = 0; k < num_chunks; k++) {
+        size_t shared_ptr = __cvta_generic_to_shared(b_frag_ptr);
+        #if (__CUDA_ARCH__ >= 900) // SM90
+            asm volatile(
+                "cp.async.cg.shared.global [%0], [%1], 16;\n"
+                "cp.async.commit_group;\n"
+                :: "l"(shared_ptr), "l"(a_ptr)
+            );
+        #else // SM80, SM89
+            asm volatile(
+                "cp.async.cg.shared.global.L2::cache_hint.L2::256B [%0], [%1], 16, %2;\n"
+                "cp.async.commit_group;\n"
+                :: "l"(shared_ptr), "l"(a_ptr), "l"(cache_policy)
+            );
+        #endif
+
+        a_ptr += 128;
+        b_frag_ptr += 128;
+    }
+
+    // generate hadamard 16x16 (up to 2 of them)
+    constexpr b16 fp16_1p[4] = {0b0011100110101000, 0b0011100000000000, 0b0011010110101000, 0b0011010000000000};
+    constexpr b16 fp16_1n[4] = {0b1011100110101000, 0b1011100000000000, 0b1011010110101000, 0b1011010000000000};
+    constexpr b16 bf16_1p[4] = {0b0011111100110101, 0b0011111100000000, 0b0011111010110101, 0b0011111010000000};
+    constexpr b16 bf16_1n[4] = {0b1011111100110101, 0b1011111100000000, 0b1011111010110101, 0b1011111010000000};
+
+    #define val_type_1p(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1p[i]) : (bf16_1p[i]))
+    #define val_type_1n(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1n[i]) : (bf16_1n[i]))
+    constexpr b16 val_1p[4] = {val_type_1p(0), val_type_1p(1), val_type_1p(2), val_type_1p(3)};
+    constexpr b16 val_1n[4] = {val_type_1n(0), val_type_1n(1), val_type_1n(2), val_type_1n(3)};
+
+    constexpr b32 p_p[4] = {p_p(0), p_p(1), p_p(2), p_p(3)};
+    constexpr b32 p_n[4] = {p_n(0), p_n(1), p_n(2), p_n(3)};
+    constexpr b32 n_p[4] = {n_p(0), n_p(1), n_p(2), n_p(3)};
+    constexpr b32 n_n[4] = {n_n(0), n_n(1), n_n(2), n_n(3)};
+    const b32 had_16_p1[4][4] = {
+        {
+            0b10001000010001000010001000010001,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b10001000010001000010001000010001
+        },
+        {
+            0b11001100100010000011001100100010,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11001100100010000011001100100010
+        },
+        {
+            0b11111111101010101100110010011001,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11111111101010101100110010011001
+        },
+        {
+            0b11111111101010101100110010011001,
+            0b11111111101010101100110010011001,
+            0b11111111101010101100110010011001,
+            0b00000000010101010011001101100110
+        }
+    };
+    const b32 had_16_p2[4][4] = {
+        {
+            0b10000000010000000010000000010000,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b10000000010000000010000000010000
+        },
+        {
+            0b11000000100001000011000000100001,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11000000100001000011000000100001
+        },
+        {
+            0b11110000101001011100001110010110,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11110000101001011100001110010110
+        },
+        {
+            0b11110000101001011100001110010110,
+            0b11110000101001011100001110010110,
+            0b11110000101001011100001110010110,
+            0b00001111010110100011110001101001
+        }
+    };
+    const b32 had_16_mask[3][4] = {
+        {
+            0b10001000010001000010001000010001,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b10001000010001000010001000010001
+        },
+        {
+            0b11001100110011000011001100110011,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11001100110011000011001100110011
+        },
+        {
+            0b11111111111111111111111111111111,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11111111111111111111111111111111
+        }
+    };
+    b32 had_frag[8];
+    #pragma unroll
+    for (int64_t i = 0; i < 2; i++) {
+        int64_t c_log_h = (i == 0) ? MIN(4, log_had_size) : log_had_size % 4;
+        #pragma unroll
+        for (int64_t j = 0; j < 4; j++) {
+            if (c_log_h < 4) {
+                bool mask = had_16_mask[c_log_h - 1][j] & (1 << (31 - threadid));
+                if (!mask) {
+                    had_frag[i * 4 + j] = 0;
+                    continue;
+                }
+            }
+            bool pred1 = had_16_p1[c_log_h - 1][j] & (1 << (31 - threadid));
+            bool pred2 = had_16_p2[c_log_h - 1][j] & (1 << (31 - threadid));
+            b32 val = pred1 ? (pred2 ? p_p[c_log_h - 1] : p_n[c_log_h - 1]) : (pred2 ? n_p[c_log_h - 1] : n_n[c_log_h - 1]);
+            had_frag[i * 4 + j] = val;
+        }
+        if constexpr(log_had_size <= 4 || log_had_size % 4 == 0) break;
+    }
+
+    // log had size above 8, only used for above 2^8 = 256 size
+    constexpr int64_t part8_log_had_size = log_had_size - 8;
+
+    b32* a_chunk_ptr = a_start_ptr; // first chunk starts at this warp's data starts
+    b32* out_chunk_ptr = out_start_ptr;
+
+    #pragma unroll
+    for (int64_t l = 0; l < 2; l++) {
+        if constexpr(log_had_size <= 8) { // l == 0 guaranteed, redundant simplified version of else body, to help compiler warnings
+            b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128;
+        } else {
+            b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * (l == 0 ? 128 : (128 >> part8_log_had_size));
+        }
+
+        if (l == 1) {
+            if constexpr(log_had_size > 8) {
+                __syncthreads(); // sync between first and second iterations if above size 256
+
+                if constexpr(log_had_size >= 12) {
+                    // sizes 4k and above
+
+                    // a + threadblock offset + warp offset
+                    // can then index into all chunks owned by this warp
+                    b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block));
+
+                    #pragma unroll
+                    for (int64_t j = 0; j < 4; j++) {
+                        #pragma unroll
+                        for (int64_t k = 0; k < num_chunks; k++) {
+                            // here, j represents register, and k represents 8-offset/chunk
+                            uint64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data
+                            
+                            int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread #
+                            int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data)
+                            int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads)
+                            int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads
+                            int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register
+                            int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index
+
+                            // fix idx for majorness
+                            int64_t rowidx = idx % (1 << part8_log_had_size);
+                            int64_t colidx = idx >> part8_log_had_size;
+
+                            // store[rowidx * 128 + colidx] = data;
+                            b32 data = store[rowidx * 128 + colidx];
+
+                            // compiler generates excessive instructions, so we manually do the if statement
+                            #pragma unroll
+                            for (uint64_t i = 0; i < num_chunks; i++) {
+                                asm volatile (
+                                    "{\n\t"
+                                    "  .reg .pred p0;\n\t"
+                                    "  setp.eq.s64 p0, %1, %2;\n\t"
+                                    "  @p0 mov.b32 %0, %3;\n\t"
+                                    "}\n\t"
+                                    : "+r"(b_frag_all[i][j]) // Output operand %0
+                                    : "l"(real_chunk_num), "l"(i), "r"(data) // Input operands %1, %2, %3
+                                );
+                            }
+                        }
+                    }
+
+                    #pragma unroll
+                    for (int64_t j = 0; j < 4; j++) {
+                        #pragma unroll
+                        for (int64_t k = 1; k < num_chunks; k++) {
+                            int64_t threadid_contig = threadid % num_chunks;
+                            int64_t threadid_mul = threadid / num_chunks;
+                            int64_t threadid2 = (threadid_contig + num_chunks - k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to
+                            b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2);
+                        }
+                    }
+                }
+            }
+        }
+
+        #pragma unroll
+        for (int64_t k = 0; k < num_chunks; k++) {
+            if constexpr(enable_mask) {
+                if (k >= real_num_chunks)
+                    break;
+            }
+            if (l == 0) {
+                // bad fix for k not being recognized as a constexpr by compiler
+                // asm("cp.async.wait_group %0;\n" :: "n"(num_chunks - k - 1));
+                #define SWITCH_WAIT_ASYNC_LOAD_GROUP(i) case i: asm volatile("cp.async.wait_group %0;\n" :: "n"(num_chunks - i - 1)); break;
+                if constexpr(enable_mask) {
+                    switch(k + diff_num_chunks) {
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(0)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(1)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(2)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(3)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(4)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(5)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(6)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(7)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(8)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(9)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(10)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(11)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(12)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(13)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(14)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(15)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(16)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(17)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(18)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(19)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(20)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(21)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(22)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(23)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(24)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(25)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(26)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(27)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(28)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(29)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(30)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(31)
+                    }
+                } else {
+                    switch(k) {
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(0)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(1)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(2)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(3)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(4)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(5)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(6)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(7)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(8)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(9)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(10)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(11)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(12)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(13)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(14)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(15)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(16)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(17)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(18)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(19)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(20)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(21)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(22)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(23)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(24)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(25)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(26)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(27)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(28)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(29)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(30)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(31)
+                    }
+                }
+            }
+
+            if (l == 0) {
+                // loading for the first iteration
+
+                // thread 0 loads  [t0r0, t16r1, t0r2, t16r3]
+                // thread 16 loads [t0r1, t16r0, t0r3, t16r2]
+                // allows full coalescing, same for t1/t17, t2/t18, etc.
+                #pragma unroll
+                for (int64_t j = 0; j < 4; j++) {
+                    int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2));
+                    int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16);
+                    int64_t real_row = real_thread_id % 4;
+                    int64_t real_col = real_thread_id / 4;
+                    b_frag_all[k][j] = b_frag_ptr[(real_row + (reg % 2) * 4) + (real_col + (j / 2) * 8) * 8];
+                }
+
+                // for t16 swap r0/r1 and r2/r3 to have [t16r0, t0r1, t16r2, t0r3]
+                // so registers are in right order, same for t17, t18, etc.
+                if ((threadid & 16) != 0) {
+                    b32 temp = b_frag_all[k][0];
+                    b_frag_all[k][0] = b_frag_all[k][1];
+                    b_frag_all[k][1] = temp;
+
+                    temp = b_frag_all[k][2];
+                    b_frag_all[k][2] = b_frag_all[k][3];
+                    b_frag_all[k][3] = temp;
+                }
+
+                // t0 and t16 swap r1 and r3 to have their own data,
+                // same for t1/t17, t2/18, etc.
+                #pragma unroll
+                for (int64_t j = 1; j < 4; j += 2) {
+                    b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16);
+                }
+            } else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings
+                if constexpr(log_had_size < 12) {
+                    // sizes 512, 1k, and 2k
+
+                    // for 512:
+                    //     thread 0 loads  [t0r0, t0r1, t16r2, t16r3]
+                    //     thread 16 loads [t0r2, t0r3, t16r0, t16r1]
+                    //     same for t1/t17, t2/t18, etc.
+                    // for 1k and 2k:
+                    //     thread 0 loads [t0r0, t0r1, t1r2, t1r3]
+                    //     thread 1 loads [t0r2, t0r3, t1r0, t1r1]
+                    //     same for t2/t3, t4/t5, etc.
+                    // allows full coalescing for 512 and 1k, 16x coalescing for 2k
+                    constexpr int64_t xor_val = log_had_size == 9 ? 16 : 1;
+
+                    #pragma unroll
+                    for (int64_t j = 0; j < 4; j++) {
+                        int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4;
+                        int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val);
+                        int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2);
+                        int64_t rowidx = idx % (1 << part8_log_had_size);
+                        int64_t colidx = idx >> part8_log_had_size;
+                        b_frag_all[k][j] = b_frag_ptr[rowidx * 128 + colidx];
+                    }
+
+                    if ((threadid & xor_val) != 0) {
+                        b32 temp = b_frag_all[k][0];
+                        b_frag_all[k][0] = b_frag_all[k][2];
+                        b_frag_all[k][2] = temp;
+
+                        temp = b_frag_all[k][1];
+                        b_frag_all[k][1] = b_frag_all[k][3];
+                        b_frag_all[k][3] = temp;
+                    }
+
+                    #pragma unroll
+                    for (int64_t j = 2; j < 4; j++) {
+                        b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val);
+                    }
+                }
+            }
+
+            if (l == 1) {
+                // for second iteration, we load 2 consecutive b16s (1 b32) per register,
+                // but tensor core register layout requires 2 b16s that are in the
+                // same column/consecutive rows to be in the same register, so do the swap
+                b32 f0 = ((b_frag_all[k][1] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF);
+                b32 f1 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][2] & 0xFFFF);
+                b32 f2 = (b_frag_all[k][1] & 0xFFFF0000) | (b_frag_all[k][0] >> 16);
+                b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][2] >> 16);
+                b_frag_all[k][0] = f0;
+                b_frag_all[k][1] = f1;
+                b_frag_all[k][2] = f2;
+                b_frag_all[k][3] = f3;
+            }
+
+            #pragma unroll
+            for(int64_t i = 0, remaining_log_had_size = log_had_size - l * 8; i < 2 && remaining_log_had_size > 0; i++) {
+                int64_t had_off = ((remaining_log_had_size < 4) && !(log_had_size <= 4 || log_had_size % 4 == 0)) ? 4 : 0;
+                mma_m16_n16_k16_b16_b16_b16_noacc<dtype>(had_frag[had_off + 0], had_frag[had_off + 1], had_frag[had_off + 2], had_frag[had_off + 3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3]);
+
+                remaining_log_had_size -= 4;
+                if (remaining_log_had_size <= 0 && i == 0) {
+                    // TODO: consider different storing so no need for transpose
+                    matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][0]);
+                    matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][1]);
+                    matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][2]);
+                    matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][3]);
+                } else {
+                    // swap and use output directly as b_frag for next iteration as an actually free transpose
+                    b32 temp = b_frag_all[k][1];
+                    b_frag_all[k][1] = b_frag_all[k][2];
+                    b_frag_all[k][2] = temp;
+                }
+            }
+
+            if (l == 1) {
+                // invert swap from above for second iteration
+                b32 f0 = ((b_frag_all[k][2] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF);
+                b32 f1 = (b_frag_all[k][2] & 0xFFFF0000) | (b_frag_all[k][0] >> 16);
+                b32 f2 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][1] & 0xFFFF);
+                b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][1] >> 16);
+                b_frag_all[k][0] = f0;
+                b_frag_all[k][1] = f1;
+                b_frag_all[k][2] = f2;
+                b_frag_all[k][3] = f3;
+            }
+
+            if (l == 0) {
+                // inverse of coalesced load for first iteration to store result
+                #pragma unroll
+                for (int64_t j = 1; j < 4; j += 2) {
+                    b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16);
+                }
+
+                if ((threadid & 16) != 0) {
+                    b32 temp = b_frag_all[k][0];
+                    b_frag_all[k][0] = b_frag_all[k][1];
+                    b_frag_all[k][1] = temp;
+
+                    temp = b_frag_all[k][2];
+                    b_frag_all[k][2] = b_frag_all[k][3];
+                    b_frag_all[k][3] = temp;
+                }
+
+                // if only going up to 256 size, store directly back to global memory,
+                // otherwise store back to shared memory for next iteration
+                b32* store = (log_had_size <= 8) ? out_chunk_ptr : b_frag_ptr;
+
+                #pragma unroll
+                for (int64_t j = 0; j < 4; j++) {
+                    int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2));
+                    int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16);
+                    int64_t real_row = real_thread_id % 4;
+                    int64_t real_col = real_thread_id / 4;
+                    store[(real_row + (reg % 2) * 4) + (real_col + (reg / 2) * 8) * 8] = b_frag_all[k][j];
+                }
+            } else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings
+                if (log_had_size < 12) {
+                    // inverse of coalesced load for sizes 512, 1k and 2k to store result
+                    constexpr int xor_val = log_had_size == 9 ? 16 : 1;
+                    #pragma unroll
+                    for (int64_t j = 2; j < 4; j++) {
+                        b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val);
+                    }
+
+                    if ((threadid & xor_val) != 0) {
+                        b32 temp = b_frag_all[k][0];
+                        b_frag_all[k][0] = b_frag_all[k][2];
+                        b_frag_all[k][2] = temp;
+
+                        temp = b_frag_all[k][1];
+                        b_frag_all[k][1] = b_frag_all[k][3];
+                        b_frag_all[k][3] = temp;
+                    }
+
+                    b32* store = (b32*)(out + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 256 + (256 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block) + k));
+                    #pragma unroll
+                    for (int64_t j = 0; j < 4; j++) {
+                        int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4;
+                        b32 data = b_frag_all[k][j];
+                        int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val);
+                        int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2);
+                        int64_t rowidx = idx % (1 << part8_log_had_size);
+                        int64_t colidx = idx >> part8_log_had_size;
+                        store[rowidx * 128 + colidx] = data;
+                    }
+                }
+                // for size 4k and above, wait to process all chunks so a final store can be performed coalesced
+            }
+
+            a_chunk_ptr += 128; // (only affects first 256 size) move on to next chunk by skipping 256 elements in b16 (= 128 in b32)
+            out_chunk_ptr += 128;
+            if constexpr(log_had_size > 8) {
+                b_frag_ptr += (l == 0 ? 128 : (128 >> part8_log_had_size));
+            } else { // else is redundant, simplified version of if body, to help compiler warnings
+                b_frag_ptr += 128;
+            }
+        }
+        if (log_had_size <= 8)
+            break;
+    }
+
+    if constexpr(log_had_size >= 12) {
+        // for sizes 4k and above, perform final coalesced store after processing all chunks
+        #pragma unroll
+        for (int64_t j = 0; j < 4; j++) {
+            #pragma unroll
+            for (int64_t k = 1; k < num_chunks; k++) {
+                int64_t threadid_contig = threadid % num_chunks;
+                int64_t threadid_mul = threadid / num_chunks;
+                int64_t threadid2 = (threadid_contig + k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to
+                b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2);
+            }
+        }
+
+        // a + threadblock offset + warp offset
+        // can then index into all chunks owned by this warp
+        b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block));
+
+        #pragma unroll
+        for (int64_t j = 0; j < 4; j++) {
+            #pragma unroll
+            for (int64_t k = 0; k < num_chunks; k++) {
+                // here, j represents register, and k represents 8-offset/chunk
+                int64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data
+
+                // b32 data = b_frag_all[real_chunk_num][j]; // target thread data
+                b32 data;
+                #pragma unroll
+                for (int64_t i = 0; i < num_chunks; i++) {
+                    if (real_chunk_num == i) data = b_frag_all[i][j];
+                }
+                
+                int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread #
+                int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data)
+                int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads)
+                int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads
+                int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register
+                int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index
+
+                // fix idx for majorness
+                int64_t rowidx = idx % (1 << part8_log_had_size);
+                int64_t colidx = idx >> part8_log_had_size;
+
+                store[rowidx * 128 + colidx] = data;
+            }
+        }
+
+        __syncthreads();
+        store = ((b32*) out) + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 128;
+        int4* store4 = (int4*) store;
+        int4* bfrag_arr4 = (int4*) bfrag_arr;
+        // flush smem, simply linearly write to store
+        // always divisible by 128*32b, so (32*4)*32b is ok
+        #pragma unroll
+        for (int64_t warp_off = 0; warp_off < (num_chunks * warps_per_block * 128 / 4); warp_off += 32 * warps_per_block) {
+            int64_t total_off = warp_off + threadid + (blockid % warps_per_block) * 32;
+            store4[total_off] = bfrag_arr4[total_off];
+        }
+    }
+
+}
+
+constexpr int64_t ceil_div(int64_t a, int64_t b) {
+    return (a + b - 1) / b;
+}
+
+template <torch::ScalarType dtype, int64_t chunks_per_warp, int64_t warps_per_block, int64_t log_had_size, int64_t blocks_per_sm, bool check_masking = false>
+void __forceinline__ run_kernel(b16* a_mat, b16* out, int64_t num_chunks, cudaStream_t stream) {
+    int64_t shared_size = chunks_per_warp * warps_per_block * 128 * 4;
+    dim3 block_size = 32 * warps_per_block;
+
+    #define CHECK_SHARED_LIM() {                                                                              \
+        if (shared_size > 48 * 1024) {                                                                        \
+            C10_CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536)); \
+        }                                                                                                     \
+    }                                                                                                         \
+
+    if constexpr(check_masking) {
+        if (num_chunks % (chunks_per_warp * warps_per_block) != 0) {
+            dim3 grid_size = ceil_div(ceil_div(num_chunks, chunks_per_warp), warps_per_block);
+            auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, true, dtype>;
+            CHECK_SHARED_LIM();
+            kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks);
+        } else {
+            dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block;
+            auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, false, dtype>;
+            CHECK_SHARED_LIM();
+            kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks);
+        }
+    } else {
+        dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block;
+        auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, false, dtype>;
+        CHECK_SHARED_LIM();
+        kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks);
+    }
+    
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <torch::ScalarType dtype>
+void run_fht(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream) {
+    int64_t num_chunks = numel / 256; // caller required to ensure divisible by 256
+    // for size 256, use (2, 1)
+    // for size 32k use (8, 16)
+    constexpr int64_t chunks_per_warp_small = 1;// 8;
+    constexpr int64_t warps_per_block_small = 1;//2;//16;
+    constexpr int64_t blocks_per_sm_small = 24;
+    constexpr int64_t chunks_per_warp_large = 2;
+    constexpr int64_t warps_per_block_large = 1;
+    constexpr int64_t blocks_per_sm_large = 24;
+
+    b16* a_mat = (b16*) a_mat_ptr;
+    b16* out = (b16*) out_ptr;
+
+    if (numel <= 256) {
+        switch (had_size) {
+            case (1<<1): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 1, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<2): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 2, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<3): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 3, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<4): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 4, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<5): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 5, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<6): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 6, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<7): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 7, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<8): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 8, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+        }
+    } else {
+        switch (had_size) {
+            case (1<<1):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 1, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<2):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 2, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<3):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 3, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<4):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 4, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<5):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 5, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<6):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 6, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<7):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 7, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<8):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 8, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<9):  run_kernel<dtype, launch_configs_big[0][0], launch_configs_big[0][1], 9 , launch_configs_big[0][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<10): run_kernel<dtype, launch_configs_big[1][0], launch_configs_big[1][1], 10, launch_configs_big[1][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<11): run_kernel<dtype, launch_configs_big[2][0], launch_configs_big[2][1], 11, launch_configs_big[2][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<12): run_kernel<dtype, launch_configs_big[3][0], launch_configs_big[3][1], 12, launch_configs_big[3][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<13): run_kernel<dtype, launch_configs_big[4][0], launch_configs_big[4][1], 13, launch_configs_big[4][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<14): run_kernel<dtype, launch_configs_big[5][0], launch_configs_big[5][1], 14, launch_configs_big[5][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<15): run_kernel<dtype, launch_configs_big[6][0], launch_configs_big[6][1], 15, launch_configs_big[6][2]>(a_mat, out, num_chunks, stream); break;
+        }
+    }
+}
+
+template void run_fht<torch::ScalarType::Half>(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream);
+template void run_fht<torch::ScalarType::BFloat16>(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream);
+
+}  // namespace hadacore
+
+constexpr bool is_power_of_two(int x) { return x && !(x & (x - 1)); }
+
+torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
+    auto dtype = x.scalar_type();
+    TORCH_CHECK(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently");
+    TORCH_CHECK(x.is_cuda());
+    
+    const int had_size = x.size(-1);
+    TORCH_CHECK(is_power_of_two(had_size) && (had_size <= (1U << 15)),
+        "Only power of two Hadamard sizes up to 2^15 are supported, got ", had_size);
+    
+    const auto res_shape = x.sizes();
+    x = x.reshape({-1, had_size});
+    
+    auto numel = x.numel();
+    if (numel % 256 != 0) {
+        x = torch::nn::functional::pad(x, torch::nn::functional::PadFuncOptions({0, 0, 0, (256 - numel % 256) / had_size}));
+    }
+    
+    if (x.stride(-1) != 1) {
+        x = x.contiguous();
+    }
+    torch::Tensor out = inplace ? x : torch::empty_like(x);
+
+    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    VLLM_DISPATCH_HALF_TYPES(x.scalar_type(), "hadacore_transform_runfht", [&] {
+      auto constexpr SCALAR_TYPE = c10::CppTypeToScalarType<scalar_t>::value;
+      hadacore::run_fht<SCALAR_TYPE>(x.data_ptr(), x.data_ptr(), x.numel(), had_size, stream);
+    });
+
+    if (numel % 256 != 0) {
+        out = out.narrow(0, 0, numel / had_size);
+    }
+
+    if (inplace && out.data_ptr() != x.data_ptr()) {
+        x.copy_(out.view(res_shape));
+        return x;
+    }
+    return out.reshape(res_shape);
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+    m.impl("hadacore_transform", &hadacore_transform);
+}
--- a/csrc/quantization/machete/Readme.md
+++ b/csrc/quantization/machete/Readme.md
@@ -0,0 +1,45 @@
+# Machete (Mixed Precision Cutlass-Based GEMM)
+
+Machete is a spiritual successor to the Marlin kernel but optimized for Hopper architectures and based on Cutlass. Being based on Cutlass, new type pairs and epilogues are easier to add compared to Marlin.
+
+## Overview
+
+Machete effectively performs
+
+```python
+scale_type = w_s.dtype
+compute_type = a.dtype
+out = (w_q.to(scale_type) * w_s - w_z.to(scale_type)) @ a
+```
+
+Where `w_q` is a quantized weight matrix, `w_s` is the quantization scales, and
+`w_z` is the quantization zeropoints.
+
+> **_NOTE:_**  `w_z` is added after the scales so we can
+use FMA operations, but this means they must have the scales pre-applied if the
+supplied zeropoints assume that they will be subtracted before the scales are
+applied.
+
+## API
+
+The main optimization within Machete is prepacking the weight matrix to more closely match the tensor core layouts, allowing for wider shared memory loads when loading the weight matrix. This means that the weight matrix must be prepacked before calling `machete_gemm`. The flow looks something like:
+
+```python
+from vllm import _custom_ops as ops
+
+...
+W_q_packed = ops.machete_prepack_B(w_q, wtype)
+output = ops.machete_gemm(
+    a,
+    b_q=W_q_packed,
+    b_type=wtype,
+    b_scales=w_s,
+    b_group_size=group_size
+)
+```
+
+## Code Generation
+
+Since Machete is based on Cutlass, we can generate multiple type pairs and different tile shapes using the same kernel template. We generate multiple instantiations of this template using `generate.py`.
+
+New type pairs (`TypeConfig`s) can be appended to `impl_configs` (in `generate()`), and these will get automatically generated (assuming they can be supported without issues). For each `TypeConfig`, you must also provide an `ImplConfig`, which bundles a `TypeConfig` with a list of `ScheduleConfig`s, `Specialization`s, and a default heuristic. The `ScheduleConfig`s (which contain info on tile shapes, tile scheduler, etc.) can perform differently for different problem shapes, and there is almost never one `ScheduleConfig` that works well for all problem shapes, so it is generally beneficial to generate different `ScheduleConfig`s for different potential problem shapes. This is where the heuristic comes in. For each `TypeConfig`, a default heuristic should be provided. This maps different problem shapes to different `ScheduleConfig`s and is used when the user does not provide the `schedule` parameter to `machete_gemm`. The `Specialization`s define what feature combinations to generate, i.e., `with_zeropoints`, `with_scales`, etc. We can reduce compile times and the final binary size by limiting the set of feature combinations we generate.
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -0,0 +1,694 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+import math
+import os
+import shutil
+from collections.abc import Iterable
+from copy import deepcopy
+from dataclasses import dataclass, fields
+from functools import reduce
+
+import jinja2
+from vllm_cutlass_library_extension import (
+    DataType,
+    EpilogueScheduleTag,
+    EpilogueScheduleType,
+    MixedInputKernelScheduleType,
+    TileSchedulerTag,
+    TileSchedulerType,
+    VLLMDataType,
+    VLLMDataTypeNames,
+    VLLMDataTypeSize,
+    VLLMDataTypeTag,
+    VLLMDataTypeTorchDataTypeTag,
+    VLLMDataTypeVLLMScalarTypeTag,
+    VLLMKernelScheduleTag,
+)
+
+#
+#   Generator templating
+#
+
+DISPATCH_TEMPLATE = """
+#include "../machete_mm_launcher.cuh"
+
+namespace machete {
+
+{% for impl_config in impl_configs %}
+{% set type_sig = gen_type_sig(impl_config.types) -%}
+{% for s in impl_config.schedules %}
+extern torch::Tensor impl_{{type_sig}}_sch_{{gen_sch_sig(s)}}(MMArgs);
+{%- endfor %}
+
+torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) {
+  [[maybe_unused]] auto M = args.A.size(0);
+  [[maybe_unused]] auto N = args.B.size(1);
+  [[maybe_unused]] auto K = args.A.size(1);
+    
+  if (!args.maybe_schedule) {
+    {%- for cond, s in impl_config.heuristic %}
+    {%if cond is not none%}if ({{cond}})
+    {%- else %}else
+    {%- endif %}
+        return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);{% endfor %}
+  }
+
+  {%- for s in impl_config.schedules %}
+  if (*args.maybe_schedule == "{{ gen_sch_sig(s) }}")
+    return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);
+  {%- endfor %}
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "machete_gemm(..) is not implemented for "
+                                     "schedule = ", *args.maybe_schedule);
+}
+{%- endfor %}
+
+
+static inline std::optional<at::ScalarType> maybe_scalartype(
+    std::optional<at::Tensor> const& t) {
+    if (!t) {
+      return std::nullopt;
+    } else {
+      return t->scalar_type();
+    };
+}
+
+torch::Tensor mm_dispatch(MMArgs args) {
+  auto out_type = args.maybe_out_type.value_or(args.A.scalar_type());
+  auto a_type = args.A.scalar_type();
+  auto maybe_g_scales_type = maybe_scalartype(args.maybe_group_scales);
+  auto maybe_g_zeros_type = maybe_scalartype(args.maybe_group_zeros);
+  auto maybe_ch_scales_type = maybe_scalartype(args.maybe_channel_scales);
+  auto maybe_tok_scales_type = maybe_scalartype(args.maybe_token_scales);
+
+  {% for impl_config in impl_configs %}
+  {% set t = impl_config.types -%}
+  {% set type_sig = gen_type_sig(t) -%}
+  if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
+      && a_type == {{TorchTypeTag[t.a]}}
+      && out_type == {{TorchTypeTag[t.out]}}
+      && {%if t.b_group_scale != void -%}
+      maybe_g_scales_type == {{TorchTypeTag[t.b_group_scale]}}
+      {%- else %}!maybe_g_scales_type{%endif%}
+      && {%if t.b_group_zeropoint != void -%}
+      maybe_g_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
+      {%- else %}!maybe_g_zeros_type{%endif%}
+      && {%if t.b_channel_scale != void -%}
+      maybe_ch_scales_type == {{TorchTypeTag[t.b_channel_scale]}}
+      {%- else %}!maybe_ch_scales_type{%endif%}
+      && {%if t.a_token_scale != void -%}
+      maybe_tok_scales_type == {{TorchTypeTag[t.a_token_scale]}}
+      {%- else %}!maybe_tok_scales_type{%endif%}
+  ) {
+      return mm_dispatch_{{type_sig}}(args);
+  }
+  {%- endfor %}
+  
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    false, "machete_mm(..) is not implemented for "
+    "a_type=", args.A.scalar_type(),
+    ", b_type=", args.b_type.str(),
+    ", out_type=", out_type,
+    ", with_group_scale_type=", maybe_g_scales_type
+        ? toString(*maybe_g_scales_type) : "None",
+    ", with_group_zeropoint_type=", maybe_g_zeros_type
+        ? toString(*maybe_g_zeros_type) : "None",
+    ", with_channel_scale_type=", maybe_ch_scales_type
+        ? toString(*maybe_ch_scales_type) : "None",
+    ", with_token_scale_type=", maybe_tok_scales_type
+        ? toString(*maybe_tok_scales_type) : "None",
+    "; implemented types are: \\n",
+    {%- for impl_config in impl_configs %}
+    {% set t = impl_config.types -%}
+    "\\t{{gen_type_option_name(t)}}\\n",
+    {%- endfor %}
+    "");
+}
+
+std::vector<std::string> supported_schedules_dispatch(
+    SupportedSchedulesArgs args) {
+    auto out_type = args.maybe_out_type.value_or(args.a_type);
+    
+    {% for impl_config in impl_configs %}
+    {% set t = impl_config.types -%}
+    {% set schs = impl_config.schedules -%}
+    if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
+        && args.a_type == {{TorchTypeTag[t.a]}}
+        && out_type == {{TorchTypeTag[t.out]}}
+        && {%if t.b_group_scale != void -%}
+        args.maybe_group_scales_type == {{TorchTypeTag[t.b_group_scale]}}
+        {%- else %}!args.maybe_group_scales_type{%endif%}
+        && {%if t.b_group_zeropoint != void-%}
+        args.maybe_group_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
+        {%- else %}!args.maybe_group_zeros_type{%endif%}
+    ) {
+        return {
+            {%- for s in impl_config.schedules %}
+            "{{gen_sch_sig(s)}}"{% if not loop.last %},{% endif %}
+            {%- endfor %}
+        };
+    }
+    {%- endfor %}
+    
+    return {};
+};
+
+}; // namespace machete
+"""
+
+IMPL_TEMPLATE = """
+#include "../machete_mm_launcher.cuh"
+
+namespace machete {
+    
+{% for sch in unique_schedules(impl_configs) %}
+{% set sch_sig = gen_sch_sig(sch) -%}
+struct sch_{{sch_sig}} {
+  using TileShapeNM = Shape<{{
+      to_cute_constant(sch.tile_shape_mn)|join(', ')}}>;
+  using ClusterShape = Shape<{{
+      to_cute_constant(sch.cluster_shape_mnk)|join(', ')}}>;
+  // TODO: Reimplement
+  // using KernelSchedule   = {{KernelScheduleTag[sch.kernel_schedule]}};
+  using EpilogueSchedule = {{EpilogueScheduleTag[sch.epilogue_schedule]}};
+  using TileScheduler    = {{TileSchedulerTag[sch.tile_scheduler]}};
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+};
+{% endfor %}
+    
+{% for impl_config in impl_configs %}
+{% set t = impl_config.types -%}
+{% set schs = impl_config.schedules -%}
+{% set type_sig = gen_type_sig(t) -%}
+
+template<typename Sch>
+using Kernel_{{type_sig}} = MacheteKernelTemplate<
+  {{DataTypeTag[t.a]}},  // ElementA
+  {{DataTypeTag[t.b]}},  // ElementB
+  {{DataTypeTag[t.out]}},  // ElementD
+  {{DataTypeTag[t.accumulator]}}, // Accumulator
+  {{DataTypeTag[t.b_group_scale]}}, // GroupScaleT
+  {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
+  {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
+  {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
+  cutlass::gemm::KernelTmaWarpSpecializedCooperative,
+  Sch>;
+
+{% for sch in schs %}
+{% set sch_sig = gen_sch_sig(sch) -%}
+torch::Tensor 
+impl_{{type_sig}}_sch_{{sch_sig}}(MMArgs args) {
+  return run_impl<Kernel_{{type_sig}}<sch_{{sch_sig}}>>(args);
+}
+{%- endfor %}
+{%- endfor %}
+
+}; // namespace machete
+"""
+
+PREPACK_TEMPLATE = """
+#include "../machete_prepack_launcher.cuh"
+
+namespace machete {
+
+torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
+  auto convert_type = args.maybe_group_scales_type.value_or(args.a_type);
+  {%- for t in types %}
+  {% set b_type = unsigned_type_with_bitwidth(t.b_num_bits) %}
+  if (args.a_type == {{TorchTypeTag[t.a]}}
+      && args.b_type.size_bits() == {{t.b_num_bits}} 
+      && convert_type == {{TorchTypeTag[t.convert]}}) {
+    return prepack_impl<
+      PrepackedLayoutBTemplate<
+        {{DataTypeTag[t.a]}}, // ElementA
+        {{DataTypeTag[b_type]}}, // ElementB
+        {{DataTypeTag[t.convert]}}, // ElementConvert
+        {{DataTypeTag[t.accumulator]}}, // Accumulator
+        cutlass::layout::ColumnMajor,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative>
+    >(args.B); 
+  }
+  {%- endfor %}
+  
+  TORCH_CHECK_NOT_IMPLEMENTED(false, 
+    "prepack_B_dispatch(..) is not implemented for "
+    "atype = ", args.a_type,
+    ", b_type = ", args.b_type.str(),
+    ", with_group_scales_type= ", args.maybe_group_scales_type ? 
+        toString(*args.maybe_group_scales_type) : "None");
+}
+
+}; // namespace machete
+"""
+
+TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative
+TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
+
+
+@dataclass(frozen=True)
+class ScheduleConfig:
+    tile_shape_mn: tuple[int, int]
+    cluster_shape_mnk: tuple[int, int, int]
+    kernel_schedule: MixedInputKernelScheduleType
+    epilogue_schedule: EpilogueScheduleType
+    tile_scheduler: TileSchedulerType
+
+
+@dataclass(frozen=True)
+class TypeConfig:
+    a: DataType
+    b: DataType | VLLMDataType
+    b_group_scale: DataType
+    b_group_zeropoint: DataType
+    b_channel_scale: DataType
+    a_token_scale: DataType
+    out: DataType
+    accumulator: DataType
+
+
+@dataclass(frozen=True)
+class PrepackTypeConfig:
+    a: DataType
+    b_num_bits: int
+    convert: DataType
+    accumulator: DataType
+
+
+@dataclass
+class ImplConfig:
+    types: TypeConfig
+    schedules: list[ScheduleConfig]
+    heuristic: list[tuple[str | None, ScheduleConfig]]
+
+
+def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
+    tile_shape = (
+        f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
+    )
+    cluster_shape = (
+        f"{schedule_config.cluster_shape_mnk[0]}"
+        + f"x{schedule_config.cluster_shape_mnk[1]}"
+        + f"x{schedule_config.cluster_shape_mnk[2]}"
+    )
+    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
+        "::"
+    )[-1]
+    epilogue_schedule = EpilogueScheduleTag[schedule_config.epilogue_schedule].split(
+        "::"
+    )[-1]
+    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler].split("::")[-1]
+
+    return (
+        f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
+        + f"_{epilogue_schedule}_{tile_scheduler}"
+    )
+
+
+# mostly unique shorter sch_sig
+def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
+    kernel_terse_names_replace = {
+        "KernelTmaWarpSpecializedCooperative": "TmaMI_",
+        "TmaWarpSpecializedCooperative_": "TmaCoop_",
+        "StreamKScheduler": "streamK",
+    }
+
+    sch_sig = generate_sch_sig(schedule_config)
+    for orig, terse in kernel_terse_names_replace.items():
+        sch_sig = sch_sig.replace(orig, terse)
+    return sch_sig
+
+
+# unique type_name
+def generate_type_signature(kernel_types: TypeConfig):
+    return str(
+        "".join(
+            [
+                VLLMDataTypeNames[getattr(kernel_types, field.name)]
+                for field in fields(TypeConfig)
+            ]
+        )
+    )
+
+
+def generate_type_option_name(kernel_types: TypeConfig):
+    return ", ".join(
+        [
+            f"{field.name.replace('b_', 'with_') + '_type'}="
+            + VLLMDataTypeNames[getattr(kernel_types, field.name)]
+            for field in fields(TypeConfig)
+        ]
+    )
+
+
+def is_power_of_two(n):
+    return (n != 0) and (n & (n - 1) == 0)
+
+
+def to_cute_constant(value: list[int]):
+    def _to_cute_constant(value: int):
+        if is_power_of_two(value):
+            return f"_{value}"
+        else:
+            return f"Int<{value}>"
+
+    if isinstance(value, Iterable):
+        return [_to_cute_constant(value) for value in value]
+    else:
+        return _to_cute_constant(value)
+
+
+def unique_schedules(impl_configs: list[ImplConfig]):
+    # Use dict over set for deterministic ordering
+    return list(
+        {
+            sch: None for impl_config in impl_configs for sch in impl_config.schedules
+        }.keys()
+    )
+
+
+def unsigned_type_with_bitwidth(num_bits):
+    return {
+        4: DataType.u4,
+        8: DataType.u8,
+        16: DataType.u16,
+        32: DataType.u32,
+        64: DataType.u64,
+    }[num_bits]
+
+
+template_globals = {
+    "void": DataType.void,
+    "DataTypeTag": VLLMDataTypeTag,
+    "VLLMScalarTypeTag": VLLMDataTypeVLLMScalarTypeTag,
+    "TorchTypeTag": VLLMDataTypeTorchDataTypeTag,
+    "KernelScheduleTag": VLLMKernelScheduleTag,
+    "EpilogueScheduleTag": EpilogueScheduleTag,
+    "TileSchedulerTag": TileSchedulerTag,
+    "to_cute_constant": to_cute_constant,
+    "gen_sch_sig": generate_terse_sch_sig,
+    "gen_type_sig": generate_type_signature,
+    "unique_schedules": unique_schedules,
+    "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
+    "gen_type_option_name": generate_type_option_name,
+}
+
+
+def create_template(template_str):
+    template = jinja2.Template(template_str)
+    template.globals.update(template_globals)
+    return template
+
+
+mm_dispatch_template = create_template(DISPATCH_TEMPLATE)
+mm_impl_template = create_template(IMPL_TEMPLATE)
+prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
+
+
+def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
+    sources = []
+
+    sources.append(
+        (
+            "machete_mm_dispatch",
+            mm_dispatch_template.render(impl_configs=impl_configs),
+        )
+    )
+
+    prepack_types = []
+    for impl_config in impl_configs:
+        convert_type = (
+            impl_config.types.a
+            if impl_config.types.b_group_scale == DataType.void
+            else impl_config.types.b_group_scale
+        )
+        prepack_types.append(
+            PrepackTypeConfig(
+                a=impl_config.types.a,
+                b_num_bits=VLLMDataTypeSize[impl_config.types.b],
+                convert=convert_type,
+                accumulator=impl_config.types.accumulator,
+            )
+        )
+
+    def prepacked_type_key(prepack_type: PrepackTypeConfig):
+        # For now, we can just use the first accumulator type seen since
+        # the tensor core shapes/layouts don't vary based on accumulator
+        # type so we can generate less code this way
+        return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
+
+    unique_prepack_types = []
+    prepack_types_seen = set()
+    for prepack_type in prepack_types:
+        key = prepacked_type_key(prepack_type)
+        if key not in prepack_types_seen:
+            unique_prepack_types.append(prepack_type)
+            prepack_types_seen.add(key)
+
+    sources.append(
+        (
+            "machete_prepack",
+            prepack_dispatch_template.render(
+                types=unique_prepack_types,
+            ),
+        )
+    )
+
+    # Split up impls across files
+    num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
+    num_impls_per_file = math.ceil(num_impls / num_impl_files)
+
+    files_impls: list[list[ImplConfig]] = [[]]
+
+    curr_num_impls_assigned = 0
+    curr_impl_in_file = 0
+    curr_impl_configs = deepcopy(list(reversed(impl_configs)))
+
+    while curr_num_impls_assigned < num_impls:
+        room_left_in_file = num_impls_per_file - curr_impl_in_file
+        if room_left_in_file == 0:
+            files_impls.append([])
+            room_left_in_file = num_impls_per_file
+            curr_impl_in_file = 0
+
+        curr_ic = curr_impl_configs[-1]
+        if len(curr_ic.schedules) >= room_left_in_file:
+            # Break apart the current impl config
+            tmp_ic = deepcopy(curr_ic)
+            tmp_ic.schedules = curr_ic.schedules[:room_left_in_file]
+            curr_ic.schedules = curr_ic.schedules[room_left_in_file:]
+            files_impls[-1].append(tmp_ic)
+        else:
+            files_impls[-1].append(curr_ic)
+            curr_impl_configs.pop()
+        curr_num_impls_assigned += len(files_impls[-1][-1].schedules)
+        curr_impl_in_file += len(files_impls[-1][-1].schedules)
+
+    for part, file_impls in enumerate(files_impls):
+        sources.append(
+            (
+                f"machete_mm_impl_part{part + 1}",
+                mm_impl_template.render(impl_configs=file_impls),
+            )
+        )
+
+    return sources
+
+
+def generate():
+    # See csrc/quantization/machete/Readme.md, the Codegeneration for more info
+    # about how this works
+    SCRIPT_DIR = os.path.dirname(__file__)
+
+    sch_common_params = dict(
+        kernel_schedule=TmaMI,
+        epilogue_schedule=TmaCoop,
+        tile_scheduler=TileSchedulerType.StreamK,
+    )
+
+    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
+    default_tile_heuristic_config = {
+        #### M = 257+
+        "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
+        "M > 256": ((128, 256), (2, 1, 1)),
+        #### M = 129-256
+        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
+        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
+        "M > 128": ((128, 256), (2, 1, 1)),
+        #### M = 65-128
+        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
+        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
+        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
+        "M > 64": ((128, 128), (2, 1, 1)),
+        #### M = 33-64
+        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
+        "M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
+        "M > 32": ((128, 64), (2, 1, 1)),
+        #### M = 17-32
+        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
+        "M > 16": ((256, 32), (2, 1, 1)),
+        #### M = 1-16
+        "N >= 26624": ((256, 16), (1, 1, 1)),
+        None: ((128, 16), (1, 1, 1)),
+    }
+
+    # For now we use the same heuristic for all types
+    # Heuristic is currently tuned for H100s
+    default_heuristic = [
+        (cond, ScheduleConfig(*tile_config, **sch_common_params))  # type: ignore
+        for cond, tile_config in default_tile_heuristic_config.items()
+    ]
+
+    def get_unique_schedules(heuristic: dict[str, ScheduleConfig]):
+        # Do not use schedules = list(set(...)) because we need to make sure
+        # the output list is deterministic; otherwise the generated kernel file
+        # will be non-deterministic and causes ccache miss.
+        schedules = []
+        for _, schedule_config in heuristic:
+            if schedule_config not in schedules:
+                schedules.append(schedule_config)
+        return schedules
+
+    impl_configs = []
+
+    GPTQ_kernel_type_configs = list(
+        TypeConfig(
+            a=a,
+            b=b,
+            b_group_scale=a,
+            b_group_zeropoint=DataType.void,
+            b_channel_scale=DataType.void,
+            a_token_scale=DataType.void,
+            out=a,
+            accumulator=DataType.f32,
+        )
+        for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
+        for a in (DataType.f16, DataType.bf16)
+    )
+
+    impl_configs += [
+        ImplConfig(x[0], x[1], x[2])
+        for x in zip(
+            GPTQ_kernel_type_configs,
+            itertools.repeat(get_unique_schedules(default_heuristic)),
+            itertools.repeat(default_heuristic),
+        )
+    ]
+
+    AWQ_kernel_type_configs = list(
+        TypeConfig(
+            a=a,
+            b=b,
+            b_group_scale=a,
+            b_group_zeropoint=a,
+            b_channel_scale=DataType.void,
+            a_token_scale=DataType.void,
+            out=a,
+            accumulator=DataType.f32,
+        )
+        for b in (DataType.u4, DataType.u8)
+        for a in (DataType.f16, DataType.bf16)
+    )
+
+    impl_configs += [
+        ImplConfig(x[0], x[1], x[2])
+        for x in zip(
+            AWQ_kernel_type_configs,
+            itertools.repeat(get_unique_schedules(default_heuristic)),
+            itertools.repeat(default_heuristic),
+        )
+    ]
+
+    # TODO: Support W4A8 when ready
+    # # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
+    # # TODO (LucasWilkinson): Further tuning required
+    # qqq_tile_heuristic_config = {
+    #     #### M = 257+
+    #     # ((128, 256), (2, 1, 1)) Broken for QQQ types
+    #     # TODO (LucasWilkinson): Investigate further
+    #     # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
+    #     # "M > 256": ((128, 256), (2, 1, 1)),
+    #     "M > 256": ((128, 128), (2, 1, 1)),
+    #     #### M = 129-256
+    #     "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
+    #     "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
+    #     # ((128, 256), (2, 1, 1)) Broken for QQQ types
+    #     # TODO (LucasWilkinson): Investigate further
+    #     # "M > 128": ((128, 256), (2, 1, 1)),
+    #     "M > 128": ((128, 128), (2, 1, 1)),
+    #     #### M = 65-128
+    #     "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
+    #     "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
+    #     "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
+    #     "M > 64": ((128, 128), (2, 1, 1)),
+    #     #### M = 33-64
+    #     "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
+    #     # Broken for QQQ types
+    #     # TODO (LucasWilkinson): Investigate further
+    #     #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
+    #     "M > 32": ((128, 64), (2, 1, 1)),
+    #     #### M = 17-32
+    #     "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
+    #     "M > 16": ((256, 32), (2, 1, 1)),
+    #     #### M = 1-16
+    #     "N >= 26624": ((256, 16), (1, 1, 1)),
+    #     None: ((128, 16), (1, 1, 1)),
+    # }
+
+    # # For now we use the same heuristic for all types
+    # # Heuristic is currently tuned for H100s
+    # qqq_heuristic = [
+    #     (cond, ScheduleConfig(*tile_config,
+    #                           **sch_common_params))  # type: ignore
+    #     for cond, tile_config in qqq_tile_heuristic_config.items()
+    # ]
+
+    # QQQ_kernel_types = [
+    #     *(TypeConfig(
+    #         a=DataType.s8,
+    #         b=VLLMDataType.u4b8,
+    #         b_group_scale=b_group_scale,
+    #         b_group_zeropoint=DataType.void,
+    #         b_channel_scale=DataType.f32,
+    #         a_token_scale=DataType.f32,
+    #         out=DataType.f16,
+    #         accumulator=DataType.s32,
+    #     ) for b_group_scale in (DataType.f16, DataType.void)),
+    #     *(TypeConfig(
+    #         a=DataType.e4m3,
+    #         b=VLLMDataType.u4b8,
+    #         b_group_scale=b_group_scale,
+    #         b_group_zeropoint=DataType.void,
+    #         b_channel_scale=DataType.f32,
+    #         a_token_scale=DataType.f32,
+    #         out=DataType.f16,
+    #         accumulator=DataType.f32,
+    #     ) for b_group_scale in (DataType.f16, DataType.void)),
+    # ]
+
+    # impl_configs += [
+    #     ImplConfig(x[0], x[1], x[2])
+    #     for x in zip(QQQ_kernel_types,
+    #                  itertools.repeat(get_unique_schedules(qqq_heuristic)),
+    #                  itertools.repeat(qqq_heuristic))
+    # ]
+
+    output_dir = os.path.join(SCRIPT_DIR, "generated")
+
+    # Delete the "generated" directory if it exists
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+
+    # Create the "generated" directory
+    os.makedirs(output_dir)
+
+    # Render each group of configurations into separate files
+    for filename, code in create_sources(impl_configs):
+        filepath = os.path.join(output_dir, f"{filename}.cu")
+        with open(filepath, "w") as output_file:
+            output_file.write(code)
+        print(f"Rendered template to {filepath}")
+
+
+if __name__ == "__main__":
+    generate()
--- a/csrc/quantization/machete/machete_collective_builder.cuh
+++ b/csrc/quantization/machete/machete_collective_builder.cuh
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "cutlass_extensions/vllm_collective_builder.cuh"
+#include "machete_mainloop.cuh"
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+struct MacheteKernelTag {};
+
+template <class ElementPairA_, class GmemLayoutA_, int AlignmentA,
+          class ElementPairB_, class GmemLayoutB_, int AlignmentB,
+          class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
+          class StageCountType, class KernelScheduleType>
+struct VLLMCollectiveBuilder<
+    MacheteKernelTag, arch::Sm90, arch::OpClassTensorOp, ElementPairA_,
+    GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_, AlignmentB,
+    ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<(
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+        cute::is_same_v<KernelScheduleType,
+                        KernelTmaWarpSpecializedCooperative>)>> {
+  using CollectiveOp = machete::MacheteCollectiveMma<
+      ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_,
+      AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
+      StageCountType, KernelScheduleType>;
+};
+
+};  // namespace cutlass::gemm::collective
--- a/csrc/quantization/machete/machete_interleaving_utils.cuh
+++ b/csrc/quantization/machete/machete_interleaving_utils.cuh
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cute/layout.hpp"
+
+namespace machete {
+
+using namespace cute;
+
+// get an interleaved block layout where each element consecutive element has a
+// stride of bit_stride and the block width is blk_bit_width,
+// examples:
+//  size_bits<T> = 8, bit_stride = 8,  blk_bit_width = 32 -> 4:1
+//  size_bits<T> = 8, bit_stride = 16, blk_bit_width = 32 -> (2, 2):(2, 1)
+//  size_bits<T> = 4, bit_stride = 8,  blk_bit_width = 32 -> (4, 2):(2, 1)
+//  size_bits<T> = 4, bit_stride = 16, blk_bit_width = 32 -> (2, 4):(4, 1)
+template <typename T, int bit_stride, int blk_bit_width>
+CUTE_HOST_DEVICE static constexpr auto get_interleaved_blk_layout() {
+  static_assert(blk_bit_width % bit_stride == 0);
+  static_assert(bit_stride % cute::sizeof_bits_v<T> == 0);
+
+  constexpr auto elems_per_blk = blk_bit_width / cute::sizeof_bits_v<T>;
+
+  if constexpr (cute::sizeof_bits_v<T> == bit_stride) {
+    // identity layout
+    return Layout<Shape<Int<elems_per_blk>>>{};
+  } else {
+    constexpr auto elems_per_stride = bit_stride / cute::sizeof_bits_v<T>;
+    constexpr auto num_strides = elems_per_blk / elems_per_stride;
+    return Layout<Shape<Int<num_strides>, Int<elems_per_stride>>,
+                  Stride<Int<elems_per_stride>, Int<1>>>{};
+  }
+}
+
+};  // namespace machete
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
--- a/csrc/quantization/machete/machete_mm_kernel.cuh
+++ b/csrc/quantization/machete/machete_mm_kernel.cuh
@@ -0,0 +1,309 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+// clang-format off
+// The cutlass include order matters (annoyingly)
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+// clang-format on
+
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/vllm_numeric_conversion.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
+#include "machete_collective_builder.cuh"
+#include "machete_prepacked_layout.cuh"
+#include "machete_interleaving_utils.cuh"
+
+namespace machete {
+
+using namespace cute;
+
+// NOTE This kernel computes D = alpha * A * B + beta * C by computing
+//   D^t = alpha * B^t * A^t + beta * C^t, this is because the wgmma
+//   instructions only support sourcing from registers for the left-hand
+//   operand, we want to upconvert/decompress the quantized operand in
+//   register. Since the primary use case we want to support is Y = XW^t where
+//   W is quantized, in this situation or right-hand operand is quantized so
+//   we compute the transpose to move it to the left-hand side.
+template <typename ElementA_, typename ElementB_, typename ElementD_,
+          typename AccumulatorT, typename GroupScaleT, typename GroupZeroT,
+          typename ChannelScaleT, typename TokenScaleT, class KernelSchedule,
+          typename ScheduleConfig>
+struct MacheteKernelTemplate {
+  static constexpr bool with_C = false;  // not ever used
+  static constexpr bool with_group_scales = !std::is_same_v<GroupScaleT, void>;
+  static constexpr bool with_group_zeropoints =
+      !std::is_same_v<GroupZeroT, void>;
+  static constexpr bool with_channel_scales =
+      !std::is_same_v<ChannelScaleT, void>;
+  static constexpr bool with_token_scales = !std::is_same_v<TokenScaleT, void>;
+
+  using MmaType = ElementA_;
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementD = ElementD_;
+  using ElementC = cute::conditional_t<with_C, ElementD, void>;
+  using ElementAccumulator = AccumulatorT;
+  using ElementCompute = AccumulatorT;  // For Epilogue
+  // Use dummy values when we don't have scales or zeropoints
+  using ElementZGroup =
+      cute::conditional_t<with_group_zeropoints, GroupZeroT, MmaType>;
+  using ElementSGroup =
+      cute::conditional_t<with_group_scales, GroupScaleT, MmaType>;
+  using ElementConvertGroup =
+      cute::conditional_t<with_group_scales, GroupScaleT, MmaType>;
+  using ElementSChannel =
+      cute::conditional_t<with_channel_scales, ChannelScaleT, AccumulatorT>;
+  using ElementSToken =
+      cute::conditional_t<with_token_scales, TokenScaleT, AccumulatorT>;
+
+  using BTypeTuple = cute::conditional_t<
+      with_group_scales,
+      cute::conditional_t<with_group_zeropoints,
+                          cute::tuple<ElementB, ElementSGroup, ElementZGroup>,
+                          cute::tuple<ElementB, ElementSGroup>>,
+      ElementB>;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+  using LayoutScale = cutlass::layout::RowMajor;
+  // not actually used since B has the prepacked layout, but required by cutlass
+  using _LayoutB = cutlass::layout::ColumnMajor;
+
+  // Interface strides expected by create_arguments (will get transposed)
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+  using StrideSGroup = cutlass::detail::TagToStrideA_t<LayoutScale>;
+  using StrideZGroup = StrideSGroup;
+
+  using LayoutA_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using PrepackedLayoutB =
+      PrepackedLayoutBTemplate<ElementA_, ElementB_, ElementConvertGroup,
+                               AccumulatorT, LayoutA_Transpose, KernelSchedule>;
+
+  static int constexpr TileShapeK =
+      128 * 8 / cutlass::sizeof_bits<MmaType>::value;
+  static int constexpr AlignmentA = 128 / cutlass::sizeof_bits_v<ElementA>;
+  static int constexpr AlignmentB = 128 / cutlass::sizeof_bits_v<ElementB>;
+  static int constexpr AlignmentC =
+      (with_C) ? 128 / cutlass::sizeof_bits_v<ElementC> : 0;
+  static int constexpr AlignmentD = 128 / cutlass::sizeof_bits_v<ElementD>;
+
+  using TileShape = decltype(append(typename ScheduleConfig::TileShapeNM{},
+                                    cute::Int<TileShapeK>{}));
+  using ClusterShape = typename ScheduleConfig::ClusterShape;
+  using EpilogueSchedule = typename ScheduleConfig::EpilogueSchedule;
+  using EpilogueTileType = typename ScheduleConfig::EpilogueTileType;
+  using TileScheduler = typename ScheduleConfig::TileScheduler;
+
+  static_assert(
+      (!with_channel_scales && !with_token_scales) ||
+          ((with_channel_scales && with_token_scales) &&
+           std::is_same_v<ElementSChannel, ElementSToken>),
+      "Currently token and channel scales (if present) must be the same type");
+
+  // Currently only supports float scales
+  using ChTokScalesEpilogue =
+      typename vllm::c3x::ScaledEpilogue<ElementAccumulator, ElementD,
+                                         TileShape>;
+  static_assert((with_channel_scales || with_token_scales) ||
+                    (std::is_same_v<ElementSChannel, float> &&
+                     std::is_same_v<ElementSToken, float>),
+                "Currently token and channel scales (if present) must be float "
+                "(and if one is present the other must be too)");
+
+  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
+      cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using EVTCompute =
+      std::conditional_t<with_channel_scales || with_token_scales,
+                         typename ChTokScalesEpilogue::EVTCompute,
+                         StoreEpilogueCompute>;
+
+  // EVTCompute
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
+          ElementAccumulator, ElementSChannel, ElementC, LayoutC_Transpose,
+          AlignmentC, ElementD, LayoutD_Transpose, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::VLLMCollectiveBuilder<
+          cutlass::gemm::collective::MacheteKernelTag, ArchTag, OperatorClass,
+          BTypeTuple, PrepackedLayoutB, AlignmentB, ElementA, LayoutA_Transpose,
+          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>,  // Indicates ProblemShape
+      CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // stride_B is unused (since B is prepacked), but still required by cutlass
+  using _StrideB = cutlass::detail::TagToStrideB_t<_LayoutB>;
+
+  using Arguments = typename Gemm::Arguments;
+  using MainloopArguments = typename GemmKernel::MainloopArguments;
+  using EpilogueArguments = typename GemmKernel::EpilogueArguments;
+
+  static Arguments create_arguments(
+      cudaStream_t stream,
+      torch::Tensor const& A,  // MxK matrix
+      torch::Tensor const& B,  // KxN prepacked matrix
+      torch::Tensor& D,        // MxN matrix
+      std::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
+      std::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
+      std::optional<int64_t> maybe_group_size,
+      std::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
+      std::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
+  {
+    static_assert(!with_group_zeropoints || with_group_scales);
+
+    int M = A.size(0), N = B.size(1), K = A.size(1);
+    TORCH_CHECK(D.size(0) == M && D.size(1) == N);
+
+    auto layout_A = make_cute_layout<StrideA>(A, "A");
+    auto layout_D = make_cute_layout<StrideD>(D, "D");
+    auto layout_S_group =
+        maybe_make_cute_layout<StrideSGroup>(maybe_g_scales, "group_scales");
+    auto layout_Z_group =
+        maybe_make_cute_layout<StrideZGroup>(maybe_g_zeros, "group_zeros");
+    int64_t numel_S_channel = maybe_ch_scales ? maybe_ch_scales->numel() : 0;
+    int64_t numel_S_token = maybe_tok_scales ? maybe_tok_scales->numel() : 0;
+
+    auto unwrap = [](auto const& t) {
+      return t ? t->const_data_ptr() : nullptr;
+    };
+    auto A_ptr = static_cast<ElementA const*>(A.const_data_ptr());
+    auto B_ptr = static_cast<ElementB const*>(B.const_data_ptr());
+    auto D_ptr = static_cast<ElementD*>(D.mutable_data_ptr());
+    auto S_group_ptr =
+        static_cast<ElementSGroup const*>(unwrap(maybe_g_scales));
+    auto Z_group_ptr = static_cast<ElementZGroup const*>(unwrap(maybe_g_zeros));
+    auto S_channel_ptr =
+        static_cast<ElementSChannel const*>(unwrap(maybe_ch_scales));
+    auto S_token_ptr =
+        static_cast<ElementSToken const*>(unwrap(maybe_tok_scales));
+
+    int const group_size =
+        maybe_group_size == -1 ? K : maybe_group_size.value_or(K);
+    int const scale_k = (K + group_size - 1) / group_size;
+
+    TORCH_CHECK(size<0>(layout_A) == M && size<1>(layout_A) == K);
+    TORCH_CHECK(size<0>(layout_D) == M && size<1>(layout_D) == N);
+
+    if constexpr (with_group_scales) {
+      TORCH_CHECK(S_group_ptr && layout_S_group);
+      TORCH_CHECK((size<0>(*layout_S_group) == scale_k &&
+                   size<1>(*layout_S_group) == N));
+    } else {
+      TORCH_CHECK(!S_group_ptr, "Scales not supported");
+    }
+
+    if constexpr (with_group_zeropoints) {
+      TORCH_CHECK(Z_group_ptr && layout_Z_group);
+      TORCH_CHECK((size<0>(*layout_Z_group) == scale_k &&
+                   size<1>(*layout_Z_group) == N));
+      TORCH_CHECK(layout_S_group && *layout_Z_group == *layout_S_group,
+                  "Scales and zeros must have the same layout");
+    } else {
+      TORCH_CHECK(!Z_group_ptr, "Zeropoints not supported");
+    }
+
+    if constexpr (with_channel_scales || with_token_scales) {
+      TORCH_CHECK(
+          (maybe_ch_scales->numel() == N || maybe_ch_scales->numel() == 1) &&
+          (maybe_tok_scales->numel() == M || maybe_tok_scales->numel() == 1));
+    }
+
+    // Transpose A and D
+    // A doesn't need to be transposed since cutlass expects a NxK matrix
+    //  for B (which is At)
+    auto stride_At = layout_A.stride();
+    auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
+
+    MainloopArguments mainloop_arguments{};
+    // {Accum, C, C_layout, D, D}
+    EpilogueArguments epilogue_arguments{};
+
+    if constexpr (with_channel_scales || with_token_scales) {
+      epilogue_arguments =
+          EpilogueArguments{ChTokScalesEpilogue::prepare_args(
+                                *maybe_ch_scales, *maybe_tok_scales),
+                            nullptr,
+                            {},
+                            D_ptr,
+                            stride_Dt};
+    } else {
+      epilogue_arguments = EpilogueArguments{{}, nullptr, {}, D_ptr, stride_Dt};
+    }
+
+    if constexpr (with_group_scales && with_group_zeropoints) {
+      auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride();
+      mainloop_arguments = MainloopArguments{
+          B_ptr,       _StrideB{},     A_ptr,      stride_At,
+          S_group_ptr, stride_S_group, group_size, Z_group_ptr};
+    } else if constexpr (with_group_scales) {
+      auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride();
+      mainloop_arguments =
+          MainloopArguments{B_ptr,       _StrideB{},     A_ptr,     stride_At,
+                            S_group_ptr, stride_S_group, group_size};
+    } else {
+      mainloop_arguments =
+          MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At};
+    }
+
+    return Arguments{cutlass::gemm::GemmUniversalMode::kGemm,
+                     {N, M, K, 1},
+                     mainloop_arguments,
+                     epilogue_arguments};
+  };
+
+  static size_t get_workspace_size(Arguments const& args) {
+    return Gemm::get_workspace_size(args);
+  }
+
+  static bool can_implement(Arguments const& args) {
+    return Gemm::can_implement(args) == cutlass::Status::kSuccess;
+  }
+
+  static void run(Arguments const& args, void* workspace, cudaStream_t stream) {
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.initialize(args, workspace, stream);
+    TORCH_CHECK(status == cutlass::Status::kSuccess,
+                "Machete kernel failed to initialize workspace");
+
+    status = gemm_op.run(stream);
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "Machete kernel failed");
+  }
+};
+
+};  // namespace machete
--- a/csrc/quantization/machete/machete_mm_launcher.cuh
+++ b/csrc/quantization/machete/machete_mm_launcher.cuh
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <torch/all.h>
+#include <Python.h>
+
+#include "machete_mm_kernel.cuh"
+#include "cutlass_extensions/torch_utils.hpp"
+#include "core/scalar_type.hpp"
+
+namespace machete {
+
+struct MMArgs {
+  torch::Tensor const& A;
+  torch::Tensor const& B;
+  vllm::ScalarType const& b_type;
+  std::optional<at::ScalarType> const& maybe_out_type;
+  std::optional<torch::Tensor> const& maybe_group_scales;
+  std::optional<torch::Tensor> const& maybe_group_zeros;
+  std::optional<int64_t> maybe_group_size;
+  std::optional<torch::Tensor> const& maybe_channel_scales;
+  std::optional<torch::Tensor> const& maybe_token_scales;
+  std::optional<std::string> maybe_schedule;
+};
+
+struct SupportedSchedulesArgs {
+  at::ScalarType a_type;
+  vllm::ScalarType b_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
+  std::optional<at::ScalarType> maybe_group_zeros_type;
+  std::optional<at::ScalarType> maybe_channel_scales_type;
+  std::optional<at::ScalarType> maybe_token_scales_type;
+  std::optional<at::ScalarType> maybe_out_type;
+};
+
+torch::Tensor mm_dispatch(MMArgs args);
+
+std::vector<std::string> supported_schedules_dispatch(
+    SupportedSchedulesArgs args);
+
+template <typename MacheteKernel>
+torch::Tensor run_impl(MMArgs args) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(args.A));
+
+  auto device = args.A.device();
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+
+  int M = args.A.size(0);
+  int N = args.B.size(1);
+  int K = args.A.size(1);
+
+  // Allocate output
+  torch::Tensor D = torch::empty(
+      {M, N},
+      torch::TensorOptions()
+          .dtype(equivalent_scalar_type_v<typename MacheteKernel::ElementD>)
+          .device(device));
+
+  auto arguments = MacheteKernel::create_arguments(
+      stream,  //
+      args.A, args.B, D, args.maybe_group_scales, args.maybe_group_zeros,
+      args.maybe_group_size, args.maybe_channel_scales,
+      args.maybe_token_scales);
+  TORCH_CHECK(MacheteKernel::can_implement(arguments),
+              "Machete kernel cannot be run with these arguments");
+
+  size_t workspace_size = MacheteKernel::get_workspace_size(arguments);
+  torch::Tensor workspace = torch::empty(
+      workspace_size, torch::TensorOptions().dtype(torch::kU8).device(device));
+
+  MacheteKernel::run(arguments, workspace.mutable_data_ptr(), stream);
+
+  return D;
+};
+
+};  // namespace machete
--- a/csrc/quantization/machete/machete_prepack_kernel.cuh
+++ b/csrc/quantization/machete/machete_prepack_kernel.cuh
@@ -0,0 +1,76 @@
+#pragma once
+
+#include "machete_mm_kernel.cuh"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/torch_utils.hpp"
+
+namespace machete {
+
+template <int threads, typename PrepackedLayoutB, typename BInTensor,
+          typename ElementB>
+static __global__ void prepack_B_kernel(BInTensor B_in, ElementB* B_out_ptr) {
+  auto constexpr block_size =
+      Int<size(typename PrepackedLayoutB::PPBlockShape_NK{})>{};
+  auto constexpr eles_per_thread = Int<block_size / threads>{};
+  static_assert(block_size % threads == 0,
+                "block_size must be divisible by the number of threads");
+
+  // Which pre-packed are we responsible for
+  auto blk_coord = make_coord(blockIdx.x, blockIdx.y, blockIdx.z);
+  auto tB_in = local_tile(
+      B_in, append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}),
+      blk_coord);
+
+  // Find the start offset in the output for this pre-packed block
+  auto bNbKL_to_offset = PrepackedLayoutB::bNbKL_to_offset(shape(B_in));
+
+  // Tensor representing a 1:1 mapping to the output space in 1D
+  auto tB_out_linear =
+      make_tensor(get_logical_ptr(B_out_ptr) + bNbKL_to_offset(blk_coord),
+                  make_layout(make_shape(block_size)));
+  // Mapping from output space (1D) to input space
+  auto tB_in_linear = make_tensor(
+      tB_in.data(),
+      tB_in.layout()
+          .compose(right_inverse(PrepackedLayoutB::ppblock_ilvd_NK_to_offset()))
+          .with_shape(make_shape(block_size)));
+
+  // Tile for this specific thread (could have used a TiledCopy but these work
+  // best with 2d layouts, this is a simple 1d layout so local_tile is enough,
+  // we are also not that concerned with performance for this kernel)
+  auto thr_tB_in_linear =
+      local_tile(tB_in_linear, make_shape(eles_per_thread), threadIdx.x);
+  auto thr_tB_out_linear =
+      local_tile(tB_out_linear, make_shape(eles_per_thread), threadIdx.x);
+
+  // Construct a register-backed Tensor with the same shape as each thread's
+  // partition
+  auto fragment = make_tensor<ElementB>(shape(thr_tB_in_linear));
+
+  copy(thr_tB_in_linear, fragment);
+  copy(Copy_Atom<DefaultCopy, uint8_t>{}, fragment, thr_tB_out_linear);
+}
+
+template <typename PrepackedLayoutB, typename InLayout>
+static void prepack_B_template(
+    cudaStream_t stream, typename PrepackedLayoutB::ElementB const* B_in_ptr,
+    InLayout B_layout, typename PrepackedLayoutB::ElementB* B_out_ptr) {
+  using TileShapeNKL =
+      decltype(append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}));
+  auto ilvd_NKbNbKL_to_offset =
+      PrepackedLayoutB::ilvd_NKbNbKL_to_offset(shape(B_layout));
+
+  TORCH_CHECK(size<0>(B_layout) % size<0>(TileShapeNKL{}) == 0);
+  TORCH_CHECK(size<1>(B_layout) % size<1>(TileShapeNKL{}) == 0);
+
+  auto N_tiles = size<0>(B_layout) / size<0>(TileShapeNKL{});
+  auto K_tiles = size<1>(B_layout) / size<1>(TileShapeNKL{});
+  auto L_tiles = size<2>(B_layout);
+
+  auto B_in = make_tensor(get_logical_ptr(B_in_ptr), B_layout);
+
+  prepack_B_kernel<128, PrepackedLayoutB>
+      <<<dim3(N_tiles, K_tiles, L_tiles), 128, 0, stream>>>(B_in, B_out_ptr);
+}
+
+};  // namespace machete
--- a/csrc/quantization/machete/machete_prepack_launcher.cuh
+++ b/csrc/quantization/machete/machete_prepack_launcher.cuh
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "machete_prepack_kernel.cuh"
+#include "cutlass_extensions/torch_utils.hpp"
+#include "core/scalar_type.hpp"
+
+namespace machete {
+
+struct PrepackBArgs {
+  torch::Tensor const& B;
+  at::ScalarType a_type;
+  vllm::ScalarType b_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
+};
+
+template <typename PrepackedLayoutB>
+torch::Tensor prepack_impl(torch::Tensor const B) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(B));
+  using ElementB = typename PrepackedLayoutB::ElementB;
+  using PPBlockShape_NK = typename PrepackedLayoutB::PPBlockShape_NK;
+
+  auto device = B.device();
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+  auto B_ptr = static_cast<ElementB const*>(B.const_data_ptr());
+  // elements per storage item for B
+  auto eles_per_storage =
+      (B.dtype().itemsize() * 8) / cute::sizeof_bits_v<ElementB>;
+
+  // torch B passed in is/should be (packed_K,N), the kernel expects (N,K,L) (to
+  // match cutlass using (N,K,L) for B), so we transpose B to (N,packed_K,L)
+  auto Bt_packed = B.t();
+
+  TORCH_CHECK(
+      (B.size(0) * eles_per_storage) % size<1>(PPBlockShape_NK{}) == 0,
+      "B.shape[0] (in terms of unpacked elements) must be a multiple of ",
+      size<1>(PPBlockShape_NK{}));
+  TORCH_CHECK(B.size(1) % size<0>(PPBlockShape_NK{}) == 0,
+              "B.shape[1] must be a multiple of ", size<0>(PPBlockShape_NK{}));
+
+  using StrideB = cutlass::detail::TagToStrideB_t<cutlass::layout::ColumnMajor>;
+  auto const l_Bt_packed = make_cute_layout<StrideB>(Bt_packed, "B");
+
+  // convert (N,packed_K,L) layout to (N,K,L) layout
+  //  in effect we want to do: blocked_product(layout_Bt_packed,
+  //      make_ordered_layout(make_shape(_1{}, eles_per_storage, _1{}),
+  //                          Step<_1, _0, _2>{}));
+  // but blocked_product does not support dynamic strides so we implement the
+  // equivalent manually,
+  //   new_shape = (N, packed_K, L) * (1, eles_per_storage, 1) -> (N, K, L)
+  //   new_stride = (s0, s1, s2) * (eles_per_storage, 1, eles_per_storage)
+  //                 when s1 == 1
+  TORCH_CHECK(stride<1>(l_Bt_packed) == 1);
+  // clang-format off
+  auto const layout_Bt = make_layout(
+      transform_with_idx(l_Bt_packed.shape(), [&](auto ele, auto idx) {
+        return idx == 1 ? ele * eles_per_storage : ele;
+      }), 
+      transform_with_idx(l_Bt_packed.stride(), [&](auto ele, auto idx) {
+        return idx != 1 ? ele * eles_per_storage : ele;
+      }));
+  // clang-format on
+
+  // Allocate output
+  torch::Tensor D = torch::empty_like(B, {}, at::MemoryFormat::Contiguous);
+
+  prepack_B_template<PrepackedLayoutB>(
+      stream, B_ptr, layout_Bt, static_cast<ElementB*>(D.mutable_data_ptr()));
+
+  return D;
+};
+
+torch::Tensor prepack_B_dispatch(PrepackBArgs args);
+
+};  // namespace machete
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -0,0 +1,253 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+// clang-format off
+// The cutlass include order matters (annoyingly)
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+// clang-format on
+
+#include "cutlass_extensions/cute_utils.cuh"
+#include "machete_collective_builder.cuh"
+#include "machete_interleaving_utils.cuh"
+
+namespace machete {
+
+using namespace cute;
+
+struct IlvBlkLayoutAuto {};
+
+// This defines a prepacked layout for the B matrix, where the matrix is broken
+// up into PPBlockShape_NK blocks. The data within each block is then compactly
+// stored in memory such that when performing a TiledMMA operation with the same
+// shape as prepacked block, all the data for a given thread is contiguous in
+// memory. This allows us to use wider shared memory loads when loading B from
+// shared memory. The values within a thread are also potentially interlaeved
+// inorder to allow for more efficient upconverting.
+//
+// The contract here is that the `TiledMma` determined below matches the one
+// ultimately used in the kernel. (this is also why the other element types are
+// required along with the kernel schedule)
+template <typename ElementA_, typename ElementB_, typename ElementConvert_,
+          typename AccumulatorT, class LayoutB, class KernelSchedule,
+          typename IlvBlkLayout_ = IlvBlkLayoutAuto>
+// clang-format on
+struct PrepackedLayoutBTemplate {
+  using MmaType = ElementA_;
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementAccumulator = AccumulatorT;
+  using ElementMma = MmaType;
+
+  // Interleave for 4bit bit types when we are not upconverting to fp8 or int8,
+  // in those cases case we use a LUT using prmt instructions to upconvert and
+  // is more efficient if the data is not interleaved For 8bit+ prmt
+  // instructions makes non-interleaved layouts efficient enough we don't need
+  // iterleaved layouts (and can reuse more of the existing cutlass converts)
+  static constexpr bool should_interleave =
+      sizeof_bits_v<ElementB> <= 4 &&
+      !std::is_same_v<ElementConvert_, cutlass::float_e4m3_t> &&
+      !std::is_same_v<ElementConvert_, int8_t>;
+
+  // Only use interleaved layouts for subbyte weights,
+  using IlvdBlkLayout = std::conditional_t<
+      std::is_same_v<IlvBlkLayout_, IlvBlkLayoutAuto>,
+      std::conditional_t<
+          should_interleave,
+          decltype(get_interleaved_blk_layout<
+                   ElementB, sizeof_bits_v<ElementConvert_>, 32>()),
+          void>,
+      IlvBlkLayout_>;
+
+  // TODO (LucasWilkinson): compare the performance for other sizes
+  // Prepacked block shape, smallest layout atom for loading into registers
+  //   (can contain multiple wgmma instructions worth of data in one block)
+  // We ideally want this to be configured such that a thread can perform 128bit
+  // loads, i.e. we amount of data associated with each thread within a
+  // prepacked block is a multiple of 128bits, when using a cooperative sechdule
+  // we have 256 threads working a single block at a time, this means each
+  // thread works on `sizeof_bits_v<ElementB> * (128*64) / 256` bits of data,
+  // for a 4bit type this would be 128bits
+  using PPBlockShape_NK = Shape<_128, _64>;
+
+  // Create the shape of the tile anticipated to be used by the GEMM kernel,
+  //  when the kernel executes we will compute `Ct = Bt * At` since the
+  //  quantized weights (B), must be the lhs operand so the flow through
+  //  registers.
+  // The _128 here doesn't actually impact the shape of the stored tile directly
+  //  but may impact the op selected by rs_op_selector
+  using GemmTileShape = decltype(make_shape(size<0>(PPBlockShape_NK{}), _128{},
+                                            size<1>(PPBlockShape_NK{})));
+
+  static constexpr cute::GMMA::Major GmmaMajorB =
+      gmma_rs_tag_to_major_B<LayoutB>();
+
+  // For coop schedules we have two warp groups cooperatively issuing wgmma
+  // instructions so we use 2 atoms along the M dim (one for each warpgroup)
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
+      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::rs_op_selector<ElementMma, ElementMma, ElementAccumulator,
+                                 GemmTileShape, GMMA::Major::K, GmmaMajorB>(),
+      AtomLayoutMNK{}));
+
+  // Prepacked block, (athrid, val) -> (N,K)
+  // i.e. ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...))) -> (N,K)
+  CUTE_HOST_DEVICE static constexpr auto ppblock_TV_to_NK() {
+    return TiledMma{}.thrfrg_A(make_layout(PPBlockShape_NK{}));
+  }
+
+  // Prepacked block, (N,K) -> (athrid, val)
+  // i.e. (N,K) -> ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...)))
+  CUTE_HOST_DEVICE static constexpr auto ppblock_NK_to_TV() {
+    return right_inverse(ppblock_TV_to_NK()).with_shape(PPBlockShape_NK{});
+  }
+
+  // Prepacked block, (athrid, val) -> (storage_offset)
+  // i.e. ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...))) -> (storage_idx)
+  CUTE_HOST_DEVICE static constexpr auto ppblock_TV_to_offset() {
+    // Return iterleaved layout
+    return make_ordered_layout(shape(ppblock_TV_to_NK()), Step<_1, _0>{});
+  }
+
+  // Prepacked block, (athrid, val) -> (storage_offset)
+  // i.e. ((ThrV,(ThrM,ThrK)),(IlvdFrgV,(RestM,RestK,...))) -> (storage_idx)
+  CUTE_HOST_DEVICE static constexpr auto ppblock_ilvd_TV_to_offset() {
+    auto layout_no_interleave =
+        make_ordered_layout(shape(ppblock_TV_to_NK()), Step<_1, _0>{});
+
+    if constexpr (std::is_same_v<IlvdBlkLayout, void>) {
+      return layout_no_interleave;
+    } else {
+      // interleave by transforming FrgV into interleaved blocks where each
+      // block has the layout IlvdBlkLayout, for example if IlvdBlkLayout is
+      // (2, 2) : (2, 1) then we get: ((2, 2), size(FrgV) / 4) : ((2, 1), 4)
+      //   if FrgV is {A, B, C, D, E, F, G, H}
+      //   then ((IlvBlk), FrgB) is {A, C, B, D, C, G, D, H}
+      auto frgV = get<1, 0>(layout_no_interleave);
+      auto ilvdBlk = IlvdBlkLayout{};
+      static_assert(size(frgV) % size(ilvdBlk) == 0,
+                    "FrgV must be divisible by size(ilvdBlk)");
+      auto ilvd_FrgV = make_layout(
+          make_shape(shape(ilvdBlk), Int<size(frgV) / size(ilvdBlk)>{}),
+          make_stride(stride(ilvdBlk), size(ilvdBlk)));
+
+      // Return iterleaved layout
+      return make_layout(
+          get<0>(layout_no_interleave),
+          make_layout(ilvd_FrgV, get<1, 1>(layout_no_interleave)));
+    }
+  }
+
+  // Prepacked block, (M,K) -> (storage_offset)
+  CUTE_HOST_DEVICE static constexpr auto ppblock_ilvd_NK_to_offset() {
+    // do (M,K) -> (athrid, val) -> (storage_idx)
+    return ppblock_ilvd_TV_to_offset().compose(ppblock_NK_to_TV());
+  }
+
+  // ((athrid, val), (BlocksN, BlocksK), L) -> (storage_idx)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset(
+      Shape_NKL shape_mkl) {
+    constexpr auto block_layout = ppblock_TV_to_offset();
+
+    // (BlocksN, BlocksK, L)
+    auto blocks_shape =
+        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
+                        [](auto x, auto y) { return x / y; });
+
+    // ((athrid, val), (BlocksN, BlocksK, L)) -> (storage_idx)
+    auto result = make_layout(
+        block_layout,
+        make_layout(blocks_shape,
+                    compact_col_major(blocks_shape, size(block_layout))));
+
+    // ((athrid, val), (BlocksN, BlocksK, L))
+    //   => ((athrid, val), (BlocksN, BlocksK), L)
+    return group<1, 3>(result(_, repeat<rank<1>(result)>(_)));
+  }
+
+  // ((athrid_val), (BlocksN, BlocksK, L)) -> (N, K, L)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
+      Shape_NKL shape_mkl) {
+    auto layout = TVbNbKL_to_offset(shape_mkl);
+    // for 4-bit elements, having >= 64 values per column
+    // allows TMA to load full 32-byte sectors
+    auto inner_layout =
+        make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
+
+    return make_layout(inner_layout, get<1>(layout), get<2>(layout));
+  }
+
+  // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto ilvd_NKbNbKL_to_offset(
+      Shape_NKL shape_mkl) {
+    constexpr auto block_layout = ppblock_ilvd_NK_to_offset();
+
+    // (BlocksN, BlocksK, L)
+    auto blocks_shape =
+        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
+                        [](auto x, auto y) { return x / y; });
+
+    // ((athrid, val), (BlocksN, BlocksK, L)) -> (storage_idx)
+    auto result = make_layout(
+        block_layout,
+        make_layout(blocks_shape,
+                    compact_col_major(blocks_shape, size(block_layout))));
+
+    // ((athrid, val), (BlocksN, BlocksK, L)) => ((athrid, val), (BlocksN,
+    // BlocksK), L)
+    return group<1, 3>(result(_, repeat<rank<1>(result)>(_)));
+  }
+
+  // (BlocksN, BlocksK, L) -> (storage_idx)
+  template <typename Shape_NKL>
+  CUTE_HOST_DEVICE static constexpr auto bNbKL_to_offset(Shape_NKL shape_mkl) {
+    // (BlocksN, BlocksK, L)
+    auto blocks_shape =
+        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
+                        [](auto x, auto y) { return x / y; });
+    auto stride = size(PPBlockShape_NK{});
+
+    // (BlocksN, BlocksK, L) -> (storage_idx)
+    return make_layout(blocks_shape, compact_col_major(blocks_shape, stride));
+  }
+
+  // ((athrid, val), (BlocksN, BlocksK, L)) -> (N, K, L)
+  template <class Shape_NKL>
+  CUTE_HOST_DEVICE static auto TVbNbK_to_NKL(Shape_NKL shape_mkl) {
+    auto tile = make_tile(make_layout(size<0>(PPBlockShape_NK{})),
+                          make_layout(size<1>(PPBlockShape_NK{})));
+
+    // ((BlockN, BlockK), (BlocksN, BlocksK, L)) -> (N, K, L)
+    auto tiled_A = zipped_divide(make_layout(shape_mkl), tile);
+    return tiled_A.compose(ppblock_TV_to_NK(), _);
+  }
+
+  // (N, K, L) -> ((athrid, val), (BlocksN, BlocksK), L)
+  template <class Shape_NKL>
+  CUTE_HOST_DEVICE static auto NKL_to_TVbNbK(Shape_NKL shape_mkl) {
+    auto TVbNbK_to_NKL_layout = TVbNbK_to_NKL(shape_mkl);
+    return blocked_product(ppblock_NK_to_TV(),
+                           make_layout(shape<1>(TVbNbK_to_NKL_layout)));
+  }
+};
+
+};  // namespace machete
--- a/csrc/quantization/machete/machete_pytorch.cu
+++ b/csrc/quantization/machete/machete_pytorch.cu
@@ -0,0 +1,73 @@
+#include "machete_mm_launcher.cuh"
+#include "machete_prepack_launcher.cuh"
+#include "core/scalar_type.hpp"
+
+#include "core/registration.h"
+
+namespace machete {
+
+using namespace vllm;
+
+std::vector<std::string> supported_schedules(
+    at::ScalarType a_type, int64_t b_type_id,
+    std::optional<at::ScalarType> maybe_group_scales_type,
+    std::optional<at::ScalarType> maybe_group_zeros_type,
+    std::optional<at::ScalarType> maybe_channel_scales_type,
+    std::optional<at::ScalarType> maybe_token_scales_type,
+    std::optional<at::ScalarType> maybe_out_type) {
+  ScalarType const b_type = ScalarType::from_id(b_type_id);
+  return supported_schedules_dispatch({
+      .a_type = a_type,
+      .b_type = b_type,
+      .maybe_group_scales_type = maybe_group_scales_type,
+      .maybe_group_zeros_type = maybe_group_zeros_type,
+      .maybe_channel_scales_type = maybe_channel_scales_type,
+      .maybe_token_scales_type = maybe_token_scales_type,
+      .maybe_out_type = maybe_out_type,
+  });
+}
+
+torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
+                 int64_t b_type_id,
+                 std::optional<at::ScalarType> const& maybe_out_type,
+                 std::optional<torch::Tensor> const& maybe_group_scales,
+                 std::optional<torch::Tensor> const& maybe_group_zeros,
+                 std::optional<int64_t> maybe_group_size,
+                 std::optional<torch::Tensor> const& maybe_channel_scales,
+                 std::optional<torch::Tensor> const& maybe_token_scales,
+                 std::optional<std::string> maybe_schedule) {
+  ScalarType const b_type = ScalarType::from_id(b_type_id);
+  return mm_dispatch({.A = A,
+                      .B = B,
+                      .b_type = b_type,
+                      .maybe_out_type = maybe_out_type,
+                      .maybe_group_scales = maybe_group_scales,
+                      .maybe_group_zeros = maybe_group_zeros,
+                      .maybe_group_size = maybe_group_size,
+                      .maybe_channel_scales = maybe_channel_scales,
+                      .maybe_token_scales = maybe_token_scales,
+                      .maybe_schedule = maybe_schedule});
+}
+
+torch::Tensor prepack_B(
+    torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
+    std::optional<at::ScalarType> const& maybe_group_scales_type) {
+  ScalarType const b_type = ScalarType::from_id(b_type_id);
+  return prepack_B_dispatch(
+      {.B = B,
+       .a_type = a_type,
+       .b_type = b_type,
+       .maybe_group_scales_type = maybe_group_scales_type});
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("machete_prepack_B", &prepack_B);
+  m.impl("machete_mm", &mm);
+}
+
+// use CatchAll since supported_schedules has no tensor arguments
+TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) {
+  m.impl("machete_supported_schedules", &supported_schedules);
+}
+
+};  // namespace machete
--- a/csrc/quantization/marlin/sparse/LICENSE
+++ b/csrc/quantization/marlin/sparse/LICENSE
@@ -0,0 +1,203 @@
+Contains code from https://github.com/IST-DASLab/Sparse-Marlin/
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/csrc/quantization/marlin/sparse/common/base.h
+++ b/csrc/quantization/marlin/sparse/common/base.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace marlin_24 {
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+template <int M_, int N_, int K_>
+struct ShapeBase {
+  static constexpr int M = M_, N = N_, K = K_;
+};
+
+using I4 = Vec<int, 4>;
+
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragM = Vec<uint, 1>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>;  // quantization scales
+
+}  // namespace marlin_24
--- a/csrc/quantization/marlin/sparse/common/mem.h
+++ b/csrc/quantization/marlin/sparse/common/mem.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "base.h"
+
+namespace marlin_24 {
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred_zfill(void* smem_ptr,
+                                            const void* glob_ptr,
+                                            bool pred = true,
+                                            const bool zfill = false) {
+  const int BYTES = 16;
+  int src_in_bytes = (zfill ? 0 : BYTES);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES), "r"(src_in_bytes));
+}
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+__device__ inline void ldsm4_m(FragM& frag_m, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_m);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+               : "=r"(a[0]), "=r"(a[1])
+               : "r"(smem));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4_t(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+      : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+      : "r"(smem));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+}  // namespace marlin_24
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "base.h"
+#include <cudaTypedefs.h>
+
+namespace marlin_24 {
+
+// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
+// is not supported. On later versions of CUDA the version without ordered
+// metadata results in the following warning:
+//  | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
+//  | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
+//  | reduced performance on some future architectures
+#if defined CUDA_VERSION && CUDA_VERSION >= 12050
+  #define MMA_SP_INST \
+    "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#else
+  #define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#endif
+
+// m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
+                              const FragA& frag_b, FragC& frag_c, FragM& frag_m,
+                              const int psel) {
+  const uint32_t* a0 = reinterpret_cast<const uint32_t*>(&a_frag0);
+  const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
+
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if (psel == 0) {
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
+  } else {
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
+  }
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+__device__ __forceinline__ uint2 to_half4(float c0, float c1, float c2,
+                                          float c3) {
+  uint2 r;
+  asm("{\n\t"
+      ".reg .f16 a, b, c, d; \n\t"
+      "cvt.rn.f16.f32 a, %2; \n\t"
+      "cvt.rn.f16.f32 b, %3; \n\t"
+      "cvt.rn.f16.f32 c, %4; \n\t"
+      "cvt.rn.f16.f32 d, %5; \n\t"
+      "mov.b32 %0, {a, b};   \n\t"
+      "mov.b32 %1, {c, d};   \n\t"
+      "}"
+      : "=r"(r.x), "=r"(r.y)
+      : "f"(c0), "f"(c1), "f"(c2), "f"(c3));
+  return r;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant_4bit(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant_8bit(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+__device__ inline void scale_floats(float* c0, float* c1, float* c2, float* c3,
+                                    FragS& s0, float* c4, float* c5, float* c6,
+                                    float* c7, FragS& s1) {
+  *c0 = __fmul_rn(*c0, __half2float(s0[0].x));
+  *c1 = __fmul_rn(*c1, __half2float(s0[0].y));
+  *c2 = __fmul_rn(*c2, __half2float(s0[1].x));
+  *c3 = __fmul_rn(*c3, __half2float(s0[1].y));
+
+  *c4 = __fmul_rn(*c4, __half2float(s1[0].x));
+  *c5 = __fmul_rn(*c5, __half2float(s1[0].y));
+  *c6 = __fmul_rn(*c6, __half2float(s1[1].x));
+  *c7 = __fmul_rn(*c7, __half2float(s1[1].y));
+}
+
+}  // namespace marlin_24
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
--- a/csrc/quantization/utils.cuh
+++ b/csrc/quantization/utils.cuh
@@ -0,0 +1,59 @@
+#pragma once
+
+/**
+ * Quantization utilities including:
+ *   Adjusted maximum values for qtypes.
+ *   Minimum scaling factors for qtypes.
+ */
+
+#include <cmath>
+#include <torch/types.h>
+
+#ifndef USE_ROCM
+  #include <c10/util/Float8_e4m3fn.h>
+  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
+#else
+  #include <ATen/hip/HIPContext.h>
+  #include <c10/util/Float8_e4m3fn.h>
+  #include <c10/util/Float8_e4m3fnuz.h>
+  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
+  #define MAYBE_HOST_DEVICE
+#endif
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct quant_type_max {
+  static constexpr T val() { return std::numeric_limits<T>::max(); }
+};
+
+// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
+// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
+template <>
+struct quant_type_max<c10::Float8_e4m3fnuz> {
+  static constexpr c10::Float8_e4m3fnuz val() {
+    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
+  }
+};
+
+template <typename T>
+MAYBE_HOST_DEVICE static constexpr T quant_type_max_v =
+    quant_type_max<T>::val();
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct min_scaling_factor {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return 1.0f / (quant_type_max_v<T> * 512.0f);
+  }
+};
+
+template <>
+struct min_scaling_factor<int8_t> {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return std::numeric_limits<float>::epsilon();
+  }
+};
--- a/csrc/quantization/vectorization.cuh
+++ b/csrc/quantization/vectorization.cuh
@@ -0,0 +1,31 @@
+#pragma once
+/**
+ * __device__ datatypes vectorized by 4
+ */
+
+// Include both AMD and NVIDIA fp8 types to avoid circular import
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+
+namespace vllm {
+
+// Vectorization containers
+template <typename scalar_t, size_t vec_size>
+struct __align__(vec_size * sizeof(scalar_t)) vec_n_t {
+  scalar_t val[vec_size];
+};
+
+template <typename quant_type_t, size_t vec_size>
+struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t {
+  static_assert(std::is_same_v<quant_type_t, int8_t> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
+  quant_type_t val[vec_size];
+};
+
+template <typename scalar_t>
+using vec4_t = vec_n_t<scalar_t, 4>;
+template <typename quant_type_t>
+using q8x4_t = q8_n_t<quant_type_t, 4>;
+
+}  // namespace vllm
--- a/csrc/quantization/vectorization_utils.cuh
+++ b/csrc/quantization/vectorization_utils.cuh
@@ -0,0 +1,177 @@
+#pragma once
+#include "vectorization.cuh"
+
+namespace vllm {
+
+template <int VEC_SIZE, typename InT, typename OutT, typename ScaOp>
+struct DefaultVecOp {
+  ScaOp scalar_op;
+
+  __device__ __forceinline__ void operator()(
+      vec_n_t<OutT, VEC_SIZE>& dst, const vec_n_t<InT, VEC_SIZE>& src) const {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      scalar_op(dst.val[i], src.val[i]);
+    }
+  }
+};
+
+template <int VEC_SIZE, typename InT, typename OutT, typename VecOp,
+          typename ScaOp>
+__device__ inline void vectorize_with_alignment(
+    const InT* in, OutT* out, int len, int tid, int stride,
+    VecOp&& vec_op,       // vec_n_t<InT,16> -> vec_n_t<OutT,16>
+    ScaOp&& scalar_op) {  // InT -> OutT
+  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
+                "VEC_SIZE must be a positive power-of-two");
+  constexpr int WIDTH = VEC_SIZE * sizeof(InT);  // eg: 64 B
+  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
+
+  // fast path when the whole region is already aligned
+  // Note: currently the output is guaranteed to be same as the input, so we
+  // don't check it here, comments here just for future reference.
+  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
+  if (can_vec) {
+    int num_vec = len / VEC_SIZE;
+
+    using vin_t = vec_n_t<InT, VEC_SIZE>;
+    using vout_t = vec_n_t<OutT, VEC_SIZE>;
+    auto* v_in = reinterpret_cast<const vin_t*>(in);
+    auto* v_out = reinterpret_cast<vout_t*>(out);
+
+    for (int i = tid; i < num_vec; i += stride) {
+      vout_t tmp;
+      // Make a local copy of the entire pack
+      vin_t src = v_in[i];  // <- encourages a single vector ld
+      vec_op(tmp, src);
+      v_out[i] = tmp;  // <- encourages a single vector st
+    }
+    return;
+  }
+
+  int misalignment_offset = addr & (WIDTH - 1);       // addr % 64
+  int alignment_bytes = WIDTH - misalignment_offset;  // 64 - (addr % 64)
+  int prefix_elems = alignment_bytes & (WIDTH - 1);   // handle 64
+  prefix_elems /= sizeof(InT);
+  prefix_elems = min(prefix_elems, len);  // 0 ≤ prefix < 16
+
+  // 1. prefill the when it is unsafe to vectorize
+  for (int i = tid; i < prefix_elems; i += stride) {
+    scalar_op(out[i], in[i]);
+  }
+
+  in += prefix_elems;
+  out += prefix_elems;
+  len -= prefix_elems;
+
+  int num_vec = len / VEC_SIZE;
+  using vin_t = vec_n_t<InT, VEC_SIZE>;
+  using vout_t = vec_n_t<OutT, VEC_SIZE>;
+  auto* v_in = reinterpret_cast<const vin_t*>(in);
+  auto* v_out = reinterpret_cast<vout_t*>(out);
+
+  // 2. vectorize the main part
+  for (int i = tid; i < num_vec; i += stride) {
+    vout_t tmp;
+    // Make a local copy of the entire pack
+    vin_t src = v_in[i];  // <- encourages a single vector ld
+    vec_op(tmp, src);
+    v_out[i] = tmp;  // <- encourages a single vector st
+  }
+
+  // 3. handle the tail
+  int tail_start = num_vec * VEC_SIZE;
+  for (int i = tid + tail_start; i < len; i += stride) {
+    scalar_op(out[i], in[i]);
+  }
+}
+
+template <int VEC_SIZE, typename InT, typename OutT, typename ScaOp>
+__device__ __forceinline__ void vectorize_with_alignment(const InT* in,
+                                                         OutT* out, int len,
+                                                         int tid, int stride,
+                                                         ScaOp&& scalar_op) {
+  using Vec = DefaultVecOp<VEC_SIZE, InT, OutT, std::decay_t<ScaOp>>;
+  vectorize_with_alignment<VEC_SIZE>(in, out, len, tid, stride, Vec{scalar_op},
+                                     std::forward<ScaOp>(scalar_op));
+}
+
+template <int VEC_SIZE, typename InT, typename ScaOp>
+struct DefaultReadVecOp {
+  ScaOp scalar_op;
+
+  __device__ __forceinline__ void operator()(
+      const vec_n_t<InT, VEC_SIZE>& src) const {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      scalar_op(src.val[i]);
+    }
+  }
+};
+
+// read-only version: iterate over the input with alignment guarantees
+template <int VEC_SIZE, typename InT, typename VecOp, typename ScaOp>
+__device__ inline void vectorize_read_with_alignment(const InT* in, int len,
+                                                     int tid, int stride,
+                                                     VecOp&& vec_op,
+                                                     ScaOp&& scalar_op) {
+  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
+                "VEC_SIZE must be a positive power-of-two");
+  constexpr int WIDTH = VEC_SIZE * sizeof(InT);
+  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
+
+  // fast path when the whole region is already aligned
+  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
+  if (can_vec) {
+    int num_vec = len / VEC_SIZE;
+
+    using vin_t = vec_n_t<InT, VEC_SIZE>;
+    auto* v_in = reinterpret_cast<const vin_t*>(in);
+
+    for (int i = tid; i < num_vec; i += stride) {
+      vin_t tmp = v_in[i];
+      vec_op(tmp);
+    }
+    return;
+  }
+
+  int misalignment_offset = addr & (WIDTH - 1);
+  int alignment_bytes = WIDTH - misalignment_offset;
+  int prefix_elems = alignment_bytes & (WIDTH - 1);
+  prefix_elems /= sizeof(InT);
+  prefix_elems = min(prefix_elems, len);
+
+  // 1. handle the possibly unaligned prefix with scalar access.
+  for (int i = tid; i < prefix_elems; i += stride) {
+    scalar_op(in[i]);
+  }
+
+  in += prefix_elems;
+  len -= prefix_elems;
+
+  int num_vec = len / VEC_SIZE;
+  using vin_t = vec_n_t<InT, VEC_SIZE>;
+  auto* v_in = reinterpret_cast<const vin_t*>(in);
+
+  // 2. vectorized traversal of the main aligned region.
+  for (int i = tid; i < num_vec; i += stride) {
+    vec_op(v_in[i]);
+  }
+
+  // 3. handle remaining tail elements.
+  int tail_start = num_vec * VEC_SIZE;
+  for (int i = tid + tail_start; i < len; i += stride) {
+    scalar_op(in[i]);
+  }
+}
+
+// overload that requires only a scalar_op
+template <int VEC_SIZE, typename InT, typename ScaOp>
+__device__ __forceinline__ void vectorize_read_with_alignment(
+    const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) {
+  using Vec = DefaultReadVecOp<VEC_SIZE, InT, std::decay_t<ScaOp>>;
+  vectorize_read_with_alignment<VEC_SIZE>(in, len, tid, stride, Vec{scalar_op},
+                                          std::forward<ScaOp>(scalar_op));
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/Epilogues.md
+++ b/csrc/quantization/w8a8/cutlass/Epilogues.md
@@ -0,0 +1,168 @@
+# CUTLASS Epilogues
+
+## Introduction
+
+This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs.
+
+Currently, we only support symmetric quantization for weights,
+and symmetric and asymmetric quantization for activations.
+Both can be quantized per-tensor or per-channel (weights) / per-token (activations).
+
+There are 4 epilogues:
+
+1. `ScaledEpilogue`: symmetric quantization for activations, no bias.
+1. `ScaledEpilogueBias`: symmetric quantization for activations, supports bias.
+1. `ScaledEpilogueAzp`: asymmetric per-tensor quantization for activations, supports bias.
+1. `ScaledEpilogueAzpPerToken`: asymmetric per-token quantization for activations, supports bias.
+
+We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size.
+Instead, if no bias is passed, the epilogue will use 0 as the bias.
+That induces a redundant addition operation (and runtime check), but the performance impact is minor.
+
+## Underlying Linear Algebra
+
+More details available in the [Activation Quantization RFC](https://github.com/vllm-project/vllm/issues/3975).
+
+If $` \widehat X `$ is the quantized $` X `$, our matrices become the following
+
+```math
+A = s_a (\widehat A - J_a z_a)
+```
+
+```math
+B = s_b \widehat B
+```
+
+```math
+D = A B + C
+```
+
+```math
+D = s_a s_b \widehat D + C
+```
+
+Here, D is the output of the GEMM, and C is the bias.
+A is the activations and supports asymmetric quantization,
+and B is the weights and only supports symmetric quantization.
+$ s_a $ and $s_b$ are the scales for activations and weights, respectively.
+$ z_a $ is the zero-point for activations, and $ J_a $ is the matrix of all ones with dimensions of A.
+Additional epilogues would be required to support asymmetric quantization for weights.
+
+Expanding further, we can calculate $` \widehat D `$ as follows:
+
+```math
+A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B
+```
+
+```math
+A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right)
+```
+
+```math
+\widehat D = \widehat A \widehat B - z_a J_a \widehat B
+```
+
+Note that $` \widehat A \widehat B `$ is the raw output of the GEMM,
+and $` J_a \widehat B `$ is known ahead of time.
+Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of column sums of $` \widehat B `$.
+
+## Epilogues
+
+### `ScaledEpilogue`
+
+This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B
+```
+
+```math
+D = s_a s_b \widehat D
+```
+
+```math
+D = s_a s_b \widehat A \widehat B
+```
+
+Epilogue parameters:
+
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+
+### `ScaledEpilogueBias`
+
+This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B
+```
+
+```math
+D = s_a s_b \widehat D + C 
+```
+
+```math
+D = s_a s_b \widehat A \widehat B + C
+```
+
+Epilogue parameters:
+
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+### `ScaledEpilogueAzp`
+
+This epilogue computes the asymmetric per-tensor quantization for activations with bias.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B - z_a J_a \widehat B
+```
+
+```math
+D = s_a s_b \widehat D + C 
+```
+
+```math
+D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C
+```
+
+Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$.
+That is precomputed and stored in `azp_with_adj` as a row-vector.
+
+Epilogue parameters:
+
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+    - Generally this will be per-tensor as the zero-points are per-tensor.
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel.
+
+### `ScaledEpilogueAzpPerToken`
+
+This epilogue computes the asymmetric per-token quantization for activations with bias.
+
+The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector.
+That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$.
+
+Epilogue parameters:
+
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+    - Generally this will be per-token as the zero-points are per-token.
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
+- `azp` is the zero-point (`z_a`), is per-token (column-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel.
+
+The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM):
+
+```math
+out = scale_a * scale_b * (Dq - azp_adj * azp) + bias
+```
--- a/csrc/quantization/w8a8/cutlass/c3x/cutlass_gemm_caller.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/cutlass_gemm_caller.cuh
@@ -0,0 +1,107 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/util/packed_stride.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+namespace vllm::c3x {
+
+static inline cute::Shape<int, int, int, int> get_problem_shape(
+    torch::Tensor const& a, torch::Tensor const& b) {
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  return {m, n, k, 1};
+}
+
+template <typename GemmKernel>
+void cutlass_gemm_caller(
+    torch::Device device, cute::Shape<int, int, int, int> prob_shape,
+    typename GemmKernel::MainloopArguments mainloop_args,
+    typename GemmKernel::EpilogueArguments epilogue_args,
+    typename GemmKernel::TileSchedulerArguments scheduler = {}) {
+  cutlass::KernelHardwareInfo hw_info;
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape,
+                                      mainloop_args,
+                                      epilogue_args,
+                                      hw_info,
+                                      scheduler};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(device);
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = StrideC;
+  using StrideAux = StrideC;
+
+  typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b);
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+  StrideC c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+  StrideD d_stride =
+      cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+  StrideAux aux_stride = d_stride;
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  // auto d_ptr = static_cast<ElementC*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, d_stride};
+
+  cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                  epilogue_args);
+}
+
+}  // namespace vllm::c3x
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
@@ -0,0 +1,209 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+/*
+  Epilogues defined in,
+  csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp,
+  must contain a public type named EVTCompute of type Sm90EVT, as well as a
+  static prepare_args function that constructs an EVTCompute::Arguments struct.
+*/
+
+using namespace cute;
+
+namespace vllm {
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, AlignmentCD, ElementD, StrideD,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentAB, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm_sm100 {
+  using ElementAB = ElementAB_;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<ElementD_>::value;
+
+  using ElementD = ElementD_;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  // MMA type
+  using ElementAccumulator = float;
+
+  // Epilogue types
+  using ElementBias = cutlass::half_t;
+  using ElementCompute = float;
+  using ElementAux = ElementD;
+  using LayoutAux = LayoutD;
+  using ElementAmax = float;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC,
+          ElementD, LayoutD, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm_sm120 {
+  using ElementAB = ElementAB_;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<ElementD_>::value;
+
+  using ElementD = ElementD_;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  // MMA type
+  using ElementAccumulator = float;
+
+  // Epilogue types
+  using ElementBias = cutlass::half_t;
+  using ElementCompute = float;
+  using ElementAux = ElementD;
+  using LayoutAux = LayoutD;
+  using ElementAmax = float;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC,
+          ElementD, LayoutD, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias) {
+  if (azp) {
+    return cutlass_scaled_mm_sm90_int8_epilogue<
+        c3x::ScaledEpilogueBiasAzpToken>(out, a, b, a_scales, b_scales, azp_adj,
+                                         *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu
@@ -0,0 +1,23 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm100_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -0,0 +1,280 @@
+#pragma once
+
+#include "cuda_utils.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass_gemm_caller.cuh"
+
+namespace vllm {
+
+using namespace cute;
+
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler,
+          bool swap_ab_ = false>
+struct cutlass_3x_gemm_fp8_blockwise {
+  static constexpr bool swap_ab = swap_ab_;
+  using ElementAB = cutlass::float_e4m3_t;
+
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementAB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementD = OutType;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementC = void; // TODO: support bias
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBlockScale = float;
+
+  using ScaleConfig = conditional_t<swap_ab,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::K, cute::UMMA::Major::MN>,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
+
+  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using ElementScalar = float;
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
+      AlignmentC,
+      ElementD,
+      conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+      AlignmentD,
+      EpilogueScheduler,
+      DefaultOperation
+  >::CollectiveOp;
+ 
+  using StageCountType = cutlass::gemm::collective::StageCountAuto; 
+  using CollectiveMainloop = conditional_t<swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementB,
+          cute::tuple<LayoutB_Transpose, LayoutSFA>,
+          AlignmentB,
+          ElementA,
+          cute::tuple<LayoutA_Transpose, LayoutSFB>,
+          AlignmentA,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementA,
+          cute::tuple<LayoutA, LayoutSFA>,
+          AlignmentA,
+          ElementB,
+          cute::tuple<LayoutB, LayoutSFB>,
+          AlignmentB,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp>;
+
+  using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutSFA = typename Gemm::LayoutSFA;
+  using LayoutSFB = typename Gemm::LayoutSFB;
+  using ScaleConfig = typename Gemm::ScaleConfig;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using ElementBlockScale = typename Gemm::ElementBlockScale;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+
+  StrideA a_stride;
+  StrideB b_stride;
+  StrideC c_stride;
+  a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+
+  LayoutSFA layout_SFA = swap_ab ? 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
+
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
+
+  typename GemmKernel::MainloopArguments mainloop_args{};
+  mainloop_args.layout_SFA = layout_SFA;
+  mainloop_args.layout_SFB = layout_SFB;
+  if (swap_ab) {
+    mainloop_args.ptr_A = b_ptr;
+    mainloop_args.dA = b_stride;
+    mainloop_args.ptr_B = a_ptr;
+    mainloop_args.dB = a_stride;
+    mainloop_args.ptr_SFA = b_scales_ptr;
+    mainloop_args.ptr_SFB = a_scales_ptr;
+  } else {
+    mainloop_args.ptr_A = a_ptr;
+    mainloop_args.dA = a_stride;
+    mainloop_args.ptr_B = b_ptr;
+    mainloop_args.dB = b_stride;
+    mainloop_args.ptr_SFA = a_scales_ptr;
+    mainloop_args.ptr_SFB = b_scales_ptr;
+  }
+  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
+                                               torch::Tensor const& a,
+                                               torch::Tensor const& b,
+                                               torch::Tensor const& a_scales,
+                                               torch::Tensor const& b_scales) {
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  constexpr int TILE_K = 128;
+  // TODO: better heuristics
+  bool swap_ab = (m < 16) || (m % 4 != 0);
+  bool use_tma_epilogue = (m * n) % 4 == 0;
+  if (!swap_ab) {
+    constexpr int TILE_N = 128;
+    int tile_m = 256;
+    if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
+      tile_m = 64;
+    }
+    else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
+      tile_m = 128;
+    }
+    if (tile_m == 64) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else if (tile_m == 128) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else { // tile_m == 256
+      if (use_tma_epilogue) {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    }
+  } else {
+    // TODO: Test more tile N configs
+    constexpr int TILE_M = 128;
+    constexpr int TILE_N = 16;
+    // TMA epilogue isn't compatible with Swap A/B
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
+        Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
+        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu
@@ -0,0 +1,23 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm120_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm120_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm120_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@@ -0,0 +1,184 @@
+#pragma once
+
+#include "cuda_utils.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass_gemm_caller.cuh"
+
+namespace vllm {
+
+using namespace cute;
+
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler>
+struct cutlass_3x_gemm_fp8_blockwise {
+  using ElementAB = cutlass::float_e4m3_t;
+
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementAB;
+  // ColumnMajor is used for B to match the CUTLASS convention.
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementD = OutType;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementC = void; // TODO: support bias
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBlockScale = float; 
+
+  using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>;
+
+  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
+  using ArchTag = cutlass::arch::Sm120;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using ElementScalar = float;
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementD,
+      LayoutD,
+      AlignmentD,
+      EpilogueScheduler,
+      DefaultOperation
+  >::CollectiveOp;
+ 
+  using StageCountType = cutlass::gemm::collective::StageCountAuto; 
+  using CollectiveMainloop = 
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementA,
+          cute::tuple<LayoutA, LayoutSFA>,
+          AlignmentA,
+          ElementB,
+          cute::tuple<LayoutB, LayoutSFB>,
+          AlignmentB,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp;
+
+  using KernelType = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutSFA = typename Gemm::LayoutSFA;
+  using LayoutSFB = typename Gemm::LayoutSFB;
+  using ScaleConfig = typename Gemm::ScaleConfig;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using ElementBlockScale = typename Gemm::ElementBlockScale;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+
+  StrideA a_stride;
+  StrideB b_stride;
+  StrideC c_stride;
+  a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+
+  LayoutSFA layout_SFA = 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = 
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
+
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
+
+  typename GemmKernel::MainloopArguments mainloop_args{};
+  mainloop_args.ptr_A = a_ptr;
+  mainloop_args.dA = a_stride;
+  mainloop_args.ptr_B = b_ptr;
+  mainloop_args.dB = b_stride;
+  mainloop_args.ptr_SFA = a_scales_ptr;
+  mainloop_args.layout_SFA = layout_SFA;
+  mainloop_args.ptr_SFB = b_scales_ptr;
+  mainloop_args.layout_SFB = layout_SFB;
+  auto prob_shape = cute::make_shape(m, n, k, 1);
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm120_fp8_dispatch(torch::Tensor& out,
+                                               torch::Tensor const& a,
+                                               torch::Tensor const& b,
+                                               torch::Tensor const& a_scales,
+                                               torch::Tensor const& b_scales) {
+  // TODO: better heuristics
+  cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+      OutType, 1, 128, 128, Shape<_128, _128, _128>,
+      Shape<_1, _1, _1>, cutlass::epilogue::collective::EpilogueScheduleAuto,
+      cutlass::gemm::collective::KernelScheduleAuto>>(
+      out, a, b, a_scales, b_scales);
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu
@@ -0,0 +1,24 @@
+
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -0,0 +1,177 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass_gemm_caller.cuh"
+
+namespace vllm {
+
+using namespace cute;
+
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler>
+struct cutlass_3x_gemm_fp8_blockwise {
+  using ElementAB = cutlass::float_e4m3_t;
+
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementAB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementD = OutType;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementC = void; // TODO: support bias
+  using LayoutC = LayoutD;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBlockScale = float;
+
+  using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::GMMA::Major::MN, cute::GMMA::Major::K>;
+
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using ElementScalar = float;
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementD,
+      LayoutD,
+      AlignmentD,
+      EpilogueScheduler,
+      DefaultOperation
+  >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutA, LayoutSFA>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutB, LayoutSFB>,
+      AlignmentB,
+      ElementAccumulator,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopScheduler
+  >::CollectiveOp;
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutSFA = typename Gemm::LayoutSFA;
+  using LayoutSFB = typename Gemm::LayoutSFB;
+  using ScaleConfig = typename Gemm::ScaleConfig;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using ElementBlockScale = typename Gemm::ElementBlockScale;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+
+  TORCH_CHECK(m % 4 == 0, "m must be divisible by 4");
+
+  StrideA a_stride;
+  StrideB b_stride;
+  StrideC c_stride;
+  a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+
+  LayoutSFA layout_SFA = 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = 
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
+
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
+
+  typename GemmKernel::MainloopArguments mainloop_args{};
+  mainloop_args.ptr_A = a_ptr;
+  mainloop_args.dA = a_stride;
+  mainloop_args.ptr_B = b_ptr;
+  mainloop_args.dB = b_stride;
+  mainloop_args.ptr_SFA = a_scales_ptr;
+  mainloop_args.layout_SFA = layout_SFA;
+  mainloop_args.ptr_SFB = b_scales_ptr;
+  mainloop_args.layout_SFB = layout_SFB;
+  auto prob_shape = cute::make_shape(m, n, k, 1);
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
+                                              torch::Tensor const& a,
+                                              torch::Tensor const& b,
+                                              torch::Tensor const& a_scales,
+                                              torch::Tensor const& b_scales) {
+  // TODO: better heuristics
+  cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+      OutType, 1, 128, 128, Shape<_128, _128, _128>,
+      Shape<_1, _2, _1>, cutlass::epilogue::TmaWarpSpecializedCooperative,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum>>(
+      out, a, b, a_scales, b_scales);
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
@@ -0,0 +1,52 @@
+#include <torch/all.h>
+#include "cuda_utils.h"
+#include "cutlass_extensions/common.hpp"
+
+template <typename Fp8Func, typename Int8Func, typename BlockwiseFunc>
+void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
+                        torch::Tensor const& b, torch::Tensor const& a_scales,
+                        torch::Tensor const& b_scales,
+                        std::optional<torch::Tensor> const& bias,
+                        Fp8Func fp8_func, Int8Func int8_func,
+                        BlockwiseFunc blockwise_func) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+
+  if ((a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+      (b_scales.numel() == 1 || b_scales.numel() == b.size(1))) {
+    // Standard per-tensor/per-token/per-channel scaling
+    TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+    if (a.dtype() == torch::kFloat8_e4m3fn) {
+      fp8_func(c, a, b, a_scales, b_scales, bias);
+    } else {
+      TORCH_CHECK(a.dtype() == torch::kInt8);
+      if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
+        int8_func(c, a, b, a_scales, b_scales, bias);
+      } else {
+        int32_t version_num = get_sm_version_num();
+        TORCH_CHECK(
+            false, "Int8 not supported on SM", version_num,
+            ". Use FP8 quantization instead, or run on older arch (SM < 100).");
+      }
+    }
+  } else {
+    TORCH_CHECK(a_scales.dim() == 2, "a scale must be 2d tensor.");
+    TORCH_CHECK(b_scales.dim() == 2, "b scale must be 2d tensor.");
+    int32_t version_num = get_sm_version_num();
+    if (version_num >= 90) {
+      TORCH_CHECK(
+          a.size(0) == a_scales.size(0) &&
+              cuda_utils::ceil_div(a.size(1), int64_t(128)) == a_scales.size(1),
+          "a_scale_group_shape must be [1, 128].");
+      TORCH_CHECK(
+          cuda_utils::ceil_div(b.size(0), int64_t(128)) == b_scales.size(0) &&
+              cuda_utils::ceil_div(b.size(1), int64_t(128)) == b_scales.size(1),
+          "b_scale_group_shape must be [128, 128].");
+    }
+
+    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
+    blockwise_func(c, a, b, a_scales, b_scales);
+  }
+}
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales);
+
+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_sm120_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales);
+
+void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales);
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
@@ -0,0 +1,23 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm100_fp8_dispatch.cuh"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm100_fp8_epilogue<true>(out, a, b, a_scales,
+                                                      b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm100_fp8_epilogue<false>(out, a, b, a_scales,
+                                                       b_scales);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -0,0 +1,318 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+/**
+ * This file defines Gemm kernel configurations for SM100 (fp8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, bool swap_ab_ = false>
+struct cutlass_3x_gemm_sm100_fp8 {
+  using ElementAB = ElementAB_;
+  using ElementC = ElementD_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Compile-time swap_ab flag
+  static constexpr bool swap_ab = swap_ab_;
+
+  // -----------------------------------------------------------
+  // Layout definitions
+  // -----------------------------------------------------------
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_T = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_T = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+
+  // -----------------------------------------------------------
+  // Collective epilogue (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC,
+          conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, AlignmentCD,
+          ElementD, conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // -----------------------------------------------------------
+  // Collective mainloop (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveMainloop = conditional_t<
+      swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutB_T, AlignmentAB,             // Swapped B (as A)
+          ElementAB, LayoutA_T, AlignmentAB,  // Swapped A (as B)
+          ElementAcc, TileShape, ClusterShape, Stages,
+          KernelSchedule>::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentAB, ElementAB, LayoutB, AlignmentAB, ElementAcc,
+          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>;
+
+  // -----------------------------------------------------------
+  // Kernel definition
+  // -----------------------------------------------------------
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_default {
+  // M in (256, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M256 {
+  // M in (64, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M64_swap_ab {
+  // This config is for M in (16, 64] and K >= 4096
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_4, _1, _1>;
+
+  // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap
+  // AB
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                                TileShape, ClusterShape, KernelSchedule,
+                                EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                                ClusterShape, KernelSchedule, EpilogueSchedule,
+                                true>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M64 {
+  // This config is for M = 64 and K < 4096 (do not enable swap AB in such case)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using Cutlass3xGemm =
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M16_swap_ab {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _32, _128>;
+  using ClusterShape = Shape<_4, _1, _1>;
+
+  // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap
+  // AB
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                                TileShape, ClusterShape, KernelSchedule,
+                                EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                                ClusterShape, KernelSchedule, EpilogueSchedule,
+                                true>>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   EpilogueArgs&&... epilogue_params) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  auto prob_shape =
+      swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC c_stride = cutlass::make_cute_packed_stride(
+      StrideC{},
+      swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename GemmKernel::MainloopArguments mainloop_args =
+      swap_ab ? typename GemmKernel::MainloopArguments{b_ptr, b_stride, a_ptr,
+                                                       a_stride}
+              : typename GemmKernel::MainloopArguments{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename InType, typename OutType, bool EnableBias,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            torch::Tensor const& a_scales,
+                                            torch::Tensor const& b_scales,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm100_fp8_config_default<InType, OutType,
+                                        EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM16SwapAB =
+      typename sm100_fp8_config_M16_swap_ab<InType, OutType,
+                                            EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64SwapAB =
+      typename sm100_fp8_config_M64_swap_ab<InType, OutType,
+                                            EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm100_fp8_config_M64<InType, OutType, EnableBias>::Cutlass3xGemm;
+
+  using Cutlass3xGemmM256 =
+      typename sm100_fp8_config_M256<InType, OutType,
+                                     EnableBias>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const k = a.size(1);
+
+  if (m <= 16) {
+    // m in [1, 16]
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM16SwapAB>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m <= 64) {
+    // m in (16, 64]
+    if (m == 64 && k < 4096) {
+      // do not enable swap AB
+      return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64>(
+          out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+    }
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64SwapAB>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+
+  } else if (m <= 256) {
+    // m in (64, 256]
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM256>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (256, inf)
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmDefault>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <bool EnableBias, typename... EpilogueArgs>
+void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::bfloat16_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::half_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm120_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm120_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm120_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm120_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM120 (fp8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm120_fp8_config_default {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;  // Only work with Shape<_1, _1, _1>
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm120<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm120_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm120_fp8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm120_fp8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm120_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm120_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
@@ -0,0 +1,23 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_fp8_dispatch.cuh"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_fp8_epilogue<true>(out, a, b, a_scales,
+                                                     b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_fp8_epilogue<false>(out, a, b, a_scales,
+                                                      b_scales);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -0,0 +1,373 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, bool swap_ab_ = false>
+struct cutlass_3x_gemm_sm90_fp8 {
+  using ElementAB = ElementAB_;
+  using ElementC = ElementD_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Compile-time swap_ab flag
+  static constexpr bool swap_ab = swap_ab_;
+
+  // -----------------------------------------------------------
+  // Layout definitions
+  // -----------------------------------------------------------
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_T = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_T = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+
+  // -----------------------------------------------------------
+  // Collective epilogue (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC,
+          conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, AlignmentCD,
+          ElementD, conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // -----------------------------------------------------------
+  // Collective mainloop (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveMainloop = conditional_t<
+      swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutB_T, AlignmentAB,             // Swapped B (as A)
+          ElementAB, LayoutA_T, AlignmentAB,  // Swapped A (as B)
+          ElementAcc, TileShape, ClusterShape, Stages,
+          KernelSchedule>::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentAB, ElementAB, LayoutB, AlignmentAB, ElementAcc,
+          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>;
+
+  // -----------------------------------------------------------
+  // Kernel definition
+  // -----------------------------------------------------------
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M8192_K6144 {
+  // M >= 8192, K >= 6144
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M64_N1280 {
+  // M in (16, 64], N in [1 1280]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _16, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+
+  // enable swap AB for M < 64
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule,
+                               true>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M64_N8192 {
+  // M in (16, 64], N > 1280
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  // enable swap AB for M < 64
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule,
+                               true>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M16_N1280 {
+  // M in [1, 16], N in [1, 1280]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _16, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+
+  // enable swap AB for M < 64
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule,
+                               true>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M16_N8192 {
+  // M in [1, 16], N > 1280
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _16, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  // enable swap AB for M < 64
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule,
+                               true>>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                  torch::Tensor const& b,
+                                  EpilogueArgs&&... epilogue_params) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  auto prob_shape =
+      swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC c_stride = cutlass::make_cute_packed_stride(
+      StrideC{},
+      swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename GemmKernel::MainloopArguments mainloop_args =
+      swap_ab ? typename GemmKernel::MainloopArguments{b_ptr, b_stride, a_ptr,
+                                                       a_stride}
+              : typename GemmKernel::MainloopArguments{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename InType, typename OutType, bool EnableBias,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales,
+                                           EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config_default<InType, OutType,
+                                       EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM8192_K6144 =
+      typename sm90_fp8_config_M8192_K6144<InType, OutType,
+                                           EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
+
+  using Cutlass3xGemmM64_N1280 =
+      typename sm90_fp8_config_M64_N1280<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64_N8192 =
+      typename sm90_fp8_config_M64_N8192<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM16_N1280 =
+      typename sm90_fp8_config_M16_N1280<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM16_N8192 =
+      typename sm90_fp8_config_M16_N8192<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const n = b.size(1);
+  uint32_t const k = a.size(1);
+
+  if (m <= 16) {
+    // m in [1, 16]
+    if (n <= 1280) {
+      return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM16_N1280>(
+          out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+    }
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM16_N8192>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m <= 64) {
+    // m in (16, 64]
+    if (n <= 1280) {
+      return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N1280>(
+          out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+    }
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N8192>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m >= 8192 && k >= 6144) {
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM8192_K6144>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <bool EnableBias, typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         torch::Tensor const& a_scales,
+                                         torch::Tensor const& b_scales,
+                                         EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::bfloat16_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::half_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8_dispatch.cuh
@@ -0,0 +1,163 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (int8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_int8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_int8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                           Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
@@ -0,0 +1,373 @@
+#include "core/registration.h"
+
+#include <torch/all.h>
+#include <cutlass/arch/arch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include <cassert>
+
+using namespace cute;
+
+template <typename ElementAB, typename ElementC, typename ElementAccumulator,
+          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+__global__ void get_ggemm_starts(
+    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
+    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scale_base_as_int,
+    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
+    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
+  int expert_id = threadIdx.x;
+
+  if (expert_id >= gridDim.x * blockDim.x) {
+    return;
+  }
+
+  int m = problem_sizes[expert_id * 3];
+  int n = problem_sizes[expert_id * 3 + 1];
+  int k = problem_sizes[expert_id * 3 + 2];
+
+  int32_t expert_offset = expert_offsets[expert_id];
+  int a_stride = expert_offset * k;
+  int b_stride = expert_id * k * n;
+  int a_scale_stride = expert_offset * k / 128;
+  int b_scale_stride = expert_id * k * n / 128 / 128;
+
+  a_offsets[expert_id] = a_base_as_int + a_stride;
+  b_offsets[expert_id] = b_base_as_int + b_stride;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
+  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
+
+  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
+  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
+
+  *layout_sfa_ptr =
+      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
+  *layout_sfb_ptr =
+      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
+                                 ScaleConfig)                                 \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
+    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
+                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
+        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
+        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
+        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
+        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
+        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
+        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
+        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
+        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
+        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
+        static_cast<float*>(a_scales.data_ptr()),                             \
+        static_cast<float*>(b_scales.data_ptr()),                             \
+        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
+        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
+        static_cast<int*>(problem_sizes.data_ptr()));                         \
+  }
+
+template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+void run_get_ggemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
+    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
+  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
+
+  int num_experts = (int)expert_offsets.size(0);
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
+                           LayoutSFB, ScaleConfig)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
+                           LayoutSFB, ScaleConfig)
+  else {
+    TORCH_CHECK(false, "Unsupported output tensor type");
+  }
+}
+
+template <typename OutType, typename ScheduleConfig, typename LayoutD>
+void run_blockwise_scaled_group_mm(
+    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
+    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
+    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
+
+  // Types
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = OutType;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = LayoutD;
+
+  // Alignments
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
+          typename ScheduleConfig::ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
+          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA,
+          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
+          AlignmentA, ElementB,
+          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
+          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
+          typename ScheduleConfig::ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue, void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = (int)expert_offsets.size(0);
+
+  Gemm gemm_op;
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementA**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(stride_a.data_ptr()),
+      static_cast<const ElementB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(stride_b.data_ptr()),
+      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
+          layout_sfa.data_ptr()),
+      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
+          layout_sfb.data_ptr())};
+
+  int device_id = a_ptrs.device().index();
+  static const cutlass::KernelHardwareInfo hw_info{
+      device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+                     device_id)};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(stride_c.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(stride_c.data_ptr())};
+
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info};
+
+  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM");
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+
+  status = gemm_op.run(stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType>
+void blockwise_scaled_group_mm_dispatch_shape(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  struct MmaConfig {
+    using ElementA = cutlass::float_e4m3_t;
+    using KernelSchedule =
+        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
+        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+    using LayoutC = cutlass::layout::RowMajor;
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using ClusterShape = Shape<_1, _1, _1>;
+  };
+
+  int num_experts = (int)expert_offsets.size(0);
+
+  auto a_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto b_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto out_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto a_scales_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto b_scales_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+
+  auto layout_sfa = torch::empty(
+      {num_experts, 5},
+      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
+  auto layout_sfb = torch::empty(
+      {num_experts, 5},
+      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
+
+  auto stride_a = torch::full(
+      {num_experts}, a.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto stride_b = torch::full(
+      {num_experts}, a.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto stride_c = torch::full(
+      {num_experts}, output.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+
+  torch::TensorOptions options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
+                       typename MmaConfig::LayoutSFB,
+                       typename MmaConfig::ScaleConfig>(
+      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
+      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
+
+  run_blockwise_scaled_group_mm<OutType, MmaConfig,
+                                typename MmaConfig::LayoutC>(
+      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
+      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
+      expert_offsets);
+}
+
+void cutlass_blockwise_scaled_grouped_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
+              "a must be kFloat8_e4m3fn");
+  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
+              "b must be kFloat8_e4m3fn");
+  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
+                  output.scalar_type() == torch::kHalf,
+              "output must be bfloat16 or half");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
+              "scales_a must be float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
+              "scales_b must be float32");
+  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
+              "expert_offsets must be int32");
+
+  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
+  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
+  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
+  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
+  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
+
+#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
+  if (output.scalar_type() == torch::kBFloat16) {
+    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
+        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
+  } else if (output.scalar_type() == torch::kFloat16) {
+    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
+        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
+  } else {
+    TORCH_CHECK(false, "Unsupported output tensor type");
+  }
+#endif
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_blockwise_scaled_grouped_mm",
+         &cutlass_blockwise_scaled_grouped_mm);
+}
--- a/csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
+++ b/csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <cuda.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+template <typename ElementAB, typename ElementC, typename ElementAccumulator>
+__global__ void get_group_gemm_starts(
+    int64_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int, int64_t n, int64_t k,
+    bool per_act_token, bool per_out_ch) {
+  int expert_id = threadIdx.x;
+
+  int64_t expert_offset = expert_offsets[expert_id];
+
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  b_offsets[expert_id] = b_base_as_int + expert_id * k * n;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] =
+      a_scales_base_as_int + (per_act_token ? expert_offset : 0);
+  b_scales_offsets[expert_id] =
+      b_scales_base_as_int + (per_out_ch ? n * expert_id : expert_id);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                    \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                         \
+    get_group_gemm_starts<cutlass::float_e4m3_t, C_TYPE, float>            \
+        <<<1, num_experts, 0, stream>>>(                                   \
+            static_cast<int64_t*>(expert_offsets.data_ptr()),              \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),       \
+            static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),       \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                    \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),                \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),                \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),     \
+            static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),     \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                  \
+            static_cast<float*>(a_scales.data_ptr()),                      \
+            static_cast<float*>(b_scales.data_ptr()), out_tensors.size(1), \
+            a_tensors.size(1), per_act_token, per_out_ch);                 \
+  }
+
+namespace {
+
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor& out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  // expect int64_t to avoid overflow during offset calculations
+  TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
--- a/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x.cuh
@@ -0,0 +1,181 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "get_group_starts.cuh"
+
+using namespace cute;
+
+namespace {
+
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
+
+using ElementAccumulator = float;
+using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutA_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutB_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+using LayoutD = cutlass::layout::RowMajor;
+using LayoutD_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+using LayoutC = LayoutD;
+using LayoutC_Transpose = LayoutD_Transpose;
+
+template <typename ElementAB_, typename ElementC_, typename ArchTag_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, bool swap_ab_ = false>
+struct cutlass_3x_group_gemm {
+  static constexpr bool swap_ab = swap_ab_;
+  using ElementAB = ElementAB_;
+  using ElementC = void;
+  using ElementD = ElementC_;
+  using ElementAccumulator = float;
+  using ArchTag = ArchTag_;
+
+  using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC,
+          conditional_t<swap_ab, LayoutC_Transpose*, LayoutC*>, AlignmentC,
+          ElementD, conditional_t<swap_ab, LayoutD_Transpose*, LayoutD*>,
+          AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  using CollectiveMainloop = conditional_t<
+      swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementAB, LayoutB_Transpose*, AlignmentAB,
+          ElementAB, LayoutA_Transpose*, AlignmentAB, ElementAccumulator,
+          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementAB, LayoutA*, AlignmentAB, ElementAB,
+          LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
+          Stages, KernelSchedule>::CollectiveOp>;
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_group_gemm_caller(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a_tensors.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+
+  run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+                            a_scales_ptrs, b_scales_ptrs, a_tensors, b_tensors,
+                            out_tensors, a_scales, b_scales);
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideB = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideC = typename GemmKernel::InternalStrideC;
+
+  ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<ProblemShape::UnderlyingProblemShape*>(
+          problem_sizes.data_ptr());
+  ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+  typename GemmKernel::MainloopArguments mainloop_args;
+  if constexpr (swap_ab) {
+    mainloop_args = typename GemmKernel::MainloopArguments{
+        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+        static_cast<StrideB*>(b_strides.data_ptr()),
+        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr())};
+  } else {
+    mainloop_args = typename GemmKernel::MainloopArguments{
+        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr()),
+        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+        static_cast<StrideB*>(b_strides.data_ptr())};
+  }
+
+  // Currently, we are only able to do broadcast on either all or none a_scales
+  // and on either all or none b_scales
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          swap_ab ? static_cast<const ElementAccumulator**>(
+                        b_scales_ptrs.data_ptr())
+                  : static_cast<const ElementAccumulator**>(
+                        a_scales_ptrs.data_ptr()),
+          swap_ab ? static_cast<const ElementAccumulator**>(
+                        a_scales_ptrs.data_ptr())
+                  : static_cast<const ElementAccumulator**>(
+                        b_scales_ptrs.data_ptr()),
+          swap_ab ? per_out_ch : per_act_token,
+          swap_ab ? per_act_token : per_out_ch),
+      nullptr, static_cast<StrideC*>(c_strides.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides.data_ptr())};
+
+  int device_id = a_tensors.device().index();
+  static const cutlass::KernelHardwareInfo hw_info{
+      device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+                     device_id)};
+
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args,
+      epilogue_args, hw_info};
+
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a_tensors.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+}  // namespace
--- a/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu
@@ -0,0 +1,140 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass/cutlass.h"
+#include "grouped_mm_c3x.cuh"
+
+using namespace cute;
+
+namespace {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_default {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using TileShape = cute::Shape<cute::_128, cute::_256, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm100;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M64 {
+  // M in [1,64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm100;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule,
+                            true>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_N8192 {
+  // N in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using TileShape = cute::Shape<cute::_128, cute::_256, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm100;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType>
+void run_cutlass_moe_mm_sm100(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
+  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
+  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "A tensors must be of type float8_e4m3fn.");
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "B tensors must be of type float8_e4m3fn.");
+
+  using Cutlass3xGemmDefault = typename sm100_fp8_config_default<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmN8192 = typename sm100_fp8_config_N8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 = typename sm100_fp8_config_M64<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+
+  uint32_t const m = a_tensors.size(0);
+  uint32_t const n = out_tensors.size(1);
+
+  if (m <= 64) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM64>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (n >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else {
+    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  }
+}
+}  // namespace
+
+void dispatch_moe_mm_sm100(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  if (out_tensors.dtype() == torch::kBFloat16) {
+    run_cutlass_moe_mm_sm100<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else {
+    run_cutlass_moe_mm_sm100<cutlass::float_e4m3_t, cutlass::half_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  }
+}
+
+void cutlass_moe_mm_sm100(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  dispatch_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                        expert_offsets, problem_sizes, a_strides, b_strides,
+                        c_strides, per_act_token, per_out_ch);
+}
--- a/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu
@@ -0,0 +1,198 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass/cutlass.h"
+#include "grouped_mm_c3x.cuh"
+
+using namespace cute;
+
+namespace {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (16, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_256, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_2, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M4 {
+  // M in [1, 4]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule,
+                            true>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in (4, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_256>;
+  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule,
+                            true>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_K8192 {
+  // K in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_128, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_N8192 {
+  // N in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_128, cute::_256>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+  using ArchTag = cutlass::arch::Sm90;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
+                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType>
+void run_cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
+  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
+  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "A tensors must be of type float8_e4m3fn.");
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "B tensors must be of type float8_e4m3fn.");
+
+  using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM4 = typename sm90_fp8_config_M4<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 = typename sm90_fp8_config_M64<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmDefault = typename sm90_fp8_config_default<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+
+  uint32_t const m = a_tensors.size(0);
+  uint32_t const n = out_tensors.size(1);
+  uint32_t const k = a_tensors.size(1);
+
+  // Use swap_ab for M <= 64 by default to reduce padding
+  if (m <= 4) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM4>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (m <= 64) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM64>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (n >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (k >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else {
+    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  }
+}
+
+void dispatch_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  if (out_tensors.dtype() == torch::kBFloat16) {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  }
+}
+
+}  // namespace
+
+void cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
+  dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                       expert_offsets, problem_sizes, a_strides, b_strides,
+                       c_strides, per_act_token, per_out_ch);
+}
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -0,0 +1,252 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <iostream>
+
+constexpr uint64_t THREADS_PER_EXPERT = 512;
+// threshold must match the dispatch logic in run_cutlass_moe_mm_sm90()
+constexpr int SWAP_AB_THRESHOLD = 64;
+
+template <bool SWAP_AB>
+__global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
+                                      int32_t* problem_sizes1,
+                                      int32_t* problem_sizes2,
+                                      int32_t* atomic_buffer,
+                                      const int topk_length, const int n,
+                                      const int k) {
+  int expert_id = blockIdx.x;
+
+  int occurrences = 0;
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    occurrences += (topk_ids[i] == expert_id);
+  }
+  atomicAdd(&atomic_buffer[expert_id], occurrences);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    int final_occurrences = atomic_buffer[expert_id];
+    if constexpr (!SWAP_AB) {
+      problem_sizes1[expert_id * 3] = final_occurrences;
+      problem_sizes1[expert_id * 3 + 1] = 2 * n;
+      problem_sizes1[expert_id * 3 + 2] = k;
+      problem_sizes2[expert_id * 3] = final_occurrences;
+      problem_sizes2[expert_id * 3 + 1] = k;
+      problem_sizes2[expert_id * 3 + 2] = n;
+    } else {
+      problem_sizes1[expert_id * 3] = 2 * n;
+      problem_sizes1[expert_id * 3 + 1] = final_occurrences;
+      problem_sizes1[expert_id * 3 + 2] = k;
+      problem_sizes2[expert_id * 3] = k;
+      problem_sizes2[expert_id * 3 + 1] = final_occurrences;
+      problem_sizes2[expert_id * 3 + 2] = n;
+    }
+  }
+}
+
+__global__ void compute_expert_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* atomic_buffer, const int num_experts, const bool swap_ab) {
+  int32_t tot_offset = 0;
+  expert_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += swap_ab ? problem_sizes1[i * 3 + 1] : problem_sizes1[i * 3];
+    expert_offsets[i + 1] = tot_offset;
+  }
+}
+
+__global__ void compute_expert_blockscale_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* blockscale_offsets, int32_t* atomic_buffer, const int num_experts,
+    const bool swap_ab) {
+  int32_t tot_offset = 0;
+  int32_t tot_offset_round = 0;
+  expert_offsets[0] = 0;
+  blockscale_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    int32_t cur_offset =
+        swap_ab ? problem_sizes1[i * 3 + 1] : problem_sizes1[i * 3];
+    atomic_buffer[i] = tot_offset;
+    tot_offset += cur_offset;
+    expert_offsets[i + 1] = tot_offset;
+    tot_offset_round += (cur_offset + (128 - 1)) / 128 * 128;
+    blockscale_offsets[i + 1] = tot_offset_round;
+  }
+}
+
+__global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
+                                  const int32_t* __restrict__ expert_offsets,
+                                  int32_t* input_permutation,
+                                  int32_t* output_permutation,
+                                  int32_t* atomic_buffer, const int topk_length,
+                                  const int topk) {
+  int const blk_expert_id = blockIdx.x;
+  int const num_experts = gridDim.x;
+  int32_t const num_tokens = expert_offsets[num_experts];
+
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    int const expert_id = topk_ids[i];
+    if (expert_id == -1 && blockIdx.x == 0) {
+      // output_permutation is used to re-order the moe outputs. It is
+      // used as c2 = c2[c_map], where c2 is a torch.tensor that is the
+      // output of the cutlass kernels and c_map is the output_permutation.
+      // c2 is initialized to zeros, therefore by setting the output_permutation
+      // to num_tokens, we are guaranteed to fill the moe outputs to zero
+      // for "invalid" topk_ids.
+      output_permutation[i] = num_tokens;
+    } else if (expert_id == blk_expert_id) {
+      int start = atomicAdd(&atomic_buffer[expert_id], 1);
+      input_permutation[start] = i / topk;
+      output_permutation[i] = start;
+    }
+  }
+}
+
+namespace {
+inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         torch::Tensor& atomic_buffer,
+                                         int64_t num_experts, int64_t n,
+                                         int64_t k, cudaStream_t stream,
+                                         const bool swap_ab) {
+  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
+
+  const int32_t* topk_ptr = static_cast<const int32_t*>(topk_ids.data_ptr());
+  int32_t* ps1_ptr = static_cast<int32_t*>(problem_sizes1.data_ptr());
+  int32_t* ps2_ptr = static_cast<int32_t*>(problem_sizes2.data_ptr());
+  int32_t* atomic_ptr = static_cast<int32_t*>(atomic_buffer.data_ptr());
+
+  if (swap_ab) {
+    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
+        topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
+        static_cast<int>(topk_ids.numel()), static_cast<int>(n),
+        static_cast<int>(k));
+  } else {
+    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
+        topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
+        static_cast<int>(topk_ids.numel()), static_cast<int>(n),
+        static_cast<int>(k));
+  }
+}
+}  // namespace
+
+void get_cutlass_moe_mm_problem_sizes_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 =
+      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  // Swap-AB should be disabled for FP4 path
+  bool may_swap_ab =
+      force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
+                             (topk_ids.numel() <= SWAP_AB_THRESHOLD));
+
+  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
+                               atomic_buffer, num_experts, n, k, stream,
+                               may_swap_ab);
+}
+
+void get_cutlass_moe_mm_data_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 =
+      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
+
+  // Swap-AB should be disabled for FP4 path
+  bool may_swap_ab = (!blockscale_offsets.has_value()) &&
+                     (topk_ids.numel() <= SWAP_AB_THRESHOLD);
+
+  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
+                               atomic_buffer, num_experts, n, k, stream,
+                               may_swap_ab);
+
+  if (blockscale_offsets.has_value()) {
+    // fp4 path
+    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
+        may_swap_ab);
+  } else {
+    compute_expert_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
+        may_swap_ab);
+  }
+  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(input_permutation.data_ptr()),
+      static_cast<int32_t*>(output_permutation.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
+      topk_ids.size(1));
+}
+
+template <bool SWAP_AB>
+__global__ void compute_pplx_data(int32_t* expert_offsets,
+                                  int32_t* problem_sizes1,
+                                  int32_t* problem_sizes2,
+                                  const int32_t* __restrict__ expert_num_tokens,
+                                  const int padded_m, const int n,
+                                  const int k) {
+  int expert_idx = threadIdx.x;
+  expert_offsets[expert_idx] = expert_idx * padded_m;
+
+  if constexpr (!SWAP_AB) {
+    problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
+    problem_sizes1[expert_idx * 3 + 1] = 2 * n;
+    problem_sizes1[expert_idx * 3 + 2] = k;
+    problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
+    problem_sizes2[expert_idx * 3 + 1] = k;
+    problem_sizes2[expert_idx * 3 + 2] = n;
+  } else {
+    problem_sizes1[expert_idx * 3] = 2 * n;
+    problem_sizes1[expert_idx * 3 + 1] = expert_num_tokens[expert_idx];
+    problem_sizes1[expert_idx * 3 + 2] = k;
+    problem_sizes2[expert_idx * 3] = k;
+    problem_sizes2[expert_idx * 3 + 1] = expert_num_tokens[expert_idx];
+    problem_sizes2[expert_idx * 3 + 2] = n;
+  }
+}
+
+void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         const torch::Tensor& expert_num_tokens,
+                                         const int64_t num_local_experts,
+                                         const int64_t padded_m,
+                                         const int64_t n, const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
+
+  if (num_local_experts * padded_m > SWAP_AB_THRESHOLD) {
+    compute_pplx_data<false><<<1, num_local_experts, 0, stream>>>(
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+        k);
+  } else {
+    compute_pplx_data<true><<<1, num_local_experts, 0, stream>>>(
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+        k);
+  }
+}
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu
@@ -0,0 +1,199 @@
+#include <stddef.h>
+#include <torch/all.h>
+#include "cutlass/cutlass.h"
+
+#include "scaled_mm_c2x.cuh"
+#include "scaled_mm_c2x_sm75_dispatch.cuh"
+#include "scaled_mm_c2x_sm80_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
+
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp"
+
+using namespace vllm;
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 2.x API, for
+   NVIDIA GPUs with SM versions prior to sm90 (Hopper).
+*/
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (azp) {
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
+        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (azp) {
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
+        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(b.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      assert(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else {
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+}
+
+void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (azp) {
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
+        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
@@ -0,0 +1,225 @@
+#pragma once
+#include <stddef.h>
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+// clang-format will break include orders
+// clang-format off
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   Epilogues defined in,
+   csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+   must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace vllm {
+
+// Wrappers for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+template <typename Arch, template <typename> typename ArchGuard,
+          typename ElementAB_, typename ElementD_,
+          template <typename, typename> typename Epilogue_, typename TileShape,
+          typename WarpShape, typename InstructionShape, int32_t MainLoopStages,
+          typename FP8MathOperator = cutlass::arch::OpMultiplyAdd>
+struct cutlass_2x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Operator =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
+                                cutlass::arch::OpMultiplyAddSaturate,
+                                FP8MathOperator>::type;
+
+  using OutputTileThreadMap =
+      cutlass::epilogue::threadblock::OutputTileThreadLayout<
+          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
+          >;
+
+  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
+      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
+      Stride<int64_t, Int<1>, Int<0>>>;
+
+  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
+
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD = 4;
+
+  // clang-format off
+  using RowMajor = typename cutlass::layout::RowMajor;
+  using ColumnMajor = typename cutlass::layout::ColumnMajor;
+  using KernelType =
+    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      float, cutlass::layout::RowMajor, AlignmentCD,
+      ElementAcc, float, cutlass::arch::OpClassTensorOp,
+      Arch,
+      TileShape, WarpShape, InstructionShape,
+      EVTD,
+      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
+      MainLoopStages, Operator,
+      1 /* epilogue stages */
+      >::GemmKernel>;
+  // clang-format on
+
+  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+  cutlass::gemm::GemmCoord problem_size{m, n, k};
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
+
+  using Epilogue = typename Gemm::Epilogue;
+  auto evt_args =
+      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
+
+  typename Gemm::EVTD::Arguments epilogue_args{
+      evt_args,
+      d_args,
+  };
+
+  typename Gemm::Op::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
+      problem_size,                                           // problem size
+      1,                                                      // batch count
+      epilogue_args,
+      a_ptr,
+      b_ptr,
+      nullptr,
+      nullptr,
+      0,
+      0,
+      0,
+      0,
+      lda,
+      ldb,
+      ldc,
+      ldc};
+
+  // Launch the CUTLASS GEMM kernel.
+  typename Gemm::Op gemm_op;
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+  cutlass::Status status = gemm_op(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
+inline void fallback_cutlass_gemm_caller(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         EpilogueArgs&&... args) {
+  // In some cases, the GPU isn't able to accommodate the
+  // shared memory requirements of the Gemm. In such cases, use
+  // the FallbackGemm instead.
+  static const int max_shared_mem_per_block_opt_in =
+      get_cuda_max_shared_memory_per_block_opt_in(0);
+
+  size_t const gemm_shared_mem_size =
+      sizeof(typename Gemm::KernelType::SharedStorage);
+  size_t const fallback_gemm_shared_mem_size =
+      sizeof(typename FallbackGemm::KernelType::SharedStorage);
+
+  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
+    return cutlass_gemm_caller<Gemm>(out, a, b,
+                                     std::forward<EpilogueArgs>(args)...);
+  } else {
+    TORCH_CHECK(fallback_gemm_shared_mem_size <=
+                max_shared_mem_per_block_opt_in);
+    return cutlass_gemm_caller<FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm75_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm75_dispatch.cuh
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM75 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_default {
+  // This config is used in 2 cases,
+  // - M in (256, inf]
+  // - M in (64, 128]
+  // Shared memory required by this Gemm 32768
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M256 {
+  // M in (128, 256]
+  // Shared memory required by this Gemm 65536
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M64 {
+  // M in (32, 64]
+  // Shared memory required by this Gemm 49152
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M32 {
+  // M in [1, 32]
+  // Shared memory required by this Gemm 49152
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm75_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm75_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM256 =
+      typename sm75_config_M256<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128 = Cutlass2xGemmDefault;
+  using Cutlass2xGemmM64 =
+      typename sm75_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm75_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm75_config_default has the least shared-memory requirements.
+  using FallbackGemm = Cutlass2xGemmDefault;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+  if (mp2 <= 32) {
+    // M in [1, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM128, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM256, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm80_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm80_dispatch.cuh
@@ -0,0 +1,139 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM80 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_default {
+  // This config is used in 2 cases,
+  //  - M in (128, inf)
+  //  - M in (64, 128] and N >= 8192
+  // Shared Memory required by this Gemm - 81920 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M64 {
+  // This config is used in 2 cases,
+  // - M in (32, 64]
+  // - M in (64, 128] and N < 8192
+  // Shared Memory required by this Gemm - 122880 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M32 {
+  // M in (16, 32]
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M16 {
+  // M in [1, 16]
+  // Shared Memory required by this Gemm - 51200 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm80_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128BigN =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128SmallN =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM64 =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM16 =
+      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm80_config_M16 has the least shared-memory requirement. However,
+  // based on some profiling, we select sm80_config_M32 as a better alternative
+  // performance wise.
+  using FallbackGemm =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    uint32_t const n = out.size(1);
+    bool const small_n = n < 8192;
+    if (small_n) {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
+                                          FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    // M in (128, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
--- a/Show More
+++ b/Show More