Support FP4 gemm (1/2) (#3899)

2025-03-24 19:50:23 -07:00
parent 22c3702e1e
commit e9f8e42318
11 changed files with 1245 additions and 5 deletions
--- a/sgl-kernel/csrc/gemm/nvfp4_quant_entry.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant_entry.cu
@@ -0,0 +1,29 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void scaled_fp4_quant_sm100a(
+    torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf);
+#endif
+
+void scaled_fp4_quant(
+    torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization");
+}
--- a/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
@@ -0,0 +1,394 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_fp8.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <torch/all.h>
+
+#include "utils.h"
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct TypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+  // PTX instructions used here requires sm100a.
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]),
+        "f"(array[1]),
+        "f"(array[2]),
+        "f"(array[3]),
+        "f"(array[4]),
+        "f"(array[5]),
+        "f"(array[6]),
+        "f"(array[7]));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+  // PTX instructions used here requires sm100a.
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x),
+        "f"(array[0].y),
+        "f"(array[1].x),
+        "f"(array[1].y),
+        "f"(array[2].x),
+        "f"(array[2].y),
+        "f"(array[3].x),
+        "f"(array[3].y));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, int numCols, SFType* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + outerMIdx * outerMStride +
+                       innerMIdx * innerMStride + innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+#endif
+  return nullptr;
+}
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    __nv_fp8_e8m0 tmp;
+    tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
+    SFValue = static_cast<float>(tmp);
+    fp8SFVal = tmp.__x;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    fp8SFVal = tmp.__x;
+    SFValue = static_cast<float>(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal)) : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(512, 4) cvt_fp16_to_fp4(
+#else
+cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; colIdx += blockDim.x) {
+      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = inOffset;
+      auto& out_pos = out[outOffset];
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF>(rowIdx, colIdx, numCols, SFout);
+
+      out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+    }
+  }
+#endif
+}
+
+template <typename T>
+void invokeFP4Quantization(
+    int m,
+    int n,
+    T const* input,
+    float const* SFScale,
+    int64_t* output,
+    int32_t* SFOuput,
+    bool useUE8M0,
+    int multiProcessorCount,
+    cudaStream_t stream) {
+  // Grid, Block size.
+  // Each thread converts 8 values.
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
+  // Get number of blocks per SM (assume we can fully utilize the SM).
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+  // Launch the cvt kernel.
+  if (useUE8M0) {
+    cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput));
+  } else {
+    cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput));
+  }
+}
+
+// Instantiate the function.
+template void invokeFP4Quantization(
+    int m,
+    int n,
+    half const* input,
+    float const* SFScale,
+    int64_t* output,
+    int32_t* SFOuput,
+    bool useUE8M0,
+    int multiProcessorCount,
+    cudaStream_t stream);
+
+template void invokeFP4Quantization(
+    int m,
+    int n,
+    __nv_bfloat16 const* input,
+    float const* SFScale,
+    int64_t* output,
+    int32_t* SFOuput,
+    bool useUE8M0,
+    int multiProcessorCount,
+    cudaStream_t stream);
+
+inline int getMultiProcessorCount() {
+  static int multi_processor_count = []() {
+    int device_id = 0;
+    int count = 0;
+
+    // Get the current CUDA device ID
+    CHECK_CUDA_SUCCESS(cudaGetDevice(&device_id));
+
+    // Get the number of multiprocessors for the current device
+    CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, device_id));
+
+    return count;  // Initialize the static variable
+  }();
+
+  return multi_processor_count;  // Return the cached value on subsequent calls
+}
+
+void scaled_fp4_quant_sm100a(
+    torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+  int32_t m = input.size(0);
+  int32_t n = input.size(1);
+
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+
+  int multiProcessorCount = getMultiProcessorCount();
+
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(input.get_device());
+
+  // We don't support e8m0 scales at this moment.
+  bool useUE8M0 = false;
+
+  switch (input.scalar_type()) {
+    case torch::kHalf: {
+      auto input_ptr = reinterpret_cast<half const*>(input.data_ptr());
+      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out, useUE8M0, multiProcessorCount, stream);
+      break;
+    }
+    case torch::kBFloat16: {
+      auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr());
+      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out, useUE8M0, multiProcessorCount, stream);
+      break;
+    }
+    default: {
+      std::cerr << "Observing: " << input.scalar_type() << " for the input datatype which is invalid";
+      throw std::runtime_error("Unsupported input data type for quantize_to_fp4.");
+    }
+  }
+}
--- a/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_entry.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_entry.cu
@@ -0,0 +1,39 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void cutlass_scaled_fp4_mm_sm100a(
+    torch::Tensor& D,
+    torch::Tensor const& A,
+    torch::Tensor const& B,
+    torch::Tensor const& A_sf,
+    torch::Tensor const& B_sf,
+    torch::Tensor const& alpha);
+#endif
+
+void cutlass_scaled_fp4_mm(
+    torch::Tensor& D,
+    torch::Tensor const& A,
+    torch::Tensor const& B,
+    torch::Tensor const& A_sf,
+    torch::Tensor const& B_sf,
+    torch::Tensor const& alpha) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel.");
+}
--- a/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_kernels.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_kernels.cu
@@ -0,0 +1,365 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+// clang-format off
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/util/packed_stride.hpp"
+// clang-format on
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                                                       \
+  {                                                                                 \
+    cutlass::Status error = status;                                                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, cutlassGetStatusString(error)); \
+  }
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+// Kernel Perf config
+template <typename T>
+struct KernelTraits;
+
+template <>
+struct KernelTraits<float> {
+  using MmaTileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _128, _256>;
+};
+
+template <>
+struct KernelTraits<cutlass::half_t> {
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_4, _4, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+template <>
+struct KernelTraits<cutlass::bfloat16_t> {
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_4, _4, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+template <typename T>
+struct Fp4GemmSm100 {
+  // A matrix configuration
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutATag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 32;
+
+  // B matrix configuration
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 32;
+
+  // C/D matrix configuration
+  using ElementD = T;
+  using ElementC = T;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Kernel functional config
+  using ElementAccumulator = float;
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  // Kernel Perf config
+  using MmaTileShape = typename KernelTraits<T>::MmaTileShape;
+  using ClusterShape = typename KernelTraits<T>::ClusterShape;
+  using PerSmTileShape_MNK = typename KernelTraits<T>::PerSmTileShape_MNK;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      PerSmTileShape_MNK,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementAccumulator,
+      ElementC,
+      LayoutCTag,
+      AlignmentC,
+      ElementD,
+      LayoutDTag,
+      AlignmentD,
+      cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      LayoutATag,
+      AlignmentA,
+      ElementB,
+      LayoutBTag,
+      AlignmentB,
+      ElementAccumulator,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using LayoutA = decltype(cute::make_layout(make_shape(0, 0, 0), StrideA{}));
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using LayoutB = decltype(cute::make_layout(make_shape(0, 0, 0), StrideB{}));
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutC = decltype(cute::make_layout(make_shape(0, 0, 0), StrideC{}));
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using LayoutD = decltype(cute::make_layout(make_shape(0, 0, 0), StrideD{}));
+};
+
+template <typename T>
+typename T::Gemm::Arguments args_from_options(
+    at::Tensor& D,
+    at::Tensor const& A,
+    at::Tensor const& B,
+    at::Tensor const& A_sf,
+    at::Tensor const& B_sf,
+    at::Tensor const& alpha,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+  using ElementA = typename T::Gemm::ElementA;
+  using ElementB = typename T::Gemm::ElementB;
+  using ElementSFA = cutlass::float_ue4m3_t;
+  using ElementSFB = cutlass::float_ue4m3_t;
+  using ElementD = typename T::Gemm::ElementD;
+  using ElementCompute = float;
+  using StrideA = typename T::StrideA;
+  using StrideB = typename T::StrideB;
+  using StrideD = typename T::StrideD;
+  using Sm100BlkScaledConfig = typename T::Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+
+  int m = static_cast<int>(M);
+  int n = static_cast<int>(N);
+  int k = static_cast<int>(K);
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {m, k, 1});
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, 1});
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {m, n, 1});
+
+  auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
+  auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
+
+  typename T::Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      {// Mainloop arguments
+       static_cast<ElementA const*>(A.data_ptr()),
+       stride_A,
+       static_cast<ElementB const*>(B.data_ptr()),
+       stride_B,
+       static_cast<ElementSFA const*>(A_sf.data_ptr()),
+       layout_SFA,
+       static_cast<ElementSFB const*>(B_sf.data_ptr()),
+       layout_SFB},
+      {     // Epilogue arguments
+       {},  // epilogue.thread
+       static_cast<ElementD const*>(D.data_ptr()),
+       stride_D,
+       static_cast<ElementD*>(D.data_ptr()),
+       stride_D}};
+  auto& fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
+  return arguments;
+}
+
+template <typename T>
+void runGemm(
+    at::Tensor& D,
+    at::Tensor const& A,
+    at::Tensor const& B,
+    at::Tensor const& A_sf,
+    at::Tensor const& B_sf,
+    at::Tensor const& alpha,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    cudaStream_t stream) {
+  typename Fp4GemmSm100<T>::Gemm gemm;
+
+  auto arguments = args_from_options<Fp4GemmSm100<T>>(D, A, B, A_sf, B_sf, alpha, m, n, k);
+
+  size_t workspace_size = Fp4GemmSm100<T>::Gemm::get_workspace_size(arguments);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
+}
+#else
+template <typename T>
+void runGemm(
+    at::Tensor& D,
+    at::Tensor const& A,
+    at::Tensor const& B,
+    at::Tensor const& A_sf,
+    at::Tensor const& B_sf,
+    at::Tensor const& alpha,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    cudaStream_t stream) {
+  TORCH_CHECK(
+      false,
+      "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
+      "a CUTLASS 3.8 source directory to enable support.");
+}
+#endif  // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+#define CHECK_TYPE(x, st, m) TORCH_CHECK(x.scalar_type() == st, "Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) TORCH_CHECK(x.is_contiguous(), m, "must be contiguous")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+
+void cutlass_scaled_fp4_mm_sm100a(
+    torch::Tensor& D,
+    torch::Tensor const& A,
+    torch::Tensor const& B,
+    torch::Tensor const& A_sf,
+    torch::Tensor const& B_sf,
+    torch::Tensor const& alpha) {
+  CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
+
+  CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
+  CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
+
+  CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
+
+  TORCH_CHECK(A.dim() == 2, "a must be a matrix");
+  TORCH_CHECK(B.dim() == 2, "b must be a matrix");
+  TORCH_CHECK(
+      A.sizes()[1] == B.sizes()[1],
+      "a and b shapes cannot be multiplied (",
+      A.sizes()[0],
+      "x",
+      A.sizes()[1],
+      " and ",
+      B.sizes()[0],
+      "x",
+      B.sizes()[1],
+      ")");
+
+  auto const m = A.sizes()[0];
+  auto const n = B.sizes()[0];
+  auto const k = A.sizes()[1] * 2;
+
+  constexpr int alignment = 32;
+  TORCH_CHECK(
+      k % alignment == 0,
+      "Expected k to be divisible by ",
+      alignment,
+      ", but got a shape: (",
+      A.sizes()[0],
+      "x",
+      A.sizes()[1],
+      "), k: ",
+      k,
+      ".");
+  TORCH_CHECK(
+      n % alignment == 0,
+      "Expected n to be divisible by ",
+      alignment,
+      ", but got b shape: (",
+      B.sizes()[0],
+      "x",
+      B.sizes()[1],
+      ").");
+
+  auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
+  int rounded_m = round_up(m, 128);
+  int rounded_n = round_up(n, 128);
+  // Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
+  // integer.
+  int rounded_k = round_up(k / 16, 4);
+
+  TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
+  TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
+  TORCH_CHECK(
+      A_sf.sizes()[1] == B_sf.sizes()[1],
+      "scale_a and scale_b shapes cannot be multiplied (",
+      A_sf.sizes()[0],
+      "x",
+      A_sf.sizes()[1],
+      " and ",
+      B_sf.sizes()[0],
+      "x",
+      B_sf.sizes()[1],
+      ")");
+  TORCH_CHECK(
+      A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k,
+      "scale_a must be padded and swizzled to a shape (",
+      rounded_m,
+      "x",
+      rounded_k,
+      "), but got a shape (",
+      A_sf.sizes()[0],
+      "x",
+      A_sf.sizes()[1],
+      ")");
+  TORCH_CHECK(
+      B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k,
+      "scale_b must be padded and swizzled to a shape (",
+      rounded_n,
+      "x",
+      rounded_k,
+      "), but got a shape (",
+      B_sf.sizes()[0],
+      "x",
+      B_sf.sizes()[1],
+      ")");
+
+  auto out_dtype = D.dtype();
+  at::cuda::CUDAGuard device_guard{(char)A.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
+
+  if (out_dtype == at::ScalarType::Half) {
+    runGemm<cutlass::half_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (out_dtype == at::ScalarType::BFloat16) {
+    runGemm<cutlass::bfloat16_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (out_dtype == at::ScalarType::Float) {
+    runGemm<float>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm");
+  }
+}
--- a/sgl-kernel/csrc/torch_extension.cc
+++ b/sgl-kernel/csrc/torch_extension.cc
@@ -114,6 +114,17 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
      " ScalarType out_dtype, int cublas_handle, int cuda_stream) -> ()");
  m.impl("cublas_grouped_gemm", torch::kCUDA, &cublas_grouped_gemm);

+  m.def(
+      "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
+      "                      Tensor block_scale_a, Tensor block_scale_b,"
+      "                      Tensor alpha) -> ()");
+  m.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
+
+  m.def(
+      "scaled_fp4_quant(Tensor! output, Tensor! input,"
+      "                 Tensor! output_scale, Tensor! input_scale) -> ()");
+  m.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
+
  /*
   * From csrc/moe
   */