[1/2] Add Kernel support for Cutlass based Fused FP4 MoE (#6093)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-06-02 13:48:03 -07:00
parent df7f61ee7d
commit eb38c7d1ca
12 changed files with 1677 additions and 22 deletions
--- a/sgl-kernel/csrc/gemm/nvfp4_quant_entry.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant_entry.cu
@@ -18,6 +18,15 @@ limitations under the License.
 #if defined ENABLE_NVFP4 && ENABLE_NVFP4
 void scaled_fp4_quant_sm100a(
    torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf);
+
+void scaled_fp4_experts_quant_sm100a(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts);
+
 #endif

 void scaled_fp4_quant(
@@ -27,3 +36,17 @@ void scaled_fp4_quant(
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization");
 }
+
+void scaled_fp4_experts_quant(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_experts_quant_sm100a(
+      output, output_scale, input, input_global_scale, input_offset_by_experts, output_scale_offset_by_experts);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 experts quantization kernel");
+}