[NVIDIA] [2/N] Optimize silu_and_mul_scaled_fp4_grouped_quant perf (#9556)

2025-08-29 17:17:03 -07:00
parent ff9b561817
commit 5c34b4f1c7
7 changed files with 297 additions and 61 deletions
--- a/sgl-kernel/csrc/gemm/nvfp4_quant_entry.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant_entry.cu
@@ -32,9 +32,8 @@ void silu_and_mul_scaled_fp4_experts_quant_sm100a(
    torch::Tensor& output_scale,
    torch::Tensor const& input,
    torch::Tensor const& input_global_scale,
-    torch::Tensor const& input_offset_by_experts,
-    torch::Tensor const& output_scale_offset_by_experts,
-    torch::Tensor const& mask);
+    torch::Tensor const& mask,
+    bool use_silu_and_mul);

 #endif

@@ -65,12 +64,11 @@ void silu_and_mul_scaled_fp4_experts_quant(
    torch::Tensor& output_scale,
    torch::Tensor const& input,
    torch::Tensor const& input_global_scale,
-    torch::Tensor const& input_offset_by_experts,
-    torch::Tensor const& output_scale_offset_by_experts,
-    torch::Tensor const& mask) {
+    torch::Tensor const& mask,
+    bool use_silu_and_mul) {
 #if defined ENABLE_NVFP4 && ENABLE_NVFP4
  return silu_and_mul_scaled_fp4_experts_quant_sm100a(
-      output, output_scale, input, input_global_scale, input_offset_by_experts, output_scale_offset_by_experts, mask);
+      output, output_scale, input, input_global_scale, mask, use_silu_and_mul);
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 experts quantization kernel");
 }