[NVIDIA] [2/N] Optimize silu_and_mul_scaled_fp4_grouped_quant perf (#9556)
This commit is contained in:
@@ -32,9 +32,8 @@ void silu_and_mul_scaled_fp4_experts_quant_sm100a(
|
||||
torch::Tensor& output_scale,
|
||||
torch::Tensor const& input,
|
||||
torch::Tensor const& input_global_scale,
|
||||
torch::Tensor const& input_offset_by_experts,
|
||||
torch::Tensor const& output_scale_offset_by_experts,
|
||||
torch::Tensor const& mask);
|
||||
torch::Tensor const& mask,
|
||||
bool use_silu_and_mul);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -65,12 +64,11 @@ void silu_and_mul_scaled_fp4_experts_quant(
|
||||
torch::Tensor& output_scale,
|
||||
torch::Tensor const& input,
|
||||
torch::Tensor const& input_global_scale,
|
||||
torch::Tensor const& input_offset_by_experts,
|
||||
torch::Tensor const& output_scale_offset_by_experts,
|
||||
torch::Tensor const& mask) {
|
||||
torch::Tensor const& mask,
|
||||
bool use_silu_and_mul) {
|
||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
||||
return silu_and_mul_scaled_fp4_experts_quant_sm100a(
|
||||
output, output_scale, input, input_global_scale, input_offset_by_experts, output_scale_offset_by_experts, mask);
|
||||
output, output_scale, input, input_global_scale, mask, use_silu_and_mul);
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 experts quantization kernel");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user