[Quant Kernel] refactored per token group quant fp8 to support int8 up-to 2x faster (#4396)

This commit is contained in:
Chunan Zeng
2025-03-23 23:44:17 -07:00
committed by GitHub
parent 3980ff1be6
commit 65c24c28f9
8 changed files with 191 additions and 127 deletions

View File

@@ -141,6 +141,14 @@ void sgl_per_token_group_quant_fp8(
double eps,
double fp8_min,
double fp8_max);
void sgl_per_token_group_quant_int8(
at::Tensor input,
at::Tensor output_q,
at::Tensor output_s,
int64_t group_size,
double eps,
double int8_min,
double int8_max);
void sgl_per_tensor_quant_fp8(at::Tensor input, at::Tensor output_q, at::Tensor output_s, bool is_static);
void sgl_per_token_quant_fp8(at::Tensor input, at::Tensor output_q, at::Tensor output_s);
void cublas_grouped_gemm(