[Quant Kernel] refactored per token group quant fp8 to support int8 up-to 2x faster (#4396)
This commit is contained in:
@@ -141,6 +141,14 @@ void sgl_per_token_group_quant_fp8(
|
||||
double eps,
|
||||
double fp8_min,
|
||||
double fp8_max);
|
||||
void sgl_per_token_group_quant_int8(
|
||||
at::Tensor input,
|
||||
at::Tensor output_q,
|
||||
at::Tensor output_s,
|
||||
int64_t group_size,
|
||||
double eps,
|
||||
double int8_min,
|
||||
double int8_max);
|
||||
void sgl_per_tensor_quant_fp8(at::Tensor input, at::Tensor output_q, at::Tensor output_s, bool is_static);
|
||||
void sgl_per_token_quant_fp8(at::Tensor input, at::Tensor output_q, at::Tensor output_s);
|
||||
void cublas_grouped_gemm(
|
||||
|
||||
Reference in New Issue
Block a user