Reland [1/2] Optimizations and refactors about quant kernel (#10312)

Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
fzyzcjy
2025-10-11 15:59:03 +08:00
committed by GitHub
parent 129d299278
commit 21337b22b9
13 changed files with 1065 additions and 178 deletions

View File

@@ -121,7 +121,7 @@ void sgl_per_token_group_quant_8bit(
double eps,
double min_8bit,
double max_8bit,
bool scale_ue8m0 = false) {
bool scale_ue8m0) {
CHECK_INPUT(input);
CHECK_INPUT(output_q);
@@ -215,26 +215,3 @@ void sgl_per_token_group_quant_8bit(
#undef LAUNCH_KERNEL
}
void sgl_per_token_group_quant_int8(
torch::Tensor input,
torch::Tensor output_q,
torch::Tensor output_s,
int64_t group_size,
double eps,
double int8_min,
double int8_max) {
sgl_per_token_group_quant_8bit(input, output_q, output_s, group_size, eps, int8_min, int8_max);
}
void sgl_per_token_group_quant_fp8(
torch::Tensor input,
torch::Tensor output_q,
torch::Tensor output_s,
int64_t group_size,
double eps,
double fp8_min,
double fp8_max,
bool scale_ue8m0) {
sgl_per_token_group_quant_8bit(input, output_q, output_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0);
}