[7/n] decouple quantization impl from vllm dependency - gguf kernel (#11019)
This commit is contained in:
@@ -114,6 +114,37 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
|
||||
"cu_seqlens_q) -> ()");
|
||||
m.impl("fast_topk_transform_fused", torch::kCUDA, &fast_topk_transform_interface);
|
||||
|
||||
/*
|
||||
* From gguf quantiztion
|
||||
*/
|
||||
m.def(
|
||||
"ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
|
||||
"dtype) -> Tensor");
|
||||
m.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
|
||||
|
||||
m.def(
|
||||
"ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) "
|
||||
"-> Tensor");
|
||||
m.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
|
||||
|
||||
m.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
|
||||
m.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
|
||||
|
||||
m.def(
|
||||
"ggml_moe_a8(Tensor X, Tensor W, "
|
||||
"Tensor sorted_token_ids, Tensor expert_ids, Tensor "
|
||||
"num_tokens_post_padded, "
|
||||
"int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor");
|
||||
m.impl("ggml_moe_a8", torch::kCUDA, &ggml_moe_a8);
|
||||
|
||||
m.def(
|
||||
"ggml_moe_a8_vec(Tensor X, Tensor W, "
|
||||
"Tensor topk_ids, int top_k, "
|
||||
"int type, SymInt row, SymInt tokens) -> Tensor");
|
||||
m.impl("ggml_moe_a8_vec", torch::kCUDA, &ggml_moe_a8_vec);
|
||||
|
||||
m.def("ggml_moe_get_block_size", &ggml_moe_get_block_size);
|
||||
|
||||
/*
|
||||
* From csrc/gemm
|
||||
*/
|
||||
@@ -226,17 +257,23 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
|
||||
|
||||
m.def("moe_sum_reduce(Tensor input, Tensor output, float routed_scaling_factor) -> ()");
|
||||
m.impl("moe_sum_reduce", torch::kCUDA, &moe_sum_reduce);
|
||||
|
||||
m.def("moe_sum(Tensor input, Tensor! output) -> ()");
|
||||
m.impl("moe_sum", torch::kCUDA, &moe_sum);
|
||||
|
||||
m.def(
|
||||
"moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
|
||||
"num_fused_shared_experts, float routed_scaling_factor, bool apply_routed_scaling_factor_on_output) -> "
|
||||
"(Tensor[])");
|
||||
m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
|
||||
|
||||
m.def(
|
||||
"fp8_blockwise_scaled_grouped_mm(Tensor output, Tensor a_ptrs, Tensor b_ptrs, Tensor out_ptrs, Tensor "
|
||||
"a_scales_ptrs, Tensor b_scales_ptrs, Tensor a, Tensor b, Tensor scales_a, Tensor scales_b, Tensor "
|
||||
"stride_a, Tensor stride_b, Tensor stride_c, Tensor layout_sfa, Tensor layout_sfb, Tensor problem_sizes, Tensor "
|
||||
"expert_offsets, Tensor workspace) -> ()");
|
||||
m.impl("fp8_blockwise_scaled_grouped_mm", torch::kCUDA, &fp8_blockwise_scaled_grouped_mm);
|
||||
|
||||
m.def(
|
||||
"prepare_moe_input(Tensor topk_ids, Tensor expert_offsets, Tensor? blockscale_offsets, Tensor problem_sizes1,"
|
||||
" Tensor problem_sizes2, Tensor input_permutation, Tensor output_permutation, int num_experts, int n, int k) -> "
|
||||
|
||||
Reference in New Issue
Block a user