[7/n] decouple quantization impl from vllm dependency - gguf kernel (#11019)

This commit is contained in:
PGFLMG
2025-10-12 05:04:57 +08:00
committed by GitHub
parent b5dcfd4154
commit 8fdcd98efe
19 changed files with 7936 additions and 1 deletions

View File

@@ -186,6 +186,32 @@ void fast_topk_transform_interface(
void gelu_quick(at::Tensor& out, const at::Tensor& input);
#endif
/*
* From gguf quantization
*/
torch::Tensor
ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, int64_t n, std::optional<at::ScalarType> const& dtype);
torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row);
torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row);
torch::Tensor ggml_moe_a8(
torch::Tensor X,
torch::Tensor W,
torch::Tensor sorted_token_ids,
torch::Tensor expert_ids,
torch::Tensor num_tokens_post_padded,
int64_t type,
int64_t row,
int64_t top_k,
int64_t tokens);
torch::Tensor ggml_moe_a8_vec(
torch::Tensor X, torch::Tensor W, torch::Tensor topk_ids, int64_t top_k, int64_t type, int64_t row, int64_t tokens);
int64_t ggml_moe_get_block_size(int64_t type);
/*
* From csrc/gemm
*/
@@ -306,6 +332,8 @@ void topk_softmax(
void moe_sum_reduce(at::Tensor& input, at::Tensor& output, double routed_scaling_factor);
void moe_sum(torch::Tensor& input, torch::Tensor& output);
std::vector<at::Tensor> moe_fused_gate(
at::Tensor& input,
at::Tensor& bias,