[7/n] decouple quantization impl from vllm dependency - gguf kernel (#11019)
This commit is contained in:
@@ -186,6 +186,32 @@ void fast_topk_transform_interface(
|
||||
void gelu_quick(at::Tensor& out, const at::Tensor& input);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* From gguf quantization
|
||||
*/
|
||||
torch::Tensor
|
||||
ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, int64_t n, std::optional<at::ScalarType> const& dtype);
|
||||
|
||||
torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row);
|
||||
|
||||
torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row);
|
||||
|
||||
torch::Tensor ggml_moe_a8(
|
||||
torch::Tensor X,
|
||||
torch::Tensor W,
|
||||
torch::Tensor sorted_token_ids,
|
||||
torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_padded,
|
||||
int64_t type,
|
||||
int64_t row,
|
||||
int64_t top_k,
|
||||
int64_t tokens);
|
||||
|
||||
torch::Tensor ggml_moe_a8_vec(
|
||||
torch::Tensor X, torch::Tensor W, torch::Tensor topk_ids, int64_t top_k, int64_t type, int64_t row, int64_t tokens);
|
||||
|
||||
int64_t ggml_moe_get_block_size(int64_t type);
|
||||
|
||||
/*
|
||||
* From csrc/gemm
|
||||
*/
|
||||
@@ -306,6 +332,8 @@ void topk_softmax(
|
||||
|
||||
void moe_sum_reduce(at::Tensor& input, at::Tensor& output, double routed_scaling_factor);
|
||||
|
||||
void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||
|
||||
std::vector<at::Tensor> moe_fused_gate(
|
||||
at::Tensor& input,
|
||||
at::Tensor& bias,
|
||||
|
||||
@@ -19,6 +19,10 @@ limitations under the License.
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
|
||||
#define _DISPATCH_CASE_F16(c_type, ...) \
|
||||
@@ -326,6 +330,13 @@ inline bool getEnvEnablePDL() {
|
||||
#define DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
|
||||
|
||||
#define DISPATCH_CASE_FLOAT_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||
|
||||
#define DISPATCH_FLOAT_TYPES(TYPE, NAME, ...) AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_FLOAT_TYPES(__VA_ARGS__))
|
||||
|
||||
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
||||
|
||||
#ifndef USE_ROCM
|
||||
@@ -447,3 +458,12 @@ inline uint32_t next_pow2(uint32_t x) noexcept {
|
||||
if (x <= 1) return 1;
|
||||
return 1u << (32 - __builtin_clz(x - 1));
|
||||
}
|
||||
|
||||
/*
|
||||
* LDG Support
|
||||
*/
|
||||
#ifndef USE_ROCM
|
||||
#define SGLANG_LDG(arg) __ldg(arg)
|
||||
#else
|
||||
#define SGLANG_LDG(arg) *(arg)
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user