[7/n] decouple quantization impl from vllm dependency - gguf kernel (#11019)

2025-10-12 05:04:57 +08:00
parent b5dcfd4154
commit 8fdcd98efe
19 changed files with 7936 additions and 1 deletions
--- a/sgl-kernel/include/sgl_kernel_ops.h
+++ b/sgl-kernel/include/sgl_kernel_ops.h
@@ -186,6 +186,32 @@ void fast_topk_transform_interface(
 void gelu_quick(at::Tensor& out, const at::Tensor& input);
 #endif

+/*
+ * From gguf quantization
+ */
+torch::Tensor
+ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, int64_t n, std::optional<at::ScalarType> const& dtype);
+
+torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row);
+
+torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row);
+
+torch::Tensor ggml_moe_a8(
+    torch::Tensor X,
+    torch::Tensor W,
+    torch::Tensor sorted_token_ids,
+    torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_padded,
+    int64_t type,
+    int64_t row,
+    int64_t top_k,
+    int64_t tokens);
+
+torch::Tensor ggml_moe_a8_vec(
+    torch::Tensor X, torch::Tensor W, torch::Tensor topk_ids, int64_t top_k, int64_t type, int64_t row, int64_t tokens);
+
+int64_t ggml_moe_get_block_size(int64_t type);
+
 /*
 * From csrc/gemm
 */
@@ -306,6 +332,8 @@ void topk_softmax(

 void moe_sum_reduce(at::Tensor& input, at::Tensor& output, double routed_scaling_factor);

+void moe_sum(torch::Tensor& input, torch::Tensor& output);
+
 std::vector<at::Tensor> moe_fused_gate(
    at::Tensor& input,
    at::Tensor& bias,
--- a/sgl-kernel/include/utils.h
+++ b/sgl-kernel/include/utils.h
@@ -19,6 +19,10 @@ limitations under the License.
 #include <cuda_runtime.h>
 #include <torch/all.h>

+#ifdef USE_ROCM
+#include <hip/hip_runtime.h>
+#endif
+
 #ifdef USE_ROCM
 // Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
 #define _DISPATCH_CASE_F16(c_type, ...) \
@@ -326,6 +330,13 @@ inline bool getEnvEnablePDL() {
 #define DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))

+#define DISPATCH_CASE_FLOAT_TYPES(...)                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define DISPATCH_FLOAT_TYPES(TYPE, NAME, ...) AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_FLOAT_TYPES(__VA_ARGS__))
+
 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))

 #ifndef USE_ROCM
@@ -447,3 +458,12 @@ inline uint32_t next_pow2(uint32_t x) noexcept {
  if (x <= 1) return 1;
  return 1u << (32 - __builtin_clz(x - 1));
 }
+
+/*
+ * LDG Support
+ */
+#ifndef USE_ROCM
+#define SGLANG_LDG(arg) __ldg(arg)
+#else
+#define SGLANG_LDG(arg) *(arg)
+#endif