Add fp8 fused_experts kernel for CPU in sgl-kernel and add UT (#6404)

2025-05-23 17:01:55 +08:00
parent 4ba1eea83f
commit 3ded6235c9
7 changed files with 752 additions and 157 deletions
--- a/sgl-kernel/csrc/cpu/gemm.h
+++ b/sgl-kernel/csrc/cpu/gemm.h
@@ -85,6 +85,32 @@ void fused_experts_int8_kernel_impl(
    int64_t topk,
    int64_t num_tokens_post_pad);

+// moe implementations for fp8 w8a16
+template <typename scalar_t>
+void fused_experts_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
 // shared expert implementation for int8 w8a8
 template <typename scalar_t>
 void shared_expert_int8_kernel_impl(