[CPU] [BF16] Call fused_experts_cpu, weight_packed_linear and bmm_cpu kernel in DeepSeek model (#6641)

Co-authored-by: Thien Tran <gau.nernst@yahoo.com.sg>
2025-06-25 16:43:33 +08:00
parent bc2e5645c4
commit 7eb47b0f3d
9 changed files with 343 additions and 23 deletions
--- a/sgl-kernel/csrc/cpu/gemm.cpp
+++ b/sgl-kernel/csrc/cpu/gemm.cpp
@@ -318,8 +318,8 @@ void weight_packed_linear_kernel_impl(
  const int64_t MB = div_up(M, BLOCK_M);
  const int64_t NB = div_up(N, BLOCK_N);

-  // use avx512-bf16 when a) M is small; b) dtype is bfloat16, otherwise use amx
-  const bool use_brgemm = (M > 4) || (!std::is_same_v<scalar_t, at::BFloat16>);
+  // use avx512-bf16 when a) M is small; b) dtype is bfloat16, otherwise use amx c) N is small
+  const bool use_brgemm = (M > 4) || (!std::is_same_v<scalar_t, at::BFloat16>) || (N < 64);

  // parallel on [MB, NB]
  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {