[CPU] [BF16] Call fused_experts_cpu, weight_packed_linear and bmm_cpu kernel in DeepSeek model (#6641)

Co-authored-by: Thien Tran <gau.nernst@yahoo.com.sg>
This commit is contained in:
Chunyuan WU
2025-06-25 16:43:33 +08:00
committed by GitHub
parent bc2e5645c4
commit 7eb47b0f3d
9 changed files with 343 additions and 23 deletions

View File

@@ -318,8 +318,8 @@ void weight_packed_linear_kernel_impl(
const int64_t MB = div_up(M, BLOCK_M);
const int64_t NB = div_up(N, BLOCK_N);
// use avx512-bf16 when a) M is small; b) dtype is bfloat16, otherwise use amx
const bool use_brgemm = (M > 4) || (!std::is_same_v<scalar_t, at::BFloat16>);
// use avx512-bf16 when a) M is small; b) dtype is bfloat16, otherwise use amx c) N is small
const bool use_brgemm = (M > 4) || (!std::is_same_v<scalar_t, at::BFloat16>) || (N < 64);
// parallel on [MB, NB]
AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {