Optimize prefill performance on cpu backend (#8750)
This commit is contained in:
@@ -100,8 +100,7 @@ void segment_gemm_kernel_impl(
|
||||
const int64_t NB1 = div_up(N1, BLOCK_N);
|
||||
const int64_t NB = NB0 + NB1;
|
||||
|
||||
// TODO: brgemm u8s8 depends on PyTorch 2.7 release.
|
||||
const bool use_brgemm = false;
|
||||
const bool use_brgemm = can_use_brgemm<int8_t>(M);
|
||||
|
||||
// K + 4 after compensation
|
||||
const int64_t packed_row_size = get_row_size<int8_t>(K);
|
||||
|
||||
Reference in New Issue
Block a user