Optimize prefill performance on cpu backend (#8750)

This commit is contained in:
Ma Mingfei
2025-08-29 08:21:55 +08:00
committed by GitHub
parent 9f81d741a2
commit 5ad296bda1
9 changed files with 680 additions and 273 deletions

View File

@@ -27,10 +27,10 @@ template <>
inline bool can_use_brgemm<at::Half>(int M) {
return true;
}
// TODO: add u8s8 brgemm, this requires PyTorch 2.7
// this requires PyTorch 2.7 or above
template <>
inline bool can_use_brgemm<int8_t>(int M) {
return false;
return M > 4;
}
template <>
@@ -198,4 +198,5 @@ void tinygemm_kernel(
int64_t ldb,
int64_t ldc,
bool brg,
int64_t block_size_K);
int64_t block_size_K,
bool do_unpack = true);