Optimize prefill performance on cpu backend (#8750)

2025-08-29 08:21:55 +08:00
parent 9f81d741a2
commit 5ad296bda1
9 changed files with 680 additions and 273 deletions
--- a/sgl-kernel/csrc/cpu/gemm.h
+++ b/sgl-kernel/csrc/cpu/gemm.h
@@ -27,10 +27,10 @@ template <>
 inline bool can_use_brgemm<at::Half>(int M) {
  return true;
 }
-// TODO: add u8s8 brgemm, this requires PyTorch 2.7
+// this requires PyTorch 2.7 or above
 template <>
 inline bool can_use_brgemm<int8_t>(int M) {
-  return false;
+  return M > 4;
 }

 template <>
@@ -198,4 +198,5 @@ void tinygemm_kernel(
    int64_t ldb,
    int64_t ldc,
    bool brg,
-    int64_t block_size_K);
+    int64_t block_size_K,
+    bool do_unpack = true);