Optimize prefill performance on cpu backend (#8750)

2025-08-29 08:21:55 +08:00
parent 9f81d741a2
commit 5ad296bda1
9 changed files with 680 additions and 273 deletions
--- a/sgl-kernel/csrc/cpu/qkv_proj.cpp
+++ b/sgl-kernel/csrc/cpu/qkv_proj.cpp
@@ -100,8 +100,7 @@ void segment_gemm_kernel_impl(
  const int64_t NB1 = div_up(N1, BLOCK_N);
  const int64_t NB = NB0 + NB1;

-  // TODO: brgemm u8s8 depends on PyTorch 2.7 release.
-  const bool use_brgemm = false;
+  const bool use_brgemm = can_use_brgemm<int8_t>(M);

  // K + 4 after compensation
  const int64_t packed_row_size = get_row_size<int8_t>(K);