[1/2] Speed up trtllm_mla attention backend (>10% e2e) (#10473)

This commit is contained in:
fzyzcjy
2025-09-16 02:53:21 +08:00
committed by GitHub
parent 5c08d7d21d
commit 3b25dc127a
6 changed files with 119 additions and 3 deletions

View File

@@ -104,6 +104,9 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
m.def("concat_mla_k(Tensor! k, Tensor k_nope, Tensor k_rope) -> ()");
m.impl("concat_mla_k", torch::kCUDA, &concat_mla_k);
m.def("concat_mla_absorb_q(Tensor a, Tensor b, Tensor! out) -> ()");
m.impl("concat_mla_absorb_q", torch::kCUDA, &concat_mla_absorb_q);
/*
* From csrc/gemm
*/