[1/2] Speed up trtllm_mla attention backend (>10% e2e) (#10473)
This commit is contained in:
@@ -104,6 +104,9 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
|
||||
m.def("concat_mla_k(Tensor! k, Tensor k_nope, Tensor k_rope) -> ()");
|
||||
m.impl("concat_mla_k", torch::kCUDA, &concat_mla_k);
|
||||
|
||||
m.def("concat_mla_absorb_q(Tensor a, Tensor b, Tensor! out) -> ()");
|
||||
m.impl("concat_mla_absorb_q", torch::kCUDA, &concat_mla_absorb_q);
|
||||
|
||||
/*
|
||||
* From csrc/gemm
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user