[OPS] add bmm_transpose ops (#3990)

### What this PR does / why we need it? Add a new fusion ops to custom_op, which can cobime the torch.bmm() and transpsose to achieve better peformance. This ops is used in mla_v1 to replace the bmm and transpose ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.11.2 --------- Signed-off-by: hust17yixuan <303660421@qq.com>
2025-12-01 09:09:51 +08:00
parent bc67696a02
commit c68ddc11ce
15 changed files with 1737 additions and 14 deletions
--- a/csrc/torch_binding_meta.cpp
+++ b/csrc/torch_binding_meta.cpp
@@ -151,6 +151,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> grouped_matmul_swiglu_quant_weigh
    return std::tuple<at::Tensor, at::Tensor, at::Tensor>(output, output_scale, output_offset);
 }

+void batch_matmul_transpose(const at::Tensor &tensor_a, const at::Tensor &tensor_b, at::Tensor &tensor_c,
+                                    c10::optional<c10::string_view> format_mode,
+                                    c10::optional<c10::string_view> quant_mode)
+{
+    return;
+
+}
+
 } // namespace meta
 } // namespace vllm_ascend

@@ -172,5 +180,7 @@ TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
    ops.impl("grouped_matmul_swiglu_quant", &vllm_ascend::meta::grouped_matmul_swiglu_quant);
    // Grouped matmul swiglu quant weight nz tensor list
    ops.impl("grouped_matmul_swiglu_quant_weight_nz_tensor_list", &vllm_ascend::meta::grouped_matmul_swiglu_quant_weight_nz_tensor_list_meta);
+    // batch_matmul_transpose
+    ops.impl("batch_matmul_transpose", &vllm_ascend::meta::batch_matmul_transpose);
 }
 }