[OPS] add bmm_transpose ops (#3990)

### What this PR does / why we need it?
Add a new fusion ops to custom_op, which can cobime the torch.bmm() and
transpsose to achieve better peformance. This ops is used in mla_v1 to
replace the bmm and transpose

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?


- vLLM version: v0.11.2

---------

Signed-off-by: hust17yixuan <303660421@qq.com>
This commit is contained in:
Wang Yixuan
2025-12-01 09:09:51 +08:00
committed by GitHub
parent bc67696a02
commit c68ddc11ce
15 changed files with 1737 additions and 14 deletions

View File

@@ -151,6 +151,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> grouped_matmul_swiglu_quant_weigh
return std::tuple<at::Tensor, at::Tensor, at::Tensor>(output, output_scale, output_offset);
}
void batch_matmul_transpose(const at::Tensor &tensor_a, const at::Tensor &tensor_b, at::Tensor &tensor_c,
c10::optional<c10::string_view> format_mode,
c10::optional<c10::string_view> quant_mode)
{
return;
}
} // namespace meta
} // namespace vllm_ascend
@@ -172,5 +180,7 @@ TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
ops.impl("grouped_matmul_swiglu_quant", &vllm_ascend::meta::grouped_matmul_swiglu_quant);
// Grouped matmul swiglu quant weight nz tensor list
ops.impl("grouped_matmul_swiglu_quant_weight_nz_tensor_list", &vllm_ascend::meta::grouped_matmul_swiglu_quant_weight_nz_tensor_list_meta);
// batch_matmul_transpose
ops.impl("batch_matmul_transpose", &vllm_ascend::meta::batch_matmul_transpose);
}
}