[MOE] commit GMM custom operator (#7010)

### What this PR does / why we need it? GMM custom operator optimization in small batch scenarios ### How was this patch tested? Submit the GMM custom operator for subsequent integration into the MOE process. - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 --------- Signed-off-by: chenxi-hh <chen464822955@163.com> Signed-off-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
2026-03-09 09:56:31 +08:00
parent 01d3515dcf
commit 737dfcf638
16 changed files with 1214 additions and 3 deletions
--- a/csrc/torch_binding_meta.cpp
+++ b/csrc/torch_binding_meta.cpp
@@ -457,6 +457,35 @@ void transpose_kv_cache_by_block_meta(
 {
    return;
 }
+
+std::vector<at::Tensor> moe_grouped_matmul_meta(
+    at::Tensor x,
+    at::Tensor weight,
+    const at::Tensor& group_list,
+    int64_t split_item,
+    int64_t group_type,
+    int64_t group_list_type
+)
+{
+    bool transpose_weight = false;
+    bool weight_nz = true;
+
+    at::TensorList x_list = at::TensorList(x);
+    at::TensorList weight_list = at::TensorList(weight);
+    std::vector<at::Tensor> y;
+    c10::TensorOptions options = x[0].options().dtype(x[0].scalar_type());
+    auto m = x[0].sizes()[0];
+    auto n = weight[0].sizes()[1];
+    if (!transpose_weight) {
+        n = weight[0].sizes()[2];
+    }
+    at::Tensor y_0 = at::zeros(at::IntArrayRef{m, n}, options);
+    y.emplace_back(y_0);
+    at::TensorList result = at::TensorList(y);
+
+    return y;
+}
+
 } // namespace meta
 } // namespace vllm_ascend

@@ -498,5 +527,7 @@ TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
    ops.impl("npu_add_rms_norm_bias", &vllm_ascend::meta::npu_add_rms_norm_bias_meta);
    // transpose_kv_cache_by_block
    ops.impl("transpose_kv_cache_by_block", &vllm_ascend::meta::transpose_kv_cache_by_block_meta);
+    // moe_grouped_matmul
+    ops.impl("moe_grouped_matmul", &vllm_ascend::meta::moe_grouped_matmul_meta);
 }
 }