[MOE] commit GMM custom operator (#7010)

### What this PR does / why we need it?
GMM custom operator optimization in small batch scenarios

### How was this patch tested?
Submit the GMM custom operator for subsequent integration into the MOE
process.


- vLLM version: v0.16.0
- vLLM main:
15d76f74e2

---------

Signed-off-by: chenxi-hh <chen464822955@163.com>
Signed-off-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
This commit is contained in:
chenxi-hh
2026-03-09 09:56:31 +08:00
committed by GitHub
parent 01d3515dcf
commit 737dfcf638
16 changed files with 1214 additions and 3 deletions

View File

@@ -457,6 +457,35 @@ void transpose_kv_cache_by_block_meta(
{
return;
}
std::vector<at::Tensor> moe_grouped_matmul_meta(
at::Tensor x,
at::Tensor weight,
const at::Tensor& group_list,
int64_t split_item,
int64_t group_type,
int64_t group_list_type
)
{
bool transpose_weight = false;
bool weight_nz = true;
at::TensorList x_list = at::TensorList(x);
at::TensorList weight_list = at::TensorList(weight);
std::vector<at::Tensor> y;
c10::TensorOptions options = x[0].options().dtype(x[0].scalar_type());
auto m = x[0].sizes()[0];
auto n = weight[0].sizes()[1];
if (!transpose_weight) {
n = weight[0].sizes()[2];
}
at::Tensor y_0 = at::zeros(at::IntArrayRef{m, n}, options);
y.emplace_back(y_0);
at::TensorList result = at::TensorList(y);
return y;
}
} // namespace meta
} // namespace vllm_ascend
@@ -498,5 +527,7 @@ TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
ops.impl("npu_add_rms_norm_bias", &vllm_ascend::meta::npu_add_rms_norm_bias_meta);
// transpose_kv_cache_by_block
ops.impl("transpose_kv_cache_by_block", &vllm_ascend::meta::transpose_kv_cache_by_block_meta);
// moe_grouped_matmul
ops.impl("moe_grouped_matmul", &vllm_ascend::meta::moe_grouped_matmul_meta);
}
}