[MOE] commit GMM custom operator (#7010)
### What this PR does / why we need it?
GMM custom operator optimization in small batch scenarios
### How was this patch tested?
Submit the GMM custom operator for subsequent integration into the MOE
process.
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
---------
Signed-off-by: chenxi-hh <chen464822955@163.com>
Signed-off-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
This commit is contained in:
@@ -457,6 +457,35 @@ void transpose_kv_cache_by_block_meta(
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> moe_grouped_matmul_meta(
|
||||
at::Tensor x,
|
||||
at::Tensor weight,
|
||||
const at::Tensor& group_list,
|
||||
int64_t split_item,
|
||||
int64_t group_type,
|
||||
int64_t group_list_type
|
||||
)
|
||||
{
|
||||
bool transpose_weight = false;
|
||||
bool weight_nz = true;
|
||||
|
||||
at::TensorList x_list = at::TensorList(x);
|
||||
at::TensorList weight_list = at::TensorList(weight);
|
||||
std::vector<at::Tensor> y;
|
||||
c10::TensorOptions options = x[0].options().dtype(x[0].scalar_type());
|
||||
auto m = x[0].sizes()[0];
|
||||
auto n = weight[0].sizes()[1];
|
||||
if (!transpose_weight) {
|
||||
n = weight[0].sizes()[2];
|
||||
}
|
||||
at::Tensor y_0 = at::zeros(at::IntArrayRef{m, n}, options);
|
||||
y.emplace_back(y_0);
|
||||
at::TensorList result = at::TensorList(y);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
} // namespace meta
|
||||
} // namespace vllm_ascend
|
||||
|
||||
@@ -498,5 +527,7 @@ TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
|
||||
ops.impl("npu_add_rms_norm_bias", &vllm_ascend::meta::npu_add_rms_norm_bias_meta);
|
||||
// transpose_kv_cache_by_block
|
||||
ops.impl("transpose_kv_cache_by_block", &vllm_ascend::meta::transpose_kv_cache_by_block_meta);
|
||||
// moe_grouped_matmul
|
||||
ops.impl("moe_grouped_matmul", &vllm_ascend::meta::moe_grouped_matmul_meta);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user