Revert "GMM custom operator optimization in small batch scenarios (vllm-project#7100)" (#7557)
### What this PR does / why we need it? This reverts commit42bcad7e9b. The commit cause accuracy decrease of qwen3Next, 150 items of gsm8k, 98 -> 91. - vLLM version: v0.18.0 - vLLM main:6a9cceb219Signed-off-by: Your Name <you@example.com> Co-authored-by: Your Name <you@example.com>
This commit is contained in:
@@ -691,7 +691,7 @@ std::vector<at::Tensor> moe_grouped_matmul(
|
||||
y.emplace_back(y_0);
|
||||
at::TensorList result = at::TensorList(y);
|
||||
|
||||
EXEC_NPU_CMD(aclnnMoeGroupedMatmul,
|
||||
EXEC_NPU_CMD(aclnnMoeGroupedMatmulWeightNz,
|
||||
x_list, weight_list, group_list, transpose_weight, result);
|
||||
|
||||
return y;
|
||||
|
||||
Reference in New Issue
Block a user