[Kernel] add custom moe ops for prefill (#4194)

### What this PR does / why we need it? 1.Add the implementation of normal Aclnn operators: MoeCombineNormal, MoeDispatchNormal, NotifyDispatch，and DispatchLayout. - MoeCombineNormal: Implements the combine logic within MoE operations. - MoeDispatchNormal: Implements the dispatch logic within MoE operations. - NotifyDispatch: Exchanges topk_idx information among different ranks to calculate the device memory required for the dispatch stage. - DispatchLayout: Used to calculate information related to the device memory layout for the dispatch stage. 2.Provide PyTorch interfaces for normal operators—get_dispatch_layout, dispatch_prefill, and combine_prefill—to be used for MoE communication during the prefill stage in vLLM. - get_dispatch_layout: Calculates information related to the device memory layout for the dispatch operator, and is called before dispatch_prefill. - dispatch_prefill: Initiates the dispatch operation. - combine_prefill: Initiates the combine operation. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? The functionality has already been validated using the local Qwen model. Test cases will be added after support for multi-NPU use cases in the CI pipeline is finalized. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: shiro-zzzz <zhangdianhao@huawei.com>
2025-12-08 19:11:58 +08:00
parent f0876b5d88
commit 0617d7d394
39 changed files with 5562 additions and 2 deletions
--- a/csrc/dispatch_layout/op_host/aclnn_dispatch_layout.cpp
+++ b/csrc/dispatch_layout/op_host/aclnn_dispatch_layout.cpp
@@ -0,0 +1,64 @@
+#include <string.h>
+#include "graph/types.h"
+#include "aclnn_dispatch_layout.h"
+
+enum NnopbaseHcclServerType {
+    NNOPBASE_HCCL_SERVER_TYPE_AICPU = 0,
+    NNOPBASE_HCCL_SERVER_TYPE_MTE,
+    NNOPBASE_HCCL_SERVER_TYPE_END
+};
+extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern aclnnStatus aclnnInnerDispatchLayoutGetWorkspaceSize(
+    const aclTensor *topkIdx,
+    int64_t numTokens,
+    int64_t numRanks,
+    int64_t numExperts,
+    int64_t numTopk,
+    const aclTensor *numTokensPerRank,
+    const aclTensor *numTokensPerExpert,
+    const aclTensor *isTokenInRank,
+    uint64_t *workspaceSize,
+    aclOpExecutor **executor);
+
+extern aclnnStatus aclnnInnerDispatchLayout(
+    void *workspace,
+    uint64_t workspaceSize,
+    aclOpExecutor *executor,
+    aclrtStream stream);
+
+aclnnStatus aclnnDispatchLayoutGetWorkspaceSize(
+    const aclTensor *topkIdx,
+    int64_t numTokens,
+    int64_t numRanks,
+    int64_t numExperts,
+    int64_t numTopk,
+    const aclTensor *numTokensPerRank,
+    const aclTensor *numTokensPerExpert,
+    const aclTensor *isTokenInRank,
+    uint64_t *workspaceSize,
+    aclOpExecutor **executor)
+{
+    return aclnnInnerDispatchLayoutGetWorkspaceSize(topkIdx, numTokens, numRanks, numExperts, numTopk, numTokensPerRank,
+                                                    numTokensPerExpert, isTokenInRank, workspaceSize, executor);
+}
+
+aclnnStatus aclnnDispatchLayout(
+    void *workspace,
+    uint64_t workspaceSize,
+    aclOpExecutor *executor,
+    aclrtStream stream)
+{
+    if (NnopbaseSetHcclServerType) {
+        NnopbaseSetHcclServerType(executor, NNOPBASE_HCCL_SERVER_TYPE_MTE);
+    }
+    return aclnnInnerDispatchLayout(workspace, workspaceSize, executor, stream);
+}
+
+#ifdef __cplusplus
+}
+#endif