[Kernel] Add moe normal ops (#4810)

### What this PR does / why we need it? 1.Add the implementation of normal Aclnn operators: MoeCombineNormal, MoeDispatchNormal, NotifyDispatch，and DispatchLayout. - MoeCombineNormal: Implements the combine logic within MoE operations. - MoeDispatchNormal: Implements the dispatch logic within MoE operations. - NotifyDispatch: Exchanges topk_idx information among different ranks to calculate the device memory required for the dispatch stage. - DispatchLayout: Used to calculate information related to the device memory layout for the dispatch stage. 2.Provide PyTorch interfaces for normal operators—get_dispatch_layout, dispatch_prefill, and combine_prefill—to be used for MoE communication during the prefill stage in vLLM. - get_dispatch_layout: Calculates information related to the device memory layout for the dispatch operator, and is called before dispatch_prefill. - dispatch_prefill: Initiates the dispatch operation. - combine_prefill: Initiates the combine operation. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? The functionality has already been validated using the local Qwen model. Test cases will be added after support for multi-NPU use cases in the CI pipeline is finalized. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: shiro-zzzz <zhangdianhao@huawei.com>
2025-12-10 17:15:28 +08:00
parent c77dca54b2
commit bd8be2e759
39 changed files with 5365 additions and 4 deletions
--- a/csrc/notify_dispatch/op_host/aclnn_notify_dispatch.cpp
+++ b/csrc/notify_dispatch/op_host/aclnn_notify_dispatch.cpp
@@ -0,0 +1,84 @@
+#include <string.h>
+#include "graph/types.h"
+#include "aclnn_notify_dispatch.h"
+
+extern void NnopbaseOpLogE(const aclnnStatus code, const char *const expr);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum NnopbaseHcclServerType {
+    NNOPBASE_HCCL_SERVER_TYPE_AICPU = 0,
+    NNOPBASE_HCCL_SERVER_TYPE_MTE,
+    NNOPBASE_HCCL_SERVER_TYPE_END
+};
+extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);
+
+extern aclnnStatus aclnnInnerNotifyDispatchGetWorkspaceSize(
+    const aclTensor *sendData,
+    const aclTensor *tokenPerExpertData,
+    int64_t sendCount,
+    int64_t numTokens,
+    char *commGroup,
+    int64_t rankSize,
+    int64_t rankId,
+    int64_t localRankSize,
+    int64_t localRankId,
+    const aclTensor *sendDataOffset,
+    const aclTensor *recvData,
+    uint64_t *workspaceSize,
+    aclOpExecutor **executor);
+
+extern aclnnStatus aclnnInnerNotifyDispatch(
+    void *workspace,
+    uint64_t workspaceSize,
+    aclOpExecutor *executor,
+    aclrtStream stream);
+
+aclnnStatus aclnnNotifyDispatchGetWorkspaceSize(
+    const aclTensor *sendData,
+    const aclTensor *tokenPerExpertData,
+    int64_t sendCount,
+    int64_t numTokens,
+    char *commGroup,
+    int64_t rankSize,
+    int64_t rankId,
+    int64_t localRankSize,
+    int64_t localRankId,
+    const aclTensor *sendDataOffset,
+    const aclTensor *recvData,
+    uint64_t *workspaceSize,
+    aclOpExecutor **executor)
+{
+    return aclnnInnerNotifyDispatchGetWorkspaceSize(
+        sendData,
+        tokenPerExpertData,
+        sendCount,
+        numTokens,
+        commGroup,
+        rankSize,
+        rankId,
+        localRankSize,
+        localRankId,
+        sendDataOffset,
+        recvData,
+        workspaceSize,
+        executor);
+}
+
+aclnnStatus aclnnNotifyDispatch(
+    void *workspace,
+    uint64_t workspaceSize,
+    aclOpExecutor *executor,
+    aclrtStream stream)
+{
+    if (NnopbaseSetHcclServerType) {
+        NnopbaseSetHcclServerType(executor, NNOPBASE_HCCL_SERVER_TYPE_MTE);
+    }
+    return aclnnInnerNotifyDispatch(workspace, workspaceSize, executor, stream);
+}
+
+#ifdef __cplusplus
+}
+#endif