[Kernel] Add moe normal ops (#4810)

### What this PR does / why we need it? 1.Add the implementation of normal Aclnn operators: MoeCombineNormal, MoeDispatchNormal, NotifyDispatch，and DispatchLayout. - MoeCombineNormal: Implements the combine logic within MoE operations. - MoeDispatchNormal: Implements the dispatch logic within MoE operations. - NotifyDispatch: Exchanges topk_idx information among different ranks to calculate the device memory required for the dispatch stage. - DispatchLayout: Used to calculate information related to the device memory layout for the dispatch stage. 2.Provide PyTorch interfaces for normal operators—get_dispatch_layout, dispatch_prefill, and combine_prefill—to be used for MoE communication during the prefill stage in vLLM. - get_dispatch_layout: Calculates information related to the device memory layout for the dispatch operator, and is called before dispatch_prefill. - dispatch_prefill: Initiates the dispatch operation. - combine_prefill: Initiates the combine operation. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? The functionality has already been validated using the local Qwen model. Test cases will be added after support for multi-NPU use cases in the CI pipeline is finalized. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: shiro-zzzz <zhangdianhao@huawei.com>
2025-12-10 17:15:28 +08:00
parent c77dca54b2
commit bd8be2e759
39 changed files with 5365 additions and 4 deletions
--- a/csrc/notify_dispatch/op_host/aclnn_notify_dispatch.h
+++ b/csrc/notify_dispatch/op_host/aclnn_notify_dispatch.h
@@ -0,0 +1,61 @@
+
+#ifndef ACLNN_NOTIFY_DISPATCH_H_
+#define ACLNN_NOTIFY_DISPATCH_H_
+
+#include "aclnn/acl_meta.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* funtion: aclnnNotifyDispatchGetWorkspaceSize
+ * parameters :
+ * sendData : required
+ * tokenPerExpertData : required
+ * sendCount : required
+ * numTokens : required
+ * commGroup : required
+ * rankSize : required
+ * rankId : required
+ * localRankSize : required
+ * localRankId : required
+ * sendDataOffset : required
+ * recvData : required
+ * workspaceSize : size of workspace(output).
+ * executor : executor context(output).
+ */
+__attribute__((visibility("default")))
+aclnnStatus aclnnNotifyDispatchGetWorkspaceSize(
+    const aclTensor *sendData,
+    const aclTensor *tokenPerExpertData,
+    int64_t sendCount,
+    int64_t numTokens,
+    char *commGroup,
+    int64_t rankSize,
+    int64_t rankId,
+    int64_t localRankSize,
+    int64_t localRankId,
+    const aclTensor *sendDataOffset,
+    const aclTensor *recvData,
+    uint64_t *workspaceSize,
+    aclOpExecutor **executor);
+
+/* funtion: aclnnNotifyDispatch
+ * parameters :
+ * workspace : workspace memory addr(input).
+ * workspaceSize : size of workspace(input).
+ * executor : executor context(input).
+ * stream : acl stream.
+ */
+__attribute__((visibility("default")))
+aclnnStatus aclnnNotifyDispatch(
+    void *workspace,
+    uint64_t workspaceSize,
+    aclOpExecutor *executor,
+    aclrtStream stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif