#ifndef CAM_DATACOPY_GM2GM_H #define CAM_DATACOPY_GM2GM_H #include #include "comm_args.h" using namespace AscendC; using namespace Moe; template FORCE_INLINE_AICORE void SetAtomicOpType(int op) { switch (op) { case ADD: AscendC::SetAtomicAdd(); break; case MUL: // Ignore setting the atomic register when performing mul break; case MAX: AscendC::SetAtomicMax(); break; case MIN: AscendC::SetAtomicMin(); break; default: AscendC::SetAtomicNone(); } } template FORCE_INLINE_AICORE void CpUB2GM(__gm__ T *gmAddr, __ubuf__ T *ubAddr, uint32_t size) { LocalTensor ubTensor; GlobalTensor gmTensor; DataCopyExtParams dataCopyParams(1, size, 0, 0, 0); ubTensor.address_.logicPos = static_cast(TPosition::VECIN); ubTensor.address_.bufferAddr = reinterpret_cast(ubAddr); gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(gmAddr)); DataCopyPad(gmTensor, ubTensor, dataCopyParams); } template FORCE_INLINE_AICORE void CpGM2UB(__ubuf__ T *ubAddr, __gm__ T *gmAddr, uint32_t size) { LocalTensor ubTensor; GlobalTensor gmTensor; DataCopyExtParams dataCopyParams(1, size, 0, 0, 0); ubTensor.address_.logicPos = static_cast(TPosition::VECIN); ubTensor.address_.bufferAddr = reinterpret_cast(ubAddr); gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(gmAddr)); DataCopyPadExtParams padParams; DataCopyPad(ubTensor, gmTensor, dataCopyParams, padParams); } template FORCE_INLINE_AICORE void CopyUB2UB(__ubuf__ T *dst, __ubuf__ T *src, const uint32_t calCount) { LocalTensor srcTensor; LocalTensor dstTensor; TBuffAddr srcAddr, dstAddr; srcAddr.bufferAddr = reinterpret_cast(src); dstAddr.bufferAddr = reinterpret_cast(dst); srcTensor.SetAddr(srcAddr); dstTensor.SetAddr(dstAddr); DataCopy(dstTensor, srcTensor, calCount); } #endif // CAM_DATACOPY_GM2GM_H