#include "add_scalar.mluh" // clang-format off #include // clang-format on namespace tmo { namespace kernels { #define ONCHIP_DATA_NUM ((int)(__MLU_NRAM_SIZE__ * 3 / 4 * 1024 / sizeof(int))) __nram__ int nram_buffer[ONCHIP_DATA_NUM]; __mlu_global__ void MLUBlockAddScalar(int *dst, int *src, int count, int scalar) { int offset = ONCHIP_DATA_NUM * taskId; int deal_num = std::min(ONCHIP_DATA_NUM, count - offset); if (deal_num <= 0) return; __memcpy(nram_buffer, src + offset, deal_num * sizeof(int), GDRAM2NRAM); __bang_add_scalar(nram_buffer, nram_buffer, scalar, deal_num); __memcpy(dst + offset, nram_buffer, deal_num * sizeof(int), NRAM2GDRAM); } } // namespace kernels KernelStatus invokeMLUAddScalar(cnrtQueue_t queue, int *dst, int *src, int count, int scalar) { uint32_t task_dim = (count + ONCHIP_DATA_NUM - 1) / ONCHIP_DATA_NUM; cnrtDim3_t dim{task_dim, 1, 1}; kernels::MLUBlockAddScalar<<>>(dst, src, count, scalar); return KernelStatus::KERNEL_STATUS_SUCCESS; } } // namespace tmo