29 lines
1.0 KiB
Plaintext
29 lines
1.0 KiB
Plaintext
#include "add_scalar.mluh"
|
|
// clang-format off
|
|
#include <mlu.h>
|
|
// clang-format on
|
|
|
|
namespace tmo {
|
|
namespace kernels {
|
|
|
|
#define ONCHIP_DATA_NUM ((int)(__MLU_NRAM_SIZE__ * 3 / 4 * 1024 / sizeof(int)))
|
|
__nram__ int nram_buffer[ONCHIP_DATA_NUM];
|
|
|
|
__mlu_global__ void MLUBlockAddScalar(int *dst, int *src, int count, int scalar) {
|
|
int offset = ONCHIP_DATA_NUM * taskId;
|
|
int deal_num = std::min(ONCHIP_DATA_NUM, count - offset);
|
|
if (deal_num <= 0) return;
|
|
__memcpy(nram_buffer, src + offset, deal_num * sizeof(int), GDRAM2NRAM);
|
|
__bang_add_scalar(nram_buffer, nram_buffer, scalar, deal_num);
|
|
__memcpy(dst + offset, nram_buffer, deal_num * sizeof(int), NRAM2GDRAM);
|
|
}
|
|
} // namespace kernels
|
|
|
|
KernelStatus invokeMLUAddScalar(cnrtQueue_t queue, int *dst, int *src, int count, int scalar) {
|
|
uint32_t task_dim = (count + ONCHIP_DATA_NUM - 1) / ONCHIP_DATA_NUM;
|
|
cnrtDim3_t dim{task_dim, 1, 1};
|
|
kernels::MLUBlockAddScalar<<<dim, cnrtFuncTypeBlock, queue>>>(dst, src, count, scalar);
|
|
return KernelStatus::KERNEL_STATUS_SUCCESS;
|
|
}
|
|
} // namespace tmo
|