Files
enginex-mlu370-vllm/torch_mlu_ops-v1.3.2/csrc/kernels/add_scalar.mlu
2026-02-04 17:39:32 +08:00

29 lines
1.0 KiB
Plaintext

#include "add_scalar.mluh"
// clang-format off
#include <mlu.h>
// clang-format on
namespace tmo {
namespace kernels {
#define ONCHIP_DATA_NUM ((int)(__MLU_NRAM_SIZE__ * 3 / 4 * 1024 / sizeof(int)))
__nram__ int nram_buffer[ONCHIP_DATA_NUM];
__mlu_global__ void MLUBlockAddScalar(int *dst, int *src, int count, int scalar) {
int offset = ONCHIP_DATA_NUM * taskId;
int deal_num = std::min(ONCHIP_DATA_NUM, count - offset);
if (deal_num <= 0) return;
__memcpy(nram_buffer, src + offset, deal_num * sizeof(int), GDRAM2NRAM);
__bang_add_scalar(nram_buffer, nram_buffer, scalar, deal_num);
__memcpy(dst + offset, nram_buffer, deal_num * sizeof(int), NRAM2GDRAM);
}
} // namespace kernels
KernelStatus invokeMLUAddScalar(cnrtQueue_t queue, int *dst, int *src, int count, int scalar) {
uint32_t task_dim = (count + ONCHIP_DATA_NUM - 1) / ONCHIP_DATA_NUM;
cnrtDim3_t dim{task_dim, 1, 1};
kernels::MLUBlockAddScalar<<<dim, cnrtFuncTypeBlock, queue>>>(dst, src, count, scalar);
return KernelStatus::KERNEL_STATUS_SUCCESS;
}
} // namespace tmo