forked from EngineX-Cambricon/enginex-mlu370-vllm
add ops
This commit is contained in:
28
torch_mlu_ops-v1.3.2/csrc/kernels/add_scalar.mlu
Normal file
28
torch_mlu_ops-v1.3.2/csrc/kernels/add_scalar.mlu
Normal file
@@ -0,0 +1,28 @@
|
||||
#include "add_scalar.mluh"
|
||||
// clang-format off
|
||||
#include <mlu.h>
|
||||
// clang-format on
|
||||
|
||||
namespace tmo {
|
||||
namespace kernels {
|
||||
|
||||
#define ONCHIP_DATA_NUM ((int)(__MLU_NRAM_SIZE__ * 3 / 4 * 1024 / sizeof(int)))
|
||||
__nram__ int nram_buffer[ONCHIP_DATA_NUM];
|
||||
|
||||
__mlu_global__ void MLUBlockAddScalar(int *dst, int *src, int count, int scalar) {
|
||||
int offset = ONCHIP_DATA_NUM * taskId;
|
||||
int deal_num = std::min(ONCHIP_DATA_NUM, count - offset);
|
||||
if (deal_num <= 0) return;
|
||||
__memcpy(nram_buffer, src + offset, deal_num * sizeof(int), GDRAM2NRAM);
|
||||
__bang_add_scalar(nram_buffer, nram_buffer, scalar, deal_num);
|
||||
__memcpy(dst + offset, nram_buffer, deal_num * sizeof(int), NRAM2GDRAM);
|
||||
}
|
||||
} // namespace kernels
|
||||
|
||||
KernelStatus invokeMLUAddScalar(cnrtQueue_t queue, int *dst, int *src, int count, int scalar) {
|
||||
uint32_t task_dim = (count + ONCHIP_DATA_NUM - 1) / ONCHIP_DATA_NUM;
|
||||
cnrtDim3_t dim{task_dim, 1, 1};
|
||||
kernels::MLUBlockAddScalar<<<dim, cnrtFuncTypeBlock, queue>>>(dst, src, count, scalar);
|
||||
return KernelStatus::KERNEL_STATUS_SUCCESS;
|
||||
}
|
||||
} // namespace tmo
|
||||
Reference in New Issue
Block a user