CANN: Fix type float_t to float (#15736)

Signed-off-by: noemotiovon <757486878@qq.com>
This commit is contained in:
Chenguang Li
2025-09-03 10:43:53 +08:00
committed by GitHub
parent 3de008208b
commit 8a2234ea0c

View File

@@ -1767,10 +1767,10 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
case GGML_TYPE_F16: { case GGML_TYPE_F16: {
aclTensor* acl_src0 = ggml_cann_create_tensor(src0); aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
ggml_cann_pool_alloc src_buffer_allocator( ggml_cann_pool_alloc src_buffer_allocator(
ctx.pool(), ggml_nelements(src0) * sizeof(float_t)); ctx.pool(), ggml_nelements(src0) * sizeof(float));
void* src_trans_buffer = src_buffer_allocator.get(); void* src_trans_buffer = src_buffer_allocator.get();
size_t src_trans_nb[GGML_MAX_DIMS]; size_t src_trans_nb[GGML_MAX_DIMS];
src_trans_nb[0] = sizeof(float_t); src_trans_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) { for (int i = 1; i < GGML_MAX_DIMS; i++) {
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
} }
@@ -1814,14 +1814,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// [3,4,5,64] -> [3,4,5,2,32] // [3,4,5,64] -> [3,4,5,2,32]
dequant_ne = weight_ne; dequant_ne = weight_ne;
dequant_nb[0] = sizeof(float_t); dequant_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) { for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1]; dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
} }
scale_offset = ggml_nelements(src0) * sizeof(int8_t); scale_offset = ggml_nelements(src0) * sizeof(int8_t);
ggml_cann_pool_alloc dequant_buffer_allocator( ggml_cann_pool_alloc dequant_buffer_allocator(
ctx.pool(), ggml_nelements(src0) * sizeof(float_t)); ctx.pool(), ggml_nelements(src0) * sizeof(float));
aclTensor* acl_weight_tensor = ggml_cann_create_tensor( aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
@@ -1830,11 +1830,11 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb, src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset); GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
aclTensor* dequant_tensor = ggml_cann_create_tensor( aclTensor* dequant_tensor = ggml_cann_create_tensor(
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t), dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float),
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1); dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor); aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
dequant_nb[0] = sizeof(float_t); dequant_nb[0] = sizeof(float);
dequant_ne = src0->ne; dequant_ne = src0->ne;
for (int i = 1; i < GGML_MAX_DIMS; i++) { for (int i = 1; i < GGML_MAX_DIMS; i++) {
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1]; dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
@@ -2282,8 +2282,8 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
int64_t theta_scale_length = src0->ne[0] / 2; int64_t theta_scale_length = src0->ne[0] / 2;
int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1}; int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t), size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float),
theta_scale_length * sizeof(float_t)}; theta_scale_length * sizeof(float)};
GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(src1->type == GGML_TYPE_I32);
int64_t position_length = src1->ne[0]; int64_t position_length = src1->ne[0];
@@ -2293,7 +2293,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1}; int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1};
size_t theta_nb[GGML_MAX_DIMS]; size_t theta_nb[GGML_MAX_DIMS];
theta_nb[0] = sizeof(float_t); theta_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) { for (int i = 1; i < GGML_MAX_DIMS; i++) {
theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1]; theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
} }
@@ -2314,10 +2314,10 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
if (ctx.rope_cache.theta_scale_cache != nullptr) { if (ctx.rope_cache.theta_scale_cache != nullptr) {
ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache)); ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
} }
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
acl_theta_scale_tensor = acl_theta_scale_tensor =
ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float_t), ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
float start = 0; float start = 0;
@@ -2383,20 +2383,20 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
} else { } else {
// use cache // use cache
acl_theta_scale_tensor = acl_theta_scale_tensor =
ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float_t), ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
} }
ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool()); ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
// freq_factors // freq_factors
if (src2) { if (src2) {
freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float_t)); freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
void* freq_fac_res_ptr = freq_fac_res_allocator.get(); void* freq_fac_res_ptr = freq_fac_res_allocator.get();
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
src2->data, ggml_cann_type_mapping(src2->type), src2->data, ggml_cann_type_mapping(src2->type),
ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
aclTensor* acl_freq_fac_res_tensor = ggml_cann_create_tensor( aclTensor* acl_freq_fac_res_tensor = ggml_cann_create_tensor(
freq_fac_res_ptr, ACL_FLOAT, sizeof(float_t), freq_fac_res_ptr, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, acl_freq_fac_res_tensor); aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor); std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
@@ -2411,29 +2411,29 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
// power * position // power * position
int64_t theta_length = theta_scale_length * position_length; int64_t theta_length = theta_scale_length * position_length;
ggml_cann_pool_alloc theta_allocator(ctx.pool(), ggml_cann_pool_alloc theta_allocator(ctx.pool(),
theta_length * sizeof(float_t)); theta_length * sizeof(float));
void* theta_buffer = theta_allocator.get(); void* theta_buffer = theta_allocator.get();
aclTensor* acl_theta_tensor = aclTensor* acl_theta_tensor =
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float),
theta_ne, theta_nb, GGML_MAX_DIMS); theta_ne, theta_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
acl_theta_tensor); acl_theta_tensor);
// sin/cos // sin/cos
ggml_cann_pool_alloc sin_allocator(ctx.pool(), ggml_cann_pool_alloc sin_allocator(ctx.pool(),
theta_length * sizeof(float_t)); theta_length * sizeof(float));
void* sin_buffer = sin_allocator.get(); void* sin_buffer = sin_allocator.get();
aclTensor* acl_sin_tensor = ggml_cann_create_tensor( aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
sin_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb,
GGML_MAX_DIMS, ACL_FORMAT_ND); GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor); aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor);
ggml_cann_pool_alloc cos_allocator(ctx.pool(), ggml_cann_pool_alloc cos_allocator(ctx.pool(),
theta_length * sizeof(float_t)); theta_length * sizeof(float));
void* cos_buffer = cos_allocator.get(); void* cos_buffer = cos_allocator.get();
aclTensor* acl_cos_tensor = ggml_cann_create_tensor( aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
cos_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb,
GGML_MAX_DIMS, ACL_FORMAT_ND); GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor); aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor);
@@ -2449,15 +2449,15 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1}; int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
size_t sin_reshape_nb[GGML_MAX_DIMS]; size_t sin_reshape_nb[GGML_MAX_DIMS];
sin_reshape_nb[0] = sizeof(float_t); sin_reshape_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) { for (int i = 1; i < GGML_MAX_DIMS; i++) {
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
} }
aclTensor* acl_sin_repeat_tensor = aclTensor* acl_sin_repeat_tensor =
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float_t), ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclTensor* acl_cos_repeat_tensor = aclTensor* acl_cos_repeat_tensor =
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float_t), ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
// repeat // repeat
@@ -2543,15 +2543,15 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1}; int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
size_t sin_reshape_nb[GGML_MAX_DIMS]; size_t sin_reshape_nb[GGML_MAX_DIMS];
sin_reshape_nb[0] = sizeof(float_t); sin_reshape_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) { for (int i = 1; i < GGML_MAX_DIMS; i++) {
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
} }
aclTensor* acl_sin_reshape_tensor = aclTensor* acl_sin_reshape_tensor =
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float_t), ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclTensor* acl_cos_reshape_tensor = aclTensor* acl_cos_reshape_tensor =
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float_t), ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclTensor* acl_src = ggml_cann_create_tensor(src0); aclTensor* acl_src = ggml_cann_create_tensor(src0);
@@ -2566,7 +2566,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
void* minus_one_scale_buffer = nullptr; void* minus_one_scale_buffer = nullptr;
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0)); ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
ggml_cann_pool_alloc minus_one_scale_allocator( ggml_cann_pool_alloc minus_one_scale_allocator(
ctx.pool(), sizeof(float_t) * src0->ne[0]); ctx.pool(), sizeof(float) * src0->ne[0]);
if (!is_neox) { if (!is_neox) {
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...] // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
input_roll_buffer = roll_allocator.get(); input_roll_buffer = roll_allocator.get();
@@ -2596,13 +2596,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
size_t minus_one_nb[GGML_MAX_DIMS]; size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float_t); minus_one_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) { for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
} }
acl_minus_one_tensor = aclnn_values( acl_minus_one_tensor = aclnn_values(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
int64_t dim = 3; int64_t dim = 3;
int64_t* index = new int64_t[src0->ne[0]]; int64_t* index = new int64_t[src0->ne[0]];
for (int i = 0; i < src0->ne[0]; i++) { for (int i = 0; i < src0->ne[0]; i++) {
@@ -2630,22 +2630,22 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
minus_one_scale_buffer = minus_one_scale_allocator.get(); minus_one_scale_buffer = minus_one_scale_allocator.get();
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
size_t minus_one_nb[GGML_MAX_DIMS]; size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float_t); minus_one_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) { for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
} }
acl_minus_one_tensor = aclnn_values( acl_minus_one_tensor = aclnn_values(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
// -1 * first half // -1 * first half
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1}; int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
size_t first_half_nb[GGML_MAX_DIMS]; size_t first_half_nb[GGML_MAX_DIMS];
first_half_nb[0] = sizeof(float_t); first_half_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) { for (int i = 1; i < GGML_MAX_DIMS; i++) {
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1]; first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
} }
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor( aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne, minus_one_scale_buffer, ACL_FLOAT, sizeof(float), first_half_ne,
first_half_nb, GGML_MAX_DIMS); first_half_nb, GGML_MAX_DIMS);
bool inplace = true; bool inplace = true;
float scale = -1; float scale = -1;
@@ -2685,28 +2685,28 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// TODO: ne0 != n_dims in mode2 // TODO: ne0 != n_dims in mode2
} else if (src0->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F16) {
size_t input_fp32_nb[GGML_MAX_DIMS]; size_t input_fp32_nb[GGML_MAX_DIMS];
input_fp32_nb[0] = sizeof(float_t); input_fp32_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) { for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
} }
ggml_cann_pool_alloc fp32_allocator1( ggml_cann_pool_alloc fp32_allocator1(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); ctx.pool(), ggml_nelements(dst) * sizeof(float));
void* input_fp32_buffer1 = fp32_allocator1.get(); void* input_fp32_buffer1 = fp32_allocator1.get();
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor( aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne, input_fp32_buffer1, ACL_FLOAT, sizeof(float), dst->ne,
input_fp32_nb, GGML_MAX_DIMS); input_fp32_nb, GGML_MAX_DIMS);
ggml_cann_pool_alloc fp32_allocator2( ggml_cann_pool_alloc fp32_allocator2(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); ctx.pool(), ggml_nelements(dst) * sizeof(float));
void* input_fp32_buffer2 = fp32_allocator2.get(); void* input_fp32_buffer2 = fp32_allocator2.get();
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor( aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne, input_fp32_buffer2, ACL_FLOAT, sizeof(float), dst->ne,
input_fp32_nb, GGML_MAX_DIMS); input_fp32_nb, GGML_MAX_DIMS);
ggml_cann_pool_alloc fp32_allocator( ggml_cann_pool_alloc fp32_allocator(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); ctx.pool(), ggml_nelements(dst) * sizeof(float));
output_fp32_buffer = fp32_allocator.get(); output_fp32_buffer = fp32_allocator.get();
aclTensor* output_fp32_tensor = ggml_cann_create_tensor( aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne, output_fp32_buffer, ACL_FLOAT, sizeof(float), dst->ne,
input_fp32_nb, GGML_MAX_DIMS); input_fp32_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1); aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,