Files
enginex-mlu370-vllm/torch_mlu_ops-v1.3.2/csrc/kernels/generate_alibi_slope.mlu
2026-02-04 17:39:32 +08:00

131 lines
5.1 KiB
Plaintext

#include <algorithm>
#include <cassert>
#include <cmath>
#include <iostream>
#include <map>
#include <ostream>
#include "cnnl.h"
#include "cnrt.h"
#include "generate_alibi_slope.mluh"
// clang-format off
#include <mlu.h>
// clang-format on
namespace tmo {
namespace kernels {
#define NRAM_SIZE (__MLU_NRAM_SIZE__ * 1024 - 32 * 1024)
__nram__ int8_t nram_buffer[NRAM_SIZE];
__nram__ float range_1[64] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64};
__nram__ float range_2[64] = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51,
53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103,
105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
__mlu_func__ void genRange(float *range_nram,
float *range_base_nram,
int fill_num,
int base_num,
int offset = 0) {
int loop = (fill_num + base_num - 1) / base_num;
for (int i = 0; i < loop; i++) {
int num = std::min((fill_num - i * base_num), base_num);
float *fill_nram = range_nram + i * base_num;
__bang_move(fill_nram, range_base_nram, num * sizeof(float));
__bang_add_scalar(fill_nram, fill_nram, i * base_num + offset, num);
}
}
__mlu_global__ void MLUAlibiSlopeKernel(float *alibi_slopes,
int *true_seq_lens,
int batch_num,
int head_start,
int head_num,
int head_num_total,
int max_sequence_length,
bool use_dynamic,
int closest_power_of_2,
int farthest_power_of_2,
float base,
float extra_base) {
float *range_nram = (float *)nram_buffer;
float *base_nram = range_nram + head_num;
float *slope_nram = base_nram + head_num;
float scale = 1.0;
float dynamic_base = base;
if (use_dynamic) {
float a0 = 1.0;
float a = a0 * true_seq_lens[taskIdX] / max_sequence_length;
a = std::max(a, 1.0f);
scale = powf(a, (1.0 / (head_num_total - 1)));
dynamic_base = base / scale;
}
int close_head_num = 0;
if (head_start >= closest_power_of_2) {
close_head_num = 0;
} else if (head_start + head_num <= closest_power_of_2) {
close_head_num = head_num;
} else {
close_head_num = closest_power_of_2 - head_start;
}
int far_head_num = head_num - close_head_num;
// fill range: 1, 2..., n1, 1, 3, (n - n1) * 2 - 1
if (close_head_num) {
genRange(range_nram, range_1, close_head_num, 64, head_start);
__bang_write_value(base_nram, close_head_num, dynamic_base);
}
if (far_head_num) {
genRange(range_nram + close_head_num, range_2, far_head_num, 64,
(head_start + close_head_num - closest_power_of_2) * 2);
__bang_write_value(base_nram + close_head_num, far_head_num, extra_base);
}
// base_nram ** range_nram
__bang_log(base_nram, base_nram, head_num);
__bang_mul(slope_nram, base_nram, range_nram, head_num);
__bang_pow2(slope_nram, slope_nram, head_num);
if (use_dynamic) {
__bang_mul_scalar(slope_nram, slope_nram, scale, close_head_num);
}
__memcpy(alibi_slopes + taskIdX * head_num, slope_nram, head_num * sizeof(float), NRAM2GDRAM);
}
} // namespace kernels
KernelStatus invokeGenerateAlibiSlope(cnrtQueue_t queue,
void *alibi_slopes,
void *true_seq_lens,
int batch_num,
int head_start,
int head_num,
int head_num_total,
int max_sequence_length,
bool use_dynamic) {
cnrtDim3_t dim{.x = (uint32_t)batch_num, .y = 1, .z = 1};
int closest_power_of_2 = pow(2, floor(log2(head_num_total)));
int farthest_power_of_2 = closest_power_of_2 * 2;
float base = pow(2, (-pow(2, -(log2(closest_power_of_2) - 3))));
float extra_base = pow(2, (-pow(2, -(log2(2 * closest_power_of_2) - 3))));
kernels::MLUAlibiSlopeKernel<<<dim, cnrtFuncTypeBlock, queue>>>(
(float *)alibi_slopes, (int *)true_seq_lens, batch_num, head_start, head_num, head_num_total,
max_sequence_length, use_dynamic, closest_power_of_2, farthest_power_of_2, base, extra_base);
return KernelStatus::KERNEL_STATUS_SUCCESS;
}
} // namespace tmo