init
This commit is contained in:
148
csrc_musa/cpu/activation.cpp
Normal file
148
csrc_musa/cpu/activation.cpp
Normal file
@@ -0,0 +1,148 @@
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8 &),
|
||||
bool is_gated>
|
||||
void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input,
|
||||
scalar_t *__restrict__ output) {
|
||||
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||
|
||||
TORCH_CHECK(d % VEC_ELEM_NUM == 0);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < num_tokens; ++i) {
|
||||
for (int j = 0; j < d; j += VEC_ELEM_NUM) {
|
||||
int start = i * d;
|
||||
if constexpr (is_gated) {
|
||||
start *= 2;
|
||||
}
|
||||
|
||||
const scalar_vec_t x(input + start + j);
|
||||
const vec_op::FP32Vec8 f32_x(x);
|
||||
vec_op::FP32Vec8 f32_ans = func(f32_x);
|
||||
|
||||
if constexpr (is_gated) {
|
||||
const scalar_vec_t y(input + start + d + j);
|
||||
const vec_op::FP32Vec8 f32_y(y);
|
||||
f32_ans = f32_y * f32_ans;
|
||||
}
|
||||
|
||||
const scalar_vec_t result(f32_ans);
|
||||
result.save(output + i * d + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 zeros(0.0);
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
return x / (ones + (zeros - x).exp());
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
const vec_op::FP32Vec8 w1(0.79788456f);
|
||||
const vec_op::FP32Vec8 w2(0.044715f);
|
||||
const vec_op::FP32Vec8 w3(0.5);
|
||||
const vec_op::FP32Vec8 x3 = x * x * x;
|
||||
const vec_op::FP32Vec8 t = (w1 * (x + w2 * x3)).tanh();
|
||||
return w3 * x * (ones + t);
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
const vec_op::FP32Vec8 w1(0.79788456f);
|
||||
const vec_op::FP32Vec8 w2(0.044715f);
|
||||
const vec_op::FP32Vec8 w3(0.5);
|
||||
const vec_op::FP32Vec8 t = (x * w1 * (ones + x * w2 * x)).tanh();
|
||||
return w3 * x * (ones + t);
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
const vec_op::FP32Vec8 w1(M_SQRT1_2);
|
||||
const vec_op::FP32Vec8 w2(0.5);
|
||||
return x * w2 * (ones + (x * w1).er());
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
const vec_op::FP32Vec8 w1(M_SQRT2 * M_2_SQRTPI * 0.5);
|
||||
const vec_op::FP32Vec8 w2(0.5);
|
||||
const vec_op::FP32Vec8 w3(0.044715);
|
||||
const vec_op::FP32Vec8 x_3 = x * x * x;
|
||||
const vec_op::FP32Vec8 inner = w1 * (x + x_3 * w3);
|
||||
return x * w2 * (ones + inner.tanh());
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
void silu_and_mul(torch::Tensor &out, torch::Tensor &input) {
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1) / 2;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "silu_and_mul_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
|
||||
activation_kernel<scalar_t, silu_act, true>(num_tokens, d,
|
||||
input.data_ptr<scalar_t>(),
|
||||
out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void gelu_and_mul(torch::Tensor &out, // [..., d]
|
||||
torch::Tensor &input) // [..., 2 * d]
|
||||
{
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1) / 2;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "gelu_and_mul_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
|
||||
activation_kernel<scalar_t, gelu_act, true>(num_tokens, d,
|
||||
input.data_ptr<scalar_t>(),
|
||||
out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void gelu_tanh_and_mul(torch::Tensor &out, // [..., d]
|
||||
torch::Tensor &input) // [..., 2 * d]
|
||||
{
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1) / 2;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "gelu_tanh_and_mul_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(gelu_tanh_and_mul_impl)
|
||||
activation_kernel<scalar_t, gelu_tanh_act, true>(
|
||||
num_tokens, d, input.data_ptr<scalar_t>(),
|
||||
out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(gelu_tanh_and_mul_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void gelu_new(torch::Tensor &out, torch::Tensor &input) {
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_new_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(gelu_new_impl)
|
||||
activation_kernel<scalar_t, gelu_new_act, false>(
|
||||
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(gelu_new_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void gelu_fast(torch::Tensor &out, torch::Tensor &input) {
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_fast_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(gelu_fast_impl)
|
||||
activation_kernel<scalar_t, gelu_fast_act, false>(
|
||||
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
|
||||
});
|
||||
}
|
||||
746
csrc_musa/cpu/attention.cpp
Normal file
746
csrc_musa/cpu/attention.cpp
Normal file
@@ -0,0 +1,746 @@
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename scalar_t> struct KernelVecType {
|
||||
using q_load_vec_type = void;
|
||||
using q_vec_type = void;
|
||||
using k_load_vec_type = void;
|
||||
using k_vec_type = void;
|
||||
using qk_acc_vec_type = void;
|
||||
using v_load_vec_type = void;
|
||||
};
|
||||
|
||||
template <> struct KernelVecType<float> {
|
||||
using q_load_vec_type = vec_op::FP32Vec4;
|
||||
using q_vec_type = vec_op::FP32Vec16;
|
||||
using k_load_vec_type = vec_op::FP32Vec16;
|
||||
using k_vec_type = vec_op::FP32Vec16;
|
||||
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
#ifdef __AVX512BF16__
|
||||
template <> struct KernelVecType<c10::BFloat16> {
|
||||
using q_load_vec_type = vec_op::BF16Vec8;
|
||||
using q_vec_type = vec_op::BF16Vec32;
|
||||
using k_load_vec_type = vec_op::BF16Vec32;
|
||||
using k_vec_type = vec_op::BF16Vec32;
|
||||
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::BF16Vec16;
|
||||
};
|
||||
#else
|
||||
template <> struct KernelVecType<c10::BFloat16> {
|
||||
using q_load_vec_type = vec_op::BF16Vec8;
|
||||
using q_vec_type = vec_op::FP32Vec16;
|
||||
using k_load_vec_type = vec_op::BF16Vec16;
|
||||
using k_vec_type = vec_op::FP32Vec16;
|
||||
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE std::pair<T, T> reduceSoftmax(T *data, const int size,
|
||||
const int capacity) {
|
||||
T max = data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
max = max >= data[i] ? max : data[i];
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = std::exp(data[i] - max);
|
||||
sum += data[i];
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (; i < size; ++i) {
|
||||
data[i] /= sum;
|
||||
}
|
||||
|
||||
for (; i < capacity; ++i) {
|
||||
data[i] = 0;
|
||||
}
|
||||
|
||||
return {max, sum};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE std::pair<T, T>
|
||||
reduceSoftmaxAlibi(T *data, const int size, const int capacity,
|
||||
const float alibi_slope, const int start_index,
|
||||
const int seq_len) {
|
||||
data[0] += alibi_slope * (start_index - seq_len + 1);
|
||||
T max = data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
|
||||
data[i] = qk;
|
||||
max = max >= qk ? max : qk;
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = std::exp(data[i] - max);
|
||||
sum += data[i];
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (; i < size; ++i) {
|
||||
data[i] /= sum;
|
||||
}
|
||||
|
||||
for (; i < capacity; ++i) {
|
||||
data[i] = 0;
|
||||
}
|
||||
|
||||
return {max, sum};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE void reducePartitonSoftmax(const T *max_data, T *sum_data,
|
||||
const int size) {
|
||||
T max = max_data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
max = max >= max_data[i] ? max : max_data[i];
|
||||
}
|
||||
|
||||
T rescaled_sum = 0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
T rescale_factor = std::exp(max_data[i] - max);
|
||||
rescaled_sum += rescale_factor * sum_data[i];
|
||||
sum_data[i] *= rescale_factor;
|
||||
}
|
||||
for (int i = 0; i < size; ++i) {
|
||||
sum_data[i] /= rescaled_sum + 1e-8;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int x>
|
||||
struct reduceQKBlockKernel {
|
||||
using q_load_vec_type = typename KernelVecType<scalar_t>::q_load_vec_type;
|
||||
using q_vec_type = typename KernelVecType<scalar_t>::q_vec_type;
|
||||
using k_load_vec_type = typename KernelVecType<scalar_t>::k_load_vec_type;
|
||||
using k_vec_type = typename KernelVecType<scalar_t>::k_vec_type;
|
||||
using qk_acc_vec_type = typename KernelVecType<scalar_t>::qk_acc_vec_type;
|
||||
|
||||
constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
|
||||
constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
|
||||
constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
|
||||
|
||||
static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
|
||||
static_assert(k_load_vec_type::get_elem_num() % x == 0);
|
||||
static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
|
||||
|
||||
FORCE_INLINE static void call(const scalar_t *__restrict__ q,
|
||||
const scalar_t *__restrict__ k_block,
|
||||
float *__restrict__ logits, float scale,
|
||||
const int token_num) {
|
||||
const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
|
||||
|
||||
qk_acc_vec_type group_accums[MAX_GROUP_NUM];
|
||||
if (token_num == BLOCK_SIZE) {
|
||||
for (int q_offset = 0; q_offset < HEAD_SIZE;
|
||||
q_offset += x, k_block += x * BLOCK_SIZE) {
|
||||
q_load_vec_type q_load_group_vec(q + q_offset);
|
||||
q_vec_type q_group_vec(q_load_group_vec);
|
||||
|
||||
vec_op::unroll_loop<int, MAX_GROUP_NUM>(
|
||||
[k_block, &q_group_vec, &group_accums](int token_group_idx) {
|
||||
k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
|
||||
TOKEN_PER_GROUP);
|
||||
k_vec_type k_group_vec(k_load_group_vec);
|
||||
vec_op::fma(group_accums[token_group_idx], q_group_vec,
|
||||
k_group_vec);
|
||||
vec_op::prefetch(k_block + x * BLOCK_SIZE +
|
||||
token_group_idx * x * TOKEN_PER_GROUP);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
for (int q_offset = 0; q_offset < HEAD_SIZE;
|
||||
q_offset += x, k_block += x * BLOCK_SIZE) {
|
||||
q_load_vec_type q_load_group_vec(q + q_offset);
|
||||
q_vec_type q_group_vec(q_load_group_vec);
|
||||
for (int token_group_start = 0; token_group_start < group_num;
|
||||
token_group_start += UNROLL_GROUP_NUM) {
|
||||
vec_op::unroll_loop<int, UNROLL_GROUP_NUM>(
|
||||
[token_group_start, k_block, &q_group_vec,
|
||||
&group_accums](int token_group_idx) {
|
||||
token_group_idx += token_group_start;
|
||||
k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
|
||||
TOKEN_PER_GROUP);
|
||||
k_vec_type k_group_vec(k_load_group_vec);
|
||||
vec_op::fma(group_accums[token_group_idx], q_group_vec,
|
||||
k_group_vec);
|
||||
vec_op::prefetch(k_block + x * BLOCK_SIZE +
|
||||
token_group_idx * x * TOKEN_PER_GROUP);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int token_group_idx = 0; token_group_idx < group_num;
|
||||
++token_group_idx) {
|
||||
vec_op::unroll_loop<int, TOKEN_PER_GROUP>(
|
||||
[&group_accums, logits, scale, token_group_idx](int token_idx) {
|
||||
float dot_v =
|
||||
group_accums[token_group_idx]
|
||||
.template reduce_sub_sum<qk_acc_vec_type::get_elem_num() /
|
||||
TOKEN_PER_GROUP>(token_idx);
|
||||
logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
|
||||
dot_v * scale;
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
|
||||
int HEAD_PARTITION_SIZE, typename acc_t>
|
||||
FORCE_INLINE void reduceValueBlock(const float *prob, const scalar_t *v_block,
|
||||
acc_t &&acc) {
|
||||
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
|
||||
constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
|
||||
static_assert(BLOCK_SIZE == ELEM_NUM);
|
||||
vec_op::FP32Vec16 prob_vec(prob);
|
||||
|
||||
vec_op::unroll_loop<int, HEAD_PARTITION_SIZE>([&](int head_elem_idx) {
|
||||
v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
|
||||
vec_op::FP32Vec16 fp32_v_vec(v_vec);
|
||||
acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
|
||||
});
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
// Paged attention v1
|
||||
namespace {
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
|
||||
struct paged_attention_v1_impl {
|
||||
static void
|
||||
call(scalar_t *__restrict__ out, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t *__restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size/x, block_size, x]
|
||||
const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int
|
||||
*__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int *__restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float *__restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const int num_seqs, const int num_heads) {
|
||||
constexpr int x = 16 / sizeof(scalar_t);
|
||||
const int num_queries_per_kv = num_heads / num_kv_heads;
|
||||
|
||||
static_assert(BLOCK_SIZE == 16);
|
||||
|
||||
int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
|
||||
int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
|
||||
TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
|
||||
|
||||
const int parallel_work_item_num = omp_get_max_threads();
|
||||
|
||||
size_t logits_bytes =
|
||||
parallel_work_item_num * max_seq_len_padded * sizeof(float);
|
||||
float *logits = (float *)std::aligned_alloc(
|
||||
64, logits_bytes); // Cacheline alignment for each context token.
|
||||
// [parallel_work_item_num, max_seq_len_padded]
|
||||
|
||||
#pragma omp parallel for collapse(2) schedule(dynamic, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
int seq_len = seq_lens[seq_idx];
|
||||
const int *seq_block_table =
|
||||
block_tables + max_num_blocks_per_seq * seq_idx;
|
||||
const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
|
||||
const scalar_t *__restrict__ q_vec_ptr =
|
||||
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
const int last_block_token_num =
|
||||
seq_len - (block_num - 1) * BLOCK_SIZE;
|
||||
float *__restrict__ thread_block_logits =
|
||||
logits + omp_get_thread_num() * max_seq_len_padded;
|
||||
|
||||
// Compute logits
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const scalar_t *__restrict__ k_block_cache_ptr =
|
||||
k_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride;
|
||||
float *__restrict__ head_block_logits =
|
||||
thread_block_logits + block_idx * BLOCK_SIZE;
|
||||
|
||||
reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
|
||||
q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
|
||||
block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
|
||||
}
|
||||
|
||||
// Compute softmax
|
||||
if (alibi_slopes) {
|
||||
reduceSoftmaxAlibi(thread_block_logits, seq_len,
|
||||
block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
|
||||
seq_len);
|
||||
} else {
|
||||
reduceSoftmax(thread_block_logits, seq_len,
|
||||
block_num * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
// Compute value
|
||||
constexpr int head_elem_num_per_partition = 16;
|
||||
constexpr int head_partition_num =
|
||||
HEAD_SIZE / head_elem_num_per_partition;
|
||||
for (int head_part_idx = 0; head_part_idx < head_partition_num;
|
||||
++head_part_idx) {
|
||||
vec_op::FP32Vec16 accums[head_elem_num_per_partition];
|
||||
scalar_t *__restrict__ out_ptr =
|
||||
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
|
||||
head_part_idx * head_elem_num_per_partition;
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const float *__restrict__ prob_vec_ptr =
|
||||
thread_block_logits + block_idx * BLOCK_SIZE;
|
||||
const scalar_t *__restrict__ v_block_cache_ptr =
|
||||
v_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
|
||||
head_elem_num_per_partition>(
|
||||
prob_vec_ptr, v_block_cache_ptr, accums);
|
||||
|
||||
if (block_idx != block_num - 1) {
|
||||
const int64_t next_physical_block_idx =
|
||||
seq_block_table[block_idx + 1];
|
||||
const scalar_t *__restrict__ next_v_block_cache_ptr =
|
||||
v_cache + next_physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||
[&](int head_elem_idx) {
|
||||
if (head_elem_idx % 2 == 0) {
|
||||
vec_op::prefetch(next_v_block_cache_ptr +
|
||||
BLOCK_SIZE * head_elem_idx);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||
[&](int head_elem_idx) {
|
||||
float value = accums[head_elem_idx].reduce_sum();
|
||||
vec_op::storeFP32(value, out_ptr + head_elem_idx);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
std::free(logits);
|
||||
}
|
||||
};
|
||||
|
||||
#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||
paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call( \
|
||||
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
|
||||
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs, \
|
||||
num_heads);
|
||||
|
||||
template <typename T, int BLOCK_SIZE>
|
||||
void paged_attention_v1_impl_launcher(
|
||||
torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache,
|
||||
torch::Tensor &value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor &block_tables, torch::Tensor &seq_lens,
|
||||
int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
int max_num_blocks_per_seq = block_tables.size(1);
|
||||
int q_stride = query.stride(0);
|
||||
int kv_block_stride = key_cache.stride(0);
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float *alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
? reinterpret_cast<const float *>(alibi_slopes.value().data_ptr())
|
||||
: nullptr;
|
||||
|
||||
T *out_ptr = reinterpret_cast<T *>(out.data_ptr());
|
||||
T *query_ptr = reinterpret_cast<T *>(query.data_ptr());
|
||||
T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
|
||||
T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
|
||||
int *block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int *seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
|
||||
switch (head_size) {
|
||||
case 64:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||
break;
|
||||
case 80:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||
break;
|
||||
case 96:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||
break;
|
||||
case 112:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v1_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
|
||||
seq_lens, max_seq_len, alibi_slopes);
|
||||
|
||||
#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_V1_KERNEL_LAUNCHER(T, 16); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void paged_attention_v1(torch::Tensor &out, torch::Tensor &query,
|
||||
torch::Tensor &key_cache, torch::Tensor &value_cache,
|
||||
int num_kv_heads, float scale,
|
||||
torch::Tensor &block_tables,
|
||||
torch::Tensor &seq_lens, int block_size,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor> &alibi_slopes,
|
||||
const std::string &kv_cache_dtype, float kv_scale) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
|
||||
[&] {
|
||||
CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
|
||||
CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
|
||||
CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
|
||||
});
|
||||
}
|
||||
|
||||
// Paged attention v2
|
||||
namespace {
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
|
||||
struct paged_attention_v2_impl {
|
||||
static void call(
|
||||
scalar_t *__restrict__ out, // [num_seqs, num_heads, head_size]
|
||||
float *__restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
||||
float
|
||||
*__restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||
scalar_t *__restrict__ tmp_out, // [num_seqs, num_heads,
|
||||
// max_num_partitions, head_size]
|
||||
const scalar_t *__restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size/x, block_size, x]
|
||||
const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int
|
||||
*__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int *__restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float *__restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const int num_seqs, const int num_heads, const int max_num_partitions) {
|
||||
constexpr int x = 16 / sizeof(scalar_t);
|
||||
const int num_queries_per_kv = num_heads / num_kv_heads;
|
||||
|
||||
static_assert(BLOCK_SIZE == 16);
|
||||
static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
|
||||
static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
|
||||
|
||||
#pragma omp parallel for collapse(3) schedule(static, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int partition_idx = 0; partition_idx < max_num_partitions;
|
||||
++partition_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int start_token_idx = partition_idx * PARTITION_SIZE;
|
||||
|
||||
if (start_token_idx >= seq_len)
|
||||
continue;
|
||||
|
||||
const int partition_num =
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
const bool no_reduce = (partition_num == 1);
|
||||
const int token_num =
|
||||
(std::min(seq_len, start_token_idx + PARTITION_SIZE) -
|
||||
start_token_idx);
|
||||
const int block_num =
|
||||
(token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int last_block_token_num =
|
||||
token_num - (block_num - 1) * BLOCK_SIZE;
|
||||
const int *seq_block_table = block_tables +
|
||||
max_num_blocks_per_seq * seq_idx +
|
||||
start_token_idx / BLOCK_SIZE;
|
||||
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
|
||||
const scalar_t *__restrict__ q_vec_ptr =
|
||||
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
|
||||
float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
|
||||
|
||||
// Compute logits
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const scalar_t *__restrict__ k_block_cache_ptr =
|
||||
k_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride;
|
||||
float *__restrict__ head_block_logits =
|
||||
logits + block_idx * BLOCK_SIZE;
|
||||
|
||||
reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
|
||||
q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
|
||||
block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
|
||||
}
|
||||
|
||||
std::pair<float, float> max_and_sum;
|
||||
if (alibi_slopes) {
|
||||
max_and_sum = reduceSoftmaxAlibi(
|
||||
logits, token_num, block_num * BLOCK_SIZE,
|
||||
alibi_slopes[head_idx], start_token_idx, seq_len);
|
||||
} else {
|
||||
max_and_sum = reduceSoftmax(logits, token_num,
|
||||
block_num * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
auto &&[max_logit, exp_sum] = max_and_sum;
|
||||
|
||||
scalar_t *__restrict__ output_buffer = nullptr;
|
||||
if (!no_reduce) {
|
||||
auto idx = seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions + partition_idx;
|
||||
max_logits[idx] = max_logit;
|
||||
exp_sums[idx] = exp_sum;
|
||||
output_buffer =
|
||||
tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
|
||||
head_idx * max_num_partitions * HEAD_SIZE +
|
||||
partition_idx * HEAD_SIZE;
|
||||
} else {
|
||||
output_buffer =
|
||||
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
|
||||
}
|
||||
|
||||
// Compute value
|
||||
constexpr int head_elem_num_per_partition = 16;
|
||||
constexpr int head_partition_num =
|
||||
HEAD_SIZE / head_elem_num_per_partition;
|
||||
for (int head_part_idx = 0; head_part_idx < head_partition_num;
|
||||
++head_part_idx) {
|
||||
vec_op::FP32Vec16 accums[head_elem_num_per_partition];
|
||||
scalar_t *__restrict__ out_ptr =
|
||||
output_buffer + head_part_idx * head_elem_num_per_partition;
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const float *__restrict__ prob_vec_ptr =
|
||||
logits + block_idx * BLOCK_SIZE;
|
||||
const scalar_t *__restrict__ v_block_cache_ptr =
|
||||
v_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
|
||||
head_elem_num_per_partition>(
|
||||
prob_vec_ptr, v_block_cache_ptr, accums);
|
||||
|
||||
if (block_idx != block_num - 1) {
|
||||
const int64_t next_physical_block_idx =
|
||||
seq_block_table[block_idx + 1];
|
||||
const scalar_t *__restrict__ next_v_block_cache_ptr =
|
||||
v_cache + next_physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||
[&](int head_elem_idx) {
|
||||
if (head_elem_idx % 2 == 0) {
|
||||
vec_op::prefetch(next_v_block_cache_ptr +
|
||||
BLOCK_SIZE * head_elem_idx);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||
[&](int head_elem_idx) {
|
||||
float value = accums[head_elem_idx].reduce_sum();
|
||||
vec_op::storeFP32(value, out_ptr + head_elem_idx);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rescale partition softmax and store the factors to exp_sums
|
||||
#pragma omp parallel for collapse(2) schedule(static, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int partition_num =
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
|
||||
if (partition_num == 1)
|
||||
continue;
|
||||
|
||||
reducePartitonSoftmax(
|
||||
max_logits + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions,
|
||||
exp_sums + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions,
|
||||
partition_num);
|
||||
}
|
||||
}
|
||||
|
||||
// Reduce values
|
||||
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
|
||||
static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
|
||||
constexpr int head_elem_num_per_group =
|
||||
16; // Note: didn't align with the cacheline size, due to some HEAD_SIZE
|
||||
// didn't align with 64 bytes
|
||||
static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
|
||||
constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
|
||||
const float *__restrict__ rescale_factors = exp_sums;
|
||||
#pragma omp parallel for collapse(3) schedule(static, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int partition_num =
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
|
||||
if (partition_num == 1)
|
||||
continue;
|
||||
|
||||
const float *__restrict__ seq_head_rescale_factors =
|
||||
rescale_factors + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions;
|
||||
const scalar_t *__restrict__ seq_head_tmp_out =
|
||||
tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
|
||||
head_idx * max_num_partitions * HEAD_SIZE +
|
||||
group_idx * head_elem_num_per_group;
|
||||
scalar_t *__restrict__ seq_head_output =
|
||||
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
|
||||
group_idx * head_elem_num_per_group;
|
||||
|
||||
vec_op::FP32Vec16 acc;
|
||||
for (int i = 0; i < partition_num; ++i) {
|
||||
vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
|
||||
v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
|
||||
vec_op::FP32Vec16 fp32_value(value);
|
||||
acc = acc + fp32_value * rescale_factor;
|
||||
}
|
||||
v_load_vec_type cast_acc(acc);
|
||||
cast_acc.save(seq_head_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||
paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call( \
|
||||
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \
|
||||
key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
|
||||
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
||||
kv_block_stride, kv_head_stride, num_seqs, num_heads, \
|
||||
max_num_partitions);
|
||||
|
||||
template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
|
||||
void paged_attention_v2_impl_launcher(
|
||||
torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
|
||||
torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
|
||||
torch::Tensor &value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor &block_tables, torch::Tensor &seq_lens, int block_size,
|
||||
int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
int max_num_blocks_per_seq = block_tables.size(1);
|
||||
int q_stride = query.stride(0);
|
||||
int kv_block_stride = key_cache.stride(0);
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
int max_num_partitions = exp_sums.size(-1);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float *alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
? reinterpret_cast<const float *>(alibi_slopes.value().data_ptr())
|
||||
: nullptr;
|
||||
|
||||
T *out_ptr = reinterpret_cast<T *>(out.data_ptr());
|
||||
float *exp_sums_ptr = reinterpret_cast<float *>(exp_sums.data_ptr());
|
||||
float *max_logits_ptr = reinterpret_cast<float *>(max_logits.data_ptr());
|
||||
T *tmp_out_ptr = reinterpret_cast<T *>(tmp_out.data_ptr());
|
||||
T *query_ptr = reinterpret_cast<T *>(query.data_ptr());
|
||||
T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
|
||||
T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
|
||||
int *block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int *seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
|
||||
switch (head_size) {
|
||||
case 64:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||
break;
|
||||
case 80:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||
break;
|
||||
case 96:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||
break;
|
||||
case 112:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v2_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, seq_lens, block_size, \
|
||||
max_seq_len, alibi_slopes);
|
||||
|
||||
#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_V2_KERNEL_LAUNCHER(T, 16); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void paged_attention_v2(torch::Tensor &out, torch::Tensor &exp_sums,
|
||||
torch::Tensor &max_logits, torch::Tensor &tmp_out,
|
||||
torch::Tensor &query, torch::Tensor &key_cache,
|
||||
torch::Tensor &value_cache, int num_kv_heads,
|
||||
float scale, torch::Tensor &block_tables,
|
||||
torch::Tensor &seq_lens, int block_size,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor> &alibi_slopes,
|
||||
const std::string &kv_cache_dtype, float kv_scale) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
|
||||
[&] {
|
||||
CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
|
||||
CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
|
||||
CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
|
||||
});
|
||||
}
|
||||
141
csrc_musa/cpu/cache.cpp
Normal file
141
csrc_musa/cpu/cache.cpp
Normal file
@@ -0,0 +1,141 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t>
|
||||
void copy_blocks_cpu_impl(
|
||||
std::vector<torch::Tensor> &key_caches,
|
||||
std::vector<torch::Tensor> &value_caches,
|
||||
const std::vector<std::pair<int64_t, int64_t>> mapping_pairs,
|
||||
const int element_num_per_block, const int layer_num) {
|
||||
const size_t pair_num = mapping_pairs.size();
|
||||
const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int layer = 0; layer < layer_num; ++layer) {
|
||||
for (size_t pair = 0; pair < pair_num; ++pair) {
|
||||
int64_t source_offset = element_num_per_block * mapping_pairs[pair].first;
|
||||
int64_t target_offset =
|
||||
element_num_per_block * mapping_pairs[pair].second;
|
||||
scalar_t *key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
|
||||
scalar_t *source_ptr = key_cache_ptr + source_offset;
|
||||
scalar_t *target_ptr = key_cache_ptr + target_offset;
|
||||
std::memcpy(target_ptr, source_ptr, block_bytes);
|
||||
|
||||
scalar_t *value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
|
||||
source_ptr = value_cache_ptr + source_offset;
|
||||
target_ptr = value_cache_ptr + target_offset;
|
||||
std::memcpy(target_ptr, source_ptr, block_bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void reshape_and_cache_cpu_impl(
|
||||
const scalar_t *__restrict__ key, const scalar_t *__restrict__ value,
|
||||
scalar_t *__restrict__ key_cache, scalar_t *__restrict__ value_cache,
|
||||
const int64_t *__restrict__ slot_mapping, const int num_tokens,
|
||||
const int key_stride, const int value_stride, const int num_heads,
|
||||
const int head_size, const int block_size, const int x) {
|
||||
const int block_elem_num = num_heads * head_size * block_size;
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
if (slot_idx >= 0) {
|
||||
int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
|
||||
int src_value_head_idx =
|
||||
token_idx * value_stride + head_idx * head_size;
|
||||
const scalar_t *src_key_head_ptr = key + src_key_head_idx;
|
||||
const scalar_t *src_value_head_ptr = value + src_value_head_idx;
|
||||
const int64_t block_index = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
scalar_t *target_key_head_ptr = key_cache +
|
||||
block_elem_num * block_index +
|
||||
head_idx * block_size * head_size;
|
||||
scalar_t *target_value_head_ptr = value_cache +
|
||||
block_elem_num * block_index +
|
||||
head_idx * block_size * head_size;
|
||||
|
||||
for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) {
|
||||
const int64_t target_offset =
|
||||
src_key_idx * block_size + block_offset * x;
|
||||
for (int i = 0; i < x; ++i) {
|
||||
target_key_head_ptr[target_offset + i] =
|
||||
src_key_head_ptr[src_key_idx + i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int src_value_idx = 0; src_value_idx < head_size;
|
||||
++src_value_idx) {
|
||||
const int64_t target_offset =
|
||||
src_value_idx * block_size + block_offset;
|
||||
target_value_head_ptr[target_offset] =
|
||||
src_value_head_ptr[src_value_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
void copy_blocks(std::vector<torch::Tensor> &key_caches,
|
||||
std::vector<torch::Tensor> &value_caches,
|
||||
const std::map<int64_t, std::vector<int64_t>> &block_mapping) {
|
||||
int num_layers = key_caches.size();
|
||||
TORCH_CHECK(num_layers == value_caches.size());
|
||||
if (num_layers == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<std::pair<int64_t, int64_t>> mapping_pairs;
|
||||
mapping_pairs.reserve(block_mapping.size());
|
||||
for (const auto &pair : block_mapping) {
|
||||
for (const auto &dst : pair.second) {
|
||||
mapping_pairs.emplace_back(pair.first, dst);
|
||||
}
|
||||
}
|
||||
|
||||
const int element_num_per_block = key_caches[0][0].numel();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
|
||||
copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, mapping_pairs,
|
||||
element_num_per_block, num_layers);
|
||||
CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
|
||||
torch::Tensor &key_cache, torch::Tensor &value_cache,
|
||||
torch::Tensor &slot_mapping,
|
||||
const std::string &kv_cache_dtype, float kv_scale) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
|
||||
int num_tokens = key.size(0);
|
||||
int num_heads = key.size(1);
|
||||
int head_size = key.size(2);
|
||||
int block_size = key_cache.size(3);
|
||||
int x = key_cache.size(4);
|
||||
|
||||
int key_stride = key.stride(0);
|
||||
int value_stride = value.stride(0);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
|
||||
reshape_and_cache_cpu_impl<scalar_t>(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
|
||||
value_stride, num_heads, head_size, block_size, x);
|
||||
CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void swap_blocks(torch::Tensor &src, torch::Tensor &dst,
|
||||
const std::map<int64_t, int64_t> &block_mapping) {
|
||||
TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
|
||||
}
|
||||
352
csrc_musa/cpu/cpu_types.hpp
Normal file
352
csrc_musa/cpu/cpu_types.hpp
Normal file
@@ -0,0 +1,352 @@
|
||||
|
||||
#ifndef CPU_TYPES_HPP
|
||||
#define CPU_TYPES_HPP
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <torch/extension.h>
|
||||
|
||||
namespace vec_op {
|
||||
|
||||
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||
|
||||
#ifndef CPU_OP_GUARD
|
||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||
#else
|
||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||
std::cout << #NAME << " invoked." << std::endl;
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||
|
||||
namespace {
|
||||
template <typename T, T... indexes, typename F>
|
||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
||||
(f(std::integral_constant<T, indexes>{}), ...);
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
template <typename T, T count, typename F,
|
||||
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||
constexpr void unroll_loop(F &&f) {
|
||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||
}
|
||||
|
||||
template <typename T> struct Vec {
|
||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||
};
|
||||
|
||||
struct FP32Vec8;
|
||||
struct FP32Vec16;
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
|
||||
__m128h reg;
|
||||
|
||||
explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
|
||||
|
||||
explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
|
||||
|
||||
explicit FP16Vec8(__m128h data) : reg(data) {}
|
||||
|
||||
FP16Vec8 operator*(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_mul_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
FP16Vec8 operator+(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_add_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
FP16Vec8 operator-(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_sub_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
FP16Vec8 operator/(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_div_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
|
||||
};
|
||||
#endif
|
||||
|
||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
|
||||
__m128i reg;
|
||||
|
||||
explicit BF16Vec8(const void *ptr)
|
||||
: reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
|
||||
|
||||
explicit BF16Vec8(const FP32Vec8 &);
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
|
||||
__m256i reg;
|
||||
|
||||
explicit BF16Vec16(const void *ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
|
||||
|
||||
explicit BF16Vec16(const FP32Vec16 &);
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
constexpr static int VEC_ELEM_NUM = 32;
|
||||
|
||||
__m512i reg;
|
||||
|
||||
explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
|
||||
|
||||
explicit BF16Vec32(__m512i data) : reg(data) {}
|
||||
|
||||
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
||||
: reg((__m512i)_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
|
||||
(__m128i)vec8_data.reg),
|
||||
(__m128i)vec8_data.reg, 1),
|
||||
(__m128i)vec8_data.reg, 2),
|
||||
(__m128i)vec8_data.reg, 3)) {}
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||
constexpr static int VEC_ELEM_NUM = 4;
|
||||
union AliasReg {
|
||||
__m128 reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m128 reg;
|
||||
|
||||
explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
|
||||
|
||||
explicit FP32Vec4(__m128 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
||||
};
|
||||
|
||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
union AliasReg {
|
||||
__m256 reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m256 reg;
|
||||
|
||||
explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
|
||||
|
||||
explicit FP32Vec8(__m256 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
|
||||
#endif
|
||||
|
||||
explicit FP32Vec8(const BF16Vec8 &v)
|
||||
: reg(_mm256_castsi256_ps(
|
||||
_mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
|
||||
|
||||
float reduce_sum() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
float result = 0;
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
|
||||
expf(ar.values[5]), expf(ar.values[4]),
|
||||
expf(ar.values[3]), expf(ar.values[2]),
|
||||
expf(ar.values[1]), expf(ar.values[0])));
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
|
||||
tanhf(ar.values[5]), tanhf(ar.values[4]),
|
||||
tanhf(ar.values[3]), tanhf(ar.values[2]),
|
||||
tanhf(ar.values[1]), tanhf(ar.values[0])));
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
|
||||
erf(ar.values[5]), erf(ar.values[4]),
|
||||
erf(ar.values[3]), erf(ar.values[2]),
|
||||
erf(ar.values[1]), erf(ar.values[0])));
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_mul_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_add_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_sub_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_div_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
|
||||
};
|
||||
|
||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
union AliasReg {
|
||||
__m512 reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m512 reg;
|
||||
|
||||
explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||
|
||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec4 &data)
|
||||
: reg((__m512)_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
|
||||
(__m128i)data.reg, 1),
|
||||
(__m128i)data.reg, 2),
|
||||
(__m128i)data.reg, 3)) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec8 &data)
|
||||
: reg((__m512)_mm512_inserti32x8(
|
||||
_mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16 &v)
|
||||
: reg(_mm512_castsi512_ps(
|
||||
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||
|
||||
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_add_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
||||
|
||||
template <int group_size> float reduce_sub_sum(int idx) {
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||
__mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
|
||||
return _mm512_mask_reduce_add_ps(mask, reg);
|
||||
}
|
||||
|
||||
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||
};
|
||||
|
||||
template <typename T> struct VecType { using vec_type = void; };
|
||||
|
||||
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||
|
||||
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
|
||||
#endif
|
||||
|
||||
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||
|
||||
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
||||
*reinterpret_cast<_Float16 *>(ptr) = v;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||
acc = acc + a * b;
|
||||
}
|
||||
|
||||
#ifdef __AVX512BF16__
|
||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
||||
}
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||
: reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||
: reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
|
||||
|
||||
inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
|
||||
acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
|
||||
}
|
||||
#else
|
||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
||||
reinterpret_cast<c10::BFloat16 *>(&v);
|
||||
*ptr = *(v_ptr + 1);
|
||||
}
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||
: reg(_mm256_cvtepi32_epi16(
|
||||
_mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||
: reg(_mm512_cvtepi32_epi16(
|
||||
_mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
|
||||
#endif
|
||||
|
||||
inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
||||
|
||||
}; // namespace vec_op
|
||||
|
||||
#endif
|
||||
117
csrc_musa/cpu/layernorm.cpp
Normal file
117
csrc_musa/cpu/layernorm.cpp
Normal file
@@ -0,0 +1,117 @@
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t>
|
||||
void rms_norm_impl(scalar_t *__restrict__ out,
|
||||
const scalar_t *__restrict__ input,
|
||||
const scalar_t *__restrict__ weight, const float epsilon,
|
||||
const int num_tokens, const int hidden_size) {
|
||||
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||
TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < num_tokens; ++i) {
|
||||
vec_op::FP32Vec8 variance(0.0);
|
||||
auto input_p = input + i * hidden_size;
|
||||
auto output_p = out + i * hidden_size;
|
||||
for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
|
||||
scalar_vec_t x(input_p + j);
|
||||
vec_op::FP32Vec8 fp32_x(x);
|
||||
variance = variance + fp32_x * fp32_x;
|
||||
}
|
||||
|
||||
float s_variance =
|
||||
1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
|
||||
vec_op::FP32Vec8 fp32_s_variance(s_variance);
|
||||
|
||||
for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
|
||||
scalar_vec_t x(input_p + j);
|
||||
scalar_vec_t w(weight + j);
|
||||
|
||||
vec_op::FP32Vec8 fp32_x(x);
|
||||
vec_op::FP32Vec8 fp32_w(w);
|
||||
|
||||
vec_op::FP32Vec8 fp32_out = fp32_x * fp32_s_variance * fp32_w;
|
||||
|
||||
scalar_vec_t out(fp32_out);
|
||||
out.save(output_p + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void fused_add_rms_norm_impl(scalar_t *__restrict__ input,
|
||||
scalar_t *__restrict__ residual,
|
||||
const scalar_t *__restrict__ weight,
|
||||
const float epsilon, const int num_tokens,
|
||||
const int hidden_size) {
|
||||
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||
TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < num_tokens; ++i) {
|
||||
vec_op::FP32Vec8 variance(0.0);
|
||||
auto input_p = input + i * hidden_size;
|
||||
auto residual_p = residual + i * hidden_size;
|
||||
for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
|
||||
scalar_vec_t x(input_p + j);
|
||||
scalar_vec_t res(residual_p + j);
|
||||
vec_op::FP32Vec8 fp32_x(x);
|
||||
vec_op::FP32Vec8 fp32_res(res);
|
||||
|
||||
fp32_x = fp32_x + fp32_res;
|
||||
variance = variance + fp32_x * fp32_x;
|
||||
scalar_vec_t out(fp32_x);
|
||||
out.save(residual_p + j);
|
||||
}
|
||||
|
||||
float s_variance =
|
||||
1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
|
||||
vec_op::FP32Vec8 fp32_s_variance(s_variance);
|
||||
|
||||
for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
|
||||
scalar_vec_t w(weight + j);
|
||||
scalar_vec_t res(residual_p + j);
|
||||
|
||||
vec_op::FP32Vec8 fp32_w(w);
|
||||
vec_op::FP32Vec8 fp32_res(res);
|
||||
|
||||
vec_op::FP32Vec8 fp32_out = fp32_res * fp32_s_variance * fp32_w;
|
||||
|
||||
scalar_vec_t out(fp32_out);
|
||||
out.save(input_p + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void rms_norm(torch::Tensor &out, torch::Tensor &input,
|
||||
torch::Tensor &weight, float epsilon) {
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(rms_norm_impl)
|
||||
rms_norm_impl(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens,
|
||||
hidden_size);
|
||||
CPU_KERNEL_GUARD_OUT(rms_norm_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual,
|
||||
torch::Tensor &weight, float epsilon) {
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "fused_add_rms_norm_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(fused_add_rms_norm_impl)
|
||||
fused_add_rms_norm_impl(
|
||||
input.data_ptr<scalar_t>(), residual.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
||||
CPU_KERNEL_GUARD_OUT(fused_add_rms_norm_impl)
|
||||
});
|
||||
}
|
||||
199
csrc_musa/cpu/pos_encoding.cpp
Normal file
199
csrc_musa/cpu/pos_encoding.cpp
Normal file
@@ -0,0 +1,199 @@
|
||||
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t>
|
||||
void rotary_embedding_impl(
|
||||
const int64_t
|
||||
*__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
|
||||
scalar_t
|
||||
*__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
|
||||
/// [num_tokens, num_heads, head_size]
|
||||
scalar_t
|
||||
*__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
|
||||
// [num_tokens, num_kv_heads, head_size]
|
||||
const scalar_t
|
||||
*__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
|
||||
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
|
||||
const int num_heads, const int num_kv_heads, const int head_size,
|
||||
const int num_tokens) {
|
||||
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||
constexpr int ELEM_SIZE = sizeof(scalar_t);
|
||||
|
||||
const int embed_dim = rot_dim / 2;
|
||||
TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
int64_t pos = positions[token_idx];
|
||||
const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||
|
||||
for (int i = 0; i < num_heads; ++i) {
|
||||
const int head_idx = i;
|
||||
const int64_t token_head =
|
||||
token_idx * query_stride + head_idx * head_size;
|
||||
for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
|
||||
const int rot_offset = j;
|
||||
const int x_index = rot_offset;
|
||||
const int y_index = embed_dim + rot_offset;
|
||||
|
||||
const int64_t out_x = token_head + x_index;
|
||||
const int64_t out_y = token_head + y_index;
|
||||
|
||||
const scalar_vec_t cos(cache_ptr + x_index);
|
||||
const scalar_vec_t sin(cache_ptr + y_index);
|
||||
|
||||
const scalar_vec_t q_x(query + out_x);
|
||||
const scalar_vec_t q_y(query + out_y);
|
||||
|
||||
vec_op::FP32Vec8 fp32_cos(cos);
|
||||
vec_op::FP32Vec8 fp32_sin(sin);
|
||||
|
||||
vec_op::FP32Vec8 fp32_q_x(q_x);
|
||||
vec_op::FP32Vec8 fp32_q_y(q_y);
|
||||
|
||||
auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
|
||||
scalar_vec_t(out1).save(query + out_x);
|
||||
|
||||
auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
|
||||
scalar_vec_t(out2).save(query + out_y);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_kv_heads; ++i) {
|
||||
const int head_idx = i;
|
||||
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
||||
for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
|
||||
const int rot_offset = j;
|
||||
const int x_index = rot_offset;
|
||||
const int y_index = embed_dim + rot_offset;
|
||||
|
||||
const int64_t out_x = token_head + x_index;
|
||||
const int64_t out_y = token_head + y_index;
|
||||
|
||||
const scalar_vec_t cos(cache_ptr + x_index);
|
||||
const scalar_vec_t sin(cache_ptr + y_index);
|
||||
|
||||
const scalar_vec_t k_x(key + out_x);
|
||||
const scalar_vec_t k_y(key + out_y);
|
||||
|
||||
vec_op::FP32Vec8 fp32_cos(cos);
|
||||
vec_op::FP32Vec8 fp32_sin(sin);
|
||||
|
||||
vec_op::FP32Vec8 fp32_k_x(k_x);
|
||||
vec_op::FP32Vec8 fp32_k_y(k_y);
|
||||
|
||||
auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
|
||||
scalar_vec_t(out1).save(key + out_x);
|
||||
auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
|
||||
scalar_vec_t(out2).save(key + out_y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void rotary_embedding_gptj_impl(
|
||||
const int64_t
|
||||
*__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
|
||||
scalar_t
|
||||
*__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
|
||||
/// [num_tokens, num_heads, head_size]
|
||||
scalar_t
|
||||
*__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
|
||||
// [num_tokens, num_kv_heads, head_size]
|
||||
const scalar_t
|
||||
*__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
|
||||
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
|
||||
const int num_heads, const int num_kv_heads, const int head_size,
|
||||
const int num_tokens) {
|
||||
const int embed_dim = rot_dim / 2;
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
for (int i = 0; i < num_heads; ++i) {
|
||||
int64_t pos = positions[token_idx];
|
||||
const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||
const scalar_t *cos_cache_ptr = cache_ptr;
|
||||
const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
|
||||
const int head_idx = i;
|
||||
const int64_t token_head =
|
||||
token_idx * query_stride + head_idx * head_size;
|
||||
scalar_t *head_query = token_head + query;
|
||||
for (int j = 0; j < embed_dim; j += 1) {
|
||||
const int rot_offset = j;
|
||||
const int x_index = 2 * rot_offset;
|
||||
const int y_index = 2 * rot_offset + 1;
|
||||
|
||||
const float cos = cos_cache_ptr[rot_offset];
|
||||
const float sin = sin_cache_ptr[rot_offset];
|
||||
|
||||
const float x = head_query[x_index];
|
||||
const float y = head_query[y_index];
|
||||
|
||||
head_query[x_index] = x * cos - y * sin;
|
||||
head_query[y_index] = y * cos + x * sin;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
for (int i = 0; i < num_kv_heads; ++i) {
|
||||
int64_t pos = positions[token_idx];
|
||||
const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||
const scalar_t *cos_cache_ptr = cache_ptr;
|
||||
const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
|
||||
const int head_idx = i;
|
||||
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
||||
scalar_t *head_key = key + token_head;
|
||||
for (int j = 0; j < embed_dim; j += 1) {
|
||||
const int rot_offset = j;
|
||||
const int x_index = 2 * rot_offset;
|
||||
const int y_index = 2 * rot_offset + 1;
|
||||
|
||||
const float cos = cos_cache_ptr[rot_offset];
|
||||
const float sin = sin_cache_ptr[rot_offset];
|
||||
|
||||
const float x = head_key[x_index];
|
||||
const float y = head_key[y_index];
|
||||
|
||||
head_key[x_index] = x * cos - y * sin;
|
||||
head_key[y_index] = y * cos + x * sin;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
|
||||
torch::Tensor &key, int head_size,
|
||||
torch::Tensor &cos_sin_cache, bool is_neox) {
|
||||
int num_tokens = query.numel() / query.size(-1);
|
||||
int rot_dim = cos_sin_cache.size(1);
|
||||
int num_heads = query.size(-1) / head_size;
|
||||
int num_kv_heads = key.size(-1) / head_size;
|
||||
int64_t key_stride = key.stride(-2);
|
||||
int64_t query_stride = query.stride(-2);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
query.scalar_type(), "rotary_embedding_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(rotary_embedding_impl)
|
||||
if (is_neox) {
|
||||
rotary_embedding_impl(
|
||||
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
|
||||
rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
|
||||
head_size, num_tokens);
|
||||
} else {
|
||||
rotary_embedding_gptj_impl(
|
||||
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
|
||||
rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
|
||||
head_size, num_tokens);
|
||||
}
|
||||
|
||||
CPU_KERNEL_GUARD_OUT(rotary_embedding_impl)
|
||||
});
|
||||
}
|
||||
73
csrc_musa/cpu/pybind.cpp
Normal file
73
csrc_musa/cpu/pybind.cpp
Normal file
@@ -0,0 +1,73 @@
|
||||
#include "cache.h"
|
||||
#include "cuda_utils.h"
|
||||
#include "ops.h"
|
||||
#include <torch/extension.h>
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
// vLLM custom ops
|
||||
pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
|
||||
|
||||
// Attention ops
|
||||
ops.def(
|
||||
"paged_attention_v1",
|
||||
&paged_attention_v1,
|
||||
"Compute the attention between an input query and the cached keys/values using PagedAttention.");
|
||||
ops.def(
|
||||
"paged_attention_v2",
|
||||
&paged_attention_v2,
|
||||
"PagedAttention V2.");
|
||||
|
||||
// Activation ops
|
||||
ops.def(
|
||||
"silu_and_mul",
|
||||
&silu_and_mul,
|
||||
"Activation function used in SwiGLU.");
|
||||
ops.def(
|
||||
"gelu_and_mul",
|
||||
&gelu_and_mul,
|
||||
"Activation function used in GeGLU with `none` approximation.");
|
||||
ops.def(
|
||||
"gelu_tanh_and_mul",
|
||||
&gelu_tanh_and_mul,
|
||||
"Activation function used in GeGLU with `tanh` approximation.");
|
||||
ops.def(
|
||||
"gelu_new",
|
||||
&gelu_new,
|
||||
"GELU implementation used in GPT-2.");
|
||||
ops.def(
|
||||
"gelu_fast",
|
||||
&gelu_fast,
|
||||
"Approximate GELU implementation.");
|
||||
|
||||
// Layernorm
|
||||
ops.def(
|
||||
"rms_norm",
|
||||
&rms_norm,
|
||||
"Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
||||
|
||||
ops.def(
|
||||
"fused_add_rms_norm",
|
||||
&fused_add_rms_norm,
|
||||
"In-place fused Add and RMS Normalization");
|
||||
|
||||
// Rotary embedding
|
||||
ops.def(
|
||||
"rotary_embedding",
|
||||
&rotary_embedding,
|
||||
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
||||
|
||||
// Cache ops
|
||||
pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
|
||||
cache_ops.def(
|
||||
"swap_blocks",
|
||||
&swap_blocks,
|
||||
"Swap in (out) the cache blocks from src to dst");
|
||||
cache_ops.def(
|
||||
"copy_blocks",
|
||||
©_blocks,
|
||||
"Copy the cache blocks from src to dst");
|
||||
cache_ops.def(
|
||||
"reshape_and_cache",
|
||||
&reshape_and_cache,
|
||||
"Reshape the key and value tensors and cache them");
|
||||
}
|
||||
Reference in New Issue
Block a user