CPU: map changes from developing branch in sgl-kernel (#6833)

Co-authored-by: mingfeima <mingfei.ma@intel.com>
This commit is contained in:
YanbingJiang
2025-06-10 16:08:15 +08:00
committed by GitHub
parent 81372f3bef
commit fcde67b016
20 changed files with 1321 additions and 321 deletions

View File

@@ -44,7 +44,10 @@ std::tuple<at::Tensor, at::Tensor> grouped_topk_cpu(
int64_t topk,
bool renormalize,
int64_t num_expert_group,
int64_t topk_group);
int64_t topk_group,
int64_t num_fused_shared_experts,
std::optional<double> routed_scaling_factor,
std::optional<at::Tensor> num_token_non_padded);
std::tuple<at::Tensor, at::Tensor> biased_grouped_topk_cpu(
at::Tensor& hidden_states,
@@ -53,7 +56,10 @@ std::tuple<at::Tensor, at::Tensor> biased_grouped_topk_cpu(
int64_t topk,
bool renormalize,
int64_t num_expert_group,
int64_t topk_group);
int64_t topk_group,
int64_t num_fused_shared_experts,
std::optional<double> routed_scaling_factor,
std::optional<at::Tensor> num_token_non_padded);
// attention
void decode_attention_cpu(
@@ -182,6 +188,26 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> qkv_proj_with_rope(
bool is_vnni,
std::optional<std::vector<int64_t>> block_size);
std::tuple<at::Tensor, at::Tensor, at::Tensor> qkv_proj_with_rope_fused_weight(
at::Tensor& hidden_states,
at::Tensor& qkv_a_proj_weight,
at::Tensor& q_b_proj_weight,
at::Tensor& w_kc,
at::Tensor& q_a_layernorm_weight,
at::Tensor& kv_a_layernorm_weight,
at::Tensor& positions,
at::Tensor& cos_sin_cache,
double eps,
bool use_int8_w8a8,
bool use_fp8_w8a16,
std::optional<at::Tensor> qkv_a_proj_scale,
std::optional<at::Tensor> q_b_proj_scale,
bool is_vnni,
std::optional<std::vector<int64_t>> block_size,
int64_t q_lora_rank,
int64_t kv_lora_rank,
int64_t qk_rope_head_dim);
// shared memory init
void initialize(int64_t size, int64_t rank);
@@ -221,13 +247,15 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
m.impl("topk_softmax_cpu", torch::kCPU, &topk_softmax_cpu);
m.def(
"grouped_topk_cpu(Tensor hidden_states, Tensor gating_output, int topk, bool renormalize, int num_expert_group, "
"int topk_group) -> (Tensor, Tensor)");
"int topk_group, int num_fused_shared_experts, float? routed_scaling_factor, Tensor? num_token_non_padded) -> "
"(Tensor, Tensor)");
m.impl("grouped_topk_cpu", torch::kCPU, &grouped_topk_cpu);
// biased group topk
m.def(
"biased_grouped_topk_cpu(Tensor hidden_states, Tensor gating_output, Tensor correction_bias, int topk, bool "
"renormalize, int num_expert_group, int topk_group) -> (Tensor, Tensor)");
"renormalize, int num_expert_group, int topk_group, int num_fused_shared_experts, float? routed_scaling_factor, "
"Tensor? num_token_non_padded) -> (Tensor, Tensor)");
m.impl("biased_grouped_topk_cpu", torch::kCPU, &biased_grouped_topk_cpu);
// decode
@@ -294,6 +322,14 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
"q_b_proj_scale, Tensor? "
"kv_a_proj_scale, bool is_vnni, int[]? block_size) -> (Tensor, Tensor, Tensor)");
m.impl("qkv_proj_with_rope", torch::kCPU, &qkv_proj_with_rope);
m.def(
"qkv_proj_with_rope_fused_weight(Tensor hidden_states, Tensor qkv_a_proj_weight, Tensor q_b_proj_weight, "
"Tensor w_kc, Tensor q_a_layernorm_weight, Tensor kv_a_layernorm_weight, Tensor positions, "
"Tensor cos_sin_cache, float eps, bool use_int8_w8a8, bool use_fp8_w8a16, Tensor? qkv_a_proj_scale, Tensor? "
"q_b_proj_scale,"
"bool is_vnni, int[]? block_size, int q_lora_rank, int kv_lora_rank,"
"int qk_rope_head_dim) -> (Tensor, Tensor, Tensor)");
m.impl("qkv_proj_with_rope_fused_weight", torch::kCPU, &qkv_proj_with_rope_fused_weight);
// shared expert
m.def(