CPU: map changes from developing branch in sgl-kernel (#6833)

Co-authored-by: mingfeima <mingfei.ma@intel.com>
2025-06-10 16:08:15 +08:00
parent 81372f3bef
commit fcde67b016
20 changed files with 1321 additions and 321 deletions
--- a/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp
+++ b/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp
@@ -44,7 +44,10 @@ std::tuple<at::Tensor, at::Tensor> grouped_topk_cpu(
    int64_t topk,
    bool renormalize,
    int64_t num_expert_group,
-    int64_t topk_group);
+    int64_t topk_group,
+    int64_t num_fused_shared_experts,
+    std::optional<double> routed_scaling_factor,
+    std::optional<at::Tensor> num_token_non_padded);

 std::tuple<at::Tensor, at::Tensor> biased_grouped_topk_cpu(
    at::Tensor& hidden_states,
@@ -53,7 +56,10 @@ std::tuple<at::Tensor, at::Tensor> biased_grouped_topk_cpu(
    int64_t topk,
    bool renormalize,
    int64_t num_expert_group,
-    int64_t topk_group);
+    int64_t topk_group,
+    int64_t num_fused_shared_experts,
+    std::optional<double> routed_scaling_factor,
+    std::optional<at::Tensor> num_token_non_padded);

 // attention
 void decode_attention_cpu(
@@ -182,6 +188,26 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> qkv_proj_with_rope(
    bool is_vnni,
    std::optional<std::vector<int64_t>> block_size);

+std::tuple<at::Tensor, at::Tensor, at::Tensor> qkv_proj_with_rope_fused_weight(
+    at::Tensor& hidden_states,
+    at::Tensor& qkv_a_proj_weight,
+    at::Tensor& q_b_proj_weight,
+    at::Tensor& w_kc,
+    at::Tensor& q_a_layernorm_weight,
+    at::Tensor& kv_a_layernorm_weight,
+    at::Tensor& positions,
+    at::Tensor& cos_sin_cache,
+    double eps,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    std::optional<at::Tensor> qkv_a_proj_scale,
+    std::optional<at::Tensor> q_b_proj_scale,
+    bool is_vnni,
+    std::optional<std::vector<int64_t>> block_size,
+    int64_t q_lora_rank,
+    int64_t kv_lora_rank,
+    int64_t qk_rope_head_dim);
+
 // shared memory init
 void initialize(int64_t size, int64_t rank);

@@ -221,13 +247,15 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
  m.impl("topk_softmax_cpu", torch::kCPU, &topk_softmax_cpu);
  m.def(
      "grouped_topk_cpu(Tensor hidden_states, Tensor gating_output, int topk, bool renormalize, int num_expert_group, "
-      "int topk_group) -> (Tensor, Tensor)");
+      "int topk_group, int num_fused_shared_experts, float? routed_scaling_factor, Tensor? num_token_non_padded) -> "
+      "(Tensor, Tensor)");
  m.impl("grouped_topk_cpu", torch::kCPU, &grouped_topk_cpu);

  // biased group topk
  m.def(
      "biased_grouped_topk_cpu(Tensor hidden_states, Tensor gating_output, Tensor correction_bias, int topk, bool "
-      "renormalize, int num_expert_group, int topk_group) -> (Tensor, Tensor)");
+      "renormalize, int num_expert_group, int topk_group, int num_fused_shared_experts, float? routed_scaling_factor, "
+      "Tensor? num_token_non_padded) -> (Tensor, Tensor)");
  m.impl("biased_grouped_topk_cpu", torch::kCPU, &biased_grouped_topk_cpu);

  // decode
@@ -294,6 +322,14 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
      "q_b_proj_scale, Tensor? "
      "kv_a_proj_scale, bool is_vnni, int[]? block_size) -> (Tensor, Tensor, Tensor)");
  m.impl("qkv_proj_with_rope", torch::kCPU, &qkv_proj_with_rope);
+  m.def(
+      "qkv_proj_with_rope_fused_weight(Tensor hidden_states, Tensor qkv_a_proj_weight, Tensor q_b_proj_weight, "
+      "Tensor w_kc, Tensor q_a_layernorm_weight, Tensor kv_a_layernorm_weight, Tensor positions, "
+      "Tensor cos_sin_cache, float eps, bool use_int8_w8a8, bool use_fp8_w8a16, Tensor? qkv_a_proj_scale, Tensor? "
+      "q_b_proj_scale,"
+      "bool is_vnni, int[]? block_size, int q_lora_rank, int kv_lora_rank,"
+      "int qk_rope_head_dim) -> (Tensor, Tensor, Tensor)");
+  m.impl("qkv_proj_with_rope_fused_weight", torch::kCPU, &qkv_proj_with_rope_fused_weight);

  // shared expert
  m.def(