CPU: map changes from developing branch in sgl-kernel (#6833)

Co-authored-by: mingfeima <mingfei.ma@intel.com>
2025-06-10 16:08:15 +08:00
parent 81372f3bef
commit fcde67b016
20 changed files with 1321 additions and 321 deletions
--- a/sgl-kernel/csrc/cpu/vec.h
+++ b/sgl-kernel/csrc/cpu/vec.h
@@ -30,6 +30,22 @@ convert_from_float_ext<at::BFloat16>(const Vectorized<float>& a, const Vectorize

 #define CVT_FP16_TO_FP32(a) _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))

+// this doesn't handle NaN.
+inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
+  const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+
+  const __m512i mant = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x07)), 4);
+  const __m512i raw_exp = _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x78)), 3);
+  const __m512i exp = _mm512_slli_epi16(_mm512_add_epi16(raw_exp, _mm512_set1_epi16(120)), 7);
+  const __m512i nonsign = _mm512_or_si512(exp, mant);
+
+  const __m512i sign = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x80)), 8);
+  const __m512i combined = _mm512_or_si512(nonsign, sign);
+
+  const __mmask32 is_nonzero = _mm512_cmpneq_epi16_mask(x, _mm512_setzero_si512());
+  return (__m512bh)_mm512_maskz_mov_epi16(is_nonzero, combined);
+}
+
 inline __m512bh cvt_e4m3_bf16_intrinsic_without_denorm(__m256i fp8_vec) {
  // The following conversion is without denorm behavior, that is to say,
  //   Max subnorm   : S.0000.111 = 0.875 ∗ 2**(−6)
@@ -84,7 +100,7 @@ inline __m512bh cvt_e4m3_bf16_intrinsic_with_denorm(__m256i fp8_vec) {

 inline __m512bh CVT_FP8_TO_BF16(__m256i a) {
 #ifdef SGLANG_CPU_FP8_CVT_FTZ
-  return cvt_e4m3_bf16_intrinsic_without_denorm(a);
+  return cvt_e4m3_bf16_intrinsic_no_nan(a);
 #else
  return cvt_e4m3_bf16_intrinsic_with_denorm(a);
 #endif
@@ -172,4 +188,102 @@ inline void quantize_row_int8<at::BFloat16>(
 }
 #endif

+// transpose utils
+// taken from my PR in ggml: https://github.com/ggml-org/llama.cpp/pull/8998
+#if defined(CPU_CAPABILITY_AVX512)
+inline void transpose_16x16_32bit(__m512i* v) {
+  __m512i v1[16];
+  v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
+  v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
+  v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
+  v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
+  v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
+  v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
+  v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
+  v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
+  v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
+  v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
+  v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
+  v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
+  v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
+  v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
+  v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
+  v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);
+
+  v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
+  v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
+  v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
+  v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
+  v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
+  v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
+  v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
+  v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
+  v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
+  v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
+  v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
+  v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
+  v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
+  v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
+  v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
+  v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);
+
+  v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
+  v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
+  v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
+  v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
+  v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
+  v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
+  v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
+  v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
+  v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
+  v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
+  v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
+  v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
+  v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
+  v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
+  v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
+  v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);
+
+  v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
+  v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
+  v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
+  v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
+  v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
+  v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
+  v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
+  v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
+  v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
+  v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
+  v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
+  v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
+  v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
+  v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
+  v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
+  v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
+}
+
+// remove warning : ignoring attributes on template argument ‘__m512i’ [-Wignored-attributes]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+
+// transpose from [2, 32] to [32, 2]
+inline std::tuple<__m512i, __m512i> transpose_2x32_16bit(__m512i r0, __m512i r1) {
+  // r0: {a0, a1, ..., a31}
+  // r1: {b0, b1, ..., b31}
+  //
+  // d0: {a0,   b0, ..., a15, b15}
+  // d1: {a16, b16, ..., a31, b31}
+  //
+  __m512i d0 = _mm512_unpacklo_epi16(r0, r1);
+  __m512i d1 = _mm512_unpackhi_epi16(r0, r1);
+  r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);
+  r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);
+  d0 = _mm512_shuffle_i32x4(r0, r1, 0x88);
+  d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd);
+  return std::make_tuple(d0, d1);
+}
+#pragma GCC diagnostic pop
+
+#endif
+
 }  // anonymous namespace