From 75cd34d1723871bf405def8378dec2a2c3f308a6 Mon Sep 17 00:00:00 2001 From: maxiao1 Date: Mon, 3 Nov 2025 10:17:53 +0800 Subject: [PATCH] change sgl_kernel WARP_SIZE to 64 --- sgl-kernel/csrc/gemm/qserve_w4a8_per_chn_gemm.cu | 2 +- sgl-kernel/csrc/gemm/qserve_w4a8_per_group_gemm.cu | 2 +- sgl-kernel/csrc/kvcacheio/transfer.cu | 2 +- sgl-kernel/csrc/quantization/gguf/ggml-common.h | 2 +- sgl-kernel/include/utils.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sgl-kernel/csrc/gemm/qserve_w4a8_per_chn_gemm.cu b/sgl-kernel/csrc/gemm/qserve_w4a8_per_chn_gemm.cu index 79180737f..297e51daa 100644 --- a/sgl-kernel/csrc/gemm/qserve_w4a8_per_chn_gemm.cu +++ b/sgl-kernel/csrc/gemm/qserve_w4a8_per_chn_gemm.cu @@ -25,7 +25,7 @@ #define INTRIN_M 16 #define INTRIN_N 16 #define INTRIN_K 32 -#define WARP_SIZE 32 +#define WARP_SIZE 64 #define SMEM_PAD_A 0 #define SMEM_PAD_B 0 #define PACK_SIZE 16 diff --git a/sgl-kernel/csrc/gemm/qserve_w4a8_per_group_gemm.cu b/sgl-kernel/csrc/gemm/qserve_w4a8_per_group_gemm.cu index a99a203e8..ebf2604c3 100644 --- a/sgl-kernel/csrc/gemm/qserve_w4a8_per_group_gemm.cu +++ b/sgl-kernel/csrc/gemm/qserve_w4a8_per_group_gemm.cu @@ -25,7 +25,7 @@ #define INTRIN_M 16 #define INTRIN_N 16 #define INTRIN_K 32 -#define WARP_SIZE 32 +#define WARP_SIZE 64 #define SMEM_PAD_A 0 #define SMEM_PAD_B 0 #define PACK_SIZE 16 diff --git a/sgl-kernel/csrc/kvcacheio/transfer.cu b/sgl-kernel/csrc/kvcacheio/transfer.cu index bca9f326c..8898d4cfa 100644 --- a/sgl-kernel/csrc/kvcacheio/transfer.cu +++ b/sgl-kernel/csrc/kvcacheio/transfer.cu @@ -5,7 +5,7 @@ #include #ifndef USE_ROCM -#define WARP_SIZE 32 +#define WARP_SIZE 64 #include "pytorch_extension_utils.h" #else #include "pytorch_extension_utils_rocm.h" diff --git a/sgl-kernel/csrc/quantization/gguf/ggml-common.h b/sgl-kernel/csrc/quantization/gguf/ggml-common.h index f6fbe57aa..b246fca44 100644 --- a/sgl-kernel/csrc/quantization/gguf/ggml-common.h +++ b/sgl-kernel/csrc/quantization/gguf/ggml-common.h @@ -3,7 +3,7 @@ // copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-common.h #define QK_K 256 #define K_QUANTS_PER_ITERATION 2 -#define WARP_SIZE_GGUF 32 +#define WARP_SIZE_GGUF 64 #define K_SCALE_SIZE 12 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 #define CUDA_QUANTIZE_BLOCK_SIZE 256 diff --git a/sgl-kernel/include/utils.h b/sgl-kernel/include/utils.h index d34ee86c3..c9152f5a2 100644 --- a/sgl-kernel/include/utils.h +++ b/sgl-kernel/include/utils.h @@ -340,7 +340,7 @@ inline bool getEnvEnablePDL() { #define CEILDIV(x, y) (((x) + (y) - 1) / (y)) #ifndef USE_ROCM -#define WARP_SIZE 32 +#define WARP_SIZE 64 #else #if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__) #define WARP_SIZE 64