From 98522149ff422d4700bf43dc6c944ee70cf2b516 Mon Sep 17 00:00:00 2001 From: yizhang2077 <1109276519@qq.com> Date: Sat, 25 Jan 2025 18:26:41 +0800 Subject: [PATCH] mirror fix for custom allreduce (#3124) --- sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu index 006c3200d..8bdb50125 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu +++ b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu @@ -160,7 +160,7 @@ __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag } template -static __global__ void oneShotAllReduceKernel(AllReduceParams params) { +static __global__ void __launch_bounds__(512, 1) oneShotAllReduceKernel(AllReduceParams params) { // Suppose that two GPUs participate in the AR exchange, and we start four blocks. // The message is partitioned into chunks as detailed below: // message