mirror fix for custom allreduce (#3124)
This commit is contained in:
@@ -160,7 +160,7 @@ __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int RANKS_PER_NODE, bool COPY_INPUT = true>
|
template <typename T, int RANKS_PER_NODE, bool COPY_INPUT = true>
|
||||||
static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
|
static __global__ void __launch_bounds__(512, 1) oneShotAllReduceKernel(AllReduceParams params) {
|
||||||
// Suppose that two GPUs participate in the AR exchange, and we start four blocks.
|
// Suppose that two GPUs participate in the AR exchange, and we start four blocks.
|
||||||
// The message is partitioned into chunks as detailed below:
|
// The message is partitioned into chunks as detailed below:
|
||||||
// message
|
// message
|
||||||
|
|||||||
Reference in New Issue
Block a user