Fix sgl-kernel cu118 compile issue (#2750)
This commit is contained in:
@@ -302,8 +302,10 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
|
|||||||
buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);
|
buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12))
|
||||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||||
cudaGridDependencySynchronize();
|
cudaGridDependencySynchronize();
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
block_barrier(params.peer_barrier_ptrs_in, params.barrier_flag, params.local_rank, RANKS_PER_NODE, tidx, bidx,
|
block_barrier(params.peer_barrier_ptrs_in, params.barrier_flag, params.local_rank, RANKS_PER_NODE, tidx, bidx,
|
||||||
@@ -350,10 +352,11 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
|
|||||||
*reinterpret_cast<int4*>(&local_output_buffer[offset_rank]) = *reinterpret_cast<int4*>(&buffers[ii][offset_rank]);
|
*reinterpret_cast<int4*>(&local_output_buffer[offset_rank]) = *reinterpret_cast<int4*>(&buffers[ii][offset_rank]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12))
|
||||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||||
cudaTriggerProgrammaticLaunchCompletion();
|
cudaTriggerProgrammaticLaunchCompletion();
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
Reference in New Issue
Block a user