optimize custom allreduce kernel (#2904)

This commit is contained in:
yizhang2077
2025-01-16 03:04:25 +08:00
committed by GitHub
parent f65c13b559
commit 6cb3974e77
9 changed files with 244 additions and 80 deletions

View File

@@ -40,7 +40,7 @@ nvcc_flags = [
"-U__CUDA_NO_HALF2_OPERATORS__",
]
cxx_flags = ["-O3"]
libraries = ["c10", "torch", "torch_python"]
libraries = ["c10", "torch", "torch_python", "cuda"]
extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib"]
ext_modules = [
CUDAExtension(