diff --git a/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp b/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp index 4dc4f704f..44257dec5 100644 --- a/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp +++ b/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp @@ -342,7 +342,6 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) { // all reduce m.def("initialize(int size, int rank) -> ()"); - m.impl("initialize", torch::kCPU, &initialize); m.def("shm_allreduce(Tensor data, int reduce_op) -> ()"); m.impl("shm_allreduce", torch::kCPU, &shm_allreduce); m.def("shm_allgather(Tensor data, int dim) -> Tensor"); @@ -360,6 +359,7 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) { TORCH_LIBRARY_IMPL(sgl_kernel, CatchAll, m) { m.impl("init_cpu_threads_env", init_cpu_threads_env); + m.impl("initialize", &initialize); } REGISTER_EXTENSION(common_ops)