[Feature] Integrate quick allreduce and select the best allreduce implementation (#6619)
Signed-off-by: Haoyang Li <Haoyang.Li@amd.com> Co-authored-by: ilmarkov <imarkov@redhat.com>
This commit is contained in:
@@ -66,6 +66,13 @@ void register_graph_buffers(
|
||||
fptr_t _fa, const std::vector<std::string>& handles, const std::vector<std::vector<int64_t>>& offsets);
|
||||
torch::Tensor allocate_meta_buffer(int64_t size);
|
||||
torch::Tensor get_meta_buffer_ipc_handle(torch::Tensor& inp);
|
||||
// quick allreduce
|
||||
fptr_t init_custom_qr(int64_t rank, int64_t world_size, std::optional<int64_t> qr_max_size = std::nullopt);
|
||||
void qr_destroy(fptr_t _fa);
|
||||
torch::Tensor qr_get_handle(fptr_t _fa);
|
||||
void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
|
||||
void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, int64_t quant_level, bool cast_bf2half = false);
|
||||
int64_t qr_max_size();
|
||||
#else
|
||||
// custom allreduce
|
||||
fptr_t
|
||||
@@ -77,6 +84,8 @@ std::tuple<std::vector<int64_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta
|
||||
void register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs);
|
||||
void register_graph_buffers(
|
||||
fptr_t _fa, const std::vector<std::vector<int64_t>>& handles, const std::vector<std::vector<int64_t>>& offsets);
|
||||
|
||||
// mscclpp
|
||||
torch::Tensor mscclpp_generate_unique_id();
|
||||
fptr_t mscclpp_init_context(
|
||||
const torch::Tensor& unique_id,
|
||||
|
||||
Reference in New Issue
Block a user