Support copying tensor from cpu to gpu without using copy engines (#10007)

This commit is contained in:
fzyzcjy
2025-09-05 20:07:19 +08:00
committed by GitHub
parent 5e5c30d9ab
commit bd7f882142
6 changed files with 70 additions and 1 deletions

View File

@@ -445,6 +445,9 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
"qserve_w4a8_per_group_gemm(Tensor _in_feats, Tensor _kernel, Tensor _zeros, Tensor _scales_i8, Tensor _wscales, "
"Tensor _ascales, Tensor! _out_feats) -> ()");
m.impl("qserve_w4a8_per_group_gemm", torch::kCUDA, &qserve_w4a8_per_group_gemm);
m.def("copy_to_gpu_no_ce(Tensor input, Tensor! output) -> ()");
m.impl("copy_to_gpu_no_ce", torch::kCUDA, &copy_to_gpu_no_ce);
}
REGISTER_EXTENSION(common_ops)