Support copying tensor from cpu to gpu without using copy engines (#10007)

2025-09-05 20:07:19 +08:00
parent 5e5c30d9ab
commit bd7f882142
6 changed files with 70 additions and 1 deletions
--- a/sgl-kernel/csrc/common_extension.cc
+++ b/sgl-kernel/csrc/common_extension.cc
@@ -445,6 +445,9 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
      "qserve_w4a8_per_group_gemm(Tensor _in_feats, Tensor _kernel, Tensor _zeros, Tensor _scales_i8, Tensor _wscales, "
      "Tensor _ascales, Tensor! _out_feats) -> ()");
  m.impl("qserve_w4a8_per_group_gemm", torch::kCUDA, &qserve_w4a8_per_group_gemm);
+
+  m.def("copy_to_gpu_no_ce(Tensor input, Tensor! output) -> ()");
+  m.impl("copy_to_gpu_no_ce", torch::kCUDA, &copy_to_gpu_no_ce);
 }

 REGISTER_EXTENSION(common_ops)