Overlapped weight offload (#8034)

This commit is contained in:
fzyzcjy
2025-08-23 17:06:46 +08:00
committed by GitHub
parent ccd3fb946e
commit 2600fc0d47
9 changed files with 584 additions and 10 deletions

View File

@@ -2954,3 +2954,13 @@ class ConcurrentCounter:
@lru_cache(maxsize=1)
def is_triton_kernels_available() -> bool:
return importlib.util.find_spec("triton_kernels") is not None
def check_cuda_result(raw_output):
import cuda.bindings.runtime as cuda_rt
err, *results = raw_output
if err != cuda_rt.cudaError_t.cudaSuccess:
raise Exception(f"CUDA error: {err}")
return results