CUDA-graph-compatible releasing and resuming KV cache and model weight memory (#2630)

2025-01-14 03:38:51 +08:00
parent d08c77c434
commit 923f518337
12 changed files with 406 additions and 60 deletions
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -12,8 +12,9 @@ bash "${SCRIPT_DIR}/killall_sglang.sh"
 pip install --upgrade pip
 pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/

-# Force reinstall flashinfer
+# Force reinstall flashinfer and torch_memory_saver
 pip install flashinfer==0.1.6 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps
+pip install torch_memory_saver --force-reinstall

 pip install transformers==4.45.2 sentence_transformers accelerate peft