CUDA-graph-compatible releasing and resuming KV cache and model weight memory (#2630)

2025-01-14 03:38:51 +08:00
parent d08c77c434
commit 923f518337
12 changed files with 406 additions and 60 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -44,6 +44,7 @@ srt_hpu = ["sglang[runtime_common]"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
+torch_memory_saver = ["torch_memory_saver"]
 test = [
    "jsonlines",
    "matplotlib",