CUDA-graph-compatible releasing and resuming KV cache and model weight memory (#2630)

2025-01-14 03:38:51 +08:00
parent d08c77c434
commit 923f518337
12 changed files with 406 additions and 60 deletions
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -29,6 +29,7 @@ suites = {
        "test_openai_server.py",
        "test_pytorch_sampling_backend.py",
        "test_radix_attention.py",
+        "test_release_memory_occupation.py",
        "test_retract_decode.py",
        "test_server_args.py",
        "test_session_control.py",