CUDA-graph-compatible releasing and resuming KV cache and model weight memory (#2630)

This commit is contained in:
fzyzcjy
2025-01-14 03:38:51 +08:00
committed by GitHub
parent d08c77c434
commit 923f518337
12 changed files with 406 additions and 60 deletions

View File

@@ -29,6 +29,7 @@ suites = {
"test_openai_server.py",
"test_pytorch_sampling_backend.py",
"test_radix_attention.py",
"test_release_memory_occupation.py",
"test_retract_decode.py",
"test_server_args.py",
"test_session_control.py",