CUDA-graph-compatible releasing and resuming KV cache and model weight memory (#2630)

2025-01-14 03:38:51 +08:00
parent d08c77c434
commit 923f518337
12 changed files with 406 additions and 60 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -23,7 +23,6 @@ from typing import List, Optional
 import torch

 from sglang.srt.hf_transformers_utils import check_gguf_file
-from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
    get_amdgpu_memory_capacity,
    get_hpu_memory_capacity,
@@ -157,6 +156,7 @@ class ServerArgs:
    triton_attention_num_kv_splits: int = 8
    num_continuous_decode_steps: int = 1
    delete_ckpt_after_loading: bool = False
+    enable_memory_saver: bool = False

    def __post_init__(self):
        # Set missing default values
@@ -854,6 +854,11 @@ class ServerArgs:
            action="store_true",
            help="Delete the model checkpoint after loading the model.",
        )
+        parser.add_argument(
+            "--enable-memory-saver",
+            action="store_true",
+            help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
+        )

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):