CUDA-graph-compatible releasing and resuming KV cache and model weight memory (#2630)

This commit is contained in:
fzyzcjy
2025-01-14 03:38:51 +08:00
committed by GitHub
parent d08c77c434
commit 923f518337
12 changed files with 406 additions and 60 deletions

View File

@@ -23,7 +23,6 @@ from typing import List, Optional
import torch
from sglang.srt.hf_transformers_utils import check_gguf_file
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
from sglang.srt.utils import (
get_amdgpu_memory_capacity,
get_hpu_memory_capacity,
@@ -157,6 +156,7 @@ class ServerArgs:
triton_attention_num_kv_splits: int = 8
num_continuous_decode_steps: int = 1
delete_ckpt_after_loading: bool = False
enable_memory_saver: bool = False
def __post_init__(self):
# Set missing default values
@@ -854,6 +854,11 @@ class ServerArgs:
action="store_true",
help="Delete the model checkpoint after loading the model.",
)
parser.add_argument(
"--enable-memory-saver",
action="store_true",
help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
)
@classmethod
def from_cli_args(cls, args: argparse.Namespace):