Multi-Stage Awake: Support Resume and Pause KV Cache and Weights separately (#7099)

This commit is contained in:
Stefan He
2025-06-19 00:56:37 -07:00
committed by GitHub
parent 9179ea1595
commit 3774f07825
14 changed files with 297 additions and 108 deletions

View File

@@ -30,6 +30,7 @@ from sglang.srt import debug_utils
from sglang.srt.configs.device_config import DeviceConfig
from sglang.srt.configs.load_config import LoadConfig
from sglang.srt.configs.model_config import AttentionArch, ModelConfig
from sglang.srt.constants import GPU_MEMORY_TYPE_WEIGHTS
from sglang.srt.distributed import (
get_tp_group,
get_world_group,
@@ -222,6 +223,7 @@ class ModelRunner:
def initialize(self, min_per_gpu_memory: float):
server_args = self.server_args
self.memory_saver_adapter = TorchMemorySaverAdapter.create(
enable=self.server_args.enable_memory_saver
)
@@ -547,7 +549,7 @@ class ModelRunner:
monkey_patch_vllm_parallel_state()
monkey_patch_isinstance_for_vllm_base_layer()
with self.memory_saver_adapter.region():
with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_WEIGHTS):
self.model = get_model(
model_config=self.model_config,
load_config=self.load_config,