Multi-Stage Awake: Support Resume and Pause KV Cache and Weights separately (#7099)
This commit is contained in:
@@ -30,6 +30,7 @@ from sglang.srt import debug_utils
|
||||
from sglang.srt.configs.device_config import DeviceConfig
|
||||
from sglang.srt.configs.load_config import LoadConfig
|
||||
from sglang.srt.configs.model_config import AttentionArch, ModelConfig
|
||||
from sglang.srt.constants import GPU_MEMORY_TYPE_WEIGHTS
|
||||
from sglang.srt.distributed import (
|
||||
get_tp_group,
|
||||
get_world_group,
|
||||
@@ -222,6 +223,7 @@ class ModelRunner:
|
||||
|
||||
def initialize(self, min_per_gpu_memory: float):
|
||||
server_args = self.server_args
|
||||
|
||||
self.memory_saver_adapter = TorchMemorySaverAdapter.create(
|
||||
enable=self.server_args.enable_memory_saver
|
||||
)
|
||||
@@ -547,7 +549,7 @@ class ModelRunner:
|
||||
monkey_patch_vllm_parallel_state()
|
||||
monkey_patch_isinstance_for_vllm_base_layer()
|
||||
|
||||
with self.memory_saver_adapter.region():
|
||||
with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_WEIGHTS):
|
||||
self.model = get_model(
|
||||
model_config=self.model_config,
|
||||
load_config=self.load_config,
|
||||
|
||||
Reference in New Issue
Block a user