[Feature] Option to save model weights to CPU when memory saver mode is enabled (#10873)

Co-authored-by: molocule <34072934+molocule@users.noreply.github.com>
This commit is contained in:
Matt Nappo
2025-10-03 04:48:19 -04:00
committed by GitHub
parent 34151f173b
commit 8c57490210
7 changed files with 78 additions and 22 deletions

View File

@@ -25,9 +25,7 @@ import time
from collections import defaultdict
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
from urllib.parse import urlparse
import requests
import torch
import torch.distributed as dist
@@ -36,7 +34,6 @@ from sglang.srt.configs.device_config import DeviceConfig
from sglang.srt.configs.load_config import LoadConfig, LoadFormat
from sglang.srt.configs.model_config import AttentionArch, ModelConfig
from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp
from sglang.srt.connector import ConnectorType
from sglang.srt.constants import GPU_MEMORY_TYPE_WEIGHTS
from sglang.srt.distributed import (
get_pp_group,
@@ -132,7 +129,6 @@ from sglang.srt.utils import (
get_bool_env_var,
get_cpu_ids_by_node,
init_custom_process_group,
is_blackwell,
is_fa3_default_architecture,
is_flashinfer_available,
is_hip,
@@ -143,7 +139,6 @@ from sglang.srt.utils import (
log_info_on_rank0,
monkey_patch_p2p_access_check,
monkey_patch_vllm_gguf_config,
parse_connector_type,
set_cuda_arch,
)
from sglang.srt.weight_sync.tensor_bucket import (
@@ -616,7 +611,7 @@ class ModelRunner:
server_args.hicache_io_backend = "direct"
logger.warning(
"FlashAttention3 decode backend is not compatible with hierarchical cache. "
f"Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
"Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
)
def init_torch_distributed(self):
@@ -778,7 +773,10 @@ class ModelRunner:
monkey_patch_vllm_parallel_state()
monkey_patch_isinstance_for_vllm_base_layer()
with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_WEIGHTS):
with self.memory_saver_adapter.region(
GPU_MEMORY_TYPE_WEIGHTS,
enable_cpu_backup=self.server_args.enable_weights_cpu_backup,
):
self.model = get_model(
model_config=self.model_config,
load_config=self.load_config,
@@ -1106,7 +1104,7 @@ class ModelRunner:
handle.wait()
self.model.load_weights(weights)
return True, f"Succeeded to update parameter online."
return True, "Succeeded to update parameter online."
except Exception as e:
error_msg = (
@@ -1749,8 +1747,8 @@ class ModelRunner:
f"prefill_backend={self.prefill_attention_backend_str}."
)
logger.warning(
f"Warning: Attention backend specified by --attention-backend or default backend might be overridden."
f"The feature of hybrid attention backend is experimental and unstable. Please raise an issue if you encounter any problem."
"Warning: Attention backend specified by --attention-backend or default backend might be overridden."
"The feature of hybrid attention backend is experimental and unstable. Please raise an issue if you encounter any problem."
)
else:
attn_backend = self._get_attention_backend_from_str(