[Feature] Option to save model weights to CPU when memory saver mode is enabled (#10873)

Co-authored-by: molocule <34072934+molocule@users.noreply.github.com>
This commit is contained in:
Matt Nappo
2025-10-03 04:48:19 -04:00
committed by GitHub
parent 34151f173b
commit 8c57490210
7 changed files with 78 additions and 22 deletions

View File

@@ -25,8 +25,6 @@ configurations (tp=1, tp=2) to ensure proper memory management in distributed se
data parallel size, we test it in verl.
"""
import gc
import os
import time
import unittest
@@ -52,7 +50,14 @@ def get_gpu_memory_gb():
class TestReleaseMemoryOccupation(CustomTestCase):
def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1, ep_size=1):
def _setup_engine(
self,
model_name,
mem_fraction_static=0.8,
tp_size=1,
ep_size=1,
enable_weights_cpu_backup=False,
):
"""Common setup for engine and HF model."""
engine = sgl.Engine(
model_path=model_name,
@@ -61,6 +66,7 @@ class TestReleaseMemoryOccupation(CustomTestCase):
mem_fraction_static=mem_fraction_static,
tp_size=tp_size,
ep_size=ep_size,
enable_weights_cpu_backup=enable_weights_cpu_backup,
# disable_cuda_graph=True, # for debugging only
)
@@ -153,6 +159,53 @@ class TestReleaseMemoryOccupation(CustomTestCase):
self.assertEqual(outputs, params["expect_output_after_update_weights"])
engine.shutdown()
def test_release_and_resume_occupation_with_weights_cpu_backup(self):
# Test release and resume occupation with weights CPU backup
model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
print("Testing test_release_and_resume_occupation_with_weights_cpu_backup")
engine = self._setup_engine(
model_name=model_name,
mem_fraction_static=0.6,
enable_weights_cpu_backup=True,
)
params = self._common_test_params()
self._test_initial_generation(
engine,
params["prompt"],
params["sampling_params"],
params["expect_output_before_update_weights"],
)
t = time.perf_counter()
gpu_memory_usage_before_release = get_gpu_memory_gb()
engine.release_memory_occupation()
gpu_memory_usage_after_release = get_gpu_memory_gb()
self.assertLess(
gpu_memory_usage_after_release,
gpu_memory_usage_before_release,
)
print(
f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB"
)
if _DEBUG_EXTRA:
time.sleep(3)
t = time.perf_counter()
engine.resume_memory_occupation()
print(
f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB"
)
print("generate post resume")
outputs = engine.generate(params["prompt"], params["sampling_params"])["text"]
self.assertEqual(outputs, params["expect_output_before_update_weights"])
engine.shutdown()
def test_multi_stage_release_and_resume(self):
# With multi-stage release and resume, we can set the memory fraction to 0.85 without concern of OOM
model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST