[Feature] Option to save model weights to CPU when memory saver mode is enabled (#10873)
Co-authored-by: molocule <34072934+molocule@users.noreply.github.com>
This commit is contained in:
@@ -25,8 +25,6 @@ configurations (tp=1, tp=2) to ensure proper memory management in distributed se
|
||||
data parallel size, we test it in verl.
|
||||
"""
|
||||
|
||||
import gc
|
||||
import os
|
||||
import time
|
||||
import unittest
|
||||
|
||||
@@ -52,7 +50,14 @@ def get_gpu_memory_gb():
|
||||
|
||||
|
||||
class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1, ep_size=1):
|
||||
def _setup_engine(
|
||||
self,
|
||||
model_name,
|
||||
mem_fraction_static=0.8,
|
||||
tp_size=1,
|
||||
ep_size=1,
|
||||
enable_weights_cpu_backup=False,
|
||||
):
|
||||
"""Common setup for engine and HF model."""
|
||||
engine = sgl.Engine(
|
||||
model_path=model_name,
|
||||
@@ -61,6 +66,7 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
mem_fraction_static=mem_fraction_static,
|
||||
tp_size=tp_size,
|
||||
ep_size=ep_size,
|
||||
enable_weights_cpu_backup=enable_weights_cpu_backup,
|
||||
# disable_cuda_graph=True, # for debugging only
|
||||
)
|
||||
|
||||
@@ -153,6 +159,53 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
self.assertEqual(outputs, params["expect_output_after_update_weights"])
|
||||
engine.shutdown()
|
||||
|
||||
def test_release_and_resume_occupation_with_weights_cpu_backup(self):
|
||||
# Test release and resume occupation with weights CPU backup
|
||||
model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
|
||||
|
||||
print("Testing test_release_and_resume_occupation_with_weights_cpu_backup")
|
||||
engine = self._setup_engine(
|
||||
model_name=model_name,
|
||||
mem_fraction_static=0.6,
|
||||
enable_weights_cpu_backup=True,
|
||||
)
|
||||
params = self._common_test_params()
|
||||
|
||||
self._test_initial_generation(
|
||||
engine,
|
||||
params["prompt"],
|
||||
params["sampling_params"],
|
||||
params["expect_output_before_update_weights"],
|
||||
)
|
||||
|
||||
t = time.perf_counter()
|
||||
gpu_memory_usage_before_release = get_gpu_memory_gb()
|
||||
engine.release_memory_occupation()
|
||||
gpu_memory_usage_after_release = get_gpu_memory_gb()
|
||||
|
||||
self.assertLess(
|
||||
gpu_memory_usage_after_release,
|
||||
gpu_memory_usage_before_release,
|
||||
)
|
||||
|
||||
print(
|
||||
f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB"
|
||||
)
|
||||
|
||||
if _DEBUG_EXTRA:
|
||||
time.sleep(3)
|
||||
|
||||
t = time.perf_counter()
|
||||
engine.resume_memory_occupation()
|
||||
print(
|
||||
f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB"
|
||||
)
|
||||
|
||||
print("generate post resume")
|
||||
outputs = engine.generate(params["prompt"], params["sampling_params"])["text"]
|
||||
self.assertEqual(outputs, params["expect_output_before_update_weights"])
|
||||
engine.shutdown()
|
||||
|
||||
def test_multi_stage_release_and_resume(self):
|
||||
# With multi-stage release and resume, we can set the memory fraction to 0.85 without concern of OOM
|
||||
model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
|
||||
|
||||
Reference in New Issue
Block a user