[Feature] Option to save model weights to CPU when memory saver mode is enabled (#10873)

Co-authored-by: molocule <34072934+molocule@users.noreply.github.com>
2025-10-03 04:48:19 -04:00
parent 34151f173b
commit 8c57490210
7 changed files with 78 additions and 22 deletions
--- a/test/srt/test_release_memory_occupation.py
+++ b/test/srt/test_release_memory_occupation.py
@@ -25,8 +25,6 @@ configurations (tp=1, tp=2) to ensure proper memory management in distributed se
 data parallel size, we test it in verl.
 """

-import gc
-import os
 import time
 import unittest

@@ -52,7 +50,14 @@ def get_gpu_memory_gb():


 class TestReleaseMemoryOccupation(CustomTestCase):
-    def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1, ep_size=1):
+    def _setup_engine(
+        self,
+        model_name,
+        mem_fraction_static=0.8,
+        tp_size=1,
+        ep_size=1,
+        enable_weights_cpu_backup=False,
+    ):
        """Common setup for engine and HF model."""
        engine = sgl.Engine(
            model_path=model_name,
@@ -61,6 +66,7 @@ class TestReleaseMemoryOccupation(CustomTestCase):
            mem_fraction_static=mem_fraction_static,
            tp_size=tp_size,
            ep_size=ep_size,
+            enable_weights_cpu_backup=enable_weights_cpu_backup,
            # disable_cuda_graph=True,  # for debugging only
        )

@@ -153,6 +159,53 @@ class TestReleaseMemoryOccupation(CustomTestCase):
            self.assertEqual(outputs, params["expect_output_after_update_weights"])
            engine.shutdown()

+    def test_release_and_resume_occupation_with_weights_cpu_backup(self):
+        # Test release and resume occupation with weights CPU backup
+        model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+        print("Testing test_release_and_resume_occupation_with_weights_cpu_backup")
+        engine = self._setup_engine(
+            model_name=model_name,
+            mem_fraction_static=0.6,
+            enable_weights_cpu_backup=True,
+        )
+        params = self._common_test_params()
+
+        self._test_initial_generation(
+            engine,
+            params["prompt"],
+            params["sampling_params"],
+            params["expect_output_before_update_weights"],
+        )
+
+        t = time.perf_counter()
+        gpu_memory_usage_before_release = get_gpu_memory_gb()
+        engine.release_memory_occupation()
+        gpu_memory_usage_after_release = get_gpu_memory_gb()
+
+        self.assertLess(
+            gpu_memory_usage_after_release,
+            gpu_memory_usage_before_release,
+        )
+
+        print(
+            f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB"
+        )
+
+        if _DEBUG_EXTRA:
+            time.sleep(3)
+
+        t = time.perf_counter()
+        engine.resume_memory_occupation()
+        print(
+            f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB"
+        )
+
+        print("generate post resume")
+        outputs = engine.generate(params["prompt"], params["sampling_params"])["text"]
+        self.assertEqual(outputs, params["expect_output_before_update_weights"])
+        engine.shutdown()
+
    def test_multi_stage_release_and_resume(self):
        # With multi-stage release and resume, we can set the memory fraction to 0.85 without concern of OOM
        model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST