[UT for RL] Add UT to cover release/resume memory case for moe model (#8803)

2025-09-10 10:25:12 +08:00
parent 676a7b51bd
commit dccf52f9c8
4 changed files with 79 additions and 5 deletions
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -42,7 +42,8 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
+DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
 DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
 # MLA test models
 DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
--- a/test/srt/test_expert_distribution.py
+++ b/test/srt/test_expert_distribution.py
@@ -8,7 +8,7 @@ import torch
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
-    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
--- a/test/srt/test_release_memory_occupation.py
+++ b/test/srt/test_release_memory_occupation.py
@@ -38,6 +38,8 @@ from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGH
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
    CustomTestCase,
 )
@@ -50,7 +52,7 @@ def get_gpu_memory_gb():
 class TestReleaseMemoryOccupation(CustomTestCase):
-    def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1):
+    def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1, ep_size=1):
        """Common setup for engine and HF model."""
        engine = sgl.Engine(
            model_path=model_name,
@@ -58,6 +60,7 @@ class TestReleaseMemoryOccupation(CustomTestCase):
            enable_memory_saver=True,
            mem_fraction_static=mem_fraction_static,
            tp_size=tp_size,
            ep_size=ep_size,
            # disable_cuda_graph=True,  # for debugging only
        )
@@ -70,6 +73,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
            "sampling_params": {"temperature": 0, "max_new_tokens": 8},
            "expect_output_before_update_weights": " to spend it outdoors. I decided to",
            "expect_output_after_update_weights": " to go for a walk. I like",
            "prompt_moe": "The weather is nice today, and I want to",
            "sampling_params_moe": {"temperature": 0, "max_new_tokens": 16},
            "expect_output_before_update_weights_moe": " go to the park. I have a picnic basket, a book, and a",
            "expect_output_after_update_weights_moe": " go to the park. I have a lot of things to do, but I",
        }
    def _test_initial_generation(
@@ -250,6 +257,72 @@ class TestReleaseMemoryOccupation(CustomTestCase):
            self.assertEqual(outputs, params["expect_output_after_update_weights"])
            engine.shutdown()
    def test_moe_model_release_and_resume(self):
        # Test with MoE model
        model_name = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT
        tp_size = ep_size = 2
        print(
            f"Testing tp_size={tp_size} and ep_size={ep_size} for test_moe_model_release_and_resume"
        )
        engine = sgl.Engine(
            model_path=model_name,
            random_seed=42,
            enable_memory_saver=True,
            mem_fraction_static=0.5,
            tp_size=tp_size,
            ep_size=ep_size,
        )
        params = self._common_test_params()
        self._test_initial_generation(
            engine,
            params["prompt_moe"],
            params["sampling_params_moe"],
            params["expect_output_before_update_weights_moe"],
        )
        t = time.perf_counter()
        gpu_memory_usage_before_release = get_gpu_memory_gb()
        engine.release_memory_occupation()
        gpu_memory_usage_after_release = get_gpu_memory_gb()
        self.assertLess(
            gpu_memory_usage_after_release,
            gpu_memory_usage_before_release,
        )
        print(
            f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB"
        )
        if _DEBUG_EXTRA:
            time.sleep(3)
        t = time.perf_counter()
        engine.resume_memory_occupation()
        print(
            f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB"
        )
        hf_model_new = AutoModelForCausalLM.from_pretrained(
            DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
            torch_dtype="bfloat16",
            device_map="cuda",
        )
        engine.update_weights_from_tensor(list(hf_model_new.named_parameters()))
        # destroy the hf model
        del hf_model_new
        torch.cuda.empty_cache()
        print("generate (#2)")
        outputs = engine.generate(params["prompt_moe"], params["sampling_params_moe"])[
            "text"
        ]
        self.assertEqual(outputs, params["expect_output_after_update_weights_moe"])
        engine.shutdown()
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_torch_compile_moe.py
+++ b/test/srt/test_torch_compile_moe.py
@@ -7,7 +7,7 @@ import requests
 from sglang.srt.utils import is_cuda, kill_process_tree
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
-    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
 class TestTorchCompileMoe(CustomTestCase):
    @classmethod
    def setUpClass(cls):
-        cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
            cls.model,