[UT for RL] Add UT to cover release/resume memory case for moe model (#8803)
This commit is contained in:
@@ -42,7 +42,8 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
|
||||
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
|
||||
|
||||
# MLA test models
|
||||
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
||||
|
||||
@@ -8,7 +8,7 @@ import torch
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
|
||||
@@ -38,6 +38,8 @@ from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGH
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
|
||||
CustomTestCase,
|
||||
)
|
||||
|
||||
@@ -50,7 +52,7 @@ def get_gpu_memory_gb():
|
||||
|
||||
|
||||
class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1):
|
||||
def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1, ep_size=1):
|
||||
"""Common setup for engine and HF model."""
|
||||
engine = sgl.Engine(
|
||||
model_path=model_name,
|
||||
@@ -58,6 +60,7 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
enable_memory_saver=True,
|
||||
mem_fraction_static=mem_fraction_static,
|
||||
tp_size=tp_size,
|
||||
ep_size=ep_size,
|
||||
# disable_cuda_graph=True, # for debugging only
|
||||
)
|
||||
|
||||
@@ -70,6 +73,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
"sampling_params": {"temperature": 0, "max_new_tokens": 8},
|
||||
"expect_output_before_update_weights": " to spend it outdoors. I decided to",
|
||||
"expect_output_after_update_weights": " to go for a walk. I like",
|
||||
"prompt_moe": "The weather is nice today, and I want to",
|
||||
"sampling_params_moe": {"temperature": 0, "max_new_tokens": 16},
|
||||
"expect_output_before_update_weights_moe": " go to the park. I have a picnic basket, a book, and a",
|
||||
"expect_output_after_update_weights_moe": " go to the park. I have a lot of things to do, but I",
|
||||
}
|
||||
|
||||
def _test_initial_generation(
|
||||
@@ -250,6 +257,72 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
self.assertEqual(outputs, params["expect_output_after_update_weights"])
|
||||
engine.shutdown()
|
||||
|
||||
def test_moe_model_release_and_resume(self):
|
||||
# Test with MoE model
|
||||
model_name = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT
|
||||
|
||||
tp_size = ep_size = 2
|
||||
|
||||
print(
|
||||
f"Testing tp_size={tp_size} and ep_size={ep_size} for test_moe_model_release_and_resume"
|
||||
)
|
||||
engine = sgl.Engine(
|
||||
model_path=model_name,
|
||||
random_seed=42,
|
||||
enable_memory_saver=True,
|
||||
mem_fraction_static=0.5,
|
||||
tp_size=tp_size,
|
||||
ep_size=ep_size,
|
||||
)
|
||||
params = self._common_test_params()
|
||||
|
||||
self._test_initial_generation(
|
||||
engine,
|
||||
params["prompt_moe"],
|
||||
params["sampling_params_moe"],
|
||||
params["expect_output_before_update_weights_moe"],
|
||||
)
|
||||
|
||||
t = time.perf_counter()
|
||||
gpu_memory_usage_before_release = get_gpu_memory_gb()
|
||||
engine.release_memory_occupation()
|
||||
gpu_memory_usage_after_release = get_gpu_memory_gb()
|
||||
self.assertLess(
|
||||
gpu_memory_usage_after_release,
|
||||
gpu_memory_usage_before_release,
|
||||
)
|
||||
|
||||
print(
|
||||
f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB"
|
||||
)
|
||||
|
||||
if _DEBUG_EXTRA:
|
||||
time.sleep(3)
|
||||
|
||||
t = time.perf_counter()
|
||||
engine.resume_memory_occupation()
|
||||
print(
|
||||
f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB"
|
||||
)
|
||||
|
||||
hf_model_new = AutoModelForCausalLM.from_pretrained(
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
|
||||
torch_dtype="bfloat16",
|
||||
device_map="cuda",
|
||||
)
|
||||
engine.update_weights_from_tensor(list(hf_model_new.named_parameters()))
|
||||
|
||||
# destroy the hf model
|
||||
del hf_model_new
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
print("generate (#2)")
|
||||
outputs = engine.generate(params["prompt_moe"], params["sampling_params_moe"])[
|
||||
"text"
|
||||
]
|
||||
self.assertEqual(outputs, params["expect_output_after_update_weights_moe"])
|
||||
engine.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -7,7 +7,7 @@ import requests
|
||||
from sglang.srt.utils import is_cuda, kill_process_tree
|
||||
from sglang.test.run_eval import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
|
||||
class TestTorchCompileMoe(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
|
||||
cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
|
||||
Reference in New Issue
Block a user