[UT for RL] Add UT to cover release/resume memory case for moe model (#8803)
This commit is contained in:
@@ -42,7 +42,8 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
|||||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
|
||||||
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
|
||||||
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
|
||||||
|
|
||||||
# MLA test models
|
# MLA test models
|
||||||
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import torch
|
|||||||
|
|
||||||
from sglang.srt.utils import kill_process_tree
|
from sglang.srt.utils import kill_process_tree
|
||||||
from sglang.test.test_utils import (
|
from sglang.test.test_utils import (
|
||||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST,
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
|
||||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
DEFAULT_URL_FOR_TEST,
|
DEFAULT_URL_FOR_TEST,
|
||||||
CustomTestCase,
|
CustomTestCase,
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGH
|
|||||||
from sglang.test.test_utils import (
|
from sglang.test.test_utils import (
|
||||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
||||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
|
||||||
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
|
||||||
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
|
||||||
CustomTestCase,
|
CustomTestCase,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -50,7 +52,7 @@ def get_gpu_memory_gb():
|
|||||||
|
|
||||||
|
|
||||||
class TestReleaseMemoryOccupation(CustomTestCase):
|
class TestReleaseMemoryOccupation(CustomTestCase):
|
||||||
def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1):
|
def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1, ep_size=1):
|
||||||
"""Common setup for engine and HF model."""
|
"""Common setup for engine and HF model."""
|
||||||
engine = sgl.Engine(
|
engine = sgl.Engine(
|
||||||
model_path=model_name,
|
model_path=model_name,
|
||||||
@@ -58,6 +60,7 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
|||||||
enable_memory_saver=True,
|
enable_memory_saver=True,
|
||||||
mem_fraction_static=mem_fraction_static,
|
mem_fraction_static=mem_fraction_static,
|
||||||
tp_size=tp_size,
|
tp_size=tp_size,
|
||||||
|
ep_size=ep_size,
|
||||||
# disable_cuda_graph=True, # for debugging only
|
# disable_cuda_graph=True, # for debugging only
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -70,6 +73,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
|||||||
"sampling_params": {"temperature": 0, "max_new_tokens": 8},
|
"sampling_params": {"temperature": 0, "max_new_tokens": 8},
|
||||||
"expect_output_before_update_weights": " to spend it outdoors. I decided to",
|
"expect_output_before_update_weights": " to spend it outdoors. I decided to",
|
||||||
"expect_output_after_update_weights": " to go for a walk. I like",
|
"expect_output_after_update_weights": " to go for a walk. I like",
|
||||||
|
"prompt_moe": "The weather is nice today, and I want to",
|
||||||
|
"sampling_params_moe": {"temperature": 0, "max_new_tokens": 16},
|
||||||
|
"expect_output_before_update_weights_moe": " go to the park. I have a picnic basket, a book, and a",
|
||||||
|
"expect_output_after_update_weights_moe": " go to the park. I have a lot of things to do, but I",
|
||||||
}
|
}
|
||||||
|
|
||||||
def _test_initial_generation(
|
def _test_initial_generation(
|
||||||
@@ -250,6 +257,72 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
|||||||
self.assertEqual(outputs, params["expect_output_after_update_weights"])
|
self.assertEqual(outputs, params["expect_output_after_update_weights"])
|
||||||
engine.shutdown()
|
engine.shutdown()
|
||||||
|
|
||||||
|
def test_moe_model_release_and_resume(self):
|
||||||
|
# Test with MoE model
|
||||||
|
model_name = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT
|
||||||
|
|
||||||
|
tp_size = ep_size = 2
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Testing tp_size={tp_size} and ep_size={ep_size} for test_moe_model_release_and_resume"
|
||||||
|
)
|
||||||
|
engine = sgl.Engine(
|
||||||
|
model_path=model_name,
|
||||||
|
random_seed=42,
|
||||||
|
enable_memory_saver=True,
|
||||||
|
mem_fraction_static=0.5,
|
||||||
|
tp_size=tp_size,
|
||||||
|
ep_size=ep_size,
|
||||||
|
)
|
||||||
|
params = self._common_test_params()
|
||||||
|
|
||||||
|
self._test_initial_generation(
|
||||||
|
engine,
|
||||||
|
params["prompt_moe"],
|
||||||
|
params["sampling_params_moe"],
|
||||||
|
params["expect_output_before_update_weights_moe"],
|
||||||
|
)
|
||||||
|
|
||||||
|
t = time.perf_counter()
|
||||||
|
gpu_memory_usage_before_release = get_gpu_memory_gb()
|
||||||
|
engine.release_memory_occupation()
|
||||||
|
gpu_memory_usage_after_release = get_gpu_memory_gb()
|
||||||
|
self.assertLess(
|
||||||
|
gpu_memory_usage_after_release,
|
||||||
|
gpu_memory_usage_before_release,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB"
|
||||||
|
)
|
||||||
|
|
||||||
|
if _DEBUG_EXTRA:
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
t = time.perf_counter()
|
||||||
|
engine.resume_memory_occupation()
|
||||||
|
print(
|
||||||
|
f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB"
|
||||||
|
)
|
||||||
|
|
||||||
|
hf_model_new = AutoModelForCausalLM.from_pretrained(
|
||||||
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
|
||||||
|
torch_dtype="bfloat16",
|
||||||
|
device_map="cuda",
|
||||||
|
)
|
||||||
|
engine.update_weights_from_tensor(list(hf_model_new.named_parameters()))
|
||||||
|
|
||||||
|
# destroy the hf model
|
||||||
|
del hf_model_new
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
print("generate (#2)")
|
||||||
|
outputs = engine.generate(params["prompt_moe"], params["sampling_params_moe"])[
|
||||||
|
"text"
|
||||||
|
]
|
||||||
|
self.assertEqual(outputs, params["expect_output_after_update_weights_moe"])
|
||||||
|
engine.shutdown()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import requests
|
|||||||
from sglang.srt.utils import is_cuda, kill_process_tree
|
from sglang.srt.utils import is_cuda, kill_process_tree
|
||||||
from sglang.test.run_eval import run_eval
|
from sglang.test.run_eval import run_eval
|
||||||
from sglang.test.test_utils import (
|
from sglang.test.test_utils import (
|
||||||
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST,
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
|
||||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
DEFAULT_URL_FOR_TEST,
|
DEFAULT_URL_FOR_TEST,
|
||||||
CustomTestCase,
|
CustomTestCase,
|
||||||
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
|
|||||||
class TestTorchCompileMoe(CustomTestCase):
|
class TestTorchCompileMoe(CustomTestCase):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
|
cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE
|
||||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
cls.process = popen_launch_server(
|
cls.process = popen_launch_server(
|
||||||
cls.model,
|
cls.model,
|
||||||
|
|||||||
Reference in New Issue
Block a user