diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index bd962a7f8..0d3d769f4 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -42,7 +42,8 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" -DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B" +DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B" +DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat" # MLA test models DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct" diff --git a/test/srt/test_expert_distribution.py b/test/srt/test_expert_distribution.py index f98c97766..5d4add72f 100755 --- a/test/srt/test_expert_distribution.py +++ b/test/srt/test_expert_distribution.py @@ -8,7 +8,7 @@ import torch from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( - DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST, + DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, diff --git a/test/srt/test_release_memory_occupation.py b/test/srt/test_release_memory_occupation.py index eb20fc46b..35be029df 100644 --- a/test/srt/test_release_memory_occupation.py +++ b/test/srt/test_release_memory_occupation.py @@ -38,6 +38,8 @@ from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGH from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE, + DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE, + DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT, CustomTestCase, ) @@ -50,7 +52,7 @@ def get_gpu_memory_gb(): class TestReleaseMemoryOccupation(CustomTestCase): - def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1): + def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1, ep_size=1): """Common setup for engine and HF model.""" engine = sgl.Engine( model_path=model_name, @@ -58,6 +60,7 @@ class TestReleaseMemoryOccupation(CustomTestCase): enable_memory_saver=True, mem_fraction_static=mem_fraction_static, tp_size=tp_size, + ep_size=ep_size, # disable_cuda_graph=True, # for debugging only ) @@ -70,6 +73,10 @@ class TestReleaseMemoryOccupation(CustomTestCase): "sampling_params": {"temperature": 0, "max_new_tokens": 8}, "expect_output_before_update_weights": " to spend it outdoors. I decided to", "expect_output_after_update_weights": " to go for a walk. I like", + "prompt_moe": "The weather is nice today, and I want to", + "sampling_params_moe": {"temperature": 0, "max_new_tokens": 16}, + "expect_output_before_update_weights_moe": " go to the park. I have a picnic basket, a book, and a", + "expect_output_after_update_weights_moe": " go to the park. I have a lot of things to do, but I", } def _test_initial_generation( @@ -250,6 +257,72 @@ class TestReleaseMemoryOccupation(CustomTestCase): self.assertEqual(outputs, params["expect_output_after_update_weights"]) engine.shutdown() + def test_moe_model_release_and_resume(self): + # Test with MoE model + model_name = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT + + tp_size = ep_size = 2 + + print( + f"Testing tp_size={tp_size} and ep_size={ep_size} for test_moe_model_release_and_resume" + ) + engine = sgl.Engine( + model_path=model_name, + random_seed=42, + enable_memory_saver=True, + mem_fraction_static=0.5, + tp_size=tp_size, + ep_size=ep_size, + ) + params = self._common_test_params() + + self._test_initial_generation( + engine, + params["prompt_moe"], + params["sampling_params_moe"], + params["expect_output_before_update_weights_moe"], + ) + + t = time.perf_counter() + gpu_memory_usage_before_release = get_gpu_memory_gb() + engine.release_memory_occupation() + gpu_memory_usage_after_release = get_gpu_memory_gb() + self.assertLess( + gpu_memory_usage_after_release, + gpu_memory_usage_before_release, + ) + + print( + f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB" + ) + + if _DEBUG_EXTRA: + time.sleep(3) + + t = time.perf_counter() + engine.resume_memory_occupation() + print( + f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB" + ) + + hf_model_new = AutoModelForCausalLM.from_pretrained( + DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE, + torch_dtype="bfloat16", + device_map="cuda", + ) + engine.update_weights_from_tensor(list(hf_model_new.named_parameters())) + + # destroy the hf model + del hf_model_new + torch.cuda.empty_cache() + + print("generate (#2)") + outputs = engine.generate(params["prompt_moe"], params["sampling_params_moe"])[ + "text" + ] + self.assertEqual(outputs, params["expect_output_after_update_weights_moe"]) + engine.shutdown() + if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_torch_compile_moe.py b/test/srt/test_torch_compile_moe.py index 62c7f8078..8bc7b45d3 100644 --- a/test/srt/test_torch_compile_moe.py +++ b/test/srt/test_torch_compile_moe.py @@ -7,7 +7,7 @@ import requests from sglang.srt.utils import is_cuda, kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( - DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST, + DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, @@ -18,7 +18,7 @@ from sglang.test.test_utils import ( class TestTorchCompileMoe(CustomTestCase): @classmethod def setUpClass(cls): - cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST + cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model,