CUDA-graph-compatible releasing and resuming KV cache and model weight memory (#2630)
This commit is contained in:
@@ -29,6 +29,7 @@ suites = {
|
||||
"test_openai_server.py",
|
||||
"test_pytorch_sampling_backend.py",
|
||||
"test_radix_attention.py",
|
||||
"test_release_memory_occupation.py",
|
||||
"test_retract_decode.py",
|
||||
"test_server_args.py",
|
||||
"test_session_control.py",
|
||||
|
||||
98
test/srt/test_release_memory_occupation.py
Normal file
98
test/srt/test_release_memory_occupation.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
|
||||
|
||||
# (temporarily) set to true to observe memory usage in nvidia-smi more clearly
|
||||
_DEBUG_EXTRA = True
|
||||
|
||||
|
||||
class TestReleaseMemoryOccupation(unittest.TestCase):
|
||||
def test_release_and_resume_occupation(self):
|
||||
prompt = "Today is a sunny day and I like"
|
||||
sampling_params = {"temperature": 0, "max_new_tokens": 8}
|
||||
model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
|
||||
expect_output = " to spend it outdoors. I decided to"
|
||||
|
||||
engine = sgl.Engine(
|
||||
model_path=model_name,
|
||||
random_seed=42,
|
||||
enable_memory_saver=True,
|
||||
# disable_cuda_graph=True, # for debugging only
|
||||
)
|
||||
hf_model_new = AutoModelForCausalLM.from_pretrained(
|
||||
model_name, torch_dtype="bfloat16"
|
||||
)
|
||||
|
||||
print("generate (#1)")
|
||||
outputs = engine.generate(prompt, sampling_params)["text"]
|
||||
self.assertEqual(outputs, expect_output)
|
||||
|
||||
if _DEBUG_EXTRA:
|
||||
time.sleep(3)
|
||||
|
||||
self.assertEqual(
|
||||
_try_allocate_big_tensor(),
|
||||
False,
|
||||
"Should not be able to allocate big tensors before releasing",
|
||||
)
|
||||
|
||||
print("release_memory_occupation start")
|
||||
t = time.time()
|
||||
engine.release_memory_occupation()
|
||||
if _DEBUG_EXTRA:
|
||||
print("release_memory_occupation", time.time() - t)
|
||||
|
||||
if _DEBUG_EXTRA:
|
||||
time.sleep(5)
|
||||
|
||||
self.assertEqual(
|
||||
_try_allocate_big_tensor(),
|
||||
True,
|
||||
"Should be able to allocate big tensors aftre releasing",
|
||||
)
|
||||
|
||||
if _DEBUG_EXTRA:
|
||||
time.sleep(5)
|
||||
|
||||
print("resume_memory_occupation start")
|
||||
t = time.time()
|
||||
engine.resume_memory_occupation()
|
||||
if _DEBUG_EXTRA:
|
||||
print("resume_memory_occupation", time.time() - t)
|
||||
|
||||
self.assertEqual(
|
||||
_try_allocate_big_tensor(),
|
||||
False,
|
||||
"Should not be able to allocate big tensors after resuming",
|
||||
)
|
||||
|
||||
print("update_weights_from_tensor")
|
||||
# As if: PPO has updated hf model's weights, and now we sync it to SGLang
|
||||
engine.update_weights_from_tensor(list(hf_model_new.named_parameters()))
|
||||
|
||||
print("generate (#2)")
|
||||
outputs = engine.generate(prompt, sampling_params)["text"]
|
||||
self.assertEqual(outputs, expect_output)
|
||||
|
||||
if _DEBUG_EXTRA:
|
||||
time.sleep(4)
|
||||
|
||||
engine.shutdown()
|
||||
|
||||
|
||||
def _try_allocate_big_tensor(size: int = 20_000_000_000):
|
||||
try:
|
||||
torch.empty((size,), dtype=torch.uint8, device="cuda")
|
||||
torch.cuda.empty_cache()
|
||||
return True
|
||||
except torch.cuda.OutOfMemoryError:
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user