From 6535fda1279d300bce88e6d18190ef0d8d51f0cd Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Mon, 29 Sep 2025 17:36:48 -0700 Subject: [PATCH] [Profile] dump memory trace when cuda graph profile is enabled (#11083) --- python/sglang/srt/model_executor/cuda_graph_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 4f09e621a..2ed78ea58 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -437,6 +437,7 @@ class CudaGraphRunner: activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, ) + torch.cuda.memory._record_memory_history() # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes @@ -485,6 +486,8 @@ class CudaGraphRunner: save_gemlite_cache() if self.enable_profile_cuda_graph: + torch.cuda.memory._dump_snapshot(f"cuda_graph_runner_memory_usage.pickle") + torch.cuda.memory._record_memory_history(enabled=None) log_message = ( "Sorted by CUDA Time:\n" + prof.key_averages(group_by_input_shape=True).table( @@ -494,6 +497,7 @@ class CudaGraphRunner: + prof.key_averages(group_by_input_shape=True).table( sort_by="cpu_time_total", row_limit=10 ) + + "\n\nMemory Usage is saved to cuda_graph_runner_memory_usage.pickle\n" ) logger.info(log_message)