[Profile] dump memory trace when cuda graph profile is enabled (#11083)
This commit is contained in:
@@ -437,6 +437,7 @@ class CudaGraphRunner:
|
|||||||
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
|
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
|
||||||
record_shapes=True,
|
record_shapes=True,
|
||||||
)
|
)
|
||||||
|
torch.cuda.memory._record_memory_history()
|
||||||
|
|
||||||
# Trigger CUDA graph capture for specific shapes.
|
# Trigger CUDA graph capture for specific shapes.
|
||||||
# Capture the large shapes first so that the smaller shapes
|
# Capture the large shapes first so that the smaller shapes
|
||||||
@@ -485,6 +486,8 @@ class CudaGraphRunner:
|
|||||||
save_gemlite_cache()
|
save_gemlite_cache()
|
||||||
|
|
||||||
if self.enable_profile_cuda_graph:
|
if self.enable_profile_cuda_graph:
|
||||||
|
torch.cuda.memory._dump_snapshot(f"cuda_graph_runner_memory_usage.pickle")
|
||||||
|
torch.cuda.memory._record_memory_history(enabled=None)
|
||||||
log_message = (
|
log_message = (
|
||||||
"Sorted by CUDA Time:\n"
|
"Sorted by CUDA Time:\n"
|
||||||
+ prof.key_averages(group_by_input_shape=True).table(
|
+ prof.key_averages(group_by_input_shape=True).table(
|
||||||
@@ -494,6 +497,7 @@ class CudaGraphRunner:
|
|||||||
+ prof.key_averages(group_by_input_shape=True).table(
|
+ prof.key_averages(group_by_input_shape=True).table(
|
||||||
sort_by="cpu_time_total", row_limit=10
|
sort_by="cpu_time_total", row_limit=10
|
||||||
)
|
)
|
||||||
|
+ "\n\nMemory Usage is saved to cuda_graph_runner_memory_usage.pickle\n"
|
||||||
)
|
)
|
||||||
logger.info(log_message)
|
logger.info(log_message)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user