Make profiler output file names consistent (#5548)
This commit is contained in:
@@ -834,6 +834,7 @@ class ProfileReq:
|
|||||||
activities: Optional[List[str]] = None
|
activities: Optional[List[str]] = None
|
||||||
with_stack: Optional[bool] = None
|
with_stack: Optional[bool] = None
|
||||||
record_shapes: Optional[bool] = None
|
record_shapes: Optional[bool] = None
|
||||||
|
profile_id: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -391,6 +391,7 @@ class Scheduler(
|
|||||||
self.torch_profiler = None
|
self.torch_profiler = None
|
||||||
self.torch_profiler_output_dir: Optional[str] = None
|
self.torch_profiler_output_dir: Optional[str] = None
|
||||||
self.profiler_activities: Optional[List[str]] = None
|
self.profiler_activities: Optional[List[str]] = None
|
||||||
|
self.profiler_id: Optional[str] = None
|
||||||
self.profiler_target_forward_ct: Optional[int] = None
|
self.profiler_target_forward_ct: Optional[int] = None
|
||||||
|
|
||||||
# Init metrics stats
|
# Init metrics stats
|
||||||
@@ -1805,6 +1806,7 @@ class Scheduler(
|
|||||||
recv_req.activities,
|
recv_req.activities,
|
||||||
recv_req.with_stack,
|
recv_req.with_stack,
|
||||||
recv_req.record_shapes,
|
recv_req.record_shapes,
|
||||||
|
recv_req.profile_id,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return self.stop_profile()
|
return self.stop_profile()
|
||||||
@@ -1816,6 +1818,7 @@ class Scheduler(
|
|||||||
activities: Optional[List[str]],
|
activities: Optional[List[str]],
|
||||||
with_stack: Optional[bool],
|
with_stack: Optional[bool],
|
||||||
record_shapes: Optional[bool],
|
record_shapes: Optional[bool],
|
||||||
|
profile_id: Optional[str],
|
||||||
) -> None:
|
) -> None:
|
||||||
if self.profiler_activities:
|
if self.profiler_activities:
|
||||||
return ProfileReqOutput(
|
return ProfileReqOutput(
|
||||||
@@ -1830,9 +1833,11 @@ class Scheduler(
|
|||||||
|
|
||||||
self.torch_profiler_output_dir = output_dir
|
self.torch_profiler_output_dir = output_dir
|
||||||
self.profiler_activities = activities
|
self.profiler_activities = activities
|
||||||
|
self.profiler_id = profile_id
|
||||||
logger.info(
|
logger.info(
|
||||||
"Profiling starts. Traces will be saved to: %s",
|
"Profiling starts. Traces will be saved to: %s (with id %s)",
|
||||||
self.torch_profiler_output_dir,
|
self.torch_profiler_output_dir,
|
||||||
|
self.profiler_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
activity_map = {
|
activity_map = {
|
||||||
@@ -1874,14 +1879,14 @@ class Scheduler(
|
|||||||
self.torch_profiler.export_chrome_trace(
|
self.torch_profiler.export_chrome_trace(
|
||||||
os.path.join(
|
os.path.join(
|
||||||
self.torch_profiler_output_dir,
|
self.torch_profiler_output_dir,
|
||||||
str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
|
self.profiler_id + f"-TP-{self.tp_rank}" + ".trace.json.gz",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if "MEM" in self.profiler_activities:
|
if "MEM" in self.profiler_activities:
|
||||||
memory_profile_path = os.path.join(
|
memory_profile_path = os.path.join(
|
||||||
self.torch_profiler_output_dir,
|
self.torch_profiler_output_dir,
|
||||||
str(time.time()) + f"-TP-{self.tp_rank}-memory" + ".pickle",
|
self.profiler_id + f"-TP-{self.tp_rank}-memory" + ".pickle",
|
||||||
)
|
)
|
||||||
torch.cuda.memory._dump_snapshot(memory_profile_path)
|
torch.cuda.memory._dump_snapshot(memory_profile_path)
|
||||||
torch.cuda.memory._record_memory_history(enabled=None)
|
torch.cuda.memory._record_memory_history(enabled=None)
|
||||||
|
|||||||
@@ -650,6 +650,7 @@ class TokenizerManager:
|
|||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
num_steps=num_steps,
|
num_steps=num_steps,
|
||||||
activities=activities,
|
activities=activities,
|
||||||
|
profile_id=str(time.time()),
|
||||||
)
|
)
|
||||||
result = (await self.start_profile_communicator(req))[0]
|
result = (await self.start_profile_communicator(req))[0]
|
||||||
if not result.success:
|
if not result.success:
|
||||||
|
|||||||
Reference in New Issue
Block a user