[feature] Add start step profile argument in /start_profile (#7608)
This commit is contained in:
@@ -418,6 +418,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
|
||||
|
||||
await _global_state.tokenizer_manager.start_profile(
|
||||
output_dir=obj.output_dir,
|
||||
start_step=obj.start_step,
|
||||
num_steps=obj.num_steps,
|
||||
activities=obj.activities,
|
||||
with_stack=obj.with_stack,
|
||||
|
||||
@@ -905,6 +905,7 @@ class ProfileReqInput:
|
||||
# If set, it profile as many as this number of steps.
|
||||
# If it is set, profiling is automatically stopped after this step, and
|
||||
# the caller doesn't need to run stop_profile.
|
||||
start_step: Optional[int] = None
|
||||
num_steps: Optional[int] = None
|
||||
activities: Optional[List[str]] = None
|
||||
profile_by_stage: bool = False
|
||||
@@ -932,6 +933,7 @@ class ExpertDistributionReqOutput:
|
||||
class ProfileReq:
|
||||
type: ProfileReqType
|
||||
output_dir: Optional[str] = None
|
||||
start_step: Optional[int] = None
|
||||
num_steps: Optional[int] = None
|
||||
activities: Optional[List[str]] = None
|
||||
profile_by_stage: bool = False
|
||||
|
||||
@@ -485,6 +485,8 @@ class Scheduler(
|
||||
enable=server_args.enable_memory_saver
|
||||
)
|
||||
self.init_profier()
|
||||
|
||||
# Init metrics stats
|
||||
self.init_metrics()
|
||||
self.init_kv_events(server_args.kv_events_config)
|
||||
|
||||
@@ -628,6 +630,7 @@ class Scheduler(
|
||||
self.torch_profiler_output_dir: Optional[str] = None
|
||||
self.profiler_activities: Optional[List[str]] = None
|
||||
self.profile_id: Optional[str] = None
|
||||
self.profiler_start_forward_ct: Optional[int] = None
|
||||
self.profiler_target_forward_ct: Optional[int] = None
|
||||
self.profiler_target_prefill_ct: Optional[int] = None
|
||||
self.profiler_target_decode_ct: Optional[int] = None
|
||||
@@ -2389,9 +2392,10 @@ class Scheduler(
|
||||
|
||||
def profile(self, recv_req: ProfileReq):
|
||||
if recv_req.type == ProfileReqType.START_PROFILE:
|
||||
if recv_req.profile_by_stage:
|
||||
if recv_req.profile_by_stage or recv_req.start_step:
|
||||
return self.init_profile(
|
||||
recv_req.output_dir,
|
||||
recv_req.start_step,
|
||||
recv_req.num_steps,
|
||||
recv_req.activities,
|
||||
recv_req.with_stack,
|
||||
@@ -2402,6 +2406,7 @@ class Scheduler(
|
||||
else:
|
||||
self.init_profile(
|
||||
recv_req.output_dir,
|
||||
recv_req.start_step,
|
||||
recv_req.num_steps,
|
||||
recv_req.activities,
|
||||
recv_req.with_stack,
|
||||
@@ -2416,6 +2421,7 @@ class Scheduler(
|
||||
def init_profile(
|
||||
self,
|
||||
output_dir: Optional[str],
|
||||
start_step: Optional[int],
|
||||
num_steps: Optional[int],
|
||||
activities: Optional[List[str]],
|
||||
with_stack: Optional[bool],
|
||||
@@ -2442,6 +2448,9 @@ class Scheduler(
|
||||
self.profiler_activities = activities
|
||||
self.profile_id = profile_id
|
||||
|
||||
if start_step:
|
||||
self.profiler_start_forward_ct = max(start_step, self.forward_ct + 1)
|
||||
|
||||
if num_steps:
|
||||
self.profile_steps = num_steps
|
||||
if self.profile_by_stage:
|
||||
@@ -2449,6 +2458,10 @@ class Scheduler(
|
||||
self.profiler_target_decode_ct = num_steps
|
||||
self.profiler_prefill_ct = 0
|
||||
self.profiler_decode_ct = 0
|
||||
elif start_step:
|
||||
self.profiler_target_forward_ct = (
|
||||
self.profiler_start_forward_ct + num_steps
|
||||
)
|
||||
else:
|
||||
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
||||
# The caller will be notified when reaching profiler_target_forward_ct
|
||||
@@ -2521,6 +2534,7 @@ class Scheduler(
|
||||
|
||||
if "CUDA_PROFILER" in activities:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
self.profile_in_progress = True
|
||||
|
||||
return ProfileReqOutput(success=True, message="Succeeded")
|
||||
|
||||
@@ -2584,6 +2598,7 @@ class Scheduler(
|
||||
)
|
||||
self.torch_profiler = None
|
||||
self.profile_in_progress = False
|
||||
self.profiler_start_forward_ct = None
|
||||
|
||||
return ProfileReqOutput(success=True, message="Succeeded.")
|
||||
|
||||
@@ -2617,6 +2632,11 @@ class Scheduler(
|
||||
and self.profiler_target_forward_ct <= self.forward_ct
|
||||
):
|
||||
self.stop_profile()
|
||||
if (
|
||||
self.profiler_start_forward_ct
|
||||
and self.profiler_start_forward_ct == self.forward_ct
|
||||
):
|
||||
self.start_profile()
|
||||
|
||||
def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
|
||||
if recv_req == ExpertDistributionReq.START_RECORD:
|
||||
|
||||
@@ -863,6 +863,7 @@ class TokenizerManager:
|
||||
async def start_profile(
|
||||
self,
|
||||
output_dir: Optional[str] = None,
|
||||
start_step: Optional[int] = None,
|
||||
num_steps: Optional[int] = None,
|
||||
activities: Optional[List[str]] = None,
|
||||
with_stack: Optional[bool] = None,
|
||||
@@ -875,6 +876,7 @@ class TokenizerManager:
|
||||
req = ProfileReq(
|
||||
type=ProfileReqType.START_PROFILE,
|
||||
output_dir=output_dir,
|
||||
start_step=start_step,
|
||||
num_steps=num_steps,
|
||||
activities=activities,
|
||||
with_stack=with_stack,
|
||||
|
||||
Reference in New Issue
Block a user