Improve profiler and integrate profiler in bench_one_batch_server (#6787)
This commit is contained in:
@@ -34,7 +34,6 @@ import zmq
|
||||
from torch.distributed import barrier
|
||||
|
||||
from sglang.global_config import global_config
|
||||
from sglang.srt import two_batch_overlap
|
||||
from sglang.srt.configs.model_config import ModelConfig
|
||||
from sglang.srt.constrained.base_grammar_backend import create_grammar_backend
|
||||
from sglang.srt.disaggregation.decode import (
|
||||
@@ -63,7 +62,6 @@ from sglang.srt.hf_transformers_utils import (
|
||||
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||
from sglang.srt.managers.expert_distribution import (
|
||||
ExpertDistributionRecorder,
|
||||
get_global_expert_distribution_recorder,
|
||||
)
|
||||
from sglang.srt.managers.io_struct import (
|
||||
@@ -140,6 +138,7 @@ from sglang.srt.utils import (
|
||||
broadcast_pyobj,
|
||||
configure_logger,
|
||||
disable_request_logging,
|
||||
get_available_gpu_memory,
|
||||
get_bool_env_var,
|
||||
get_zmq_socket,
|
||||
kill_itself_when_parent_died,
|
||||
@@ -213,7 +212,6 @@ class Scheduler(
|
||||
self.gpu_id = gpu_id
|
||||
self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
|
||||
self.page_size = server_args.page_size
|
||||
# Distributed rank info
|
||||
self.dp_size = server_args.dp_size
|
||||
self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
|
||||
compute_dp_attention_world_info(
|
||||
@@ -333,12 +331,16 @@ class Scheduler(
|
||||
|
||||
# Print debug info
|
||||
if tp_rank == 0:
|
||||
avail_mem = get_available_gpu_memory(
|
||||
self.device, self.gpu_id, empty_cache=False
|
||||
)
|
||||
logger.info(
|
||||
f"max_total_num_tokens={self.max_total_num_tokens}, "
|
||||
f"chunked_prefill_size={server_args.chunked_prefill_size}, "
|
||||
f"max_prefill_tokens={self.max_prefill_tokens}, "
|
||||
f"max_running_requests={self.max_running_requests}, "
|
||||
f"context_len={self.model_config.context_len}"
|
||||
f"context_len={self.model_config.context_len}, "
|
||||
f"available_gpu_mem={avail_mem:.2f} GB"
|
||||
)
|
||||
|
||||
# Init memory pool and cache
|
||||
@@ -362,6 +364,7 @@ class Scheduler(
|
||||
self.current_stream = torch.get_device_module(self.device).current_stream()
|
||||
if self.device == "cpu":
|
||||
self.current_stream.synchronize = lambda: None # No-op for CPU
|
||||
self.forward_sleep_time = None
|
||||
|
||||
# Init session info
|
||||
self.sessions: Dict[str, Session] = {}
|
||||
@@ -425,8 +428,14 @@ class Scheduler(
|
||||
self.profiler_activities: Optional[List[str]] = None
|
||||
self.profiler_id: Optional[str] = None
|
||||
self.profiler_target_forward_ct: Optional[int] = None
|
||||
|
||||
self.forward_sleep_time = None
|
||||
self.profiler_target_prefill_ct: Optional[int] = None
|
||||
self.profiler_target_decode_ct: Optional[int] = None
|
||||
self.profiler_prefill_ct: Optional[int] = None
|
||||
self.profiler_decode_ct: Optional[int] = None
|
||||
self.profile_by_stage: bool = False
|
||||
self.profile_steps: Optional[int] = None
|
||||
self.profile_in_progress: bool = False
|
||||
self.rpd_profiler = None
|
||||
|
||||
# Init metrics stats
|
||||
self.init_metrics()
|
||||
@@ -1518,7 +1527,7 @@ class Scheduler(
|
||||
self.new_token_ratio = new_token_ratio
|
||||
|
||||
logger.info(
|
||||
"Decode out of memory happened. "
|
||||
"KV cache pool is full. Retract requests. "
|
||||
f"#retracted_reqs: {len(retracted_reqs)}, "
|
||||
f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
|
||||
)
|
||||
@@ -1542,13 +1551,8 @@ class Scheduler(
|
||||
"""Run a batch."""
|
||||
self.forward_ct += 1
|
||||
|
||||
# Check profiler
|
||||
if (
|
||||
self.profiler_target_forward_ct
|
||||
and self.profiler_target_forward_ct <= self.forward_ct
|
||||
):
|
||||
self.send_to_tokenizer.send_pyobj(self.stop_profile())
|
||||
|
||||
# Whether to run the profiler
|
||||
self._profile_batch_predicate(batch)
|
||||
if self.forward_sleep_time is not None:
|
||||
logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
|
||||
time.sleep(self.forward_sleep_time)
|
||||
@@ -2121,46 +2125,82 @@ class Scheduler(
|
||||
|
||||
def profile(self, recv_req: ProfileReq):
|
||||
if recv_req.type == ProfileReqType.START_PROFILE:
|
||||
return self.start_profile(
|
||||
recv_req.output_dir,
|
||||
recv_req.num_steps,
|
||||
recv_req.activities,
|
||||
recv_req.with_stack,
|
||||
recv_req.record_shapes,
|
||||
recv_req.profile_id,
|
||||
)
|
||||
if recv_req.profile_by_stage:
|
||||
return self.init_profile(
|
||||
recv_req.output_dir,
|
||||
recv_req.num_steps,
|
||||
recv_req.activities,
|
||||
recv_req.with_stack,
|
||||
recv_req.record_shapes,
|
||||
recv_req.profile_by_stage,
|
||||
)
|
||||
else:
|
||||
self.init_profile(
|
||||
recv_req.output_dir,
|
||||
recv_req.num_steps,
|
||||
recv_req.activities,
|
||||
recv_req.with_stack,
|
||||
recv_req.record_shapes,
|
||||
recv_req.profile_by_stage,
|
||||
)
|
||||
return self.start_profile(True)
|
||||
else:
|
||||
return self.stop_profile()
|
||||
|
||||
def start_profile(
|
||||
def init_profile(
|
||||
self,
|
||||
output_dir: Optional[str],
|
||||
num_steps: Optional[int],
|
||||
activities: Optional[List[str]],
|
||||
with_stack: Optional[bool],
|
||||
record_shapes: Optional[bool],
|
||||
profile_id: Optional[str],
|
||||
) -> None:
|
||||
if self.profiler_activities:
|
||||
profile_by_stage: bool,
|
||||
) -> ProfileReqOutput:
|
||||
if self.profile_in_progress:
|
||||
return ProfileReqOutput(
|
||||
success=False,
|
||||
message="Profiling is already in progress. Call /stop_profile first.",
|
||||
)
|
||||
|
||||
self.profile_by_stage = profile_by_stage
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
|
||||
if activities is None:
|
||||
activities = ["CPU", "GPU"]
|
||||
|
||||
self.torch_profiler_output_dir = output_dir
|
||||
self.torch_profiler_with_stack = with_stack
|
||||
self.torch_profiler_record_shapes = record_shapes
|
||||
self.profiler_activities = activities
|
||||
self.profiler_id = profile_id
|
||||
|
||||
if num_steps:
|
||||
self.profile_steps = num_steps
|
||||
if self.profile_by_stage:
|
||||
self.profiler_target_prefill_ct = num_steps
|
||||
self.profiler_target_decode_ct = num_steps
|
||||
self.profiler_prefill_ct = 0
|
||||
self.profiler_decode_ct = 0
|
||||
else:
|
||||
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
||||
# The caller will be notified when reaching profiler_target_forward_ct
|
||||
else:
|
||||
self.profiler_target_forward_ct = None
|
||||
|
||||
return ProfileReqOutput(success=True, message="Succeeded")
|
||||
|
||||
def start_profile(
|
||||
self, stage: Optional[ForwardMode] = None
|
||||
) -> ProfileReqOutput | None:
|
||||
stage_str = f" for {stage.__str__()}" if stage else ""
|
||||
logger.info(
|
||||
"Profiling starts. Traces will be saved to: %s (with id %s)",
|
||||
self.torch_profiler_output_dir,
|
||||
self.profiler_id,
|
||||
f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir}",
|
||||
)
|
||||
|
||||
activities = self.profiler_activities
|
||||
with_stack = self.torch_profiler_with_stack
|
||||
record_shapes = self.torch_profiler_record_shapes
|
||||
|
||||
activity_map = {
|
||||
"CPU": torch.profiler.ProfilerActivity.CPU,
|
||||
"GPU": torch.profiler.ProfilerActivity.CUDA,
|
||||
@@ -2169,48 +2209,97 @@ class Scheduler(
|
||||
activity_map[a] for a in activities if a in activity_map
|
||||
]
|
||||
|
||||
if torchprof_activities:
|
||||
if "RPD" in activities:
|
||||
from rpdTracerControl import rpdTracerControl
|
||||
|
||||
rpdTracerControl.skipCreate()
|
||||
|
||||
self.rpd_profile_path = os.path.join(
|
||||
self.torch_profiler_output_dir,
|
||||
"rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
|
||||
)
|
||||
|
||||
if self.tp_rank == 0:
|
||||
import sqlite3
|
||||
|
||||
from rocpd.schema import RocpdSchema
|
||||
|
||||
if os.path.exists("trace.rpd"):
|
||||
os.unlink("trace.rpd")
|
||||
schema = RocpdSchema()
|
||||
connection = sqlite3.connect("trace.rpd")
|
||||
schema.writeSchema(connection)
|
||||
connection.commit()
|
||||
del connection
|
||||
torch.distributed.barrier(self.tp_cpu_group)
|
||||
|
||||
self.rpd_profiler = rpdTracerControl()
|
||||
self.rpd_profiler.setPythonTrace(True)
|
||||
self.rpd_profiler.start()
|
||||
self.rpd_profiler.rangePush("", "rpd profile range", "")
|
||||
self.profile_in_progress = True
|
||||
elif torchprof_activities:
|
||||
self.torch_profiler = torch.profiler.profile(
|
||||
activities=torchprof_activities,
|
||||
with_stack=with_stack if with_stack is not None else True,
|
||||
record_shapes=record_shapes if record_shapes is not None else False,
|
||||
)
|
||||
self.torch_profiler.start()
|
||||
self.profile_in_progress = True
|
||||
|
||||
if "MEM" in activities:
|
||||
torch.cuda.memory._record_memory_history(max_entries=100000)
|
||||
self.profile_in_progress = True
|
||||
|
||||
if "CUDA_PROFILER" in activities:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
|
||||
if num_steps:
|
||||
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
||||
# The caller will be notified when reaching profiler_target_forward_ct
|
||||
else:
|
||||
self.profiler_target_forward_ct = None
|
||||
return ProfileReqOutput(success=True, message="Succeeded")
|
||||
return ProfileReqOutput(success=True, message="Succeeded")
|
||||
|
||||
def stop_profile(self) -> None:
|
||||
if self.profiler_activities is None:
|
||||
def stop_profile(
|
||||
self, stage: Optional[ForwardMode] = None
|
||||
) -> ProfileReqOutput | None:
|
||||
if not self.profile_in_progress:
|
||||
return ProfileReqOutput(
|
||||
success=False,
|
||||
message="Profiling is not in progress. Call /start_profile first.",
|
||||
)
|
||||
|
||||
logger.info("Stop profiling...")
|
||||
stage_suffix = f"-{stage.__str__()}" if stage else ""
|
||||
logger.info("Stop profiling" + stage_suffix + "...")
|
||||
if self.torch_profiler is not None:
|
||||
self.torch_profiler.stop()
|
||||
self.torch_profiler.export_chrome_trace(
|
||||
os.path.join(
|
||||
self.torch_profiler_output_dir,
|
||||
self.profiler_id + f"-TP-{self.tp_rank}" + ".trace.json.gz",
|
||||
str(time.time())
|
||||
+ f"-TP-{self.tp_rank}"
|
||||
+ stage_suffix
|
||||
+ ".trace.json.gz",
|
||||
)
|
||||
)
|
||||
torch.distributed.barrier(self.tp_cpu_group)
|
||||
|
||||
if "MEM" in self.profiler_activities:
|
||||
if self.rpd_profiler is not None:
|
||||
self.rpd_profiler.rangePop()
|
||||
self.rpd_profiler.stop()
|
||||
self.rpd_profiler.flush()
|
||||
|
||||
torch.distributed.barrier(self.tp_cpu_group)
|
||||
if self.tp_rank == 0:
|
||||
from sglang.srt.utils import rpd_to_chrome_trace
|
||||
|
||||
rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
|
||||
self.rpd_profiler = None
|
||||
self.rpd_profiler_path = None
|
||||
|
||||
if self.profiler_activities is not None and "MEM" in self.profiler_activities:
|
||||
memory_profile_path = os.path.join(
|
||||
self.torch_profiler_output_dir,
|
||||
self.profiler_id + f"-TP-{self.tp_rank}-memory" + ".pickle",
|
||||
str(time.time())
|
||||
+ f"-TP-{self.tp_rank}-memory"
|
||||
+ stage_suffix
|
||||
+ ".pickle",
|
||||
)
|
||||
torch.cuda.memory._dump_snapshot(memory_profile_path)
|
||||
torch.cuda.memory._record_memory_history(enabled=None)
|
||||
@@ -2223,11 +2312,38 @@ class Scheduler(
|
||||
self.torch_profiler_output_dir,
|
||||
)
|
||||
self.torch_profiler = None
|
||||
self.torch_profiler_output_dir = None
|
||||
self.profiler_activities = None
|
||||
self.profiler_target_forward_ct = None
|
||||
self.profile_in_progress = False
|
||||
|
||||
return ProfileReqOutput(success=True, message="Succeeded")
|
||||
return ProfileReqOutput(success=True, message="Succeeded.")
|
||||
|
||||
def _profile_batch_predicate(self, batch):
|
||||
if self.profile_by_stage:
|
||||
if batch.forward_mode.is_prefill():
|
||||
if self.profiler_prefill_ct == 0:
|
||||
self.start_profile(batch.forward_mode)
|
||||
self.profiler_prefill_ct += 1
|
||||
if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
|
||||
if self.profile_in_progress:
|
||||
self.stop_profile(stage=ForwardMode.EXTEND)
|
||||
elif batch.forward_mode.is_decode():
|
||||
if self.profiler_decode_ct == 0:
|
||||
if self.profile_in_progress:
|
||||
# force trace flush
|
||||
self.stop_profile(ForwardMode.EXTEND)
|
||||
self.start_profile(batch.forward_mode)
|
||||
self.profiler_decode_ct += 1
|
||||
if self.profiler_decode_ct > self.profiler_target_decode_ct:
|
||||
if self.profile_in_progress:
|
||||
self.stop_profile(stage=ForwardMode.DECODE)
|
||||
else:
|
||||
raise RuntimeError("unsupported profile stage")
|
||||
else:
|
||||
# Check profiler
|
||||
if (
|
||||
self.profiler_target_forward_ct
|
||||
and self.profiler_target_forward_ct <= self.forward_ct
|
||||
):
|
||||
self.stop_profile()
|
||||
|
||||
def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
|
||||
if recv_req == ExpertDistributionReq.START_RECORD:
|
||||
|
||||
Reference in New Issue
Block a user