[Feature] Sglang Tracing: Fine-Grained Tracking for Request Latency - Part 1 (#9962)
Signed-off-by: Feng Su <sufeng@linux.alibaba.com> Signed-off-by: Huaixin Chang <changhuaixin@linux.alibaba.com> Signed-off-by: Peng Wang <rocking@linux.alibaba.com>
This commit is contained in:
@@ -149,6 +149,15 @@ from sglang.srt.parser.reasoning_parser import ReasoningParser
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
||||
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
||||
from sglang.srt.tracing.trace import (
|
||||
process_tracing_init,
|
||||
trace_event,
|
||||
trace_set_proc_propagate_context,
|
||||
trace_set_thread_info,
|
||||
trace_slice,
|
||||
trace_slice_end,
|
||||
trace_slice_start,
|
||||
)
|
||||
from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
|
||||
from sglang.srt.utils import (
|
||||
DynamicGradMode,
|
||||
@@ -826,6 +835,10 @@ class Scheduler(
|
||||
batch = self.get_next_batch_to_run()
|
||||
self.cur_batch = batch
|
||||
|
||||
if batch:
|
||||
for req in batch.reqs:
|
||||
trace_event("schedule", req.rid)
|
||||
|
||||
if batch:
|
||||
result = self.run_batch(batch)
|
||||
self.process_batch_result(batch, result)
|
||||
@@ -847,6 +860,10 @@ class Scheduler(
|
||||
batch = self.get_next_batch_to_run()
|
||||
self.cur_batch = batch
|
||||
|
||||
if batch:
|
||||
for req in batch.reqs:
|
||||
trace_event("schedule", req.rid)
|
||||
|
||||
if batch:
|
||||
batch.launch_done = threading.Event()
|
||||
result = self.run_batch(batch)
|
||||
@@ -1110,6 +1127,12 @@ class Scheduler(
|
||||
self.tp_cpu_group,
|
||||
src=self.tp_group.ranks[0],
|
||||
)
|
||||
|
||||
for req in recv_reqs:
|
||||
if isinstance(req, (TokenizedGenerateReqInput, TokenizedEmbeddingReqInput)):
|
||||
trace_set_proc_propagate_context(req.rid, req.trace_context)
|
||||
trace_slice_start("", req.rid, anonymous=True)
|
||||
|
||||
return recv_reqs
|
||||
|
||||
def process_input_requests(self, recv_reqs: List):
|
||||
@@ -1347,6 +1370,7 @@ class Scheduler(
|
||||
else:
|
||||
self._prefetch_kvcache(req)
|
||||
self.waiting_queue.append(req)
|
||||
trace_slice_end("process req", req.rid, auto_next_anon=True)
|
||||
|
||||
def _prefetch_kvcache(self, req: Req):
|
||||
if self.enable_hicache_storage:
|
||||
@@ -1914,8 +1938,23 @@ class Scheduler(
|
||||
):
|
||||
if batch.forward_mode.is_decode():
|
||||
self.process_batch_result_decode(batch, result, launch_done)
|
||||
for req in batch.reqs:
|
||||
trace_slice(
|
||||
"decode loop",
|
||||
req.rid,
|
||||
auto_next_anon=not req.finished(),
|
||||
thread_finish_flag=req.finished(),
|
||||
)
|
||||
|
||||
elif batch.forward_mode.is_extend():
|
||||
self.process_batch_result_prefill(batch, result, launch_done)
|
||||
for req in batch.reqs:
|
||||
trace_slice(
|
||||
"prefill",
|
||||
req.rid,
|
||||
auto_next_anon=not req.finished(),
|
||||
thread_finish_flag=req.finished(),
|
||||
)
|
||||
elif batch.forward_mode.is_idle():
|
||||
if self.enable_overlap:
|
||||
self.tp_worker.resolve_last_batch_result(launch_done)
|
||||
@@ -2600,6 +2639,12 @@ def run_scheduler_process(
|
||||
pipe_writer,
|
||||
balance_meta: Optional[DPBalanceMeta] = None,
|
||||
):
|
||||
if server_args.enable_trace:
|
||||
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
||||
if server_args.disaggregation_mode == "null":
|
||||
thread_label = "Scheduler"
|
||||
trace_set_thread_info(thread_label, tp_rank, dp_rank)
|
||||
|
||||
if (numa_node := server_args.numa_node) is not None:
|
||||
numa_bind_to_node(numa_node[gpu_id])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user