[Feature] Sglang Tracing: Fine-Grained Tracking for Request Latency - Part 1 (#9962)

Signed-off-by: Feng Su <sufeng@linux.alibaba.com>
Signed-off-by: Huaixin Chang <changhuaixin@linux.alibaba.com>
Signed-off-by: Peng Wang <rocking@linux.alibaba.com>
This commit is contained in:
Feng Su
2025-09-15 02:08:02 +08:00
committed by GitHub
parent 165abeebca
commit 4c21b09074
12 changed files with 1129 additions and 0 deletions

View File

@@ -149,6 +149,15 @@ from sglang.srt.parser.reasoning_parser import ReasoningParser
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
from sglang.srt.tracing.trace import (
process_tracing_init,
trace_event,
trace_set_proc_propagate_context,
trace_set_thread_info,
trace_slice,
trace_slice_end,
trace_slice_start,
)
from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
from sglang.srt.utils import (
DynamicGradMode,
@@ -826,6 +835,10 @@ class Scheduler(
batch = self.get_next_batch_to_run()
self.cur_batch = batch
if batch:
for req in batch.reqs:
trace_event("schedule", req.rid)
if batch:
result = self.run_batch(batch)
self.process_batch_result(batch, result)
@@ -847,6 +860,10 @@ class Scheduler(
batch = self.get_next_batch_to_run()
self.cur_batch = batch
if batch:
for req in batch.reqs:
trace_event("schedule", req.rid)
if batch:
batch.launch_done = threading.Event()
result = self.run_batch(batch)
@@ -1110,6 +1127,12 @@ class Scheduler(
self.tp_cpu_group,
src=self.tp_group.ranks[0],
)
for req in recv_reqs:
if isinstance(req, (TokenizedGenerateReqInput, TokenizedEmbeddingReqInput)):
trace_set_proc_propagate_context(req.rid, req.trace_context)
trace_slice_start("", req.rid, anonymous=True)
return recv_reqs
def process_input_requests(self, recv_reqs: List):
@@ -1347,6 +1370,7 @@ class Scheduler(
else:
self._prefetch_kvcache(req)
self.waiting_queue.append(req)
trace_slice_end("process req", req.rid, auto_next_anon=True)
def _prefetch_kvcache(self, req: Req):
if self.enable_hicache_storage:
@@ -1914,8 +1938,23 @@ class Scheduler(
):
if batch.forward_mode.is_decode():
self.process_batch_result_decode(batch, result, launch_done)
for req in batch.reqs:
trace_slice(
"decode loop",
req.rid,
auto_next_anon=not req.finished(),
thread_finish_flag=req.finished(),
)
elif batch.forward_mode.is_extend():
self.process_batch_result_prefill(batch, result, launch_done)
for req in batch.reqs:
trace_slice(
"prefill",
req.rid,
auto_next_anon=not req.finished(),
thread_finish_flag=req.finished(),
)
elif batch.forward_mode.is_idle():
if self.enable_overlap:
self.tp_worker.resolve_last_batch_result(launch_done)
@@ -2600,6 +2639,12 @@ def run_scheduler_process(
pipe_writer,
balance_meta: Optional[DPBalanceMeta] = None,
):
if server_args.enable_trace:
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
if server_args.disaggregation_mode == "null":
thread_label = "Scheduler"
trace_set_thread_info(thread_label, tp_rank, dp_rank)
if (numa_node := server_args.numa_node) is not None:
numa_bind_to_node(numa_node[gpu_id])