[Feature] Sglang Tracing: Fine-Grained Tracking for Request Latency - Part 1 (#9962)

Signed-off-by: Feng Su <sufeng@linux.alibaba.com> Signed-off-by: Huaixin Chang <changhuaixin@linux.alibaba.com> Signed-off-by: Peng Wang <rocking@linux.alibaba.com>
2025-09-15 02:08:02 +08:00
parent 165abeebca
commit 4c21b09074
12 changed files with 1129 additions and 0 deletions
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -149,6 +149,15 @@ from sglang.srt.parser.reasoning_parser import ReasoningParser
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.tracing.trace import (
+    process_tracing_init,
+    trace_event,
+    trace_set_proc_propagate_context,
+    trace_set_thread_info,
+    trace_slice,
+    trace_slice_end,
+    trace_slice_start,
+)
 from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
 from sglang.srt.utils import (
    DynamicGradMode,
@@ -826,6 +835,10 @@ class Scheduler(
            batch = self.get_next_batch_to_run()
            self.cur_batch = batch

+            if batch:
+                for req in batch.reqs:
+                    trace_event("schedule", req.rid)
+
            if batch:
                result = self.run_batch(batch)
                self.process_batch_result(batch, result)
@@ -847,6 +860,10 @@ class Scheduler(
            batch = self.get_next_batch_to_run()
            self.cur_batch = batch

+            if batch:
+                for req in batch.reqs:
+                    trace_event("schedule", req.rid)
+
            if batch:
                batch.launch_done = threading.Event()
                result = self.run_batch(batch)
@@ -1110,6 +1127,12 @@ class Scheduler(
                self.tp_cpu_group,
                src=self.tp_group.ranks[0],
            )
+
+        for req in recv_reqs:
+            if isinstance(req, (TokenizedGenerateReqInput, TokenizedEmbeddingReqInput)):
+                trace_set_proc_propagate_context(req.rid, req.trace_context)
+                trace_slice_start("", req.rid, anonymous=True)
+
        return recv_reqs

    def process_input_requests(self, recv_reqs: List):
@@ -1347,6 +1370,7 @@ class Scheduler(
        else:
            self._prefetch_kvcache(req)
            self.waiting_queue.append(req)
+            trace_slice_end("process req", req.rid, auto_next_anon=True)

    def _prefetch_kvcache(self, req: Req):
        if self.enable_hicache_storage:
@@ -1914,8 +1938,23 @@ class Scheduler(
    ):
        if batch.forward_mode.is_decode():
            self.process_batch_result_decode(batch, result, launch_done)
+            for req in batch.reqs:
+                trace_slice(
+                    "decode loop",
+                    req.rid,
+                    auto_next_anon=not req.finished(),
+                    thread_finish_flag=req.finished(),
+                )
+
        elif batch.forward_mode.is_extend():
            self.process_batch_result_prefill(batch, result, launch_done)
+            for req in batch.reqs:
+                trace_slice(
+                    "prefill",
+                    req.rid,
+                    auto_next_anon=not req.finished(),
+                    thread_finish_flag=req.finished(),
+                )
        elif batch.forward_mode.is_idle():
            if self.enable_overlap:
                self.tp_worker.resolve_last_batch_result(launch_done)
@@ -2600,6 +2639,12 @@ def run_scheduler_process(
    pipe_writer,
    balance_meta: Optional[DPBalanceMeta] = None,
 ):
+    if server_args.enable_trace:
+        process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
+        if server_args.disaggregation_mode == "null":
+            thread_label = "Scheduler"
+            trace_set_thread_info(thread_label, tp_rank, dp_rank)
+
    if (numa_node := server_args.numa_node) is not None:
        numa_bind_to_node(numa_node[gpu_id])