[Feature] Sglang Tracing: Fine-Grained Tracking for Request Latency - Part 1 (#9962)

Signed-off-by: Feng Su <sufeng@linux.alibaba.com>
Signed-off-by: Huaixin Chang <changhuaixin@linux.alibaba.com>
Signed-off-by: Peng Wang <rocking@linux.alibaba.com>
This commit is contained in:
Feng Su
2025-09-15 02:08:02 +08:00
committed by GitHub
parent 165abeebca
commit 4c21b09074
12 changed files with 1129 additions and 0 deletions

View File

@@ -82,6 +82,13 @@ from sglang.srt.managers.tokenizer_communicator_mixin import TokenizerCommunicat
from sglang.srt.metrics.collector import TokenizerMetricsCollector
from sglang.srt.sampling.sampling_params import SamplingParams
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.tracing.trace import (
trace_get_proc_propagate_context,
trace_req_finish,
trace_req_start,
trace_slice_end,
trace_slice_start,
)
from sglang.srt.utils import (
configure_gc_warning,
dataclass_to_string_truncated,
@@ -358,6 +365,24 @@ class TokenizerManager(TokenizerCommunicatorMixin):
# If it's a single value, add worker_id prefix
obj.rid = f"{self.worker_id}_{obj.rid}"
if obj.is_single:
bootstrap_room = (
obj.bootstrap_room if hasattr(obj, "bootstrap_room") else None
)
trace_req_start(obj.rid, bootstrap_room, ts=int(created_time * 1e9))
trace_slice_start("", obj.rid, ts=int(created_time * 1e9), anonymous=True)
else:
for i in range(len(obj.rid)):
bootstrap_room = (
obj.bootstrap_room[i]
if hasattr(obj, "bootstrap_room") and obj.bootstrap_room
else None
)
trace_req_start(obj.rid[i], bootstrap_room, ts=int(created_time * 1e9))
trace_slice_start(
"", obj.rid[i], ts=int(created_time * 1e9), anonymous=True
)
if self.log_requests:
max_length, skip_names, _ = self.log_request_metadata
logger.info(
@@ -574,6 +599,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
mm_inputs = None
self._validate_one_request(obj, input_ids)
trace_slice_end("tokenize", obj.rid)
return self._create_tokenized_object(
obj, input_text, input_ids, input_embeds, mm_inputs, token_type_ids
)
@@ -752,6 +778,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
req, req.text, input_ids_list[i], None, None, token_type_ids
)
)
trace_slice_end("tokenize", req.rid)
logger.debug(f"Completed batch processing for {batch_size} requests")
return tokenized_objs
@@ -779,9 +806,12 @@ class TokenizerManager(TokenizerCommunicatorMixin):
tokenized_obj: Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput],
created_time: Optional[float] = None,
):
trace_slice_start("dispatch", obj.rid)
tokenized_obj.trace_context = trace_get_proc_propagate_context(obj.rid)
self.send_to_scheduler.send_pyobj(tokenized_obj)
state = ReqState([], False, asyncio.Event(), obj, created_time=created_time)
self.rid_to_state[obj.rid] = state
trace_slice_end("dispatch", obj.rid, thread_finish_flag=True)
return state
def _send_batch_request(
@@ -1429,6 +1459,9 @@ class TokenizerManager(TokenizerCommunicatorMixin):
meta_info["spec_verify_ct"] = recv_obj.spec_verify_ct[i]
state.finished_time = time.time()
meta_info["e2e_latency"] = state.finished_time - state.created_time
trace_req_finish(rid, ts=int(state.finished_time * 1e9))
del self.rid_to_state[rid]
# Mark ongoing LoRA request as finished.