Simplify prometheus metrics (#1981)
Co-authored-by: Mohit Reddy <mohitreddy1996@users.noreply.github.com>
This commit is contained in:
@@ -22,6 +22,7 @@ import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import fastapi
|
||||
@@ -52,6 +53,7 @@ from sglang.srt.managers.io_struct import (
|
||||
UpdateWeightReqInput,
|
||||
UpdateWeightReqOutput,
|
||||
)
|
||||
from sglang.srt.metrics.collector import TokenizerMetricsCollector
|
||||
from sglang.srt.sampling.sampling_params import SamplingParams
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.utils import get_zmq_socket, kill_child_process
|
||||
@@ -69,6 +71,10 @@ class ReqState:
|
||||
finished: bool
|
||||
event: asyncio.Event
|
||||
|
||||
# For metrics
|
||||
created_time: float
|
||||
first_token_time: Optional[float] = None
|
||||
|
||||
|
||||
class TokenizerManager:
|
||||
"""TokenizerManager is a process that tokenizes the text."""
|
||||
@@ -80,6 +86,7 @@ class TokenizerManager:
|
||||
):
|
||||
# Parse args
|
||||
self.server_args = server_args
|
||||
self.enable_metrics = server_args.enable_metrics
|
||||
|
||||
# Init inter-process communication
|
||||
context = zmq.asyncio.Context(2)
|
||||
@@ -142,11 +149,22 @@ class TokenizerManager:
|
||||
# Others
|
||||
self.gracefully_exit = False
|
||||
|
||||
# Metrics
|
||||
if self.enable_metrics:
|
||||
self.metrics_collector = TokenizerMetricsCollector(
|
||||
labels={
|
||||
"model_name": self.server_args.served_model_name,
|
||||
# TODO: Add lora name/path in the future,
|
||||
},
|
||||
)
|
||||
|
||||
async def generate_request(
|
||||
self,
|
||||
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
||||
request: Optional[fastapi.Request] = None,
|
||||
):
|
||||
created_time = time.time()
|
||||
|
||||
if self.to_create_loop:
|
||||
self.create_handle_loop()
|
||||
|
||||
@@ -164,10 +182,12 @@ class TokenizerManager:
|
||||
if is_single:
|
||||
tokenized_obj = await self._tokenize_one_request(obj)
|
||||
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
||||
async for response in self._wait_one_response(obj, request):
|
||||
async for response in self._wait_one_response(obj, request, created_time):
|
||||
yield response
|
||||
else:
|
||||
async for response in self._handle_batch_request(obj, request):
|
||||
async for response in self._handle_batch_request(
|
||||
obj, request, created_time
|
||||
):
|
||||
yield response
|
||||
|
||||
async def _tokenize_one_request(
|
||||
@@ -231,10 +251,11 @@ class TokenizerManager:
|
||||
self,
|
||||
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
||||
request: Optional[fastapi.Request] = None,
|
||||
created_time: Optional[float] = None,
|
||||
):
|
||||
"""Wait for the response of one request."""
|
||||
event = asyncio.Event()
|
||||
state = ReqState([], False, event)
|
||||
state = ReqState([], False, event, created_time=created_time)
|
||||
self.rid_to_state[obj.rid] = state
|
||||
|
||||
while True:
|
||||
@@ -272,6 +293,7 @@ class TokenizerManager:
|
||||
self,
|
||||
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
||||
request: Optional[fastapi.Request] = None,
|
||||
created_time: Optional[float] = None,
|
||||
):
|
||||
batch_size = obj.batch_size
|
||||
|
||||
@@ -283,7 +305,9 @@ class TokenizerManager:
|
||||
tmp_obj = obj[i]
|
||||
tokenized_obj = await self._tokenize_one_request(tmp_obj)
|
||||
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
||||
generators.append(self._wait_one_response(tmp_obj, request))
|
||||
generators.append(
|
||||
self._wait_one_response(tmp_obj, request, created_time)
|
||||
)
|
||||
rids.append(tmp_obj.rid)
|
||||
else:
|
||||
# FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
|
||||
@@ -303,7 +327,9 @@ class TokenizerManager:
|
||||
tokenized_obj.sampling_params.max_new_tokens = 0
|
||||
tokenized_obj.stream = False
|
||||
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
||||
await self._wait_one_response(tmp_obj, request).__anext__()
|
||||
await self._wait_one_response(
|
||||
tmp_obj, request, created_time
|
||||
).__anext__()
|
||||
|
||||
# Expand requests, assign new rids for them, and send them
|
||||
for i in range(batch_size):
|
||||
@@ -312,7 +338,9 @@ class TokenizerManager:
|
||||
tokenized_obj = copy.copy(tokenized_objs[i])
|
||||
tokenized_obj.rid = tmp_obj.regenerate_rid()
|
||||
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
||||
generators.append(self._wait_one_response(tmp_obj, request))
|
||||
generators.append(
|
||||
self._wait_one_response(tmp_obj, request, created_time)
|
||||
)
|
||||
rids.append(tmp_obj.rid)
|
||||
|
||||
# Wait for all requests
|
||||
@@ -524,6 +552,34 @@ class TokenizerManager:
|
||||
state.finished = recv_obj.finished_reason[i] is not None
|
||||
state.event.set()
|
||||
|
||||
if self.enable_metrics:
|
||||
completion_tokens = recv_obj.meta_info[i]["completion_tokens"]
|
||||
|
||||
if state.first_token_time is None:
|
||||
state.first_token_time = time.time()
|
||||
self.metrics_collector.observe_time_to_first_token(
|
||||
state.first_token_time - state.created_time
|
||||
)
|
||||
else:
|
||||
if completion_tokens >= 2:
|
||||
self.metrics_collector.observe_time_per_output_token(
|
||||
(time.time() - state.first_token_time)
|
||||
/ (completion_tokens - 1)
|
||||
)
|
||||
|
||||
if state.finished:
|
||||
self.metrics_collector.inc_prompt_tokens(
|
||||
recv_obj.meta_info[i]["prompt_tokens"]
|
||||
)
|
||||
self.metrics_collector.inc_generation_tokens(completion_tokens)
|
||||
self.metrics_collector.observe_e2e_request_latency(
|
||||
time.time() - state.created_time
|
||||
)
|
||||
if completion_tokens >= 1:
|
||||
self.metrics_collector.observe_time_per_output_token(
|
||||
(time.time() - state.created_time) / completion_tokens
|
||||
)
|
||||
|
||||
def convert_logprob_style(
|
||||
self,
|
||||
ret: dict,
|
||||
|
||||
Reference in New Issue
Block a user