Improve metrics, logging, and importing orders (#2992)

This commit is contained in:
Lianmin Zheng
2025-01-19 18:36:59 -08:00
committed by GitHub
parent 61f42b5732
commit cd493b5afc
8 changed files with 64 additions and 49 deletions

View File

@@ -1,5 +1,6 @@
# SGL API Components
# SGLang public APIs
# Frontend Language APIs
from sglang.api import (
Engine,
Runtime,
@@ -23,16 +24,26 @@ from sglang.api import (
user_end,
video,
)
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.lang.choices import (
greedy_token_selection,
token_length_normalized,
unconditional_likelihood_normalized,
)
from sglang.utils import LazyImport
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
# Other configs
from sglang.global_config import global_config
from sglang.version import __version__
# SGLang DSL APIs
__all__ = [
"Runtime",
"Engine",
"Runtime",
"assistant",
"assistant_begin",
"assistant_end",
@@ -52,27 +63,14 @@ __all__ = [
"user_begin",
"user_end",
"video",
"RuntimeEndpoint",
"greedy_token_selection",
"token_length_normalized",
"unconditional_likelihood_normalized",
"Anthropic",
"LiteLLM",
"OpenAI",
"VertexAI",
"global_config",
"__version__",
]
# Global Configurations
from sglang.global_config import global_config
__all__ += ["global_config"]
from sglang.version import __version__
__all__ += ["__version__"]
# SGLang Backends
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.utils import LazyImport
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]

View File

@@ -19,9 +19,6 @@ from sglang.lang.ir import (
REGEX_STR,
SglSamplingParams,
)
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import is_port_available, kill_process_tree
from sglang.utils import http_request
@@ -342,7 +339,7 @@ class Runtime:
using the commond line interface.
It is mainly used for the frontend language.
You should use the Engine class if you want to do normal offline processing.
You should use the Engine class if you want to do normal offline processing without the frontend language.
"""
def __init__(
@@ -352,13 +349,14 @@ class Runtime:
**kwargs,
):
"""See the arguments in server_args.py::ServerArgs"""
# We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
# client code without installing SRT server and its dependency if they want.
from sglang.srt.server import launch_server
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import is_port_available
self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
atexit.register(self.shutdown)
# Pre-allocate ports
for port in range(self.server_args.port, 40000):
if is_port_available(port):
@@ -380,6 +378,10 @@ class Runtime:
pipe_writer.close()
self.pid = proc.pid
# Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
atexit.register(self.shutdown)
# TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
try:
init_state = pipe_reader.recv()
except EOFError:
@@ -394,6 +396,8 @@ class Runtime:
self.endpoint = RuntimeEndpoint(self.url)
def shutdown(self):
from sglang.srt.utils import kill_process_tree
if self.pid is not None:
kill_process_tree(self.pid)
self.pid = None
@@ -402,6 +406,8 @@ class Runtime:
self.endpoint.cache_prefix(prefix)
def get_tokenizer(self):
from sglang.srt.hf_transformers_utils import get_tokenizer
return get_tokenizer(
self.server_args.tokenizer_path,
tokenizer_mode=self.server_args.tokenizer_mode,

View File

@@ -785,8 +785,9 @@ class Scheduler:
f"gen throughput (token/s): {gen_throughput:.2f}, "
f"#queue-req: {len(self.waiting_queue)}"
)
spec_accept_length = 0
else:
accept_length = (
spec_accept_length = (
self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
)
self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
@@ -795,7 +796,7 @@ class Scheduler:
f"#running-req: {num_running_reqs}, "
f"#token: {num_used}, "
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
f"accept len: {accept_length:.2f}, "
f"accept len: {spec_accept_length:.2f}, "
f"gen throughput (token/s): {gen_throughput:.2f}, "
f"#queue-req: {len(self.waiting_queue)}"
)
@@ -807,6 +808,7 @@ class Scheduler:
self.stats.token_usage = num_used / self.max_total_num_tokens
self.stats.gen_throughput = gen_throughput
self.stats.num_queue_reqs = len(self.waiting_queue)
self.stats.spec_accept_length = spec_accept_length
self.metrics_collector.log_stats(self.stats)
def check_memory(self):

View File

@@ -25,6 +25,7 @@ class SchedulerStats:
gen_throughput: float = 0.0
num_queue_reqs: int = 0
cache_hit_rate: float = 0.0
spec_accept_length: float = 0.0
class SchedulerMetricsCollector:
@@ -37,42 +38,49 @@ class SchedulerMetricsCollector:
self.num_running_reqs = Gauge(
name="sglang:num_running_reqs",
documentation="The number of running requests",
documentation="The number of running requests.",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.num_used_tokens = Gauge(
name="sglang:num_used_tokens",
documentation="The number of used tokens",
documentation="The number of used tokens.",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.token_usage = Gauge(
name="sglang:token_usage",
documentation="The token usage",
documentation="The token usage.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
self.gen_throughput = Gauge(
name="sglang:gen_throughput",
documentation="The generate throughput (token/s)",
documentation="The generation throughput (token/s).",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.num_queue_reqs = Gauge(
name="sglang:num_queue_reqs",
documentation="The number of requests in the waiting queue",
documentation="The number of requests in the waiting queue.",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.cache_hit_rate = Gauge(
name="sglang:cache_hit_rate",
documentation="The cache hit rate",
documentation="The prefix cache hit rate.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
self.spec_accept_length = Gauge(
name="sglang:spec_accept_length",
documentation="The average acceptance length of speculative decoding.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
@@ -88,6 +96,7 @@ class SchedulerMetricsCollector:
self._log_gauge(self.gen_throughput, stats.gen_throughput)
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
class TokenizerMetricsCollector: