From cd493b5afc27ed1b0f5700809c896af16204f0d9 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 19 Jan 2025 18:36:59 -0800 Subject: [PATCH] Improve metrics, logging, and importing orders (#2992) --- .github/workflows/pr-test.yml | 2 +- .../runtime/engine/offline_batch_inference.py | 5 +++ python/sglang/__init__.py | 44 +++++++++---------- .../sglang/lang/backend/runtime_endpoint.py | 20 ++++++--- python/sglang/srt/managers/scheduler.py | 6 ++- python/sglang/srt/metrics/collector.py | 21 ++++++--- sgl-router/py_src/sglang_router/__init__.py | 12 ++--- test/srt/run_suite.py | 3 +- 8 files changed, 64 insertions(+), 49 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 51117127a..b910683e7 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -52,7 +52,7 @@ jobs: runs-on: 1-gpu-runner strategy: matrix: - range: [0-6, 6-16, 16-23, 23-30, 30-38, 38-100] + range: [0-6, 6-15, 15-22, 22-32, 32-37, 37-100] steps: - name: Checkout code uses: actions/checkout@v3 diff --git a/examples/runtime/engine/offline_batch_inference.py b/examples/runtime/engine/offline_batch_inference.py index 724051eab..92e68dcd7 100644 --- a/examples/runtime/engine/offline_batch_inference.py +++ b/examples/runtime/engine/offline_batch_inference.py @@ -1,3 +1,8 @@ +""" +Usage: +python3 offline_batch_inference.py --model meta-llama/Llama-3.1-8B-Instruct +""" + import argparse import dataclasses diff --git a/python/sglang/__init__.py b/python/sglang/__init__.py index de9134857..70d58043d 100644 --- a/python/sglang/__init__.py +++ b/python/sglang/__init__.py @@ -1,5 +1,6 @@ -# SGL API Components +# SGLang public APIs +# Frontend Language APIs from sglang.api import ( Engine, Runtime, @@ -23,16 +24,26 @@ from sglang.api import ( user_end, video, ) +from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.lang.choices import ( greedy_token_selection, token_length_normalized, unconditional_likelihood_normalized, ) +from sglang.utils import LazyImport + +Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic") +LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM") +OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI") +VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI") + +# Other configs +from sglang.global_config import global_config +from sglang.version import __version__ -# SGLang DSL APIs __all__ = [ - "Runtime", "Engine", + "Runtime", "assistant", "assistant_begin", "assistant_end", @@ -52,27 +63,14 @@ __all__ = [ "user_begin", "user_end", "video", + "RuntimeEndpoint", "greedy_token_selection", "token_length_normalized", "unconditional_likelihood_normalized", + "Anthropic", + "LiteLLM", + "OpenAI", + "VertexAI", + "global_config", + "__version__", ] - -# Global Configurations -from sglang.global_config import global_config - -__all__ += ["global_config"] - -from sglang.version import __version__ - -__all__ += ["__version__"] - -# SGLang Backends -from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint -from sglang.utils import LazyImport - -Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic") -LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM") -OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI") -VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI") - -__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"] diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py index 23e9f1afb..c139db6f0 100644 --- a/python/sglang/lang/backend/runtime_endpoint.py +++ b/python/sglang/lang/backend/runtime_endpoint.py @@ -19,9 +19,6 @@ from sglang.lang.ir import ( REGEX_STR, SglSamplingParams, ) -from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import is_port_available, kill_process_tree from sglang.utils import http_request @@ -342,7 +339,7 @@ class Runtime: using the commond line interface. It is mainly used for the frontend language. - You should use the Engine class if you want to do normal offline processing. + You should use the Engine class if you want to do normal offline processing without the frontend language. """ def __init__( @@ -352,13 +349,14 @@ class Runtime: **kwargs, ): """See the arguments in server_args.py::ServerArgs""" + # We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run + # client code without installing SRT server and its dependency if they want. from sglang.srt.server import launch_server + from sglang.srt.server_args import ServerArgs + from sglang.srt.utils import is_port_available self.server_args = ServerArgs(*args, log_level=log_level, **kwargs) - # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown() - atexit.register(self.shutdown) - # Pre-allocate ports for port in range(self.server_args.port, 40000): if is_port_available(port): @@ -380,6 +378,10 @@ class Runtime: pipe_writer.close() self.pid = proc.pid + # Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown() + atexit.register(self.shutdown) + + # TODO: remove this pipe_writer mechanism and use `/health_generate` instead. try: init_state = pipe_reader.recv() except EOFError: @@ -394,6 +396,8 @@ class Runtime: self.endpoint = RuntimeEndpoint(self.url) def shutdown(self): + from sglang.srt.utils import kill_process_tree + if self.pid is not None: kill_process_tree(self.pid) self.pid = None @@ -402,6 +406,8 @@ class Runtime: self.endpoint.cache_prefix(prefix) def get_tokenizer(self): + from sglang.srt.hf_transformers_utils import get_tokenizer + return get_tokenizer( self.server_args.tokenizer_path, tokenizer_mode=self.server_args.tokenizer_mode, diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index ece5b2664..416abe21c 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -785,8 +785,9 @@ class Scheduler: f"gen throughput (token/s): {gen_throughput:.2f}, " f"#queue-req: {len(self.waiting_queue)}" ) + spec_accept_length = 0 else: - accept_length = ( + spec_accept_length = ( self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct ) self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0 @@ -795,7 +796,7 @@ class Scheduler: f"#running-req: {num_running_reqs}, " f"#token: {num_used}, " f"token usage: {num_used / self.max_total_num_tokens:.2f}, " - f"accept len: {accept_length:.2f}, " + f"accept len: {spec_accept_length:.2f}, " f"gen throughput (token/s): {gen_throughput:.2f}, " f"#queue-req: {len(self.waiting_queue)}" ) @@ -807,6 +808,7 @@ class Scheduler: self.stats.token_usage = num_used / self.max_total_num_tokens self.stats.gen_throughput = gen_throughput self.stats.num_queue_reqs = len(self.waiting_queue) + self.stats.spec_accept_length = spec_accept_length self.metrics_collector.log_stats(self.stats) def check_memory(self): diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py index 070b405be..26eb2fc27 100644 --- a/python/sglang/srt/metrics/collector.py +++ b/python/sglang/srt/metrics/collector.py @@ -25,6 +25,7 @@ class SchedulerStats: gen_throughput: float = 0.0 num_queue_reqs: int = 0 cache_hit_rate: float = 0.0 + spec_accept_length: float = 0.0 class SchedulerMetricsCollector: @@ -37,42 +38,49 @@ class SchedulerMetricsCollector: self.num_running_reqs = Gauge( name="sglang:num_running_reqs", - documentation="The number of running requests", + documentation="The number of running requests.", labelnames=labels.keys(), multiprocess_mode="sum", ) self.num_used_tokens = Gauge( name="sglang:num_used_tokens", - documentation="The number of used tokens", + documentation="The number of used tokens.", labelnames=labels.keys(), multiprocess_mode="sum", ) self.token_usage = Gauge( name="sglang:token_usage", - documentation="The token usage", + documentation="The token usage.", labelnames=labels.keys(), multiprocess_mode="mostrecent", ) self.gen_throughput = Gauge( name="sglang:gen_throughput", - documentation="The generate throughput (token/s)", + documentation="The generation throughput (token/s).", labelnames=labels.keys(), multiprocess_mode="sum", ) self.num_queue_reqs = Gauge( name="sglang:num_queue_reqs", - documentation="The number of requests in the waiting queue", + documentation="The number of requests in the waiting queue.", labelnames=labels.keys(), multiprocess_mode="sum", ) self.cache_hit_rate = Gauge( name="sglang:cache_hit_rate", - documentation="The cache hit rate", + documentation="The prefix cache hit rate.", + labelnames=labels.keys(), + multiprocess_mode="mostrecent", + ) + + self.spec_accept_length = Gauge( + name="sglang:spec_accept_length", + documentation="The average acceptance length of speculative decoding.", labelnames=labels.keys(), multiprocess_mode="mostrecent", ) @@ -88,6 +96,7 @@ class SchedulerMetricsCollector: self._log_gauge(self.gen_throughput, stats.gen_throughput) self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs) self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate) + self._log_gauge(self.spec_accept_length, stats.spec_accept_length) class TokenizerMetricsCollector: diff --git a/sgl-router/py_src/sglang_router/__init__.py b/sgl-router/py_src/sglang_router/__init__.py index 285ee173b..081740479 100644 --- a/sgl-router/py_src/sglang_router/__init__.py +++ b/sgl-router/py_src/sglang_router/__init__.py @@ -1,11 +1,7 @@ # a lightweihgt wrapper on router with argument type and comments +# no wrapper on policy type => direct export +from sglang_router.router import Router +from sglang_router.version import __version__ from sglang_router_rs import PolicyType -# no wrapper on policy type => direct export -from .router import Router - -__all__ = ["Router", "PolicyType"] - -from sglang_router.version import __version__ - -__all__ += ["__version__"] +__all__ = ["Router", "PolicyType", "__version__"] diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 2ed252275..69a5470be 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -42,8 +42,7 @@ suites = { "test_srt_endpoint.py", "test_torch_compile.py", "test_torch_compile_moe.py", - # Temporarily disable this because it requires PyTorch >= 2.5 - # "test_torch_native_attention_backend.py", + "test_torch_native_attention_backend.py", "test_torchao.py", "test_triton_attention_kernels.py", "test_triton_attention_backend.py",