From cd493b5afc27ed1b0f5700809c896af16204f0d9 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 19 Jan 2025 18:36:59 -0800
Subject: [PATCH] Improve metrics, logging, and importing orders (#2992)

---
 .github/workflows/pr-test.yml                 |  2 +-
 .../runtime/engine/offline_batch_inference.py |  5 +++
 python/sglang/__init__.py                     | 44 +++++++++----------
 .../sglang/lang/backend/runtime_endpoint.py   | 20 ++++++---
 python/sglang/srt/managers/scheduler.py       |  6 ++-
 python/sglang/srt/metrics/collector.py        | 21 ++++++---
 sgl-router/py_src/sglang_router/__init__.py   | 12 ++---
 test/srt/run_suite.py                         |  3 +-
 8 files changed, 64 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 51117127a..b910683e7 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -52,7 +52,7 @@ jobs:
     runs-on: 1-gpu-runner
     strategy:
       matrix:
-        range: [0-6, 6-16, 16-23, 23-30, 30-38, 38-100]
+        range: [0-6, 6-15, 15-22, 22-32, 32-37, 37-100]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
diff --git a/examples/runtime/engine/offline_batch_inference.py b/examples/runtime/engine/offline_batch_inference.py
index 724051eab..92e68dcd7 100644
--- a/examples/runtime/engine/offline_batch_inference.py
+++ b/examples/runtime/engine/offline_batch_inference.py
@@ -1,3 +1,8 @@
+"""
+Usage:
+python3 offline_batch_inference.py  --model meta-llama/Llama-3.1-8B-Instruct
+"""
+
 import argparse
 import dataclasses
 
diff --git a/python/sglang/__init__.py b/python/sglang/__init__.py
index de9134857..70d58043d 100644
--- a/python/sglang/__init__.py
+++ b/python/sglang/__init__.py
@@ -1,5 +1,6 @@
-# SGL API Components
+# SGLang public APIs
 
+# Frontend Language APIs
 from sglang.api import (
     Engine,
     Runtime,
@@ -23,16 +24,26 @@ from sglang.api import (
     user_end,
     video,
 )
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.lang.choices import (
     greedy_token_selection,
     token_length_normalized,
     unconditional_likelihood_normalized,
 )
+from sglang.utils import LazyImport
+
+Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
+LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
+OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
+VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
+
+# Other configs
+from sglang.global_config import global_config
+from sglang.version import __version__
 
-# SGLang DSL APIs
 __all__ = [
-    "Runtime",
     "Engine",
+    "Runtime",
     "assistant",
     "assistant_begin",
     "assistant_end",
@@ -52,27 +63,14 @@ __all__ = [
     "user_begin",
     "user_end",
     "video",
+    "RuntimeEndpoint",
     "greedy_token_selection",
     "token_length_normalized",
     "unconditional_likelihood_normalized",
+    "Anthropic",
+    "LiteLLM",
+    "OpenAI",
+    "VertexAI",
+    "global_config",
+    "__version__",
 ]
-
-# Global Configurations
-from sglang.global_config import global_config
-
-__all__ += ["global_config"]
-
-from sglang.version import __version__
-
-__all__ += ["__version__"]
-
-# SGLang Backends
-from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
-from sglang.utils import LazyImport
-
-Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
-LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
-OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
-VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
-
-__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py
index 23e9f1afb..c139db6f0 100644
--- a/python/sglang/lang/backend/runtime_endpoint.py
+++ b/python/sglang/lang/backend/runtime_endpoint.py
@@ -19,9 +19,6 @@ from sglang.lang.ir import (
     REGEX_STR,
     SglSamplingParams,
 )
-from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import is_port_available, kill_process_tree
 from sglang.utils import http_request
 
 
@@ -342,7 +339,7 @@ class Runtime:
     using the commond line interface.
 
     It is mainly used for the frontend language.
-    You should use the Engine class if you want to do normal offline processing.
+    You should use the Engine class if you want to do normal offline processing without the frontend language.
     """
 
     def __init__(
@@ -352,13 +349,14 @@ class Runtime:
         **kwargs,
     ):
         """See the arguments in server_args.py::ServerArgs"""
+        # We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
+        # client code without installing SRT server and its dependency if they want.
         from sglang.srt.server import launch_server
+        from sglang.srt.server_args import ServerArgs
+        from sglang.srt.utils import is_port_available
 
         self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
 
-        # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
-        atexit.register(self.shutdown)
-
         # Pre-allocate ports
         for port in range(self.server_args.port, 40000):
             if is_port_available(port):
@@ -380,6 +378,10 @@ class Runtime:
         pipe_writer.close()
         self.pid = proc.pid
 
+        # Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
+        atexit.register(self.shutdown)
+
+        # TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
         try:
             init_state = pipe_reader.recv()
         except EOFError:
@@ -394,6 +396,8 @@ class Runtime:
         self.endpoint = RuntimeEndpoint(self.url)
 
     def shutdown(self):
+        from sglang.srt.utils import kill_process_tree
+
         if self.pid is not None:
             kill_process_tree(self.pid)
             self.pid = None
@@ -402,6 +406,8 @@ class Runtime:
         self.endpoint.cache_prefix(prefix)
 
     def get_tokenizer(self):
+        from sglang.srt.hf_transformers_utils import get_tokenizer
+
         return get_tokenizer(
             self.server_args.tokenizer_path,
             tokenizer_mode=self.server_args.tokenizer_mode,
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index ece5b2664..416abe21c 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -785,8 +785,9 @@ class Scheduler:
                 f"gen throughput (token/s): {gen_throughput:.2f}, "
                 f"#queue-req: {len(self.waiting_queue)}"
             )
+            spec_accept_length = 0
         else:
-            accept_length = (
+            spec_accept_length = (
                 self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
             )
             self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
@@ -795,7 +796,7 @@ class Scheduler:
                 f"#running-req: {num_running_reqs}, "
                 f"#token: {num_used}, "
                 f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
-                f"accept len: {accept_length:.2f}, "
+                f"accept len: {spec_accept_length:.2f}, "
                 f"gen throughput (token/s): {gen_throughput:.2f}, "
                 f"#queue-req: {len(self.waiting_queue)}"
             )
@@ -807,6 +808,7 @@ class Scheduler:
             self.stats.token_usage = num_used / self.max_total_num_tokens
             self.stats.gen_throughput = gen_throughput
             self.stats.num_queue_reqs = len(self.waiting_queue)
+            self.stats.spec_accept_length = spec_accept_length
             self.metrics_collector.log_stats(self.stats)
 
     def check_memory(self):
diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py
index 070b405be..26eb2fc27 100644
--- a/python/sglang/srt/metrics/collector.py
+++ b/python/sglang/srt/metrics/collector.py
@@ -25,6 +25,7 @@ class SchedulerStats:
     gen_throughput: float = 0.0
     num_queue_reqs: int = 0
     cache_hit_rate: float = 0.0
+    spec_accept_length: float = 0.0
 
 
 class SchedulerMetricsCollector:
@@ -37,42 +38,49 @@ class SchedulerMetricsCollector:
 
         self.num_running_reqs = Gauge(
             name="sglang:num_running_reqs",
-            documentation="The number of running requests",
+            documentation="The number of running requests.",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
 
         self.num_used_tokens = Gauge(
             name="sglang:num_used_tokens",
-            documentation="The number of used tokens",
+            documentation="The number of used tokens.",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
 
         self.token_usage = Gauge(
             name="sglang:token_usage",
-            documentation="The token usage",
+            documentation="The token usage.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
 
         self.gen_throughput = Gauge(
             name="sglang:gen_throughput",
-            documentation="The generate throughput (token/s)",
+            documentation="The generation throughput (token/s).",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
 
         self.num_queue_reqs = Gauge(
             name="sglang:num_queue_reqs",
-            documentation="The number of requests in the waiting queue",
+            documentation="The number of requests in the waiting queue.",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
 
         self.cache_hit_rate = Gauge(
             name="sglang:cache_hit_rate",
-            documentation="The cache hit rate",
+            documentation="The prefix cache hit rate.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+
+        self.spec_accept_length = Gauge(
+            name="sglang:spec_accept_length",
+            documentation="The average acceptance length of speculative decoding.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
@@ -88,6 +96,7 @@ class SchedulerMetricsCollector:
         self._log_gauge(self.gen_throughput, stats.gen_throughput)
         self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
         self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
+        self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
 
 
 class TokenizerMetricsCollector:
diff --git a/sgl-router/py_src/sglang_router/__init__.py b/sgl-router/py_src/sglang_router/__init__.py
index 285ee173b..081740479 100644
--- a/sgl-router/py_src/sglang_router/__init__.py
+++ b/sgl-router/py_src/sglang_router/__init__.py
@@ -1,11 +1,7 @@
 # a lightweihgt wrapper on router with argument type and comments
+# no wrapper on policy type => direct export
+from sglang_router.router import Router
+from sglang_router.version import __version__
 from sglang_router_rs import PolicyType
 
-# no wrapper on policy type => direct export
-from .router import Router
-
-__all__ = ["Router", "PolicyType"]
-
-from sglang_router.version import __version__
-
-__all__ += ["__version__"]
+__all__ = ["Router", "PolicyType", "__version__"]
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 2ed252275..69a5470be 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -42,8 +42,7 @@ suites = {
         "test_srt_endpoint.py",
         "test_torch_compile.py",
         "test_torch_compile_moe.py",
-        # Temporarily disable this because it requires PyTorch >= 2.5
-        # "test_torch_native_attention_backend.py",
+        "test_torch_native_attention_backend.py",
         "test_torchao.py",
         "test_triton_attention_kernels.py",
         "test_triton_attention_backend.py",