Improve metrics, logging, and importing orders (#2992)
This commit is contained in:
2
.github/workflows/pr-test.yml
vendored
2
.github/workflows/pr-test.yml
vendored
@@ -52,7 +52,7 @@ jobs:
|
|||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
range: [0-6, 6-16, 16-23, 23-30, 30-38, 38-100]
|
range: [0-6, 6-15, 15-22, 22-32, 32-37, 37-100]
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|||||||
@@ -1,3 +1,8 @@
|
|||||||
|
"""
|
||||||
|
Usage:
|
||||||
|
python3 offline_batch_inference.py --model meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# SGL API Components
|
# SGLang public APIs
|
||||||
|
|
||||||
|
# Frontend Language APIs
|
||||||
from sglang.api import (
|
from sglang.api import (
|
||||||
Engine,
|
Engine,
|
||||||
Runtime,
|
Runtime,
|
||||||
@@ -23,16 +24,26 @@ from sglang.api import (
|
|||||||
user_end,
|
user_end,
|
||||||
video,
|
video,
|
||||||
)
|
)
|
||||||
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
||||||
from sglang.lang.choices import (
|
from sglang.lang.choices import (
|
||||||
greedy_token_selection,
|
greedy_token_selection,
|
||||||
token_length_normalized,
|
token_length_normalized,
|
||||||
unconditional_likelihood_normalized,
|
unconditional_likelihood_normalized,
|
||||||
)
|
)
|
||||||
|
from sglang.utils import LazyImport
|
||||||
|
|
||||||
|
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
||||||
|
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
||||||
|
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
||||||
|
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
||||||
|
|
||||||
|
# Other configs
|
||||||
|
from sglang.global_config import global_config
|
||||||
|
from sglang.version import __version__
|
||||||
|
|
||||||
# SGLang DSL APIs
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Runtime",
|
|
||||||
"Engine",
|
"Engine",
|
||||||
|
"Runtime",
|
||||||
"assistant",
|
"assistant",
|
||||||
"assistant_begin",
|
"assistant_begin",
|
||||||
"assistant_end",
|
"assistant_end",
|
||||||
@@ -52,27 +63,14 @@ __all__ = [
|
|||||||
"user_begin",
|
"user_begin",
|
||||||
"user_end",
|
"user_end",
|
||||||
"video",
|
"video",
|
||||||
|
"RuntimeEndpoint",
|
||||||
"greedy_token_selection",
|
"greedy_token_selection",
|
||||||
"token_length_normalized",
|
"token_length_normalized",
|
||||||
"unconditional_likelihood_normalized",
|
"unconditional_likelihood_normalized",
|
||||||
|
"Anthropic",
|
||||||
|
"LiteLLM",
|
||||||
|
"OpenAI",
|
||||||
|
"VertexAI",
|
||||||
|
"global_config",
|
||||||
|
"__version__",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Global Configurations
|
|
||||||
from sglang.global_config import global_config
|
|
||||||
|
|
||||||
__all__ += ["global_config"]
|
|
||||||
|
|
||||||
from sglang.version import __version__
|
|
||||||
|
|
||||||
__all__ += ["__version__"]
|
|
||||||
|
|
||||||
# SGLang Backends
|
|
||||||
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
|
||||||
from sglang.utils import LazyImport
|
|
||||||
|
|
||||||
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
|
||||||
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
|
||||||
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
|
||||||
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
|
||||||
|
|
||||||
__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
|
|
||||||
|
|||||||
@@ -19,9 +19,6 @@ from sglang.lang.ir import (
|
|||||||
REGEX_STR,
|
REGEX_STR,
|
||||||
SglSamplingParams,
|
SglSamplingParams,
|
||||||
)
|
)
|
||||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
|
||||||
from sglang.srt.server_args import ServerArgs
|
|
||||||
from sglang.srt.utils import is_port_available, kill_process_tree
|
|
||||||
from sglang.utils import http_request
|
from sglang.utils import http_request
|
||||||
|
|
||||||
|
|
||||||
@@ -342,7 +339,7 @@ class Runtime:
|
|||||||
using the commond line interface.
|
using the commond line interface.
|
||||||
|
|
||||||
It is mainly used for the frontend language.
|
It is mainly used for the frontend language.
|
||||||
You should use the Engine class if you want to do normal offline processing.
|
You should use the Engine class if you want to do normal offline processing without the frontend language.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -352,13 +349,14 @@ class Runtime:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""See the arguments in server_args.py::ServerArgs"""
|
"""See the arguments in server_args.py::ServerArgs"""
|
||||||
|
# We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
|
||||||
|
# client code without installing SRT server and its dependency if they want.
|
||||||
from sglang.srt.server import launch_server
|
from sglang.srt.server import launch_server
|
||||||
|
from sglang.srt.server_args import ServerArgs
|
||||||
|
from sglang.srt.utils import is_port_available
|
||||||
|
|
||||||
self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
|
self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
|
||||||
|
|
||||||
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
|
|
||||||
atexit.register(self.shutdown)
|
|
||||||
|
|
||||||
# Pre-allocate ports
|
# Pre-allocate ports
|
||||||
for port in range(self.server_args.port, 40000):
|
for port in range(self.server_args.port, 40000):
|
||||||
if is_port_available(port):
|
if is_port_available(port):
|
||||||
@@ -380,6 +378,10 @@ class Runtime:
|
|||||||
pipe_writer.close()
|
pipe_writer.close()
|
||||||
self.pid = proc.pid
|
self.pid = proc.pid
|
||||||
|
|
||||||
|
# Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
|
||||||
|
atexit.register(self.shutdown)
|
||||||
|
|
||||||
|
# TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
|
||||||
try:
|
try:
|
||||||
init_state = pipe_reader.recv()
|
init_state = pipe_reader.recv()
|
||||||
except EOFError:
|
except EOFError:
|
||||||
@@ -394,6 +396,8 @@ class Runtime:
|
|||||||
self.endpoint = RuntimeEndpoint(self.url)
|
self.endpoint = RuntimeEndpoint(self.url)
|
||||||
|
|
||||||
def shutdown(self):
|
def shutdown(self):
|
||||||
|
from sglang.srt.utils import kill_process_tree
|
||||||
|
|
||||||
if self.pid is not None:
|
if self.pid is not None:
|
||||||
kill_process_tree(self.pid)
|
kill_process_tree(self.pid)
|
||||||
self.pid = None
|
self.pid = None
|
||||||
@@ -402,6 +406,8 @@ class Runtime:
|
|||||||
self.endpoint.cache_prefix(prefix)
|
self.endpoint.cache_prefix(prefix)
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self):
|
||||||
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||||
|
|
||||||
return get_tokenizer(
|
return get_tokenizer(
|
||||||
self.server_args.tokenizer_path,
|
self.server_args.tokenizer_path,
|
||||||
tokenizer_mode=self.server_args.tokenizer_mode,
|
tokenizer_mode=self.server_args.tokenizer_mode,
|
||||||
|
|||||||
@@ -785,8 +785,9 @@ class Scheduler:
|
|||||||
f"gen throughput (token/s): {gen_throughput:.2f}, "
|
f"gen throughput (token/s): {gen_throughput:.2f}, "
|
||||||
f"#queue-req: {len(self.waiting_queue)}"
|
f"#queue-req: {len(self.waiting_queue)}"
|
||||||
)
|
)
|
||||||
|
spec_accept_length = 0
|
||||||
else:
|
else:
|
||||||
accept_length = (
|
spec_accept_length = (
|
||||||
self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
|
self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
|
||||||
)
|
)
|
||||||
self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
|
self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
|
||||||
@@ -795,7 +796,7 @@ class Scheduler:
|
|||||||
f"#running-req: {num_running_reqs}, "
|
f"#running-req: {num_running_reqs}, "
|
||||||
f"#token: {num_used}, "
|
f"#token: {num_used}, "
|
||||||
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
||||||
f"accept len: {accept_length:.2f}, "
|
f"accept len: {spec_accept_length:.2f}, "
|
||||||
f"gen throughput (token/s): {gen_throughput:.2f}, "
|
f"gen throughput (token/s): {gen_throughput:.2f}, "
|
||||||
f"#queue-req: {len(self.waiting_queue)}"
|
f"#queue-req: {len(self.waiting_queue)}"
|
||||||
)
|
)
|
||||||
@@ -807,6 +808,7 @@ class Scheduler:
|
|||||||
self.stats.token_usage = num_used / self.max_total_num_tokens
|
self.stats.token_usage = num_used / self.max_total_num_tokens
|
||||||
self.stats.gen_throughput = gen_throughput
|
self.stats.gen_throughput = gen_throughput
|
||||||
self.stats.num_queue_reqs = len(self.waiting_queue)
|
self.stats.num_queue_reqs = len(self.waiting_queue)
|
||||||
|
self.stats.spec_accept_length = spec_accept_length
|
||||||
self.metrics_collector.log_stats(self.stats)
|
self.metrics_collector.log_stats(self.stats)
|
||||||
|
|
||||||
def check_memory(self):
|
def check_memory(self):
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ class SchedulerStats:
|
|||||||
gen_throughput: float = 0.0
|
gen_throughput: float = 0.0
|
||||||
num_queue_reqs: int = 0
|
num_queue_reqs: int = 0
|
||||||
cache_hit_rate: float = 0.0
|
cache_hit_rate: float = 0.0
|
||||||
|
spec_accept_length: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
class SchedulerMetricsCollector:
|
class SchedulerMetricsCollector:
|
||||||
@@ -37,42 +38,49 @@ class SchedulerMetricsCollector:
|
|||||||
|
|
||||||
self.num_running_reqs = Gauge(
|
self.num_running_reqs = Gauge(
|
||||||
name="sglang:num_running_reqs",
|
name="sglang:num_running_reqs",
|
||||||
documentation="The number of running requests",
|
documentation="The number of running requests.",
|
||||||
labelnames=labels.keys(),
|
labelnames=labels.keys(),
|
||||||
multiprocess_mode="sum",
|
multiprocess_mode="sum",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.num_used_tokens = Gauge(
|
self.num_used_tokens = Gauge(
|
||||||
name="sglang:num_used_tokens",
|
name="sglang:num_used_tokens",
|
||||||
documentation="The number of used tokens",
|
documentation="The number of used tokens.",
|
||||||
labelnames=labels.keys(),
|
labelnames=labels.keys(),
|
||||||
multiprocess_mode="sum",
|
multiprocess_mode="sum",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.token_usage = Gauge(
|
self.token_usage = Gauge(
|
||||||
name="sglang:token_usage",
|
name="sglang:token_usage",
|
||||||
documentation="The token usage",
|
documentation="The token usage.",
|
||||||
labelnames=labels.keys(),
|
labelnames=labels.keys(),
|
||||||
multiprocess_mode="mostrecent",
|
multiprocess_mode="mostrecent",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.gen_throughput = Gauge(
|
self.gen_throughput = Gauge(
|
||||||
name="sglang:gen_throughput",
|
name="sglang:gen_throughput",
|
||||||
documentation="The generate throughput (token/s)",
|
documentation="The generation throughput (token/s).",
|
||||||
labelnames=labels.keys(),
|
labelnames=labels.keys(),
|
||||||
multiprocess_mode="sum",
|
multiprocess_mode="sum",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.num_queue_reqs = Gauge(
|
self.num_queue_reqs = Gauge(
|
||||||
name="sglang:num_queue_reqs",
|
name="sglang:num_queue_reqs",
|
||||||
documentation="The number of requests in the waiting queue",
|
documentation="The number of requests in the waiting queue.",
|
||||||
labelnames=labels.keys(),
|
labelnames=labels.keys(),
|
||||||
multiprocess_mode="sum",
|
multiprocess_mode="sum",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.cache_hit_rate = Gauge(
|
self.cache_hit_rate = Gauge(
|
||||||
name="sglang:cache_hit_rate",
|
name="sglang:cache_hit_rate",
|
||||||
documentation="The cache hit rate",
|
documentation="The prefix cache hit rate.",
|
||||||
|
labelnames=labels.keys(),
|
||||||
|
multiprocess_mode="mostrecent",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.spec_accept_length = Gauge(
|
||||||
|
name="sglang:spec_accept_length",
|
||||||
|
documentation="The average acceptance length of speculative decoding.",
|
||||||
labelnames=labels.keys(),
|
labelnames=labels.keys(),
|
||||||
multiprocess_mode="mostrecent",
|
multiprocess_mode="mostrecent",
|
||||||
)
|
)
|
||||||
@@ -88,6 +96,7 @@ class SchedulerMetricsCollector:
|
|||||||
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
||||||
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
||||||
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
||||||
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
||||||
|
|
||||||
|
|
||||||
class TokenizerMetricsCollector:
|
class TokenizerMetricsCollector:
|
||||||
|
|||||||
@@ -1,11 +1,7 @@
|
|||||||
# a lightweihgt wrapper on router with argument type and comments
|
# a lightweihgt wrapper on router with argument type and comments
|
||||||
|
# no wrapper on policy type => direct export
|
||||||
|
from sglang_router.router import Router
|
||||||
|
from sglang_router.version import __version__
|
||||||
from sglang_router_rs import PolicyType
|
from sglang_router_rs import PolicyType
|
||||||
|
|
||||||
# no wrapper on policy type => direct export
|
__all__ = ["Router", "PolicyType", "__version__"]
|
||||||
from .router import Router
|
|
||||||
|
|
||||||
__all__ = ["Router", "PolicyType"]
|
|
||||||
|
|
||||||
from sglang_router.version import __version__
|
|
||||||
|
|
||||||
__all__ += ["__version__"]
|
|
||||||
|
|||||||
@@ -42,8 +42,7 @@ suites = {
|
|||||||
"test_srt_endpoint.py",
|
"test_srt_endpoint.py",
|
||||||
"test_torch_compile.py",
|
"test_torch_compile.py",
|
||||||
"test_torch_compile_moe.py",
|
"test_torch_compile_moe.py",
|
||||||
# Temporarily disable this because it requires PyTorch >= 2.5
|
"test_torch_native_attention_backend.py",
|
||||||
# "test_torch_native_attention_backend.py",
|
|
||||||
"test_torchao.py",
|
"test_torchao.py",
|
||||||
"test_triton_attention_kernels.py",
|
"test_triton_attention_kernels.py",
|
||||||
"test_triton_attention_backend.py",
|
"test_triton_attention_backend.py",
|
||||||
|
|||||||
Reference in New Issue
Block a user