Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions
--- a/python/sglang/srt/metrics/collector.py
+++ b/python/sglang/srt/metrics/collector.py
@@ -13,6 +13,7 @@
 # ==============================================================================
 """Utilities for Prometheus Metrics Collection."""

+import time
 from dataclasses import dataclass
 from typing import Dict, Union

@@ -35,19 +36,20 @@ class SchedulerMetricsCollector:
        from prometheus_client import Gauge

        self.labels = labels
+        self.last_log_time = time.time()

        self.num_running_reqs = Gauge(
            name="sglang:num_running_reqs",
            documentation="The number of running requests.",
            labelnames=labels.keys(),
-            multiprocess_mode="sum",
+            multiprocess_mode="mostrecent",
        )

        self.num_used_tokens = Gauge(
            name="sglang:num_used_tokens",
            documentation="The number of used tokens.",
            labelnames=labels.keys(),
-            multiprocess_mode="sum",
+            multiprocess_mode="mostrecent",
        )

        self.token_usage = Gauge(
@@ -61,14 +63,14 @@ class SchedulerMetricsCollector:
            name="sglang:gen_throughput",
            documentation="The generation throughput (token/s).",
            labelnames=labels.keys(),
-            multiprocess_mode="sum",
+            multiprocess_mode="mostrecent",
        )

        self.num_queue_reqs = Gauge(
            name="sglang:num_queue_reqs",
            documentation="The number of requests in the waiting queue.",
            labelnames=labels.keys(),
-            multiprocess_mode="sum",
+            multiprocess_mode="mostrecent",
        )

        self.cache_hit_rate = Gauge(
@@ -97,6 +99,7 @@ class SchedulerMetricsCollector:
        self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
        self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
        self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
+        self.last_log_time = time.time()


 class TokenizerMetricsCollector:
@@ -130,12 +133,15 @@ class TokenizerMetricsCollector:
            labelnames=labels.keys(),
            buckets=[
                0.1,
-                0.25,
+                0.3,
                0.5,
-                0.75,
+                0.7,
+                0.9,
                1,
                2,
-                5,
+                4,
+                6,
+                8,
                10,
                20,
                40,
@@ -151,24 +157,56 @@ class TokenizerMetricsCollector:
            documentation="Histogram of time per output token in seconds.",
            labelnames=labels.keys(),
            buckets=[
+                0.002,
                0.005,
-                0.01,
+                0.010,
+                0.020,
+                0.030,
+                0.040,
+                0.050,
+                0.060,
+                0.070,
+                0.080,
+                0.090,
+                0.100,
+                0.150,
+                0.200,
+                0.300,
+                0.400,
+                0.600,
+                0.800,
+                1.000,
+                2.000,
+            ],
+        )
+
+        self.histogram_inter_token_latency_seconds = Histogram(
+            name="sglang:inter_token_latency_seconds",
+            documentation="Histogram of inter-token latency in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.002,
+                0.004,
+                0.006,
+                0.008,
+                0.010,
                0.015,
-                0.02,
+                0.020,
                0.025,
-                0.03,
-                0.04,
-                0.05,
+                0.030,
+                0.035,
+                0.040,
+                0.050,
                0.075,
-                0.1,
-                0.15,
-                0.2,
-                0.3,
-                0.4,
-                0.5,
-                0.75,
-                1.0,
-                2.5,
+                0.100,
+                0.150,
+                0.200,
+                0.300,
+                0.400,
+                0.500,
+                0.750,
+                1.000,
+                2.000,
            ],
        )

@@ -178,8 +216,9 @@ class TokenizerMetricsCollector:
            labelnames=labels.keys(),
            buckets=[
                0.1,
-                0.25,
-                0.5,
+                0.2,
+                0.4,
+                0.8,
                1,
                2,
                5,
@@ -188,28 +227,161 @@ class TokenizerMetricsCollector:
                40,
                60,
                80,
+                100,
+                150,
+                200,
+                250,
+                300,
+                350,
+                500,
+                1000,
+            ],
+        )
+
+        self.histogram_prefill_prealloc_duration = Histogram(
+            name="sglang:prefill_prealloc_duration_seconds",
+            documentation="Histogram of prefill prealloc duration in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.1,
+                0.3,
+                0.5,
+                0.7,
+                0.9,
+                1,
+                2,
+                4,
+                6,
+                8,
+                10,
+                20,
+                40,
+                60,
+                80,
                120,
                160,
            ],
        )

+        self.histogram_prefill_queue_duration = Histogram(
+            name="sglang:prefill_queue_duration_seconds",
+            documentation="Histogram of prefill queue duration in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.1,
+                0.3,
+                0.5,
+                0.7,
+                0.9,
+                2,
+                4,
+                8,
+                16,
+                64,
+            ],
+        )
+
+        self.histogram_prefill_forward_duration = Histogram(
+            name="sglang:prefill_forward_duration_seconds",
+            documentation="Histogram of prefill forward duration in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.1,
+                0.3,
+                0.5,
+                0.7,
+                0.9,
+                2,
+                4,
+                8,
+                16,
+                64,
+            ],
+        )
+
+        self.histogram_prefill_transfer_duration = Histogram(
+            name="sglang:prefill_transfer_duration_seconds",
+            documentation="Histogram of prefill transfer duration in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.050,
+                0.100,
+                0.150,
+                0.200,
+                0.300,
+                0.400,
+                0.500,
+                1.000,
+                2.000,
+            ],
+        )
+
+        self.histogram_decode_prealloc_duration = Histogram(
+            name="sglang:decode_prealloc_duration_seconds",
+            documentation="Histogram of decode prealloc duration in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.1,
+                0.3,
+                0.5,
+                0.7,
+                0.9,
+                2,
+                4,
+                8,
+                16,
+                64,
+            ],
+        )
+
+        self.histogram_decode_queue_duration = Histogram(
+            name="sglang:decode_queue_duration_seconds",
+            documentation="Histogram of decode queue duration in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.1,
+                0.3,
+                0.5,
+                0.7,
+                0.9,
+                2,
+                4,
+                8,
+                16,
+                64,
+            ],
+        )
+
    def _log_histogram(self, histogram, data: Union[int, float]) -> None:
        histogram.labels(**self.labels).observe(data)

-    def _log_counter(self, counter, data: Union[int, float]) -> None:
-        # Convenience function for logging to counter.
-        counter.labels(**self.labels).inc(data)
-
-    def observe_one_finished_request(self, prompt_tokens: int, generation_tokens: int):
+    def observe_one_finished_request(
+        self,
+        prompt_tokens: int,
+        generation_tokens: int,
+        e2e_latency: float,
+    ):
        self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
        self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
        self.num_requests_total.labels(**self.labels).inc(1)
+        self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
+        if generation_tokens >= 1:
+            self.histogram_time_per_output_token.labels(**self.labels).observe(
+                e2e_latency / generation_tokens
+            )

-    def observe_time_to_first_token(self, value: Union[float, int]):
-        self._log_histogram(self.histogram_time_to_first_token, value)
+    def observe_time_to_first_token(self, value: float):
+        self.histogram_time_to_first_token.labels(**self.labels).observe(value)

-    def observe_time_per_output_token(self, value: Union[float, int]):
-        self._log_histogram(self.histogram_time_per_output_token, value)
+    def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
+        adjusted_interval = internval / num_new_tokens

-    def observe_e2e_request_latency(self, value: Union[float, int]):
-        self._log_histogram(self.histogram_e2e_request_latency, value)
+        # A faster version of the Histogram::observe which observes multiple values at the same time.
+        # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
+        his = self.histogram_inter_token_latency_seconds.labels(**self.labels)
+        his._sum.inc(internval)
+
+        for i, bound in enumerate(his._upper_bounds):
+            if adjusted_interval <= bound:
+                his._buckets[i].inc(num_new_tokens)
+                break