From b32ab0705edb8401d93b47e28dc5fc4d37b2e39b Mon Sep 17 00:00:00 2001 From: Yingchun Lai Date: Thu, 4 Sep 2025 22:22:08 +0800 Subject: [PATCH] metrics: support customer buckets for prompt/generation_tokens_histogram (#9634) --- docs/advanced_features/server_arguments.md | 32 ++-- .../sglang/srt/managers/tokenizer_manager.py | 1 + python/sglang/srt/metrics/collector.py | 16 +- python/sglang/srt/metrics/utils.py | 48 ++++++ python/sglang/srt/server_args.py | 76 ++++++++++ test/srt/run_suite.py | 2 + test/srt/test_metrics_utils.py | 137 ++++++++++++++++++ 7 files changed, 293 insertions(+), 19 deletions(-) create mode 100644 python/sglang/srt/metrics/utils.py create mode 100644 test/srt/test_metrics_utils.py diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md index e8caddd77..04e3b962d 100644 --- a/docs/advanced_features/server_arguments.md +++ b/docs/advanced_features/server_arguments.md @@ -121,21 +121,23 @@ Please consult the documentation below and [server_args.py](https://github.com/s ## Logging -| Arguments | Description | Defaults | -|-----------|-------------|----------| -| `--log-level` | The logging level of all loggers. | info | -| `--log-level-http` | The logging level of HTTP server. If not set, reuse --log-level by default. | None | -| `--log-requests` | Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level. | False | -| `--log-requests-level` | 0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output. | 0 | -| `--show-time-cost` | Show time cost of custom marks. | False | -| `--enable-metrics` | Enable log prometheus metrics. | False | -| `--bucket-time-to-first-token` | The buckets of time to first token, specified as a list of floats. | None | -| `--bucket-inter-token-latency` | The buckets of inter-token latency, specified as a list of floats. | None | -| `--bucket-e2e-request-latency` | The buckets of end-to-end request latency, specified as a list of floats. | None | -| `--collect-tokens-histogram` | Collect prompt/generation tokens histogram. | False | -| `--kv-events-config` | Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used. | None | -| `--decode-log-interval` | The log interval of decode batch. | 40 | -| `--enable-request-time-stats-logging` | Enable per request time stats logging. | False | +| Arguments | Description | Defaults | +|---------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| +| `--log-level` | The logging level of all loggers. | info | +| `--log-level-http` | The logging level of HTTP server. If not set, reuse --log-level by default. | None | +| `--log-requests` | Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level. | False | +| `--log-requests-level` | 0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output. | 0 | +| `--show-time-cost` | Show time cost of custom marks. | False | +| `--enable-metrics` | Enable log prometheus metrics. | False | +| `--bucket-time-to-first-token` | The buckets of time to first token, specified as a list of floats. | None | +| `--bucket-inter-token-latency` | The buckets of inter-token latency, specified as a list of floats. | None | +| `--bucket-e2e-request-latency` | The buckets of end-to-end request latency, specified as a list of floats. | None | +| `--collect-tokens-histogram` | Collect prompt/generation tokens histogram. | False | +| `--kv-events-config` | Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used. | None | +| `--decode-log-interval` | The log interval of decode batch. | 40 | +| `--enable-request-time-stats-logging` | Enable per request time stats logging. | False | +| `--prompt-tokens-buckets` | The buckets rule of prompt tokens. Supports 3 rule types: 'default' uses predefined buckets; 'tse ' generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets [984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer ...' uses custom bucket values (e.g., 'customer 10 50 100 500'). | None | +| `--generation-tokens-buckets` | The buckets rule of prompt tokens. Supports 3 rule types: 'default' uses predefined buckets; 'tse ' generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets [984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer ...' uses custom bucket values (e.g., 'customer 10 50 100 500'). | None | ## API related diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 53c6a8036..129bf4a3a 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -329,6 +329,7 @@ class TokenizerManager: # Metrics if self.enable_metrics: self.metrics_collector = TokenizerMetricsCollector( + server_args=server_args, labels={ "model_name": self.server_args.served_model_name, # TODO: Add lora name/path in the future, diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py index cfb90aa0a..f1bb74689 100644 --- a/python/sglang/srt/metrics/collector.py +++ b/python/sglang/srt/metrics/collector.py @@ -18,6 +18,8 @@ from dataclasses import dataclass from enum import Enum from typing import Dict, List, Optional, Union +from sglang.srt.metrics.utils import generate_buckets +from sglang.srt.server_args import ServerArgs from sglang.srt.utils import get_bool_env_var SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS") @@ -309,6 +311,7 @@ class SchedulerMetricsCollector: class TokenizerMetricsCollector: def __init__( self, + server_args: ServerArgs, labels: Dict[str, str], bucket_time_to_first_token: Optional[List[float]] = None, bucket_inter_token_latency: Optional[List[float]] = None, @@ -334,7 +337,7 @@ class TokenizerMetricsCollector: ) if collect_tokens_histogram: - bucket_prompt_tokens = [ + default_bucket_prompt_tokens = [ 100, 300, 500, @@ -363,9 +366,11 @@ class TokenizerMetricsCollector: name="sglang:prompt_tokens_histogram", documentation="Histogram of prompt token length.", labelnames=labels.keys(), - buckets=bucket_prompt_tokens, + buckets=generate_buckets( + server_args.prompt_tokens_buckets, default_bucket_prompt_tokens + ), ) - bucket_generation_tokens = [ + default_bucket_generation_tokens = [ 100, 300, 500, @@ -390,7 +395,10 @@ class TokenizerMetricsCollector: name="sglang:generation_tokens_histogram", documentation="Histogram of generation token length.", labelnames=labels.keys(), - buckets=bucket_generation_tokens, + buckets=generate_buckets( + server_args.generation_tokens_buckets, + default_bucket_generation_tokens, + ), ) self.cached_tokens_total = Counter( diff --git a/python/sglang/srt/metrics/utils.py b/python/sglang/srt/metrics/utils.py new file mode 100644 index 000000000..ffc7e1066 --- /dev/null +++ b/python/sglang/srt/metrics/utils.py @@ -0,0 +1,48 @@ +# Copyright 2023-2025 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utilities for Prometheus Metrics.""" +import math +from typing import List + + +def two_sides_exponential_buckets( + middle: float, base: float, count: int +) -> List[float]: + buckets = [] + half_count = math.ceil(count / 2) + distance = 1 + buckets.append(middle) + for i in range(half_count): + distance *= base + buckets.append(middle + distance) + buckets.append(max(0, middle - distance)) + return sorted(set(buckets)) + + +def generate_buckets( + buckets_rule: List[str], default_buckets: List[float] +) -> List[float]: + if not buckets_rule: + buckets_rule = ["default"] + + assert len(buckets_rule) > 0 + rule = buckets_rule[0] + if rule == "tse": + middle, base, count = buckets_rule[1:] + assert float(base) > 1.0, "Base must be greater than 1.0" + return two_sides_exponential_buckets(float(middle), float(base), int(count)) + if rule == "default": + return sorted(set(default_buckets)) + assert rule == "customer" + return sorted(set([float(x) for x in buckets_rule[1:]])) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 86b0f1c18..9466f02ce 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -195,6 +195,8 @@ class ServerArgs: bucket_inter_token_latency: Optional[List[float]] = None bucket_e2e_request_latency: Optional[List[float]] = None collect_tokens_histogram: bool = False + prompt_tokens_buckets: Optional[List[str]] = None + generation_tokens_buckets: Optional[List[str]] = None decode_log_interval: int = 40 enable_request_time_stats_logging: bool = False kv_events_config: Optional[str] = None @@ -1234,6 +1236,26 @@ class ServerArgs: default=ServerArgs.collect_tokens_histogram, help="Collect prompt/generation tokens histogram.", ) + bucket_rule = ( + "Supports 3 rule types: 'default' uses predefined buckets; 'tse ' " + "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets " + "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer " + " ...' uses custom bucket values (e.g., 'customer 10 50 100 500')." + ) + parser.add_argument( + "--prompt-tokens-buckets", + type=str, + nargs="+", + default=ServerArgs.prompt_tokens_buckets, + help=f"The buckets rule of prompt tokens. {bucket_rule}", + ) + parser.add_argument( + "--generation-tokens-buckets", + type=str, + nargs="+", + default=ServerArgs.generation_tokens_buckets, + help=f"The buckets rule for generation tokens histogram. {bucket_rule}", + ) parser.add_argument( "--gc-warning-threshold-secs", type=float, @@ -2185,6 +2207,12 @@ class ServerArgs: # Check multi tokenizer assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1" + self.validate_buckets_rule( + "--prompt-tokens-buckets", self.prompt_tokens_buckets + ) + self.validate_buckets_rule( + "--generation-tokens-buckets", self.generation_tokens_buckets + ) def check_lora_server_args(self): assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive" @@ -2277,6 +2305,54 @@ class ServerArgs: f"decode_tp={decode_tp}, prefill_tp={prefill_tp}" ) + def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]): + if not buckets_rule: + return + + assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list" + rule = buckets_rule[0] + assert rule in [ + "tse", + "default", + "customer", + ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'" + + if rule == "tse": + assert ( + len(buckets_rule) == 4 + ), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}" + try: + middle = float(buckets_rule[1]) + base = float(buckets_rule[2]) + count = int(buckets_rule[3]) + except (ValueError, IndexError): + assert ( + False + ), f"{arg_name} TSE rule parameters must be: ['tse', , , ]" + assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}" + assert count > 0, f"{arg_name} TSE count must be positive, got: {count}" + assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}" + + elif rule == "default": + assert ( + len(buckets_rule) == 1 + ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}" + + elif rule == "customer": + assert ( + len(buckets_rule) >= 2 + ), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]" + try: + bucket_values = [float(x) for x in buckets_rule[1:]] + except ValueError: + assert False, f"{arg_name} customer rule bucket values must be numeric" + assert len(set(bucket_values)) == len( + bucket_values + ), f"{arg_name} customer rule bucket values should not contain duplicates" + assert all( + val >= 0 for val in bucket_values + ), f"{arg_name} customer rule bucket values should be non-negative" + def model_specific_adjustments(self): hf_config = self.get_hf_config() model_arch = hf_config.architectures[0] diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 047410fe2..f417af1bc 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -80,6 +80,7 @@ suites = { TestFile("test_io_struct.py", 8), TestFile("test_jinja_template_utils.py", 1), TestFile("test_metrics.py", 32), + TestFile("test_metrics_utils.py", 1), TestFile("test_mla.py", 167), TestFile("test_mla_deepseek_v3.py", 700), TestFile("test_mla_int8_deepseek_v3.py", 429), @@ -214,6 +215,7 @@ suite_amd = { TestFile("test_io_struct.py", 8), TestFile("test_jinja_template_utils.py", 1), TestFile("test_metrics.py", 32), + TestFile("test_metrics_utils.py", 1), TestFile("test_mla.py", 242), TestFile("test_mla_deepseek_v3.py", 221), TestFile("test_no_chunked_prefill.py", 108), diff --git a/test/srt/test_metrics_utils.py b/test/srt/test_metrics_utils.py new file mode 100644 index 000000000..4d33ad950 --- /dev/null +++ b/test/srt/test_metrics_utils.py @@ -0,0 +1,137 @@ +import unittest + +from sglang.srt.metrics.utils import generate_buckets, two_sides_exponential_buckets + + +class TestMetricsUtils(unittest.TestCase): + """Test cases for metrics utility functions.""" + + def test_two_sides_exponential_buckets_basic(self): + """Test basic functionality of two_sides_exponential_buckets.""" + # Test with simple parameters + count = 5 + buckets = two_sides_exponential_buckets(middle=10.0, base=2.0, count=count) + + # Should contain the middle value + self.assertIn(10.0, buckets) + + # Should be sorted + self.assertEqual(buckets, sorted(buckets)) + + # Should have unique values (no duplicates) + self.assertEqual(len(buckets), len(set(buckets))) + + # Should have reasonable number of buckets (not exactly count due to ceiling and deduplication) + self.assertGreaterEqual(len(buckets), 3) + self.assertLessEqual(len(buckets), count + 2) + + def test_two_sides_exponential_buckets_specific_values(self): + """Test specific values for two_sides_exponential_buckets.""" + buckets = two_sides_exponential_buckets(middle=100.0, base=2.0, count=4) + expected_values = [96.0, 98.0, 100.0, 102.0, 104.0] + self.assertEqual(buckets, expected_values) + + def test_two_sides_exponential_buckets_negative_values(self): + """Test two_sides_exponential_buckets with values that could go negative.""" + buckets = two_sides_exponential_buckets(middle=5.0, base=3.0, count=4) + + # Should not contain negative values (max(0, middle - distance)) + for bucket in buckets: + self.assertGreaterEqual(bucket, 0.0) + + # Should contain the middle value + self.assertIn(5.0, buckets) + + def test_two_sides_exponential_buckets_edge_cases(self): + """Test edge cases for two_sides_exponential_buckets.""" + # Count = 1 + buckets = two_sides_exponential_buckets(middle=10.0, base=2.0, count=1) + self.assertIn(10.0, buckets) + + # Very small middle value + buckets = two_sides_exponential_buckets(middle=0.1, base=2.0, count=2) + self.assertIn(0.1, buckets) + for bucket in buckets: + self.assertGreaterEqual(bucket, 0.0) + + def test_generate_buckets_default(self): + """Test generate_buckets with default rule.""" + default_buckets = [1.0, 5.0, 10.0, 50.0, 100.0] + + # Test with "default" rule + result = generate_buckets(["default"], default_buckets) + self.assertEqual(result, default_buckets) + + # Test with None (should default to "default") + result = generate_buckets(None, default_buckets) + self.assertEqual(result, default_buckets) + + # Test with empty (should default to "default") + result = generate_buckets(None, default_buckets) + self.assertEqual(result, default_buckets) + + def test_generate_buckets_tse(self): + """Test generate_buckets with tse (two sides exponential) rule.""" + default_buckets = [1.0, 5.0, 10.0] + + # Test with "tse" rule + result = generate_buckets(["tse", "10", "2.0", "4"], default_buckets) + + # Should return the same as calling two_sides_exponential_buckets directly + expected = two_sides_exponential_buckets(10.0, 2.0, 4) + self.assertEqual(result, expected) + + def test_generate_buckets_customer(self): + """Test generate_buckets with customer rule.""" + default_buckets = [1.0, 5.0, 10.0] + + # Test with "customer" rule + result = generate_buckets( + ["customer", "1.5", "3.2", "7.8", "15.6"], default_buckets + ) + expected = [1.5, 3.2, 7.8, 15.6] + self.assertEqual(result, expected) + + def test_generate_buckets_customer_with_integers(self): + """Test generate_buckets with customer rule using integer strings.""" + default_buckets = [1.0, 5.0, 10.0] + + # Test with integer strings + result = generate_buckets(["customer", "1", "5", "10", "50"], default_buckets) + expected = [1.0, 5.0, 10.0, 50.0] + self.assertEqual(result, expected) + + def test_generate_buckets_preserves_order_and_type(self): + """Test that generate_buckets preserves order and returns floats.""" + default_buckets = [1, 5, 10, 50, 100] # integers + + # Test default rule + result = generate_buckets(["default"], default_buckets) + self.assertEqual(result, default_buckets) + self.assertIsInstance(result, list) + + # Test customer rule with proper float conversion + result = generate_buckets( + ["customer", "100", "50", "10", "5", "1"], default_buckets + ) + expected = [1.0, 5.0, 10.0, 50.0, 100.0] + self.assertEqual(result, expected) + + # All values should be floats + for value in result: + self.assertIsInstance(value, float) + + def test_integration_tse_through_generate_buckets(self): + """Test integration of TSE buckets through generate_buckets function.""" + default_buckets = [1.0, 10.0, 100.0] + + # Generate buckets using both methods + direct_result = two_sides_exponential_buckets(50.0, 1.5, 6) + indirect_result = generate_buckets(["tse", "50.0", "1.5", "6"], default_buckets) + + # Results should be identical + self.assertEqual(direct_result, indirect_result) + + +if __name__ == "__main__": + unittest.main()