Misc fix for min_p_sampling, --cuda-graph-bs (#2761)

This commit is contained in:
Lianmin Zheng
2025-01-07 02:52:53 -08:00
committed by GitHub
parent 6d08ce2aa9
commit bdc1acf6cd
17 changed files with 135 additions and 63 deletions

View File

@@ -114,26 +114,20 @@ class TokenizerMetricsCollector:
documentation="Histogram of time to first token in seconds.",
labelnames=labels.keys(),
buckets=[
0.001,
0.005,
0.01,
0.02,
0.04,
0.06,
0.08,
0.1,
0.25,
0.5,
0.75,
1.0,
2.5,
5.0,
7.5,
10.0,
15.0,
20.0,
25.0,
30.0,
1,
2,
5,
10,
20,
40,
60,
80,
120,
160,
],
)
@@ -168,21 +162,19 @@ class TokenizerMetricsCollector:
documentation="Histogram of End-to-end request latency in seconds",
labelnames=labels.keys(),
buckets=[
0.3,
0.1,
0.25,
0.5,
0.8,
1.0,
1.5,
2.0,
2.5,
5.0,
10.0,
15.0,
20.0,
30.0,
40.0,
50.0,
60.0,
1,
2,
5,
10,
20,
40,
60,
80,
120,
160,
],
)