Sync from v0.13
This commit is contained in:
66
tests/v1/metrics/test_engine_logger_apis.py
Normal file
66
tests/v1/metrics/test_engine_logger_apis.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import copy
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.plugins.vllm_add_dummy_stat_logger.dummy_stat_logger.dummy_stat_logger import ( # noqa E501
|
||||
DummyStatLogger,
|
||||
)
|
||||
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
|
||||
from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def log_stats_enabled_engine_args():
|
||||
"""
|
||||
Shared fixture providing common AsyncEngineArgs configuration
|
||||
used across multiple tests.
|
||||
"""
|
||||
return AsyncEngineArgs(
|
||||
model="distilbert/distilgpt2",
|
||||
dtype="half",
|
||||
disable_log_stats=False,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_llm_replace_default_loggers(log_stats_enabled_engine_args):
|
||||
"""
|
||||
RayPrometheusStatLogger should replace the default PrometheusStatLogger
|
||||
"""
|
||||
|
||||
engine = AsyncLLM.from_engine_args(
|
||||
log_stats_enabled_engine_args, stat_loggers=[RayPrometheusStatLogger]
|
||||
)
|
||||
assert isinstance(engine.logger_manager.stat_loggers[0], RayPrometheusStatLogger)
|
||||
engine.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args):
|
||||
"""
|
||||
It's still possible to use custom stat loggers exclusively by passing
|
||||
disable_log_stats=True in addition to a list of custom stat loggers.
|
||||
"""
|
||||
# Create engine_args with disable_log_stats=True for this test
|
||||
disabled_log_engine_args = copy.deepcopy(log_stats_enabled_engine_args)
|
||||
disabled_log_engine_args.disable_log_stats = True
|
||||
|
||||
# Disable default loggers; pass custom stat logger to the constructor
|
||||
engine = AsyncLLM.from_engine_args(
|
||||
disabled_log_engine_args, stat_loggers=[DummyStatLogger]
|
||||
)
|
||||
|
||||
assert len(engine.logger_manager.stat_loggers) == 2
|
||||
assert len(engine.logger_manager.stat_loggers[0].per_engine_stat_loggers) == 1
|
||||
assert isinstance(
|
||||
engine.logger_manager.stat_loggers[0].per_engine_stat_loggers[0],
|
||||
DummyStatLogger,
|
||||
)
|
||||
|
||||
# log_stats is still True, since custom stat loggers are used
|
||||
assert engine.log_stats
|
||||
|
||||
engine.shutdown()
|
||||
127
tests/v1/metrics/test_metrics_reader.py
Normal file
127
tests/v1/metrics/test_metrics_reader.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import prometheus_client
|
||||
import pytest
|
||||
|
||||
from vllm.v1.metrics.reader import (
|
||||
Counter,
|
||||
Gauge,
|
||||
Histogram,
|
||||
Vector,
|
||||
get_metrics_snapshot,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def test_registry(monkeypatch):
|
||||
# Use a custom registry for tests
|
||||
test_registry = prometheus_client.CollectorRegistry(auto_describe=True)
|
||||
monkeypatch.setattr("vllm.v1.metrics.reader.REGISTRY", test_registry)
|
||||
return test_registry
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_engines", [1, 4])
|
||||
def test_gauge_metric(test_registry, num_engines):
|
||||
g = prometheus_client.Gauge(
|
||||
"vllm:test_gauge",
|
||||
"Test gauge metric",
|
||||
labelnames=["model", "engine_index"],
|
||||
registry=test_registry,
|
||||
)
|
||||
for i in range(num_engines):
|
||||
g.labels(model="foo", engine_index=str(i)).set(98.5)
|
||||
|
||||
metrics = get_metrics_snapshot()
|
||||
assert len(metrics) == num_engines
|
||||
engine_labels = [str(i) for i in range(num_engines)]
|
||||
for m in metrics:
|
||||
assert isinstance(m, Gauge)
|
||||
assert m.name == "vllm:test_gauge"
|
||||
assert m.value == 98.5
|
||||
assert m.labels["model"] == "foo"
|
||||
assert m.labels["engine_index"] in engine_labels
|
||||
engine_labels.remove(m.labels["engine_index"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_engines", [1, 4])
|
||||
def test_counter_metric(test_registry, num_engines):
|
||||
c = prometheus_client.Counter(
|
||||
"vllm:test_counter",
|
||||
"Test counter metric",
|
||||
labelnames=["model", "engine_index"],
|
||||
registry=test_registry,
|
||||
)
|
||||
for i in range(num_engines):
|
||||
c.labels(model="bar", engine_index=str(i)).inc(19)
|
||||
|
||||
metrics = get_metrics_snapshot()
|
||||
assert len(metrics) == num_engines
|
||||
engine_labels = [str(i) for i in range(num_engines)]
|
||||
for m in metrics:
|
||||
assert isinstance(m, Counter)
|
||||
assert m.name == "vllm:test_counter"
|
||||
assert m.value == 19
|
||||
assert m.labels["model"] == "bar"
|
||||
assert m.labels["engine_index"] in engine_labels
|
||||
engine_labels.remove(m.labels["engine_index"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_engines", [1, 4])
|
||||
def test_histogram_metric(test_registry, num_engines):
|
||||
h = prometheus_client.Histogram(
|
||||
"vllm:test_histogram",
|
||||
"Test histogram metric",
|
||||
labelnames=["model", "engine_index"],
|
||||
buckets=[10, 20, 30, 40, 50],
|
||||
registry=test_registry,
|
||||
)
|
||||
for i in range(num_engines):
|
||||
hist = h.labels(model="blaa", engine_index=str(i))
|
||||
hist.observe(42)
|
||||
hist.observe(21)
|
||||
hist.observe(7)
|
||||
|
||||
metrics = get_metrics_snapshot()
|
||||
assert len(metrics) == num_engines
|
||||
engine_labels = [str(i) for i in range(num_engines)]
|
||||
for m in metrics:
|
||||
assert isinstance(m, Histogram)
|
||||
assert m.name == "vllm:test_histogram"
|
||||
assert m.count == 3
|
||||
assert m.sum == 70
|
||||
assert m.buckets["10.0"] == 1
|
||||
assert m.buckets["20.0"] == 1
|
||||
assert m.buckets["30.0"] == 2
|
||||
assert m.buckets["40.0"] == 2
|
||||
assert m.buckets["50.0"] == 3
|
||||
assert m.labels["model"] == "blaa"
|
||||
assert m.labels["engine_index"] in engine_labels
|
||||
engine_labels.remove(m.labels["engine_index"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_engines", [1, 4])
|
||||
def test_vector_metric(test_registry, num_engines):
|
||||
c = prometheus_client.Counter(
|
||||
"vllm:spec_decode_num_accepted_tokens_per_pos",
|
||||
"Vector-like counter metric",
|
||||
labelnames=["position", "model", "engine_index"],
|
||||
registry=test_registry,
|
||||
)
|
||||
for i in range(num_engines):
|
||||
c.labels(position="0", model="llama", engine_index=str(i)).inc(10)
|
||||
c.labels(position="1", model="llama", engine_index=str(i)).inc(5)
|
||||
c.labels(position="2", model="llama", engine_index=str(i)).inc(1)
|
||||
|
||||
metrics = get_metrics_snapshot()
|
||||
assert len(metrics) == num_engines
|
||||
engine_labels = [str(i) for i in range(num_engines)]
|
||||
for m in metrics:
|
||||
assert isinstance(m, Vector)
|
||||
assert m.name == "vllm:spec_decode_num_accepted_tokens_per_pos"
|
||||
assert m.values == [10, 5, 1]
|
||||
assert m.labels["model"] == "llama"
|
||||
assert m.labels["engine_index"] in engine_labels
|
||||
engine_labels.remove(m.labels["engine_index"])
|
||||
96
tests/v1/metrics/test_ray_metrics.py
Normal file
96
tests/v1/metrics/test_ray_metrics.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
from vllm.config.model import ModelDType
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
|
||||
from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
|
||||
|
||||
MODELS = [
|
||||
"distilbert/distilgpt2",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [16])
|
||||
def test_engine_log_metrics_ray(
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: ModelDType,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""Simple smoke test, verifying this can be used without exceptions.
|
||||
Need to start a Ray cluster in order to verify outputs."""
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class EngineTestActor:
|
||||
async def run(self):
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
|
||||
)
|
||||
|
||||
engine = AsyncLLM.from_engine_args(
|
||||
engine_args, stat_loggers=[RayPrometheusStatLogger]
|
||||
)
|
||||
|
||||
for i, prompt in enumerate(example_prompts):
|
||||
results = engine.generate(
|
||||
request_id=f"request-id-{i}",
|
||||
prompt=prompt,
|
||||
sampling_params=SamplingParams(max_tokens=max_tokens),
|
||||
)
|
||||
|
||||
async for _ in results:
|
||||
pass
|
||||
|
||||
# Create the actor and call the async method
|
||||
actor = EngineTestActor.remote() # type: ignore[attr-defined]
|
||||
ray.get(actor.run.remote())
|
||||
|
||||
|
||||
def test_sanitized_opentelemetry_name():
|
||||
"""Test the metric name sanitization logic for Ray."""
|
||||
|
||||
# Only a-z, A-Z, 0-9, _, test valid characters are preserved
|
||||
valid_name = "valid_metric_123_abcDEF"
|
||||
assert (
|
||||
RayPrometheusMetric._get_sanitized_opentelemetry_name(valid_name) == valid_name
|
||||
)
|
||||
|
||||
# Test dash, dot, are replaced
|
||||
name_with_dash_dot = "metric-name.test"
|
||||
expected = "metric_name_test"
|
||||
assert (
|
||||
RayPrometheusMetric._get_sanitized_opentelemetry_name(name_with_dash_dot)
|
||||
== expected
|
||||
)
|
||||
|
||||
# Test colon is replaced with underscore
|
||||
name_with_colon = "metric:name"
|
||||
expected = "metric_name"
|
||||
assert (
|
||||
RayPrometheusMetric._get_sanitized_opentelemetry_name(name_with_colon)
|
||||
== expected
|
||||
)
|
||||
|
||||
# Test multiple invalid characters are replaced
|
||||
name_with_invalid = "metric:name@with#special%chars"
|
||||
expected = "metric_name_with_special_chars"
|
||||
assert (
|
||||
RayPrometheusMetric._get_sanitized_opentelemetry_name(name_with_invalid)
|
||||
== expected
|
||||
)
|
||||
|
||||
# Test mixed valid and invalid characters
|
||||
complex_name = "vllm:engine_stats/time.latency_ms-99p"
|
||||
expected = "vllm_engine_stats_time_latency_ms_99p"
|
||||
assert (
|
||||
RayPrometheusMetric._get_sanitized_opentelemetry_name(complex_name) == expected
|
||||
)
|
||||
|
||||
# Test empty string
|
||||
assert RayPrometheusMetric._get_sanitized_opentelemetry_name("") == ""
|
||||
109
tests/v1/metrics/test_stats.py
Normal file
109
tests/v1/metrics/test_stats.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from vllm.v1.engine import FinishReason
|
||||
from vllm.v1.metrics.stats import IterationStats, RequestStateStats
|
||||
|
||||
|
||||
def test_iteration_stats_repr():
|
||||
iteration_stats = IterationStats()
|
||||
assert repr(iteration_stats).startswith("IterationStats(")
|
||||
|
||||
|
||||
def test_prefill_kv_computed_with_cache():
|
||||
"""Test that prefill KV compute correctly excludes cached tokens."""
|
||||
iteration_stats = IterationStats()
|
||||
req_stats = RequestStateStats(arrival_time=0.0)
|
||||
req_stats.scheduled_ts = 0.1
|
||||
req_stats.first_token_ts = 0.5
|
||||
req_stats.last_token_ts = 5.0
|
||||
req_stats.num_generation_tokens = 50
|
||||
|
||||
# Case 1: With prefix cache (1200 tokens cached)
|
||||
iteration_stats.update_from_finished_request(
|
||||
finish_reason=FinishReason.STOP,
|
||||
num_prompt_tokens=10000,
|
||||
max_tokens_param=100,
|
||||
req_stats=req_stats,
|
||||
num_cached_tokens=1200,
|
||||
)
|
||||
|
||||
finished_req = iteration_stats.finished_requests[0]
|
||||
assert finished_req.num_prompt_tokens == 10000
|
||||
assert finished_req.num_cached_tokens == 1200
|
||||
|
||||
# Verify calculation: prefill KV = prompt tokens - cached tokens
|
||||
prefill_kv_computed = finished_req.num_prompt_tokens - max(
|
||||
finished_req.num_cached_tokens, 0
|
||||
)
|
||||
assert prefill_kv_computed == 8800 # 10000 - 1200
|
||||
|
||||
|
||||
def test_prefill_kv_computed_no_cache():
|
||||
"""Test prefill KV compute without prefix caching."""
|
||||
iteration_stats = IterationStats()
|
||||
req_stats = RequestStateStats(arrival_time=0.0)
|
||||
req_stats.scheduled_ts = 0.1
|
||||
req_stats.first_token_ts = 0.5
|
||||
req_stats.last_token_ts = 2.0
|
||||
req_stats.num_generation_tokens = 10
|
||||
|
||||
# Case 2: No prefix cache
|
||||
iteration_stats.update_from_finished_request(
|
||||
finish_reason=FinishReason.STOP,
|
||||
num_prompt_tokens=2000,
|
||||
max_tokens_param=100,
|
||||
req_stats=req_stats,
|
||||
num_cached_tokens=0,
|
||||
)
|
||||
|
||||
finished_req = iteration_stats.finished_requests[0]
|
||||
assert finished_req.num_prompt_tokens == 2000
|
||||
assert finished_req.num_cached_tokens == 0
|
||||
|
||||
# Verify calculation: prefill KV = full prompt when no cache
|
||||
prefill_kv_computed = finished_req.num_prompt_tokens - max(
|
||||
finished_req.num_cached_tokens, 0
|
||||
)
|
||||
assert prefill_kv_computed == 2000
|
||||
|
||||
|
||||
def test_prefill_kv_computed_edge_cases():
|
||||
"""Test edge cases for prefill KV compute calculation."""
|
||||
iteration_stats = IterationStats()
|
||||
req_stats = RequestStateStats(arrival_time=0.0)
|
||||
req_stats.scheduled_ts = 0.1
|
||||
req_stats.first_token_ts = 0.5
|
||||
req_stats.last_token_ts = 1.0
|
||||
req_stats.num_generation_tokens = 1
|
||||
|
||||
# Case 3: Negative num_cached_tokens (shouldn't happen, but handle gracefully)
|
||||
iteration_stats.update_from_finished_request(
|
||||
finish_reason=FinishReason.STOP,
|
||||
num_prompt_tokens=100,
|
||||
max_tokens_param=10,
|
||||
req_stats=req_stats,
|
||||
num_cached_tokens=-1,
|
||||
)
|
||||
|
||||
finished_req = iteration_stats.finished_requests[0]
|
||||
# max() should handle negative values
|
||||
prefill_kv_computed = finished_req.num_prompt_tokens - max(
|
||||
finished_req.num_cached_tokens, 0
|
||||
)
|
||||
assert prefill_kv_computed == 100 # Should treat negative as 0
|
||||
|
||||
# Case 4: All tokens cached (shouldn't happen in practice)
|
||||
iteration_stats2 = IterationStats()
|
||||
iteration_stats2.update_from_finished_request(
|
||||
finish_reason=FinishReason.STOP,
|
||||
num_prompt_tokens=100,
|
||||
max_tokens_param=10,
|
||||
req_stats=req_stats,
|
||||
num_cached_tokens=100,
|
||||
)
|
||||
|
||||
finished_req2 = iteration_stats2.finished_requests[0]
|
||||
prefill_kv_computed2 = finished_req2.num_prompt_tokens - max(
|
||||
finished_req2.num_cached_tokens, 0
|
||||
)
|
||||
assert prefill_kv_computed2 == 0 # All cached, nothing computed
|
||||
Reference in New Issue
Block a user