Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,66 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
import pytest
from tests.plugins.vllm_add_dummy_stat_logger.dummy_stat_logger.dummy_stat_logger import ( # noqa E501
DummyStatLogger,
)
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
@pytest.fixture
def log_stats_enabled_engine_args():
"""
Shared fixture providing common AsyncEngineArgs configuration
used across multiple tests.
"""
return AsyncEngineArgs(
model="distilbert/distilgpt2",
dtype="half",
disable_log_stats=False,
enforce_eager=True,
)
@pytest.mark.asyncio
async def test_async_llm_replace_default_loggers(log_stats_enabled_engine_args):
"""
RayPrometheusStatLogger should replace the default PrometheusStatLogger
"""
engine = AsyncLLM.from_engine_args(
log_stats_enabled_engine_args, stat_loggers=[RayPrometheusStatLogger]
)
assert isinstance(engine.logger_manager.stat_loggers[0], RayPrometheusStatLogger)
engine.shutdown()
@pytest.mark.asyncio
async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args):
"""
It's still possible to use custom stat loggers exclusively by passing
disable_log_stats=True in addition to a list of custom stat loggers.
"""
# Create engine_args with disable_log_stats=True for this test
disabled_log_engine_args = copy.deepcopy(log_stats_enabled_engine_args)
disabled_log_engine_args.disable_log_stats = True
# Disable default loggers; pass custom stat logger to the constructor
engine = AsyncLLM.from_engine_args(
disabled_log_engine_args, stat_loggers=[DummyStatLogger]
)
assert len(engine.logger_manager.stat_loggers) == 2
assert len(engine.logger_manager.stat_loggers[0].per_engine_stat_loggers) == 1
assert isinstance(
engine.logger_manager.stat_loggers[0].per_engine_stat_loggers[0],
DummyStatLogger,
)
# log_stats is still True, since custom stat loggers are used
assert engine.log_stats
engine.shutdown()

View File

@@ -0,0 +1,127 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import prometheus_client
import pytest
from vllm.v1.metrics.reader import (
Counter,
Gauge,
Histogram,
Vector,
get_metrics_snapshot,
)
pytestmark = pytest.mark.cpu_test
@pytest.fixture(autouse=True)
def test_registry(monkeypatch):
# Use a custom registry for tests
test_registry = prometheus_client.CollectorRegistry(auto_describe=True)
monkeypatch.setattr("vllm.v1.metrics.reader.REGISTRY", test_registry)
return test_registry
@pytest.mark.parametrize("num_engines", [1, 4])
def test_gauge_metric(test_registry, num_engines):
g = prometheus_client.Gauge(
"vllm:test_gauge",
"Test gauge metric",
labelnames=["model", "engine_index"],
registry=test_registry,
)
for i in range(num_engines):
g.labels(model="foo", engine_index=str(i)).set(98.5)
metrics = get_metrics_snapshot()
assert len(metrics) == num_engines
engine_labels = [str(i) for i in range(num_engines)]
for m in metrics:
assert isinstance(m, Gauge)
assert m.name == "vllm:test_gauge"
assert m.value == 98.5
assert m.labels["model"] == "foo"
assert m.labels["engine_index"] in engine_labels
engine_labels.remove(m.labels["engine_index"])
@pytest.mark.parametrize("num_engines", [1, 4])
def test_counter_metric(test_registry, num_engines):
c = prometheus_client.Counter(
"vllm:test_counter",
"Test counter metric",
labelnames=["model", "engine_index"],
registry=test_registry,
)
for i in range(num_engines):
c.labels(model="bar", engine_index=str(i)).inc(19)
metrics = get_metrics_snapshot()
assert len(metrics) == num_engines
engine_labels = [str(i) for i in range(num_engines)]
for m in metrics:
assert isinstance(m, Counter)
assert m.name == "vllm:test_counter"
assert m.value == 19
assert m.labels["model"] == "bar"
assert m.labels["engine_index"] in engine_labels
engine_labels.remove(m.labels["engine_index"])
@pytest.mark.parametrize("num_engines", [1, 4])
def test_histogram_metric(test_registry, num_engines):
h = prometheus_client.Histogram(
"vllm:test_histogram",
"Test histogram metric",
labelnames=["model", "engine_index"],
buckets=[10, 20, 30, 40, 50],
registry=test_registry,
)
for i in range(num_engines):
hist = h.labels(model="blaa", engine_index=str(i))
hist.observe(42)
hist.observe(21)
hist.observe(7)
metrics = get_metrics_snapshot()
assert len(metrics) == num_engines
engine_labels = [str(i) for i in range(num_engines)]
for m in metrics:
assert isinstance(m, Histogram)
assert m.name == "vllm:test_histogram"
assert m.count == 3
assert m.sum == 70
assert m.buckets["10.0"] == 1
assert m.buckets["20.0"] == 1
assert m.buckets["30.0"] == 2
assert m.buckets["40.0"] == 2
assert m.buckets["50.0"] == 3
assert m.labels["model"] == "blaa"
assert m.labels["engine_index"] in engine_labels
engine_labels.remove(m.labels["engine_index"])
@pytest.mark.parametrize("num_engines", [1, 4])
def test_vector_metric(test_registry, num_engines):
c = prometheus_client.Counter(
"vllm:spec_decode_num_accepted_tokens_per_pos",
"Vector-like counter metric",
labelnames=["position", "model", "engine_index"],
registry=test_registry,
)
for i in range(num_engines):
c.labels(position="0", model="llama", engine_index=str(i)).inc(10)
c.labels(position="1", model="llama", engine_index=str(i)).inc(5)
c.labels(position="2", model="llama", engine_index=str(i)).inc(1)
metrics = get_metrics_snapshot()
assert len(metrics) == num_engines
engine_labels = [str(i) for i in range(num_engines)]
for m in metrics:
assert isinstance(m, Vector)
assert m.name == "vllm:spec_decode_num_accepted_tokens_per_pos"
assert m.values == [10, 5, 1]
assert m.labels["model"] == "llama"
assert m.labels["engine_index"] in engine_labels
engine_labels.remove(m.labels["engine_index"])

View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import ray
from vllm.config.model import ModelDType
from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
MODELS = [
"distilbert/distilgpt2",
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [16])
def test_engine_log_metrics_ray(
example_prompts,
model: str,
dtype: ModelDType,
max_tokens: int,
) -> None:
"""Simple smoke test, verifying this can be used without exceptions.
Need to start a Ray cluster in order to verify outputs."""
@ray.remote(num_gpus=1)
class EngineTestActor:
async def run(self):
engine_args = AsyncEngineArgs(
model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
)
engine = AsyncLLM.from_engine_args(
engine_args, stat_loggers=[RayPrometheusStatLogger]
)
for i, prompt in enumerate(example_prompts):
results = engine.generate(
request_id=f"request-id-{i}",
prompt=prompt,
sampling_params=SamplingParams(max_tokens=max_tokens),
)
async for _ in results:
pass
# Create the actor and call the async method
actor = EngineTestActor.remote() # type: ignore[attr-defined]
ray.get(actor.run.remote())
def test_sanitized_opentelemetry_name():
"""Test the metric name sanitization logic for Ray."""
# Only a-z, A-Z, 0-9, _, test valid characters are preserved
valid_name = "valid_metric_123_abcDEF"
assert (
RayPrometheusMetric._get_sanitized_opentelemetry_name(valid_name) == valid_name
)
# Test dash, dot, are replaced
name_with_dash_dot = "metric-name.test"
expected = "metric_name_test"
assert (
RayPrometheusMetric._get_sanitized_opentelemetry_name(name_with_dash_dot)
== expected
)
# Test colon is replaced with underscore
name_with_colon = "metric:name"
expected = "metric_name"
assert (
RayPrometheusMetric._get_sanitized_opentelemetry_name(name_with_colon)
== expected
)
# Test multiple invalid characters are replaced
name_with_invalid = "metric:name@with#special%chars"
expected = "metric_name_with_special_chars"
assert (
RayPrometheusMetric._get_sanitized_opentelemetry_name(name_with_invalid)
== expected
)
# Test mixed valid and invalid characters
complex_name = "vllm:engine_stats/time.latency_ms-99p"
expected = "vllm_engine_stats_time_latency_ms_99p"
assert (
RayPrometheusMetric._get_sanitized_opentelemetry_name(complex_name) == expected
)
# Test empty string
assert RayPrometheusMetric._get_sanitized_opentelemetry_name("") == ""

View File

@@ -0,0 +1,109 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.v1.engine import FinishReason
from vllm.v1.metrics.stats import IterationStats, RequestStateStats
def test_iteration_stats_repr():
iteration_stats = IterationStats()
assert repr(iteration_stats).startswith("IterationStats(")
def test_prefill_kv_computed_with_cache():
"""Test that prefill KV compute correctly excludes cached tokens."""
iteration_stats = IterationStats()
req_stats = RequestStateStats(arrival_time=0.0)
req_stats.scheduled_ts = 0.1
req_stats.first_token_ts = 0.5
req_stats.last_token_ts = 5.0
req_stats.num_generation_tokens = 50
# Case 1: With prefix cache (1200 tokens cached)
iteration_stats.update_from_finished_request(
finish_reason=FinishReason.STOP,
num_prompt_tokens=10000,
max_tokens_param=100,
req_stats=req_stats,
num_cached_tokens=1200,
)
finished_req = iteration_stats.finished_requests[0]
assert finished_req.num_prompt_tokens == 10000
assert finished_req.num_cached_tokens == 1200
# Verify calculation: prefill KV = prompt tokens - cached tokens
prefill_kv_computed = finished_req.num_prompt_tokens - max(
finished_req.num_cached_tokens, 0
)
assert prefill_kv_computed == 8800 # 10000 - 1200
def test_prefill_kv_computed_no_cache():
"""Test prefill KV compute without prefix caching."""
iteration_stats = IterationStats()
req_stats = RequestStateStats(arrival_time=0.0)
req_stats.scheduled_ts = 0.1
req_stats.first_token_ts = 0.5
req_stats.last_token_ts = 2.0
req_stats.num_generation_tokens = 10
# Case 2: No prefix cache
iteration_stats.update_from_finished_request(
finish_reason=FinishReason.STOP,
num_prompt_tokens=2000,
max_tokens_param=100,
req_stats=req_stats,
num_cached_tokens=0,
)
finished_req = iteration_stats.finished_requests[0]
assert finished_req.num_prompt_tokens == 2000
assert finished_req.num_cached_tokens == 0
# Verify calculation: prefill KV = full prompt when no cache
prefill_kv_computed = finished_req.num_prompt_tokens - max(
finished_req.num_cached_tokens, 0
)
assert prefill_kv_computed == 2000
def test_prefill_kv_computed_edge_cases():
"""Test edge cases for prefill KV compute calculation."""
iteration_stats = IterationStats()
req_stats = RequestStateStats(arrival_time=0.0)
req_stats.scheduled_ts = 0.1
req_stats.first_token_ts = 0.5
req_stats.last_token_ts = 1.0
req_stats.num_generation_tokens = 1
# Case 3: Negative num_cached_tokens (shouldn't happen, but handle gracefully)
iteration_stats.update_from_finished_request(
finish_reason=FinishReason.STOP,
num_prompt_tokens=100,
max_tokens_param=10,
req_stats=req_stats,
num_cached_tokens=-1,
)
finished_req = iteration_stats.finished_requests[0]
# max() should handle negative values
prefill_kv_computed = finished_req.num_prompt_tokens - max(
finished_req.num_cached_tokens, 0
)
assert prefill_kv_computed == 100 # Should treat negative as 0
# Case 4: All tokens cached (shouldn't happen in practice)
iteration_stats2 = IterationStats()
iteration_stats2.update_from_finished_request(
finish_reason=FinishReason.STOP,
num_prompt_tokens=100,
max_tokens_param=10,
req_stats=req_stats,
num_cached_tokens=100,
)
finished_req2 = iteration_stats2.finished_requests[0]
prefill_kv_computed2 = finished_req2.num_prompt_tokens - max(
finished_req2.num_cached_tokens, 0
)
assert prefill_kv_computed2 == 0 # All cached, nothing computed