Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""
import lm_eval
from vllm.platforms import current_platform
from ....utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
EXPECTED_VALUE = 0.54
DEFAULT_ARGS = ["--max-model-len", "4096"]
MORE_ARGS_LIST = [
[], # Default
["--enable-chunked-prefill"], # Chunked
]
MAX_WAIT_SECONDS = None
if current_platform.is_tpu():
MORE_ARGS_LIST = [
[], # Default
]
MAX_WAIT_SECONDS = 600
def run_test(more_args):
"""Run the end to end accuracy test."""
args = list(DEFAULT_ARGS)
args.extend(more_args)
print(f"Running with: {args}")
with RemoteOpenAIServer(
MODEL_NAME, args, max_wait_seconds=MAX_WAIT_SECONDS
) as remote_server:
url = f"{remote_server.url_for('v1')}/completions"
model_args = (
f"model={MODEL_NAME},"
f"base_url={url},"
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
)
results = lm_eval.simple_evaluate(
model="local-completions",
model_args=model_args,
tasks=TASK,
)
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
def test_lm_eval_accuracy_v1_engine():
"""Run with the V1 Engine."""
more_args = []
# Limit compilation time for V1
if current_platform.is_tpu():
more_args = ["--max-num-seqs", "64"]
run_test(more_args)

View File

@@ -0,0 +1,171 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Evaluate Transcription API correctness by computing Word Error Rate (WER)
on a given ASR dataset. When provided, it will also compare the WER against
a baseline.
This simulates real work usage of the API and makes sure that the frontend and
AsyncLLMEngine are working correctly.
"""
import asyncio
import io
import time
from statistics import mean, median
import librosa
import pytest
import soundfile
import torch
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer
from ....utils import RemoteOpenAIServer
def to_bytes(y, sr):
buffer = io.BytesIO()
soundfile.write(buffer, y, sr, format="WAV")
buffer.seek(0)
return buffer
async def transcribe_audio(client, tokenizer, y, sr):
# Send loaded audio directly instead of loading from disk,
# don't account for that time though
with to_bytes(y, sr) as f:
start_time = time.perf_counter()
transcription = await client.audio.transcriptions.create(
file=f,
model=tokenizer.name_or_path,
language="en",
temperature=0.0,
)
end_time = time.perf_counter()
# NOTE there's no streaming in transcriptions, can't measure ttft
latency = end_time - start_time
num_output_tokens = len(
tokenizer(transcription.text, add_special_tokens=False).input_ids
)
return latency, num_output_tokens, transcription.text
async def bound_transcribe(sem, client, tokenizer, audio, reference):
# Use semaphore to limit concurrent requests.
async with sem:
result = await transcribe_audio(client, tokenizer, *audio)
# Normalize *english* output/reference for evaluation.
out = tokenizer.normalize(result[2])
ref = tokenizer.normalize(reference)
return result[:2] + (out, ref)
async def process_dataset(model, client, data, concurrent_request):
sem = asyncio.Semaphore(concurrent_request)
# Load tokenizer once outside the loop
tokenizer = AutoTokenizer.from_pretrained(model)
# Warmup call as the first `librosa.load` server-side is quite slow.
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
_ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")
tasks: list[asyncio.Task] = []
for sample in data:
audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
task = asyncio.create_task(
bound_transcribe(sem, client, tokenizer, (audio, sr), sample["text"])
)
tasks.append(task)
return await asyncio.gather(*tasks)
def print_performance_metrics(results, total_time):
latencies = [res[0] for res in results]
total_tokens = sum([res[1] for res in results])
total = len(results)
print(f"Total Requests: {total}")
print(f"Successful Requests: {len(latencies)}")
print(f"Average Latency: {mean(latencies):.4f} seconds")
print(f"Median Latency: {median(latencies):.4f} seconds")
perc = sorted(latencies)[int(len(latencies) * 0.95) - 1]
print(f"95th Percentile Latency: {perc:.4f} seconds")
# Throughput
req_throughput = len(latencies) / total_time
print(f"Estimated req_Throughput: {req_throughput:.2f} requests/s")
throughput = total_tokens / total_time
print(f"Estimated Throughput: {throughput:.2f} tok/s")
def add_duration(sample):
y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000
return sample
def load_hf_dataset(dataset_repo: str, split="validation", **hf_kwargs):
## Load and filter the dataset
dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
if "duration_ms" not in dataset[0]:
# compute duration to filter
dataset = dataset.map(add_duration)
# Whisper max supported duration
dataset = dataset.filter(lambda example: example["duration_ms"] < 30000)
return dataset
def run_evaluation(
model: str,
client,
dataset,
max_concurrent_reqs: int,
n_examples: int = -1,
print_metrics: bool = True,
):
if n_examples > 0:
dataset = dataset.select(range(n_examples))
start = time.perf_counter()
results = asyncio.run(process_dataset(model, client, dataset, max_concurrent_reqs))
end = time.perf_counter()
total_time = end - start
print(f"Total Test Time: {total_time:.4f} seconds")
if print_metrics:
print_performance_metrics(results, total_time)
# Compute WER
predictions = [res[2] for res in results]
references = [res[3] for res in results]
wer = load("wer")
wer_score = 100 * wer.compute(references=references, predictions=predictions)
print("WER:", wer_score)
return wer_score
# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
@pytest.mark.parametrize(
"dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"]
)
# NOTE: Expected WER measured with equivalent hf.transformers args:
# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
@pytest.mark.parametrize("expected_wer", [12.744980])
def test_wer_correctness(
model_name, dataset_repo, expected_wer, n_examples=-1, max_concurrent_request=None
):
# TODO refactor to use `ASRDataset`
with RemoteOpenAIServer(model_name, ["--enforce-eager"]) as remote_server:
dataset = load_hf_dataset(dataset_repo)
if not max_concurrent_request:
# No max concurrency
max_concurrent_request = n_examples if n_examples > 0 else len(dataset)
client = remote_server.get_async_client()
wer = run_evaluation(
model_name, client, dataset, max_concurrent_request, n_examples
)
if expected_wer:
torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)