Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,203 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
@pytest.fixture
def sample_prompts():
return [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
@pytest.fixture
def sample_token_ids():
return [
[0],
[0, 1],
[0, 2, 1],
[0, 3, 1, 2],
]
@pytest.fixture
def sample_regex():
return (
r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
@pytest.fixture
def sample_json_schema():
return {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {"type": "string", "maxLength": 10},
"minItems": 3,
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {"type": "string"},
"duration": {"type": "number"},
"position": {"type": "string"},
},
"required": ["company", "position"],
},
},
},
"required": ["name", "age", "skills", "work_history"],
}
@pytest.fixture
def sample_complex_json_schema():
return {
"type": "object",
"properties": {
"score": {
"type": "integer",
"minimum": 0,
"maximum": 100, # Numeric range
},
"grade": {
"type": "string",
"pattern": "^[A-D]$", # Regex pattern
},
"email": {
"type": "string",
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
},
"tags": {
"type": "array",
"items": {
"type": "string",
# Combining length and pattern restrictions
"pattern": "^[a-z]{1,10}$",
},
},
},
"required": ["score", "grade", "email", "tags"],
}
@pytest.fixture
def sample_definition_json_schema():
return {
"$defs": {
"Step": {
"properties": {
"explanation": {"title": "Explanation", "type": "string"},
"output": {"title": "Output", "type": "string"},
},
"required": ["explanation", "output"],
"title": "Step",
"type": "object",
}
},
"properties": {
"steps": {
"items": {"$ref": "#/$defs/Step"},
"title": "Steps",
"type": "array",
},
"final_answer": {"title": "Final Answer", "type": "string"},
},
"required": ["steps", "final_answer"],
"title": "MathReasoning",
"type": "object",
}
@pytest.fixture
def sample_enum_json_schema():
return {
"type": "object",
"properties": {
"status": {
"type": "string",
"enum": ["active", "inactive", "pending"], # Literal values using enum
},
"priority": {
"type": "string",
"enum": ["low", "medium", "high", "critical"],
},
"category": {
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["bug", "feature", "improvement"],
},
"severity": {
"type": "integer",
"enum": [1, 2, 3, 4, 5], # Enum can also contain numbers
},
},
"required": ["type", "severity"],
},
"flags": {
"type": "array",
"items": {
"type": "string",
"enum": ["urgent", "blocked", "needs_review", "approved"],
},
},
},
"required": ["status", "priority", "category", "flags"],
}
@pytest.fixture
def sample_structured_outputs_choices():
return [
"Python",
"Java",
"JavaScript",
"C++",
"C#",
"PHP",
"TypeScript",
"Ruby",
"Swift",
"Kotlin",
]
@pytest.fixture
def sample_sql_statements():
return """
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
"""
@pytest.fixture(scope="session")
def qwen3_lora_files():
"""Download Qwen3 LoRA files once per test session."""
from huggingface_hub import snapshot_download
return snapshot_download(repo_id="charent/self_cognition_Alice")
@pytest.fixture(scope="session")
def opt125_lora_files() -> str:
"""Download opt-125m LoRA files once per test session."""
from huggingface_hub import snapshot_download
return snapshot_download(repo_id="peft-internal-testing/opt-125m-dummy-lora")

View File

View File

@@ -0,0 +1,94 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""
import lm_eval
import pytest
from vllm.platforms import current_platform
MODEL_NAMES = [
"Qwen/Qwen3-1.7B",
"google/gemma-3-1b-it",
]
FP8_KV_MODEL_NAMES = [
"Qwen/Qwen3-1.7B",
]
NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
EXPECTED_VALUES = {
"Qwen/Qwen3-1.7B": 0.68,
"google/gemma-3-1b-it": 0.25,
}
def run_test(model_name, more_args=None):
"""Run the end to end accuracy test."""
model_args = f"pretrained={model_name},max_model_len=4096"
if more_args is not None:
model_args = "{},{}".format(model_args, more_args)
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks="gsm8k",
batch_size="auto",
)
measured_value = results["results"][TASK][FILTER]
assert model_name in EXPECTED_VALUES, (
f"Cannot find the expected value for the model {model_name=}"
)
expected_value = EXPECTED_VALUES[model_name]
assert (
measured_value - RTOL < expected_value
and measured_value + RTOL > expected_value
), f"Expected: {expected_value} | Measured: {measured_value}"
# TODO: [AlexM] Fix it with new CI/CD tests
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
@pytest.mark.parametrize("model", MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine(model):
"""Run with the V1 Engine."""
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=64"
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
"""Run with the V1 Engine."""
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)

View File

@@ -0,0 +1,212 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.sampling_params import SamplingParams
from ..openai.test_vision import TEST_IMAGE_ASSETS
@pytest.fixture(scope="function")
def text_llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.fixture(scope="function")
def llm_for_failure_test():
"""
Fixture for testing issue #26081.
Uses a small max_model_len to easily trigger length errors.
"""
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
seed=0,
max_model_len=128,
disable_log_stats=True,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
def test_chat(text_llm):
prompt1 = "Explain the concept of entropy."
messages = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt1},
]
outputs = text_llm.chat(messages)
assert len(outputs) == 1
def test_multi_chat(text_llm):
prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
conversation1 = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt1},
]
conversation2 = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt2},
]
messages = [conversation1, conversation2]
outputs = text_llm.chat(messages)
assert len(outputs) == 2
@pytest.fixture(scope="function")
def vision_llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
trust_remote_code=True,
limit_mm_per_prompt={"image": 2},
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.parametrize(
"image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
)
def test_chat_multi_image(vision_llm, image_urls: list[str]):
messages = [
{
"role": "user",
"content": [
*(
{"type": "image_url", "image_url": {"url": image_url}}
for image_url in image_urls
),
{"type": "text", "text": "What's in this image?"},
],
}
]
outputs = vision_llm.chat(messages)
assert len(outputs) >= 0
def test_llm_chat_tokenization_no_double_bos(text_llm):
"""
LLM.chat() should not add special tokens when using chat templates.
Check we get a single BOS token for llama chat.
"""
messages = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "Hello!"},
]
outputs = text_llm.chat(messages)
assert len(outputs) == 1
prompt_token_ids = outputs[0].prompt_token_ids
assert prompt_token_ids is not None
bos_token = text_llm.get_tokenizer().bos_token_id
# Ensure we have a single BOS
assert prompt_token_ids[0] == bos_token
assert prompt_token_ids[1] != bos_token, "Double BOS"
@pytest.fixture(scope="function")
def thinking_llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model="Qwen/Qwen3-0.6B",
max_model_len=4096,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.parametrize("enable_thinking", [True, False])
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
messages = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "What is 1+1?"},
]
outputs = thinking_llm.chat(
messages,
chat_template_kwargs={"enable_thinking": enable_thinking},
)
assert len(outputs) == 1
prompt_token_ids = outputs[0].prompt_token_ids
assert prompt_token_ids is not None
think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]
if enable_thinking:
assert think_id not in prompt_token_ids
else:
# The chat template includes dummy thinking process
assert think_id in prompt_token_ids
def test_chat_batch_failure_cleanup(llm_for_failure_test):
"""
Tests that if a batch call to llm.chat() fails mid-way
(e.g., due to one invalid prompt), the requests that
were already enqueued are properly aborted and do not
pollute the queue for subsequent calls.
(Fixes Issue #26081)
"""
llm = llm_for_failure_test
valid_msg = [{"role": "user", "content": "Hello"}]
long_text = "This is a very long text to test the error " * 50
invalid_msg = [{"role": "user", "content": long_text}]
batch_1 = [
valid_msg,
valid_msg,
invalid_msg,
]
batch_2 = [
valid_msg,
valid_msg,
]
sampling_params = SamplingParams(temperature=0, max_tokens=10)
with pytest.raises(ValueError, match="longer than the maximum model length"):
llm.chat(batch_1, sampling_params=sampling_params)
outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
assert len(outputs_2) == len(batch_2)
assert llm.llm_engine.get_num_unfinished_requests() == 0

View File

@@ -0,0 +1,36 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm import LLM
from ...utils import create_new_process_for_each_test
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("backend", ["mp", "ray"])
@create_new_process_for_each_test()
def test_collective_rpc(tp_size, backend, monkeypatch):
if torch.cuda.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
if tp_size == 1 and backend == "ray":
pytest.skip("Skip duplicate test case")
if tp_size == 1:
backend = None
# intentionally define the method and class in the test function,
# to test if they can be serialized and sent to the workers
def echo_rank(self):
return self.rank
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
llm = LLM(
model="hmellor/tiny-random-LlamaForCausalLM",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,
distributed_executor_backend=backend,
)
assert llm.collective_rpc(echo_rank) == list(range(tp_size))

View File

@@ -0,0 +1,124 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "distilbert/distilgpt2"
PROMPTS = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
TOKEN_IDS = [
[0],
[0, 1],
[0, 2, 1],
[0, 3, 1, 2],
]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=4096,
tensor_parallel_size=1,
gpu_memory_utilization=0.10,
enforce_eager=True,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_multiple_sampling_params(llm: LLM):
sampling_params = [
SamplingParams(temperature=0.01, top_p=0.95),
SamplingParams(temperature=0.3, top_p=0.95),
SamplingParams(temperature=0.7, top_p=0.95),
SamplingParams(temperature=0.99, top_p=0.95),
]
# Multiple SamplingParams should be matched with each prompt
outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
# Single SamplingParams should be applied to every prompt
single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
assert len(PROMPTS) == len(outputs)
# sampling_params is None, default params should be applied
outputs = llm.generate(PROMPTS, sampling_params=None)
assert len(PROMPTS) == len(outputs)
def test_multiple_priority(llm: LLM):
# Generate works when priority is None
outputs = llm.generate(PROMPTS, sampling_params=None, priority=None)
assert len(PROMPTS) == len(outputs)
# Generate works when length of priority is same as the len(PROMPTS)
outputs = llm.generate(PROMPTS, sampling_params=None, priority=[0] * len(PROMPTS))
assert len(PROMPTS) == len(outputs)
# Exception raised, if the length of priority does not match the length of prompts
with pytest.raises(ValueError):
outputs = llm.generate(
PROMPTS, sampling_params=None, priority=[0] * (len(PROMPTS) - 1)
)
# Exception raised, if the priority list is empty
with pytest.raises(ValueError):
outputs = llm.generate(PROMPTS, sampling_params=None, priority=[])
def test_max_model_len():
max_model_len = 20
llm = LLM(
model=MODEL_NAME,
max_model_len=max_model_len,
gpu_memory_utilization=0.10,
enforce_eager=True, # reduce test time
)
sampling_params = SamplingParams(max_tokens=max_model_len + 10)
outputs = llm.generate(PROMPTS, sampling_params)
for output in outputs:
num_total_tokens = len(output.prompt_token_ids) + len(
output.outputs[0].token_ids
)
# Total tokens must not exceed max_model_len.
# It can be less if generation finishes due to other reasons (e.g., EOS)
# before reaching the absolute model length limit.
assert num_total_tokens <= max_model_len
def test_log_stats():
llm = LLM(
model=MODEL_NAME,
disable_log_stats=False,
gpu_memory_utilization=0.10,
enforce_eager=True, # reduce test time
)
outputs = llm.generate(PROMPTS, sampling_params=None)
# disable_log_stats is False, every output should have metrics
assert all(output.metrics is not None for output in outputs)

View File

@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, SamplingParams
def test_gpu_memory_utilization():
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# makes sure gpu_memory_utilization is per-instance limit,
# not a global limit
llms = [
LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3, enforce_eager=True)
for i in range(3)
]
for llm in llms:
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@@ -0,0 +1,98 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
import pytest
import regex as re
from vllm import LLM
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.v1.metrics import loggers as stat_loggers
from vllm.v1.metrics.reader import Counter, Metric
from ..openai.test_vision import TEST_IMAGE_ASSETS
def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
return [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
]
def _get_counter_value(metrics: list[Metric], name: str):
metric = next(m for m in metrics if m.name == name)
assert isinstance(metric, Counter)
return metric.value
def _get_mm_cache_stats(metrics: list[Metric]):
mm_cache_queries = _get_counter_value(metrics, "vllm:mm_cache_queries")
mm_cache_hits = _get_counter_value(metrics, "vllm:mm_cache_hits")
return mm_cache_queries, mm_cache_hits
def _get_mm_cache_log(llm: LLM, caplog_vllm: pytest.LogCaptureFixture) -> float:
caplog_vllm.clear()
with caplog_vllm.at_level(logging.INFO, logger=stat_loggers.__name__):
llm.llm_engine.do_log_stats()
assert len(caplog_vllm.records) == 1
msg = caplog_vllm.records[0].getMessage()
assert "MM cache hit rate" in msg
match = re.search(r"MM cache hit rate: ([0-9.]+)%", msg)
assert match is not None
return float(match.group(1))
@pytest.mark.parametrize("image_urls", [TEST_IMAGE_ASSETS[:2]], indirect=True)
@pytest.mark.parametrize("mm_processor_cache_type", ["lru", "shm"])
def test_mm_cache_stats(
num_gpus_available,
image_urls,
mm_processor_cache_type,
caplog_vllm,
):
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
mm_processor_cache_type=mm_processor_cache_type,
disable_log_stats=False,
limit_mm_per_prompt={"image": 2},
)
llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (1, 0)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
llm.chat(_make_messages(image_urls[1]))
assert _get_mm_cache_stats(llm.get_metrics()) == (2, 0)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (3, 1)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(33.3)
# NOTE: This only resets hit rate stats in CachingMetrics
# The raw queries and hits counts remain unaffected
llm.reset_mm_cache()
llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (4, 1)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
llm.chat(_make_messages(image_urls[1]))
assert _get_mm_cache_stats(llm.get_metrics()) == (5, 1)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)

View File

@@ -0,0 +1,34 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm import LLM
def test_empty_prompt():
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match="decoder prompt cannot be empty"):
llm.generate([""])
def test_out_of_vocab_token():
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match="out of vocabulary"):
llm.generate({"prompt_token_ids": [999999]})
def test_require_mm_embeds():
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
enforce_eager=True,
enable_mm_embeds=False,
)
with pytest.raises(ValueError, match="--enable-mm-embeds"):
llm.generate(
{
"prompt": "<image>",
"multi_modal_data": {"image": torch.empty(1, 1, 1)},
}
)

View File

@@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for HF_HUB_OFFLINE mode"""
import dataclasses
import importlib
import sys
import pytest
import urllib3
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import EngineArgs
MODEL_CONFIGS = [
{
"model": "facebook/opt-125m",
"enforce_eager": True,
"gpu_memory_utilization": 0.20,
"max_model_len": 64,
"max_num_batched_tokens": 64,
"max_num_seqs": 64,
"tensor_parallel_size": 1,
},
{
"model": "Qwen/Qwen3-0.6B",
"enforce_eager": True,
"gpu_memory_utilization": 0.50,
"max_model_len": 64,
"max_num_batched_tokens": 64,
"max_num_seqs": 64,
"tensor_parallel_size": 1,
"tokenizer": "Qwen/Qwen3-4B",
},
{
"model": "mistralai/Mistral-7B-Instruct-v0.1",
"enforce_eager": True,
"gpu_memory_utilization": 0.95,
"max_model_len": 64,
"max_num_batched_tokens": 64,
"max_num_seqs": 64,
"tensor_parallel_size": 1,
"tokenizer_mode": "mistral",
},
# TODO: re-enable once these tests are run with V1
# {
# "model": "sentence-transformers/all-MiniLM-L12-v2",
# "enforce_eager": True,
# "gpu_memory_utilization": 0.20,
# "max_model_len": 64,
# "max_num_batched_tokens": 64,
# "max_num_seqs": 64,
# "tensor_parallel_size": 1,
# },
]
@pytest.fixture(scope="module")
def cache_models():
# Cache model files first
for model_config in MODEL_CONFIGS:
LLM(**model_config)
cleanup_dist_env_and_memory()
yield
@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM
with monkeypatch.context() as m:
try:
m.setenv("HF_HUB_OFFLINE", "1")
m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed")
m.setattr(
urllib3.connection.HTTPConnection,
"connect",
disable_connect,
)
m.setattr(
urllib3.connection.HTTPSConnection,
"connect",
disable_connect,
)
# Need to re-import huggingface_hub
# and friends to set up offline mode
_re_import_modules()
# Cached model files should be used in offline mode
for model_config in MODEL_CONFIGS:
LLM(**model_config)
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
_re_import_modules()
def _re_import_modules():
hf_hub_module_names = [k for k in sys.modules if k.startswith("huggingface_hub")]
transformers_module_names = [
k
for k in sys.modules
if k.startswith("transformers") and not k.startswith("transformers_modules")
]
reload_exception = None
for module_name in hf_hub_module_names + transformers_module_names:
try:
importlib.reload(sys.modules[module_name])
except Exception as e:
reload_exception = e
# Try to continue clean up so that other tests are less likely to
# be affected
# Error this test if reloading a module failed
if reload_exception is not None:
raise reload_exception
@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM
with monkeypatch.context() as m:
try:
m.setenv("HF_HUB_OFFLINE", "1")
m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed")
m.setattr(
urllib3.connection.HTTPConnection,
"connect",
disable_connect,
)
m.setattr(
urllib3.connection.HTTPSConnection,
"connect",
disable_connect,
)
# Need to re-import huggingface_hub
# and friends to set up offline mode
_re_import_modules()
engine_args = EngineArgs(model="facebook/opt-125m")
LLM(**dataclasses.asdict(engine_args))
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
_re_import_modules()

View File

View File

@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.assets.audio import AudioAsset
@pytest.fixture
def mary_had_lamb():
path = AudioAsset("mary_had_lamb").get_local_path()
with open(str(path), "rb") as f:
yield f
@pytest.fixture
def winning_call():
path = AudioAsset("winning_call").get_local_path()
with open(str(path), "rb") as f:
yield f
@pytest.fixture
def foscolo():
# Test translation it->en
path = AudioAsset("azacinto_foscolo").get_local_path()
with open(str(path), "rb") as f:
yield f

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""
import lm_eval
from vllm.platforms import current_platform
from ....utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
EXPECTED_VALUE = 0.54
DEFAULT_ARGS = ["--max-model-len", "4096"]
MORE_ARGS_LIST = [
[], # Default
["--enable-chunked-prefill"], # Chunked
]
MAX_WAIT_SECONDS = None
if current_platform.is_tpu():
MORE_ARGS_LIST = [
[], # Default
]
MAX_WAIT_SECONDS = 600
def run_test(more_args):
"""Run the end to end accuracy test."""
args = list(DEFAULT_ARGS)
args.extend(more_args)
print(f"Running with: {args}")
with RemoteOpenAIServer(
MODEL_NAME, args, max_wait_seconds=MAX_WAIT_SECONDS
) as remote_server:
url = f"{remote_server.url_for('v1')}/completions"
model_args = (
f"model={MODEL_NAME},"
f"base_url={url},"
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
)
results = lm_eval.simple_evaluate(
model="local-completions",
model_args=model_args,
tasks=TASK,
)
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
def test_lm_eval_accuracy_v1_engine():
"""Run with the V1 Engine."""
more_args = []
# Limit compilation time for V1
if current_platform.is_tpu():
more_args = ["--max-num-seqs", "64"]
run_test(more_args)

View File

@@ -0,0 +1,171 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Evaluate Transcription API correctness by computing Word Error Rate (WER)
on a given ASR dataset. When provided, it will also compare the WER against
a baseline.
This simulates real work usage of the API and makes sure that the frontend and
AsyncLLMEngine are working correctly.
"""
import asyncio
import io
import time
from statistics import mean, median
import librosa
import pytest
import soundfile
import torch
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer
from ....utils import RemoteOpenAIServer
def to_bytes(y, sr):
buffer = io.BytesIO()
soundfile.write(buffer, y, sr, format="WAV")
buffer.seek(0)
return buffer
async def transcribe_audio(client, tokenizer, y, sr):
# Send loaded audio directly instead of loading from disk,
# don't account for that time though
with to_bytes(y, sr) as f:
start_time = time.perf_counter()
transcription = await client.audio.transcriptions.create(
file=f,
model=tokenizer.name_or_path,
language="en",
temperature=0.0,
)
end_time = time.perf_counter()
# NOTE there's no streaming in transcriptions, can't measure ttft
latency = end_time - start_time
num_output_tokens = len(
tokenizer(transcription.text, add_special_tokens=False).input_ids
)
return latency, num_output_tokens, transcription.text
async def bound_transcribe(sem, client, tokenizer, audio, reference):
# Use semaphore to limit concurrent requests.
async with sem:
result = await transcribe_audio(client, tokenizer, *audio)
# Normalize *english* output/reference for evaluation.
out = tokenizer.normalize(result[2])
ref = tokenizer.normalize(reference)
return result[:2] + (out, ref)
async def process_dataset(model, client, data, concurrent_request):
sem = asyncio.Semaphore(concurrent_request)
# Load tokenizer once outside the loop
tokenizer = AutoTokenizer.from_pretrained(model)
# Warmup call as the first `librosa.load` server-side is quite slow.
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
_ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")
tasks: list[asyncio.Task] = []
for sample in data:
audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
task = asyncio.create_task(
bound_transcribe(sem, client, tokenizer, (audio, sr), sample["text"])
)
tasks.append(task)
return await asyncio.gather(*tasks)
def print_performance_metrics(results, total_time):
latencies = [res[0] for res in results]
total_tokens = sum([res[1] for res in results])
total = len(results)
print(f"Total Requests: {total}")
print(f"Successful Requests: {len(latencies)}")
print(f"Average Latency: {mean(latencies):.4f} seconds")
print(f"Median Latency: {median(latencies):.4f} seconds")
perc = sorted(latencies)[int(len(latencies) * 0.95) - 1]
print(f"95th Percentile Latency: {perc:.4f} seconds")
# Throughput
req_throughput = len(latencies) / total_time
print(f"Estimated req_Throughput: {req_throughput:.2f} requests/s")
throughput = total_tokens / total_time
print(f"Estimated Throughput: {throughput:.2f} tok/s")
def add_duration(sample):
y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000
return sample
def load_hf_dataset(dataset_repo: str, split="validation", **hf_kwargs):
## Load and filter the dataset
dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
if "duration_ms" not in dataset[0]:
# compute duration to filter
dataset = dataset.map(add_duration)
# Whisper max supported duration
dataset = dataset.filter(lambda example: example["duration_ms"] < 30000)
return dataset
def run_evaluation(
model: str,
client,
dataset,
max_concurrent_reqs: int,
n_examples: int = -1,
print_metrics: bool = True,
):
if n_examples > 0:
dataset = dataset.select(range(n_examples))
start = time.perf_counter()
results = asyncio.run(process_dataset(model, client, dataset, max_concurrent_reqs))
end = time.perf_counter()
total_time = end - start
print(f"Total Test Time: {total_time:.4f} seconds")
if print_metrics:
print_performance_metrics(results, total_time)
# Compute WER
predictions = [res[2] for res in results]
references = [res[3] for res in results]
wer = load("wer")
wer_score = 100 * wer.compute(references=references, predictions=predictions)
print("WER:", wer_score)
return wer_score
# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
@pytest.mark.parametrize(
"dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"]
)
# NOTE: Expected WER measured with equivalent hf.transformers args:
# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
@pytest.mark.parametrize("expected_wer", [12.744980])
def test_wer_correctness(
model_name, dataset_repo, expected_wer, n_examples=-1, max_concurrent_request=None
):
# TODO refactor to use `ASRDataset`
with RemoteOpenAIServer(model_name, ["--enforce-eager"]) as remote_server:
dataset = load_hf_dataset(dataset_repo)
if not max_concurrent_request:
# No max concurrency
max_concurrent_request = n_examples if n_examples > 0 else len(dataset)
client = remote_server.get_async_client()
wer = run_evaluation(
model_name, client, dataset, max_concurrent_request, n_examples
)
if expected_wer:
torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,82 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import random
from collections.abc import Callable
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
"--load-format",
"dummy",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
ids=["completion", "chat"],
argnames=["create_func_gen", "content_body"],
argvalues=[
(lambda x: x.completions.create, {"prompt": " ".join(["A"] * 10_000)}),
(
lambda x: x.chat.completions.create,
{"messages": [{"role": "user", "content": " ".join(["A"] * 10_000)}]},
),
],
)
async def test_with_and_without_truncate(
server: RemoteOpenAIServer,
client: openai.AsyncOpenAI,
create_func_gen: Callable,
content_body: dict,
):
create_func = create_func_gen(client)
body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
num_requests = 10
truncate_prompt_tokens = [1000] * (num_requests // 2) + [None] * (
num_requests - num_requests // 2
)
random.shuffle(truncate_prompt_tokens)
bodies = [
{**body, "extra_body": {"truncate_prompt_tokens": t}}
for t in truncate_prompt_tokens
]
async def get_status_code(**kwargs):
try:
await create_func(**kwargs)
return 200
except openai.APIStatusError as e:
return e.status_code
responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
assert 500 not in responses

View File

@@ -0,0 +1,392 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai
import pytest
import pytest_asyncio
from vllm.assets.audio import AudioAsset
from vllm.multimodal.utils import encode_audio_base64, fetch_audio
from ...utils import RemoteOpenAIServer
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
TEST_AUDIO_URLS = [
AudioAsset("winning_call").url,
AudioAsset("mary_had_lamb").url,
]
MAXIMUM_AUDIOS = 2
@pytest.fixture(scope="module")
def server():
args = [
"--dtype",
"float32",
"--max-model-len",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
json.dumps({"audio": MAXIMUM_AUDIOS}),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_audio() -> dict[str, str]:
return {
audio_url: encode_audio_base64(*fetch_audio(audio_url))
for audio_url in TEST_AUDIO_URLS
}
def dummy_messages_from_audio_url(
audio_urls: str | list[str],
content_text: str = "What's happening in this audio?",
):
if isinstance(audio_urls, str):
audio_urls = [audio_urls]
return [
{
"role": "user",
"content": [
*(
{"type": "audio_url", "audio_url": {"url": audio_url}}
for audio_url in audio_urls
),
{"type": "text", "text": content_text},
],
}
]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
async def test_single_chat_session_audio(
client: openai.AsyncOpenAI, model_name: str, audio_url: str
):
messages = dummy_messages_from_audio_url(audio_url)
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5,
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=202, total_tokens=212
)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
async def test_error_on_invalid_audio_url_type(
client: openai.AsyncOpenAI, model_name: str, audio_url: str
):
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": audio_url},
{"type": "text", "text": "What's happening in this audio?"},
],
}
]
# audio_url should be a dict {"url": "some url"}, not directly a string
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
async def test_single_chat_session_audio_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
audio_url: str,
base64_encoded_audio: dict[str, str],
):
messages = dummy_messages_from_audio_url(
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
)
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5,
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=202, total_tokens=212
)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
async def test_single_chat_session_input_audio(
client: openai.AsyncOpenAI,
model_name: str,
audio_url: str,
base64_encoded_audio: dict[str, str],
):
messages = [
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": base64_encoded_audio[audio_url],
"format": "wav",
},
},
{"type": "text", "text": "What's happening in this audio?"},
],
}
]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=202, total_tokens=212
)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_chat_streaming_audio(
client: openai.AsyncOpenAI, model_name: str, audio_url: str
):
messages = dummy_messages_from_audio_url(
audio_url, "What's a short title for this audio?"
)
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=8,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=8,
temperature=0.0,
stream=True,
)
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_chat_streaming_input_audio(
client: openai.AsyncOpenAI,
model_name: str,
audio_url: str,
base64_encoded_audio: dict[str, str],
):
messages = [
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": base64_encoded_audio[audio_url],
"format": "wav",
},
},
{"type": "text", "text": "What's happening in this audio?"},
],
}
]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=8,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=8,
temperature=0.0,
stream=True,
)
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]]
)
async def test_multi_audio_input(
client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
):
messages = dummy_messages_from_audio_url(audio_urls)
if len(audio_urls) > MAXIMUM_AUDIOS:
with pytest.raises(openai.BadRequestError): # test multi-audio input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
else:
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0

View File

@@ -0,0 +1,250 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
from http import HTTPStatus
from unittest.mock import AsyncMock, Mock
import openai
import pytest
import pytest_asyncio
import requests
from fastapi import Request
from vllm.v1.engine.exceptions import EngineDeadError
from vllm.version import __version__ as VLLM_VERSION
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def server_args(request: pytest.FixtureRequest) -> list[str]:
"""Provide extra arguments to the server via indirect parametrization
Usage:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
>>> ],
>>> ],
>>> indirect=True,
>>> )
>>> def test_foo(server, client):
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
if not hasattr(request, "param"):
return []
val = request.param
if isinstance(val, str):
return [val]
return request.param
@pytest.fixture(scope="module")
def server(server_args):
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
*server_args,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio
async def test_show_version(server: RemoteOpenAIServer):
response = requests.get(server.url_for("version"))
response.raise_for_status()
assert response.json() == {"version": VLLM_VERSION}
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio
async def test_check_health(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health"))
assert response.status_code == HTTPStatus.OK
@pytest.mark.parametrize(
"server_args",
[
pytest.param(
["--max-model-len", "10100"], id="default-frontend-multiprocessing"
),
pytest.param(
["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio
async def test_request_cancellation(server: RemoteOpenAIServer):
# clunky test: send an ungodly amount of load in with short timeouts
# then ensure that it still responds quickly afterwards
chat_input = [{"role": "user", "content": "Write a long story"}]
client = server.get_async_client(timeout=0.5)
tasks = []
# Request about 2 million tokens
for _ in range(200):
task = asyncio.create_task(
client.chat.completions.create(
messages=chat_input,
model=MODEL_NAME,
max_tokens=10000,
extra_body={"min_tokens": 10000},
)
)
tasks.append(task)
done, pending = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
# Make sure all requests were sent to the server and timed out
# (We don't want to hide other errors like 400s that would invalidate this
# test)
assert len(pending) == 0
for d in done:
with pytest.raises(openai.APITimeoutError):
d.result()
# If the server had not cancelled all the other requests, then it would not
# be able to respond to this one within the timeout
client = server.get_async_client(timeout=5)
response = await client.chat.completions.create(
messages=chat_input, model=MODEL_NAME, max_tokens=10
)
assert len(response.choices) == 1
@pytest.mark.asyncio
async def test_request_wrong_content_type(server: RemoteOpenAIServer):
chat_input = [{"role": "user", "content": "Write a long story"}]
client = server.get_async_client()
with pytest.raises(openai.APIStatusError):
await client.chat.completions.create(
messages=chat_input,
model=MODEL_NAME,
max_tokens=10000,
extra_headers={"Content-Type": "application/x-www-form-urlencoded"},
)
@pytest.mark.parametrize(
"server_args",
[pytest.param(["--enable-server-load-tracking"], id="enable-server-load-tracking")],
indirect=True,
)
@pytest.mark.asyncio
async def test_server_load(server: RemoteOpenAIServer):
# Check initial server load
response = requests.get(server.url_for("load"))
assert response.status_code == HTTPStatus.OK
assert response.json().get("server_load") == 0
def make_long_completion_request():
return requests.post(
server.url_for("v1/completions"),
headers={"Content-Type": "application/json"},
json={
"prompt": "Give me a long story",
"max_tokens": 1000,
"temperature": 0,
},
)
# Start the completion request in a background thread.
completion_future = asyncio.create_task(
asyncio.to_thread(make_long_completion_request)
)
# Give a short delay to ensure the request has started.
await asyncio.sleep(0.1)
# Check server load while the completion request is running.
response = requests.get(server.url_for("load"))
assert response.status_code == HTTPStatus.OK
assert response.json().get("server_load") == 1
# Wait for the completion request to finish.
await completion_future
await asyncio.sleep(0.1)
# Check server load after the completion request has finished.
response = requests.get(server.url_for("load"))
assert response.status_code == HTTPStatus.OK
assert response.json().get("server_load") == 0
@pytest.mark.asyncio
async def test_health_check_engine_dead_error():
# Import the health function directly to test it in isolation
from vllm.entrypoints.serve.instrumentator.health import health
# Create a mock request that simulates what FastAPI would provide
mock_request = Mock(spec=Request)
mock_app_state = Mock()
mock_engine_client = AsyncMock()
mock_engine_client.check_health.side_effect = EngineDeadError()
mock_app_state.engine_client = mock_engine_client
mock_request.app.state = mock_app_state
# Test the health function directly with our mocked request
# This simulates what would happen if the engine dies
response = await health(mock_request)
# Assert that it returns 503 Service Unavailable
assert response.status_code == 503

View File

@@ -0,0 +1,798 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# imports for structured outputs tests
import json
import jsonschema
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import regex as re
import requests
import torch
from openai import BadRequestError
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def zephyr_lora_files():
"""Download zephyr LoRA files once per test session."""
from huggingface_hub import snapshot_download
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
@pytest.fixture(scope="module")
def server(zephyr_lora_files): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"128",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"},
]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=5,
temperature=0.0,
logprobs=False,
)
choice = chat_completion.choices[0]
assert choice.logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"},
]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=5,
temperature=0.0,
logprobs=True,
top_logprobs=0,
)
choice = chat_completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.content is not None
assert len(choice.logprobs.content[0].top_logprobs) == 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"},
]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=5,
temperature=0.0,
logprobs=True,
top_logprobs=5,
)
choice = chat_completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.content is not None
assert len(choice.logprobs.content[0].top_logprobs) == 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"},
]
# Default max_logprobs is 20, so this should raise an error
with pytest.raises((openai.BadRequestError, openai.APIError)):
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=21,
stream=True,
)
async for chunk in stream:
...
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=30,
stream=False,
)
# the server should still work afterwards
chat_completion = await client.chat.completions.create(
model=model_name, messages=messages, max_completion_tokens=10, stream=False
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name, prompt_logprobs",
[(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
)
async def test_prompt_logprobs_chat(
client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
):
params: dict = {
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020?"},
{
"role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020.",
},
{"role": "user", "content": "Where was it played?"},
],
"model": model_name,
}
if prompt_logprobs is not None:
params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
if prompt_logprobs is not None and prompt_logprobs < 0:
with pytest.raises(BadRequestError):
await client.chat.completions.create(**params)
else:
completion = await client.chat.completions.create(**params)
if prompt_logprobs is not None:
assert completion.prompt_logprobs is not None
assert len(completion.prompt_logprobs) > 0
else:
assert completion.prompt_logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_more_than_one_prompt_logprobs_chat(
client: openai.AsyncOpenAI, model_name: str
):
params: dict = {
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020?"},
{
"role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020.",
},
{"role": "user", "content": "Where was it played?"},
],
"model": model_name,
"extra_body": {"prompt_logprobs": 1},
}
completion_1 = await client.chat.completions.create(**params)
params["extra_body"] = {"prompt_logprobs": 2}
completion_2 = await client.chat.completions.create(**params)
assert len(completion_1.prompt_logprobs[3]) == 1
assert len(completion_2.prompt_logprobs[3]) == 2
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"},
]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=37, total_tokens=47
)
message = choice.message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"},
]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=True,
)
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
)
async def test_chat_completion_stream_options(
client: openai.AsyncOpenAI, model_name: str
):
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
]
# Test stream=True, stream_options={"include_usage": False}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=True,
stream_options={"include_usage": False},
)
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options={"include_usage": True,
# "continuous_usage_stats": False}}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=True,
stream_options={"include_usage": True, "continuous_usage_stats": False},
)
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await anext(stream)
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
)
assert final_chunk.choices == []
# Test stream=False, stream_options={"include_usage": None}
with pytest.raises(BadRequestError):
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=False,
stream_options={"include_usage": None},
)
# Test stream=False, stream_options={"include_usage": True}
with pytest.raises(BadRequestError):
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=False,
stream_options={"include_usage": True},
)
# Test stream=True, stream_options={"include_usage": True,
# "continuous_usage_stats": True}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
extra_body=dict(min_tokens=10),
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
)
last_completion_tokens = 0
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert (
last_completion_tokens == 0
or chunk.usage.completion_tokens > last_completion_tokens
or (
not chunk.choices
and chunk.usage.completion_tokens == last_completion_tokens
)
)
assert chunk.usage.total_tokens == (
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
)
last_completion_tokens = chunk.usage.completion_tokens
assert last_completion_tokens == 10
@pytest.mark.asyncio
async def test_structured_outputs_choice_chat(
client: openai.AsyncOpenAI,
sample_structured_outputs_choices,
):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{
"role": "user",
"content": "The best language for type-safe systems programming is ",
},
]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=10,
temperature=0.7,
extra_body=dict(
structured_outputs={"choice": sample_structured_outputs_choices}
),
)
choice1 = chat_completion.choices[0].message.content
assert choice1 in sample_structured_outputs_choices
messages.append({"role": "assistant", "content": choice1})
messages.append({"role": "user", "content": "I disagree, pick another one"})
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=10,
temperature=0.7,
extra_body=dict(
structured_outputs={"choice": sample_structured_outputs_choices}
),
)
choice2 = chat_completion.choices[0].message.content
assert choice2 in sample_structured_outputs_choices
assert choice1 != choice2
@pytest.mark.asyncio
async def test_structured_outputs_json_chat(
client: openai.AsyncOpenAI,
sample_json_schema,
):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{
"role": "user",
"content": f"Give an example JSON for an employee profile that "
f"fits this schema: {sample_json_schema}",
},
]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
extra_body=dict(structured_outputs={"json": sample_json_schema}),
)
message = chat_completion.choices[0].message
assert message.content is not None
json1 = json.loads(message.content)
jsonschema.validate(instance=json1, schema=sample_json_schema)
messages.append({"role": "assistant", "content": message.content})
messages.append(
{"role": "user", "content": "Give me another one with a different name and age"}
)
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
extra_body=dict(structured_outputs={"json": sample_json_schema}),
)
message = chat_completion.choices[0].message
assert message.content is not None
json2 = json.loads(message.content)
jsonschema.validate(instance=json2, schema=sample_json_schema)
assert json1["name"] != json2["name"]
assert json1["age"] != json2["age"]
@pytest.mark.asyncio
async def test_structured_outputs_regex_chat(
client: openai.AsyncOpenAI,
sample_regex,
):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{
"role": "user",
"content": f"Give an example IP address with this regex: {sample_regex}",
},
]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=20,
extra_body=dict(structured_outputs={"regex": sample_regex}),
)
ip1 = chat_completion.choices[0].message.content
assert ip1 is not None
assert re.fullmatch(sample_regex, ip1) is not None
messages.append({"role": "assistant", "content": ip1})
messages.append({"role": "user", "content": "Give me a different one"})
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=20,
extra_body=dict(structured_outputs={"regex": sample_regex}),
)
ip2 = chat_completion.choices[0].message.content
assert ip2 is not None
assert re.fullmatch(sample_regex, ip2) is not None
assert ip1 != ip2
@pytest.mark.asyncio
async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{
"role": "user",
"content": "The best language for type-safe systems programming is ",
},
]
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
extra_body=dict(structured_outputs={"regex": {1: "Python", 2: "C++"}}),
)
@pytest.mark.asyncio
async def test_structured_outputs_choice_chat_logprobs(
client: openai.AsyncOpenAI, sample_structured_outputs_choices
):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{
"role": "user",
"content": "The best language for type-safe systems programming is ",
},
]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(
structured_outputs={"choice": sample_structured_outputs_choices}
),
)
assert chat_completion.choices[0].logprobs is not None
assert chat_completion.choices[0].logprobs.content is not None
top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
# -9999.0 is the minimum logprob returned by OpenAI
for item in top_logprobs:
assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
@pytest.mark.asyncio
async def test_response_format_json_object(client: openai.AsyncOpenAI):
for _ in range(2):
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "user",
"content": (
"what is 1+1? please respond with a JSON object, "
'the format is {"result": 2}'
),
}
],
response_format={"type": "json_object"},
)
content = resp.choices[0].message.content
assert content is not None
loaded = json.loads(content)
assert loaded == {"result": 2}, loaded
@pytest.mark.asyncio
async def test_response_format_json_schema(client: openai.AsyncOpenAI):
prompt = 'what is 1+1? The format is "result": 2'
# Check that this prompt cannot lead to a valid JSON without json_schema
for _ in range(2):
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": prompt}],
)
content = resp.choices[0].message.content
assert content is not None
with pytest.raises((json.JSONDecodeError, AssertionError)):
loaded = json.loads(content)
assert loaded == {"result": 2}, loaded
for _ in range(2):
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": prompt}],
response_format={
"type": "json_schema",
"json_schema": {
"name": "foo_test",
"schema": {
"type": "object",
"properties": {
"result": {"type": "integer"},
},
},
},
},
)
content = resp.choices[0].message.content
assert content is not None
loaded = json.loads(content)
assert loaded == {"result": 2}, loaded
@pytest.mark.asyncio
async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "user",
"content": "what is 1+1?",
"extra_field": "0",
}
], # type: ignore
temperature=0,
seed=0,
)
content = resp.choices[0].message.content
assert content is not None
@pytest.mark.asyncio
async def test_complex_message_content(client: openai.AsyncOpenAI):
content = [
{
"type": "text",
"text": "what is 1+1? please provide the result without any other text.",
}
]
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "user",
"content": content,
}
],
temperature=0,
seed=0,
)
content = resp.choices[0].message.content
assert content == "2"
@pytest.mark.asyncio
async def test_custom_role(client: openai.AsyncOpenAI):
# Not sure how the model handles custom roles so we just check that
# both string and complex message content are handled in the same way
resp1 = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "my-custom-role",
"content": "what is 1+1?",
}
], # type: ignore
temperature=0,
seed=0,
)
resp2 = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "my-custom-role",
"content": [{"type": "text", "text": "what is 1+1?"}],
}
], # type: ignore
temperature=0,
seed=0,
)
content1 = resp1.choices[0].message.content
content2 = resp2.choices[0].message.content
assert content1 == content2
@pytest.mark.asyncio
async def test_long_seed(client: openai.AsyncOpenAI):
for seed in [torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).max + 1]:
with pytest.raises(BadRequestError) as exc_info:
await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": "You are a helpful assistant.",
}
],
temperature=0,
seed=seed,
)
assert (
"greater_than_equal" in exc_info.value.message
or "less_than_equal" in exc_info.value.message
)
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"},
]
request_args = {
"model": MODEL_NAME,
"messages": messages,
"max_completion_tokens": 5,
"temperature": 0.0,
"logprobs": False,
}
chat_completion = await client.chat.completions.create(**request_args)
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
chat_output = chat_completion.model_dump()
invocation_output = invocation_response.json()
assert chat_output.keys() == invocation_output.keys()
assert chat_output["choices"] == invocation_output["choices"]

View File

@@ -0,0 +1,132 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import NamedTuple
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from vllm.config import ModelConfig
from ...utils import RemoteOpenAIServer
# # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
def get_vocab_size(model_name):
config = ModelConfig(
model=model_name,
seed=0,
dtype="float16",
)
return config.get_vocab_size()
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--enforce-eager",
"--max-model-len",
"4080",
"--max-logprobs", # test prompt_logprobs equal to -1
"151936",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
class TestCase(NamedTuple):
model_name: str
echo: bool
@pytest.mark.asyncio
@pytest.mark.parametrize(
"test_case",
[
TestCase(model_name=MODEL_NAME, echo=True),
TestCase(model_name=MODEL_NAME, echo=False),
],
)
async def test_chat_session_with_echo_and_continue_final_message(
client: openai.AsyncOpenAI, test_case: TestCase
):
saying: str = "Here is a common saying about apple. An apple a day, keeps"
# test echo with continue_final_message parameter
chat_completion = await client.chat.completions.create(
model=test_case.model_name,
messages=[
{"role": "user", "content": "tell me a common saying"},
{"role": "assistant", "content": saying},
],
extra_body={
"echo": test_case.echo,
"continue_final_message": True,
"add_generation_prompt": False,
},
)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "stop"
message = choice.message
if test_case.echo:
assert message.content is not None and saying in message.content
else:
assert message.content is not None and saying not in message.content
assert message.role == "assistant"
@pytest.mark.asyncio
async def test_prompt_logprobs(client: openai.AsyncOpenAI):
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Beijing is the capital of which country?"},
]
completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
extra_body={"prompt_logprobs": -1},
)
assert completion.prompt_logprobs is not None
assert len(completion.prompt_logprobs) > 0
@pytest.mark.asyncio
async def test_top_logprobs(client: openai.AsyncOpenAI):
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Beijing is the capital of which country?"},
]
completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1,
extra_body={
"top_logprobs": -1,
"logprobs": "true",
},
)
assert completion.choices[0].logprobs is not None
assert completion.choices[0].logprobs.content is not None
assert len(completion.choices[0].logprobs.content) > 0
assert len(
completion.choices[0].logprobs.content[0].top_logprobs
) == get_vocab_size(MODEL_NAME)

View File

@@ -0,0 +1,227 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass, field
from http import HTTPStatus
from typing import Any
from unittest.mock import AsyncMock, MagicMock
import pytest
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.v1.engine.async_llm import AsyncLLM
MODEL_NAME = "openai-community/gpt2"
MODEL_NAME_SHORT = "gpt2"
BASE_MODEL_PATHS = [
BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
]
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass
class MockModelConfig:
task = "generate"
runner_type = "generate"
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
logits_processor_pattern = None
logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init = False
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels(
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
serving_chat = OpenAIServingChat(
engine,
models,
response_role="assistant",
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
async def _fake_process_inputs(
request_id,
engine_prompt,
sampling_params,
*,
lora_request,
trace_headers,
priority,
):
return dict(engine_prompt), {}
async def _fake_preprocess_chat(*args, **kwargs):
# return conversation, engine_prompts
return (
[{"role": "user", "content": "Test"}],
[{"prompt_token_ids": [1, 2, 3]}],
)
serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
return serving_chat
@pytest.mark.asyncio
async def test_chat_error_non_stream():
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine)
completion_output = CompletionOutput(
index=0,
text="",
token_ids=[],
cumulative_logprob=None,
logprobs=None,
finish_reason="error",
)
request_output = RequestOutput(
request_id="test-id",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output],
finished=True,
metrics=None,
lora_request=None,
encoder_prompt=None,
encoder_prompt_token_ids=None,
)
async def mock_generate(*args, **kwargs):
yield request_output
mock_engine.generate = MagicMock(side_effect=mock_generate)
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "Test prompt"}],
max_tokens=10,
stream=False,
)
response = await serving_chat.create_chat_completion(request)
assert isinstance(response, ErrorResponse)
assert response.error.type == "InternalServerError"
assert response.error.message == "Internal server error"
assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
@pytest.mark.asyncio
async def test_chat_error_stream():
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine)
completion_output_1 = CompletionOutput(
index=0,
text="Hello",
token_ids=[100],
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
)
request_output_1 = RequestOutput(
request_id="test-id",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output_1],
finished=False,
metrics=None,
lora_request=None,
encoder_prompt=None,
encoder_prompt_token_ids=None,
)
completion_output_2 = CompletionOutput(
index=0,
text="Hello",
token_ids=[100],
cumulative_logprob=None,
logprobs=None,
finish_reason="error",
)
request_output_2 = RequestOutput(
request_id="test-id",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output_2],
finished=True,
metrics=None,
lora_request=None,
encoder_prompt=None,
encoder_prompt_token_ids=None,
)
async def mock_generate(*args, **kwargs):
yield request_output_1
yield request_output_2
mock_engine.generate = MagicMock(side_effect=mock_generate)
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "Test prompt"}],
max_tokens=10,
stream=True,
)
response = await serving_chat.create_chat_completion(request)
chunks = []
async for chunk in response:
chunks.append(chunk)
assert len(chunks) >= 2
assert any("Internal server error" in chunk for chunk in chunks), (
f"Expected error message in chunks: {chunks}"
)
assert chunks[-1] == "data: [DONE]\n\n"

View File

@@ -0,0 +1,79 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai
import pytest
import pytest_asyncio
from vllm.config import ModelConfig
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
def get_vocab_size(model_name):
config = ModelConfig(
model=model_name,
seed=0,
dtype="bfloat16",
)
return config.get_vocab_size()
@pytest.fixture(scope="module")
def server():
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"1024",
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_chat_logit_bias_valid(client):
"""Test that valid logit_bias values are accepted in chat completions."""
vocab_size = get_vocab_size(MODEL_NAME)
valid_token_id = vocab_size - 1
completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": "Testing valid logit bias"}],
max_tokens=5,
logit_bias={str(valid_token_id): 1.0},
)
assert completion.choices[0].message.content is not None
@pytest.mark.asyncio
async def test_chat_logit_bias_invalid(client):
"""Test that invalid logit_bias values are rejected in chat completions."""
vocab_size = get_vocab_size(MODEL_NAME)
invalid_token_id = vocab_size + 1
with pytest.raises(openai.BadRequestError) as excinfo:
await client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": "Testing invalid logit bias"}],
max_tokens=5,
logit_bias={str(invalid_token_id): 1.0},
)
error = excinfo.value
error_message = str(error)
assert error.status_code == 400
assert str(invalid_token_id) in error_message
assert str(vocab_size) in error_message

View File

@@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.tokenizers import get_tokenizer
from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import VLLM_PATH
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATION_OUTPUT = [
(
"facebook/opt-125m",
chatml_jinja_path,
True,
False,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
""",
),
(
"facebook/opt-125m",
chatml_jinja_path,
False,
False,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of""",
),
(
"facebook/opt-125m",
chatml_jinja_path,
False,
True,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of""",
),
]
TEST_MESSAGES = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "What is the capital of"},
]
ASSISTANT_MESSAGE_TO_CONTINUE = {"role": "assistant", "content": "The capital of"}
def test_load_chat_template():
# Testing chatml template
template_content = load_chat_template(chat_template=chatml_jinja_path)
# Test assertions
assert template_content is not None
# Hard coded value for template_chatml.jinja
assert (
template_content
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
)
def test_no_load_chat_template_filelike():
# Testing chatml template
template = "../../examples/does_not_exist"
with pytest.raises(ValueError, match="looks like a file path"):
load_chat_template(chat_template=template)
def test_no_load_chat_template_literallike():
# Testing chatml template
template = "{{ messages }}"
template_content = load_chat_template(chat_template=template)
assert template_content == template
@pytest.mark.parametrize(
"model,template,add_generation_prompt,continue_final_message,expected_output",
MODEL_TEMPLATE_GENERATION_OUTPUT,
)
def test_get_gen_prompt(
model, template, add_generation_prompt, continue_final_message, expected_output
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
revision=model_info.revision,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
# Initialize the tokenizer
tokenizer = get_tokenizer(
tokenizer_name=model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
template_content = load_chat_template(chat_template=template)
# Create a mock request object using keyword arguments
mock_request = ChatCompletionRequest(
model=model,
messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
if continue_final_message
else TEST_MESSAGES,
add_generation_prompt=add_generation_prompt,
continue_final_message=continue_final_message,
)
# Call the function and get the result
result = apply_hf_chat_template(
tokenizer=tokenizer,
conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content,
model_config=model_config,
tools=None,
add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message,
)
# Test assertion
assert result == expected_output, (
f"The generated prompt does not match the expected output for "
f"model {model} and template {template}"
)

View File

@@ -0,0 +1,141 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
# a reasoning and tool calling model
MODEL_NAME = "Qwen/QwQ-32B"
@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
"--max-model-len",
"8192",
"--enforce-eager",
"--reasoning-parser",
"deepseek_r1",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
TOOLS = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. "
"'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that "
"the city is in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
}
]
MESSAGES = [
{"role": "user", "content": "Hi! How are you doing today?"},
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
{
"role": "user",
"content": "Can you tell me what the temperate will be in Dallas, "
"in fahrenheit?",
},
]
FUNC_NAME = "get_current_weather"
FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
def extract_reasoning_and_calls(chunks: list):
reasoning = ""
tool_call_idx = -1
arguments = []
function_names = []
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
function_names.append("")
if tool_call.function:
if tool_call.function.name:
function_names[tool_call_idx] = tool_call.function.name
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
else:
if hasattr(chunk.choices[0].delta, "reasoning"):
reasoning += chunk.choices[0].delta.reasoning
return reasoning, arguments, function_names
# test streaming
@pytest.mark.asyncio
async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI):
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
tools=TOOLS,
temperature=0.0,
stream=True,
)
chunks = []
async for chunk in stream:
chunks.append(chunk)
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
assert len(reasoning) > 0
assert len(function_names) > 0 and function_names[0] == FUNC_NAME
assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
# test full generate
@pytest.mark.asyncio
async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
tool_calls = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
tools=TOOLS,
temperature=0.0,
stream=False,
)
assert len(tool_calls.choices[0].message.reasoning) > 0
assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS

View File

@@ -0,0 +1,127 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
"--enable-chunked-prefill",
"--max-num-batched-tokens",
"1000",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_completion_stream_options_and_logprobs_with_long_prompts(
client: openai.AsyncOpenAI,
):
# Test stream with long prompt
prompt = "What is the capital of France?" * 400
stream = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
logprobs=5,
)
tokens_received = 0
finished = False
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
)
if not finished:
tokens_received += 1
assert chunk.choices[0].text
if chunk.choices[0].finish_reason is not None:
finished = True
if finished:
assert chunk.usage.completion_tokens == tokens_received
@pytest.mark.asyncio
async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
client: openai.AsyncOpenAI,
):
# Test stream with long prompt
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?" * 400},
]
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
logprobs=True,
top_logprobs=5,
)
tokens_received = 0
empty_chunks_received = 0
finished = False
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
)
if not finished:
if chunk.choices[0].delta.content == "":
# when there is no tokens generated
assert chunk.usage.completion_tokens == 0
assert chunk.choices[0].logprobs is None
empty_chunks_received += 1
else:
tokens_received += 1
if chunk.choices[0].finish_reason is not None:
finished = True
if finished:
assert chunk.usage.completion_tokens == tokens_received
assert empty_chunks_received <= 1

View File

@@ -0,0 +1,210 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.utils.argparse_utils import FlexibleArgumentParser
from ...utils import VLLM_PATH
LORA_MODULE = {
"name": "module2",
"path": "/path/to/module2",
"base_model_name": "llama",
}
CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
assert CHATML_JINJA_PATH.exists()
@pytest.fixture
def serve_parser():
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
return make_arg_parser(parser)
### Test config parsing
def test_config_arg_parsing(serve_parser, cli_config_file):
args = serve_parser.parse_args([])
assert args.port == 8000
args = serve_parser.parse_args(["--config", cli_config_file])
assert args.port == 12312
args = serve_parser.parse_args(
[
"--config",
cli_config_file,
"--port",
"9000",
]
)
assert args.port == 9000
args = serve_parser.parse_args(
[
"--port",
"9000",
"--config",
cli_config_file,
]
)
assert args.port == 9000
### Tests for LoRA module parsing
def test_valid_key_value_format(serve_parser):
# Test old format: name=path
args = serve_parser.parse_args(
[
"--lora-modules",
"module1=/path/to/module1",
]
)
expected = [LoRAModulePath(name="module1", path="/path/to/module1")]
assert args.lora_modules == expected
def test_valid_json_format(serve_parser):
# Test valid JSON format input
args = serve_parser.parse_args(
[
"--lora-modules",
json.dumps(LORA_MODULE),
]
)
expected = [
LoRAModulePath(name="module2", path="/path/to/module2", base_model_name="llama")
]
assert args.lora_modules == expected
def test_invalid_json_format(serve_parser):
# Test invalid JSON format input, missing closing brace
with pytest.raises(SystemExit):
serve_parser.parse_args(
["--lora-modules", '{"name": "module3", "path": "/path/to/module3"']
)
def test_invalid_type_error(serve_parser):
# Test type error when values are not JSON or key=value
with pytest.raises(SystemExit):
serve_parser.parse_args(
[
"--lora-modules",
"invalid_format", # This is not JSON or key=value format
]
)
def test_invalid_json_field(serve_parser):
# Test valid JSON format but missing required fields
with pytest.raises(SystemExit):
serve_parser.parse_args(
[
"--lora-modules",
'{"name": "module4"}', # Missing required 'path' field
]
)
def test_empty_values(serve_parser):
# Test when no LoRA modules are provided
args = serve_parser.parse_args(["--lora-modules", ""])
assert args.lora_modules == []
def test_multiple_valid_inputs(serve_parser):
# Test multiple valid inputs (both old and JSON format)
args = serve_parser.parse_args(
[
"--lora-modules",
"module1=/path/to/module1",
json.dumps(LORA_MODULE),
]
)
expected = [
LoRAModulePath(name="module1", path="/path/to/module1"),
LoRAModulePath(
name="module2", path="/path/to/module2", base_model_name="llama"
),
]
assert args.lora_modules == expected
### Tests for serve argument validation that run prior to loading
def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
"""Ensure validation fails if tool choice is enabled with no call parser"""
# If we enable-auto-tool-choice, explode with no tool-call-parser
args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)
def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
"""Ensure validation passes with tool choice enabled with a call parser"""
args = serve_parser.parse_args(
args=[
"--enable-auto-tool-choice",
"--tool-call-parser",
"mistral",
]
)
validate_parsed_serve_args(args)
def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
"""Ensure validation fails if reasoning is enabled with auto tool choice"""
args = serve_parser.parse_args(
args=[
"--enable-auto-tool-choice",
"--reasoning-parser",
"deepseek_r1",
]
)
with pytest.raises(TypeError):
validate_parsed_serve_args(args)
def test_passes_with_reasoning_parser(serve_parser):
"""Ensure validation passes if reasoning is enabled
with a reasoning parser"""
args = serve_parser.parse_args(
args=[
"--reasoning-parser",
"deepseek_r1",
]
)
validate_parsed_serve_args(args)
def test_chat_template_validation_for_happy_paths(serve_parser):
"""Ensure validation passes if the chat template exists"""
args = serve_parser.parse_args(
args=["--chat-template", CHATML_JINJA_PATH.absolute().as_posix()]
)
validate_parsed_serve_args(args)
def test_chat_template_validation_for_sad_paths(serve_parser):
"""Ensure validation fails if the chat template doesn't exist"""
args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
with pytest.raises(ValueError):
validate_parsed_serve_args(args)
@pytest.mark.parametrize(
"cli_args, expected_middleware",
[
(
["--middleware", "middleware1", "--middleware", "middleware2"],
["middleware1", "middleware2"],
),
([], []),
],
)
def test_middleware(serve_parser, cli_args, expected_middleware):
"""Ensure multiple middleware args are parsed properly"""
args = serve_parser.parse_args(args=cli_args)
assert args.middleware == expected_middleware

View File

@@ -0,0 +1,84 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import pytest
import requests
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
class TestWorkerExtension:
def get_model_name(self) -> str:
"""Test non-pydantic return type."""
return MODEL_NAME
def echo_args_kwargs(self, *args, **kwargs) -> dict[str, Any]:
"""Echo back both args and kwargs."""
return dict(
args=list(args),
kwargs=kwargs,
total_items=len(args) + len(kwargs),
)
def return_none(self, *args, **kwargs) -> None:
"""Test method that does not return anything"""
return
@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
"8192",
"--max-num-seqs",
"128",
"--worker-extension-cls",
"tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
]
with RemoteOpenAIServer(
MODEL_NAME,
args,
env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
) as remote_server:
yield remote_server
def test_get_model_name(server):
"""Test basic response"""
response = requests.post(
server.url_for("collective_rpc"), json={"method": "get_model_name"}
)
assert response.status_code == 200
results = response.json()
assert "results" in results
assert results["results"] == [MODEL_NAME]
def test_return_none(server):
"""Test return none"""
response = requests.post(
server.url_for("collective_rpc"), json={"method": "return_none"}
)
assert response.status_code == 200
results = response.json()
assert results["results"] == [None]
def test_echo_args_kwargs(server):
"""Test args, kwargs, and dict response"""
args = ["arg1", "arg2"]
kwargs = {"key1": "value1", "key2": "value2"}
response = requests.post(
server.url_for("collective_rpc"),
json={"method": "echo_args_kwargs", "args": args, "kwargs": kwargs},
)
assert response.status_code == 200
results = response.json()
result = results["results"][0]
assert result["args"] == args
assert result["kwargs"] == kwargs
assert result["total_items"] == len(args) + len(kwargs)

View File

@@ -0,0 +1,216 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass, field
from http import HTTPStatus
from typing import Any
from unittest.mock import AsyncMock, MagicMock
import pytest
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.v1.engine.async_llm import AsyncLLM
MODEL_NAME = "openai-community/gpt2"
MODEL_NAME_SHORT = "gpt2"
BASE_MODEL_PATHS = [
BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
]
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass
class MockModelConfig:
task = "generate"
runner_type = "generate"
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
logits_processor_pattern = None
logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init = False
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
models = OpenAIServingModels(
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
serving_completion = OpenAIServingCompletion(
engine,
models,
request_logger=None,
)
async def _fake_process_inputs(
request_id,
engine_prompt,
sampling_params,
*,
lora_request,
trace_headers,
priority,
):
return dict(engine_prompt), {}
serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
return serving_completion
@pytest.mark.asyncio
async def test_completion_error_non_stream():
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_completion = _build_serving_completion(mock_engine)
completion_output = CompletionOutput(
index=0,
text="",
token_ids=[],
cumulative_logprob=None,
logprobs=None,
finish_reason="error",
)
request_output = RequestOutput(
request_id="test-id",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output],
finished=True,
metrics=None,
lora_request=None,
encoder_prompt=None,
encoder_prompt_token_ids=None,
)
async def mock_generate(*args, **kwargs):
yield request_output
mock_engine.generate = MagicMock(side_effect=mock_generate)
request = CompletionRequest(
model=MODEL_NAME,
prompt="Test prompt",
max_tokens=10,
stream=False,
)
response = await serving_completion.create_completion(request)
assert isinstance(response, ErrorResponse)
assert response.error.type == "InternalServerError"
assert response.error.message == "Internal server error"
assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
@pytest.mark.asyncio
async def test_completion_error_stream():
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_completion = _build_serving_completion(mock_engine)
completion_output_1 = CompletionOutput(
index=0,
text="Hello",
token_ids=[100],
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
)
request_output_1 = RequestOutput(
request_id="test-id",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output_1],
finished=False,
metrics=None,
lora_request=None,
encoder_prompt=None,
encoder_prompt_token_ids=None,
)
completion_output_2 = CompletionOutput(
index=0,
text="Hello",
token_ids=[100],
cumulative_logprob=None,
logprobs=None,
finish_reason="error",
)
request_output_2 = RequestOutput(
request_id="test-id",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output_2],
finished=True,
metrics=None,
lora_request=None,
encoder_prompt=None,
encoder_prompt_token_ids=None,
)
async def mock_generate(*args, **kwargs):
yield request_output_1
yield request_output_2
mock_engine.generate = MagicMock(side_effect=mock_generate)
request = CompletionRequest(
model=MODEL_NAME,
prompt="Test prompt",
max_tokens=10,
stream=True,
)
response = await serving_completion.create_completion(request)
chunks = []
async for chunk in response:
chunks.append(chunk)
assert len(chunks) >= 2
assert any("Internal server error" in chunk for chunk in chunks), (
f"Expected error message in chunks: {chunks}"
)
assert chunks[-1] == "data: [DONE]\n\n"

View File

@@ -0,0 +1,486 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime
import json
import jsonschema
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
# downloading lora to test lora requests
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B"
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. "
"'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. "
"'Austria'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
"options": {
"$ref": "#/$defs/WeatherOptions",
"description": "Optional parameters for weather query",
},
},
"required": ["country", "unit"],
"$defs": {
"WeatherOptions": {
"title": "WeatherOptions",
"type": "object",
"additionalProperties": False,
"properties": {
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"default": "celsius",
"description": "Temperature unit",
"title": "Temperature Unit",
},
"include_forecast": {
"type": "boolean",
"default": False,
"description": "Whether to include a 24-hour forecast",
"title": "Include Forecast",
},
"language": {
"type": "string",
"default": "zh-CN",
"description": "Language of the response",
"title": "Language",
"enum": ["zh-CN", "en-US", "ja-JP"],
},
},
},
},
},
},
},
{
"type": "function",
"function": {
"name": "get_forecast",
"description": "Get the weather forecast for a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to get the forecast for, e.g. "
"'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. "
"'Austria'",
},
"days": {
"type": "integer",
"description": "Number of days to get the forecast for (1-7)",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["country", "days", "unit"],
},
},
},
]
messages = [
{"role": "user", "content": "Hi! How are you doing today?"},
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
{
"role": "user",
"content": "Can you tell me what the current weather is in Berlin and the "
"forecast for the next 5 days, in fahrenheit?",
},
]
@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"half",
"--enable-auto-tool-choice",
"--structured-outputs-config.backend",
"xgrammar",
"--tool-call-parser",
"hermes",
"--reasoning-parser",
"qwen3",
"--gpu-memory-utilization",
"0.4",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize(
"tool_choice",
[
"auto",
"required",
{"type": "function", "function": {"name": "get_current_weather"}},
],
)
@pytest.mark.parametrize("enable_thinking", [True, False])
async def test_function_tool_use(
client: openai.AsyncOpenAI,
model_name: str,
stream: bool,
tool_choice: str | dict,
enable_thinking: bool,
):
if not stream:
# Non-streaming test
chat_completion = await client.chat.completions.create(
messages=messages,
model=model_name,
tools=tools,
tool_choice=tool_choice,
extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
)
if enable_thinking:
assert chat_completion.choices[0].message.reasoning is not None
assert chat_completion.choices[0].message.reasoning != ""
assert chat_completion.choices[0].message.tool_calls is not None
assert len(chat_completion.choices[0].message.tool_calls) > 0
else:
# Streaming test
output_stream = await client.chat.completions.create(
messages=messages,
model=model_name,
tools=tools,
tool_choice=tool_choice,
stream=True,
extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
)
output = []
reasoning = []
async for chunk in output_stream:
if chunk.choices:
if enable_thinking and getattr(
chunk.choices[0].delta, "reasoning", None
):
reasoning.append(chunk.choices[0].delta.reasoning)
if chunk.choices[0].delta.tool_calls:
output.extend(chunk.choices[0].delta.tool_calls)
assert len(output) > 0
if enable_thinking:
assert len(reasoning) > 0
@pytest.fixture(scope="module")
def k2_server(): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"half",
"--enable-auto-tool-choice",
"--structured-outputs-config.backend",
"xgrammar",
"--tool-call-parser",
"hermes",
"--reasoning-parser",
"qwen3",
"--gpu-memory-utilization",
"0.4",
]
# hack to test kimi_k2 tool use tool_id format.
# avoid error in is_deepseek_mla check by setting kv_lora_rank=null
with RemoteOpenAIServer(
MODEL_NAME,
args,
override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def k2_client(k2_server):
async with k2_server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize("tool_choice", ["required"])
async def test_tool_id_kimi_k2(
k2_client: openai.AsyncOpenAI, model_name: str, stream: bool, tool_choice: str
):
if not stream:
# Non-streaming test
chat_completion = await k2_client.chat.completions.create(
messages=messages, model=model_name, tools=tools, tool_choice=tool_choice
)
assert chat_completion.choices[0].message.tool_calls is not None
assert len(chat_completion.choices[0].message.tool_calls) > 0
assert chat_completion.choices[0].message.tool_calls[0].id in [
"functions.get_current_weather:0",
"functions.get_forecast:1",
]
else:
# Streaming test
output_stream = await k2_client.chat.completions.create(
messages=messages,
model=model_name,
tools=tools,
tool_choice=tool_choice,
stream=True,
)
output = []
async for chunk in output_stream:
if chunk.choices and chunk.choices[0].delta.tool_calls:
output.extend(chunk.choices[0].delta.tool_calls)
for o in output:
assert o.id is None or o.id in [
"functions.get_current_weather:0",
"functions.get_forecast:1",
]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("arguments", ["{}", ""])
async def test_no_args_tool_call(
client: openai.AsyncOpenAI, model_name: str, arguments: str
):
# Step 1: Define a tool that requires no parameters
tools = [
{
"type": "function",
"function": {
"name": "get_current_time",
"description": "Get the current date and time. No parameters needed.",
"parameters": {
"type": "object",
"properties": {}, # No parameters
"required": [], # No required fields
},
},
}
]
messages = [{"role": "user", "content": "What time is it now?"}]
# Step 2: Send user message and let model decide whether to call the tool
response = await client.chat.completions.create(
model=model_name,
messages=messages,
tools=tools,
tool_choice="auto", # Let model choose automatically
)
# Step 3: Check if model wants to call a tool
message = response.choices[0].message
if message.tool_calls:
# Get the first tool call
tool_call = message.tool_calls[0]
tool_name = tool_call.function.name
# Step 4: Execute the tool locally (no parameters)
if tool_name == "get_current_time":
# Test both empty string and "{}" for no-arg tool calls
tool_call.function.arguments = arguments
messages.append(message)
current_time = datetime.datetime.now()
result = current_time.isoformat()
messages.append(
{
"role": "tool",
"tool_call_id": tool_call.id,
"content": result,
}
)
# Step 5: Send tool result back to model to continue conversation
final_response = await client.chat.completions.create(
model=model_name,
messages=messages,
)
# Output final natural language response
assert final_response.choices[0].message.content is not None
else:
# No tool called — just print model's direct reply
assert message.content is not None
@pytest.mark.asyncio
async def test_named_tool_use(
client: openai.AsyncOpenAI,
sample_json_schema,
):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{
"role": "user",
"content": (
"Give an example JSON for an employee profile using the specified tool."
),
},
]
tools = [
{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": sample_json_schema,
},
}
]
tool_choice = {"type": "function", "function": {"name": "dummy_function_name"}}
# non-streaming
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
tools=tools,
temperature=0.0,
tool_choice=tool_choice,
)
message = chat_completion.choices[0].message
assert len(message.content) == 0
json_string = message.tool_calls[0].function.arguments
json1 = json.loads(json_string)
jsonschema.validate(instance=json1, schema=sample_json_schema)
messages.append({"role": "assistant", "content": json_string})
messages.append(
{"role": "user", "content": "Give me another one with a different name and age"}
)
# streaming
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
tools=tools,
tool_choice=tool_choice,
temperature=0.0,
stream=True,
)
output = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
assert delta.content is None or len(delta.content) == 0
if delta.tool_calls:
output.append(delta.tool_calls[0].function.arguments)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
json2 = json.loads("".join(output))
jsonschema.validate(instance=json2, schema=sample_json_schema)
assert json1["name"] != json2["name"]
assert json1["age"] != json2["age"]
@pytest.mark.asyncio
async def test_inconsistent_tool_choice_and_tools(
client: openai.AsyncOpenAI, sample_json_schema
):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{
"role": "user",
"content": f"Give an example JSON for an employee profile that "
f"fits this schema: {sample_json_schema}",
},
]
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
tool_choice={
"type": "function",
"function": {"name": "dummy_function_name"},
},
)
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
tools=[
{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": sample_json_schema,
},
}
],
tool_choice={
"type": "function",
"function": {"name": "nondefined_function_name"},
},
)
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
tools=[
{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": sample_json_schema,
},
}
],
tool_choice={},
)

View File

@@ -0,0 +1,307 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import json
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import torch
# downloading lora to test lora requests
from openai import BadRequestError
from transformers import AutoConfig
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
LORA_SERVING_MODEL_NAME = "opt125m-lora"
CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
@pytest.fixture(scope="module", params=["use-lora"])
def default_server_args(
request: pytest.FixtureRequest, opt125_lora_files: str
) -> list[str]:
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
# Prompt Embeds server args
"--enable-prompt-embeds",
]
if request.param == "use-lora":
lora_module_1 = {
"name": LORA_SERVING_MODEL_NAME,
"path": opt125_lora_files,
"base_model_name": MODEL_NAME,
}
args.extend(
[
"--enable-lora",
"--lora-module",
json.dumps(lora_module_1),
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
]
)
return args
EXAMPLE_PROMPTS = [
"Hello, my name is",
"What is an LLM?",
]
def _encode_embeds(embeds: torch.Tensor):
buffer = io.BytesIO()
torch.save(embeds, buffer)
return base64.b64encode(buffer.getvalue()).decode("utf-8")
@pytest.fixture(scope="module")
def example_prompt_embeds(hf_runner):
"""Create example embeddings and return them as base64 encoded string."""
with hf_runner(MODEL_NAME) as hf_model:
example_embeddings = hf_model.get_prompt_embeddings(EXAMPLE_PROMPTS)
return [_encode_embeds(item) for item in example_embeddings]
@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
def server_with_prompt_embeds(default_server_args, request):
if request.param:
default_server_args.append(request.param)
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client_with_prompt_embeds(server_with_prompt_embeds):
async with server_with_prompt_embeds.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
async def test_completions_with_prompt_embeds(
example_prompt_embeds,
client_with_prompt_embeds: openai.AsyncOpenAI,
model_name: str,
):
encoded_embeds, encoded_embeds2 = example_prompt_embeds
# Test case: Single prompt embeds input
completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds},
)
assert len(completion.choices[0].text) >= 1
assert completion.choices[0].prompt_logprobs is None
# Test case: batch completion with prompt_embeds
completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
)
assert len(completion.choices) == 2
assert len(completion.choices[0].text) >= 1
assert len(completion.choices[1].text) >= 1
# Test case: streaming with prompt_embeds
single_completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds},
)
single_output = single_completion.choices[0].text
stream = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
max_tokens=5,
temperature=0.0,
stream=True,
extra_body={"prompt_embeds": encoded_embeds},
)
chunks = []
finish_reason_count = 0
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text
assert "".join(chunks) == single_output
# Test case: batch streaming with prompt_embeds
stream = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
max_tokens=5,
temperature=0.0,
stream=True,
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
)
chunks_stream_embeds: list[list[str]] = [[], []]
finish_reason_count = 0
async for chunk in stream:
chunks_stream_embeds[chunk.choices[0].index].append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
assert finish_reason_count == 2
assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text
assert len(chunks_stream_embeds[0]) > 0
assert len(chunks_stream_embeds[1]) > 0
# Test case: mixed text and prompt_embeds
completion_mixed = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="This is a prompt",
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds},
)
assert len(completion.choices) == 2
completion_text_only = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="This is a prompt",
max_tokens=5,
temperature=0.0,
)
completion_embeds_only = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="",
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds},
)
# Embeddings responses should be handled first
assert completion_mixed.choices[0].text == completion_embeds_only.choices[0].text
assert completion_mixed.choices[1].text == completion_text_only.choices[0].text
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
async def test_completions_errors_with_prompt_embeds(
client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str
):
# Test error case: invalid prompt_embeds
with pytest.raises(BadRequestError):
await client_with_prompt_embeds.completions.create(
prompt="",
model=model_name,
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": "invalid_base64"},
)
@pytest.mark.asyncio
@pytest.mark.parametrize("logprobs_arg", [1, 0])
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
async def test_completions_with_logprobs_and_prompt_embeds(
example_prompt_embeds,
client_with_prompt_embeds: openai.AsyncOpenAI,
logprobs_arg: int,
model_name: str,
):
encoded_embeds, encoded_embeds2 = example_prompt_embeds
# Test case: Logprobs using prompt_embeds
completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
max_tokens=5,
temperature=0.0,
echo=False,
logprobs=logprobs_arg,
extra_body={"prompt_embeds": encoded_embeds},
)
logprobs = completion.choices[0].logprobs
assert logprobs is not None
assert len(logprobs.text_offset) == 5
assert len(logprobs.token_logprobs) == 5
assert len(logprobs.top_logprobs) == 5
for top_logprobs in logprobs.top_logprobs[1:]:
assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
assert len(logprobs.tokens) == 5
# Test case: Log probs with batch completion and prompt_embeds
completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
max_tokens=5,
temperature=0.0,
echo=False,
logprobs=logprobs_arg,
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
)
assert len(completion.choices) == 2
for choice in completion.choices:
logprobs = choice.logprobs
assert logprobs is not None
assert len(logprobs.text_offset) == 5
assert len(logprobs.token_logprobs) == 5
assert len(logprobs.top_logprobs) == 5
for top_logprobs in logprobs.top_logprobs[1:]:
assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
assert len(logprobs.tokens) == 5
@pytest.mark.asyncio
async def test_prompt_logprobs_raises_error(
example_prompt_embeds,
client_with_prompt_embeds: openai.AsyncOpenAI,
):
encoded_embeds, _ = example_prompt_embeds
with pytest.raises(BadRequestError, match="not compatible"):
await client_with_prompt_embeds.completions.create(
model=MODEL_NAME,
prompt="",
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
)
@pytest.mark.asyncio
async def test_empty_prompt_embeds(
client_with_prompt_embeds: openai.AsyncOpenAI,
) -> None:
await client_with_prompt_embeds.completions.create(
model=MODEL_NAME,
prompt="Hello",
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": []},
)

View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from huggingface_hub import snapshot_download
from ...conftest import AudioTestAssets
from ...utils import RemoteOpenAIServer
# NOTE - the tests in this module are currently analogous to test_chat, but are
# separated to avoid OOM killing due to module-scoped servers, since we
# need a multimodal model for these tests.
# Contains a modality specific lora alongside the base model
MULTIMODAL_MODEL_NAME = snapshot_download("microsoft/Phi-4-multimodal-instruct")
AUDIO_LORA_PATH = os.path.join(MULTIMODAL_MODEL_NAME, "speech-lora")
ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
@pytest.fixture(scope="module")
def multimodal_server(): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"half",
"--max-model-len",
"4096",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
f"speech={AUDIO_LORA_PATH}",
"--max-lora-rank",
"320",
"--max-num-seqs",
"2",
"--trust-remote-code",
"--gpu-memory-utilization",
"0.8",
"--default-mm-loras",
f'{{"audio": "{AUDIO_LORA_PATH}"}}',
]
with RemoteOpenAIServer(
MULTIMODAL_MODEL_NAME, args, max_wait_seconds=480
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def multi_modal_client(multimodal_server):
async with multimodal_server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
# base model with default lora should give the same response as lora model
"model_name",
[MULTIMODAL_MODEL_NAME, "speech"],
)
async def test_default_mm_lora_chat_completions(
model_name: str,
multi_modal_client: openai.AsyncOpenAI,
audio_assets: AudioTestAssets,
):
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Can you transcribe this audio?",
},
{
"type": "audio_url",
"audio_url": {"url": audio_assets[0].url},
},
],
}
]
chat_completion = await multi_modal_client.chat.completions.create(
model=model_name, messages=messages, max_completion_tokens=128, temperature=0.0
)
assert len(chat_completion.choices) > 0
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
assert message.content == ACTIVE_MM_LORA_RESPONSE

View File

@@ -0,0 +1,126 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="module")
def chat_server_with_force_include_usage(request): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"128",
"--enforce-eager",
"--max-num-seqs",
"4",
"--enable-force-include-usage",
"--port",
"55857",
"--gpu-memory-utilization",
"0.2",
]
with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
async with chat_server_with_force_include_usage.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_chat_with_enable_force_include_usage(
chat_client_with_force_include_usage: openai.AsyncOpenAI,
):
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
]
stream = await chat_client_with_force_include_usage.chat.completions.create(
model="Qwen/Qwen3-0.6B",
messages=messages,
max_completion_tokens=10,
extra_body=dict(min_tokens=10),
temperature=0.0,
stream=True,
)
last_completion_tokens = 0
async for chunk in stream:
if not len(chunk.choices):
assert chunk.usage.prompt_tokens >= 0
assert (
last_completion_tokens == 0
or chunk.usage.completion_tokens > last_completion_tokens
or (
not chunk.choices
and chunk.usage.completion_tokens == last_completion_tokens
)
)
assert chunk.usage.total_tokens == (
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
)
else:
assert chunk.usage is None
@pytest.fixture(scope="module")
def transcription_server_with_force_include_usage():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-num-seqs",
"4",
"--enforce-eager",
"--enable-force-include-usage",
"--gpu-memory-utilization",
"0.2",
]
with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def transcription_client_with_force_include_usage(
transcription_server_with_force_include_usage,
):
async with (
transcription_server_with_force_include_usage.get_async_client() as async_client
):
yield async_client
@pytest.mark.asyncio
async def test_transcription_with_enable_force_include_usage(
transcription_client_with_force_include_usage, winning_call
):
res = (
await transcription_client_with_force_include_usage.audio.transcriptions.create(
model="openai/whisper-large-v3-turbo",
file=winning_call,
language="en",
temperature=0.0,
stream=True,
timeout=30,
)
)
async for chunk in res:
if not len(chunk.choices):
# final usage sent
usage = chunk.usage
assert isinstance(usage, dict)
assert usage["prompt_tokens"] > 0
assert usage["completion_tokens"] > 0
assert usage["total_tokens"] > 0
else:
assert not hasattr(chunk, "usage")

View File

@@ -0,0 +1,280 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Integration tests for GPT-OSS structural tags functionality (PR #25515)."""
import json
from unittest.mock import Mock
import pytest
from vllm.entrypoints.openai.protocol import (
StructuredOutputsParams,
)
from vllm.entrypoints.tool_server import ToolServer
from vllm.reasoning.gptoss_reasoning_parser import (
GptOssReasoningParser,
)
class TestGptOssStructuralTagsIntegration:
"""Integration tests for structural tags in GPT-OSS tool calls."""
@pytest.fixture
def mock_tokenizer(self):
"""Create a mock tokenizer."""
tokenizer = Mock()
tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
return tokenizer
@pytest.fixture
def gptoss_parser(self, mock_tokenizer):
"""Create a real GptOssReasoningParser instance."""
return GptOssReasoningParser(mock_tokenizer)
@pytest.fixture
def tool_server_with_python(self):
"""Create a tool server with Python tool enabled."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
return tool_server
@pytest.fixture
def tool_server_empty(self):
"""Create a tool server with no tools."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(return_value=False)
return tool_server
def test_end_to_end_no_tools(self, gptoss_parser):
"""Test end-to-end flow when no tools are available."""
# Test the parser directly
result = gptoss_parser.prepare_structured_tag(None, None)
parsed_result = json.loads(result)
# Verify basic structure
assert parsed_result["type"] == "structural_tag"
assert parsed_result["format"]["type"] == "triggered_tags"
assert len(parsed_result["format"]["tags"]) == 1
# Verify only analysis channel is allowed
analysis_tag = parsed_result["format"]["tags"][0]
assert analysis_tag["begin"] == "<|channel|>analysis<|message|>"
assert analysis_tag["content"]["type"] == "any_text"
assert analysis_tag["end"] == "<|end|>"
# Verify triggers
assert parsed_result["format"]["triggers"] == ["<|channel|>analysis"]
assert parsed_result["format"]["stop_after_first"] is False
def test_end_to_end_with_python_tool(self, gptoss_parser, tool_server_with_python):
"""Test end-to-end flow with Python tool enabled."""
result = gptoss_parser.prepare_structured_tag(None, tool_server_with_python)
parsed_result = json.loads(result)
# Should have analysis tag + 2 python tags
assert len(parsed_result["format"]["tags"]) == 3
# Verify all expected tags are present
tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
expected_begins = [
"<|channel|>analysis<|message|>",
"<|channel|>commentary to=python",
"<|channel|>analysis to=python",
]
for expected in expected_begins:
assert expected in tag_begins
# Verify triggers include commentary
assert "<|channel|>analysis" in parsed_result["format"]["triggers"]
assert "<|channel|>commentary to=" in parsed_result["format"]["triggers"]
def test_structured_outputs_params_integration(
self, gptoss_parser, tool_server_with_python
):
"""Test integration with StructuredOutputsParams."""
# Generate structural tag
structural_tag = gptoss_parser.prepare_structured_tag(
None, tool_server_with_python
)
# Create StructuredOutputsParams
params = StructuredOutputsParams(structural_tag=structural_tag)
# Verify the tag is properly stored and accessible
assert params.structural_tag == structural_tag
# Verify the tag is valid JSON
parsed_tag = json.loads(params.structural_tag)
assert parsed_tag["type"] == "structural_tag"
@pytest.mark.parametrize(
"browser, python, container, expected_tags",
[
# No tools
(False, False, False, 1),
# Single tool
(True, False, False, 3),
# Multiple tools
(True, True, False, 5),
# All tools
(True, True, True, 7),
],
)
def test_tool_server_interaction_flow(
self, gptoss_parser, browser, python, container, expected_tags
):
"""Test the complete tool server interaction flow."""
# Create a mock ToolServer
tool_server = Mock(spec=ToolServer)
# Simulate tool availability based on parameters
tool_server.has_tool = Mock(
side_effect=lambda tool: {
"browser": browser,
"python": python,
"container": container,
}.get(tool, False)
)
# Run the parser and verify results
result = gptoss_parser.prepare_structured_tag(None, tool_server)
parsed_result = json.loads(result)
# Validate number of tags
assert len(parsed_result["format"]["tags"]) == expected_tags
# Verify tool-specific tags exist for enabled tools
tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
for tool, enabled in {
"browser": browser,
"python": python,
"container": container,
}.items():
if enabled:
assert f"<|channel|>commentary to={tool}" in tag_begins
assert f"<|channel|>analysis to={tool}" in tag_begins
def test_original_tag_preservation(self, gptoss_parser, tool_server_with_python):
"""Test that original tags are preserved when provided."""
original_tag = '{"type": "custom_tag", "data": "preserved"}'
result = gptoss_parser.prepare_structured_tag(
original_tag, tool_server_with_python
)
# Should return original tag unchanged
assert result == original_tag
@pytest.mark.parametrize(
"tools",
[
[],
["browser"],
["python"],
["container"],
["browser", "python"],
["browser", "container"],
["python", "container"],
["browser", "python", "container"],
],
)
def test_json_validity_comprehensive(self, gptoss_parser, tools):
"""Test JSON validity across all possible tool combinations."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=lambda tool: tool in tools)
result = gptoss_parser.prepare_structured_tag(None, tool_server)
# Should be valid JSON
parsed_result = json.loads(result)
# Should have correct structure
assert parsed_result["type"] == "structural_tag"
assert "format" in parsed_result
assert "tags" in parsed_result["format"]
assert "triggers" in parsed_result["format"]
# Tag count should be: 1 (analysis) + 2 * len(tools)
expected_tag_count = 1 + (2 * len(tools))
assert len(parsed_result["format"]["tags"]) == expected_tag_count
def test_error_handling_invalid_tool_server(self, gptoss_parser):
"""Test error handling with invalid tool server."""
# Tool server that raises exceptions
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=Exception("Tool server error"))
# Should handle gracefully and still return a valid tag
with pytest.raises(Exception, match="Tool server error"):
gptoss_parser.prepare_structured_tag(None, tool_server)
def test_concurrent_requests_isolation(self, gptoss_parser):
"""Test that concurrent requests don't interfere with each other."""
# Simulate concurrent requests with different tool servers
tool_server_1 = Mock(spec=ToolServer)
tool_server_1.has_tool = Mock(side_effect=lambda tool: tool == "python")
tool_server_2 = Mock(spec=ToolServer)
tool_server_2.has_tool = Mock(side_effect=lambda tool: tool == "browser")
# Generate tags concurrently
result_1 = gptoss_parser.prepare_structured_tag(None, tool_server_1)
result_2 = gptoss_parser.prepare_structured_tag(None, tool_server_2)
# Parse results
parsed_1 = json.loads(result_1)
parsed_2 = json.loads(result_2)
# Verify they have different tool configurations
tags_1 = [tag["begin"] for tag in parsed_1["format"]["tags"]]
tags_2 = [tag["begin"] for tag in parsed_2["format"]["tags"]]
# Result 1 should have python tags
assert "<|channel|>commentary to=python" in tags_1
assert "<|channel|>commentary to=browser" not in tags_1
# Result 2 should have browser tags
assert "<|channel|>commentary to=browser" in tags_2
assert "<|channel|>commentary to=python" not in tags_2
def test_tag_format_consistency(self, gptoss_parser):
"""Test that all generated tags follow consistent format."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(
side_effect=lambda tool: tool in ["python", "browser"]
)
result = gptoss_parser.prepare_structured_tag(None, tool_server)
parsed_result = json.loads(result)
# Verify all tags have required fields
for tag in parsed_result["format"]["tags"]:
assert "begin" in tag
assert "content" in tag
assert "end" in tag
assert tag["content"]["type"] == "any_text"
assert tag["end"] == "<|end|>"
# Verify begin format
assert tag["begin"].startswith("<|channel|>")
def test_trigger_configuration(self, gptoss_parser):
"""Test trigger configuration for different tool setups."""
# Test with no tools
result_no_tools = gptoss_parser.prepare_structured_tag(None, None)
parsed_no_tools = json.loads(result_no_tools)
assert parsed_no_tools["format"]["triggers"] == ["<|channel|>analysis"]
# Test with tools
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
result_with_tools = gptoss_parser.prepare_structured_tag(None, tool_server)
parsed_with_tools = json.loads(result_with_tools)
expected_triggers = ["<|channel|>analysis", "<|channel|>commentary to="]
assert set(parsed_with_tools["format"]["triggers"]) == set(expected_triggers)

View File

@@ -0,0 +1,294 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import json
import shutil
from contextlib import suppress
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B"
BADREQUEST_CASES = [
(
"test_rank",
{"r": 1024},
"is greater than max_lora_rank",
),
("test_dora", {"use_dora": True}, "does not yet support DoRA"),
(
"test_modules_to_save",
{"modules_to_save": ["lm_head"]},
"only supports modules_to_save being None",
),
]
@pytest.fixture(scope="module", params=[True])
def server_with_lora_modules_json(request, qwen3_lora_files):
# Define the json format LoRA module configurations
lora_module_1 = {
"name": "qwen3-lora",
"path": qwen3_lora_files,
"base_model_name": MODEL_NAME,
}
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
json.dumps(lora_module_1),
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"64",
]
# Enable the /v1/load_lora_adapter endpoint
envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server_with_lora_modules_json):
async with server_with_lora_modules_json.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
models = await client.models.list()
models = models.data
served_model = models[0]
lora_models = models[1:]
assert served_model.id == MODEL_NAME
assert served_model.root == MODEL_NAME
assert served_model.parent is None
assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
assert lora_models[0].id == "qwen3-lora"
@pytest.mark.asyncio
async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
response = await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
)
# Ensure adapter loads before querying /models
assert "success" in response
models = await client.models.list()
models = models.data
dynamic_lora_model = models[-1]
assert dynamic_lora_model.root == qwen3_lora_files
assert dynamic_lora_model.parent == MODEL_NAME
assert dynamic_lora_model.id == "qwen3-lora-3"
@pytest.mark.asyncio
async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
with pytest.raises(openai.NotFoundError):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
)
@pytest.mark.asyncio
async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
invalid_files = tmp_path / "invalid_files"
invalid_files.mkdir()
(invalid_files / "adapter_config.json").write_text("this is not json")
with pytest.raises(openai.BadRequestError):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": "invalid-json", "lora_path": str(invalid_files)},
)
@pytest.mark.asyncio
@pytest.mark.parametrize("test_name,config_change,expected_error", BADREQUEST_CASES)
async def test_dynamic_lora_badrequests(
client: openai.AsyncOpenAI,
tmp_path,
qwen3_lora_files,
test_name: str,
config_change: dict,
expected_error: str,
):
# Create test directory
test_dir = tmp_path / test_name
# Copy adapter files
shutil.copytree(qwen3_lora_files, test_dir)
# Load and modify configuration
config_path = test_dir / "adapter_config.json"
with open(config_path) as f:
adapter_config = json.load(f)
# Apply configuration changes
adapter_config.update(config_change)
# Save modified configuration
with open(config_path, "w") as f:
json.dump(adapter_config, f)
# Test loading the adapter
with pytest.raises(openai.BadRequestError, match=expected_error):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": test_name, "lora_path": str(test_dir)},
)
@pytest.mark.asyncio
async def test_multiple_lora_adapters(
client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
):
"""Validate that many loras can be dynamically registered and inferenced
with concurrently"""
# This test file configures the server with --max-cpu-loras=2 and this test
# will concurrently load 10 adapters, so it should flex the LRU cache
async def load_and_run_adapter(adapter_name: str):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
)
for _ in range(3):
await client.completions.create(
model=adapter_name,
prompt=["Hello there", "Foo bar bazz buzz"],
max_tokens=5,
)
lora_tasks = []
for i in range(10):
lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
results, _ = await asyncio.wait(lora_tasks)
for r in results:
assert not isinstance(r, Exception), f"Got exception {r}"
@pytest.mark.asyncio
async def test_loading_invalid_adapters_does_not_break_others(
client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
):
invalid_files = tmp_path / "invalid_files"
invalid_files.mkdir()
(invalid_files / "adapter_config.json").write_text("this is not json")
stop_good_requests_event = asyncio.Event()
async def run_good_requests(client):
# Run chat completions requests until event set
results = []
while not stop_good_requests_event.is_set():
try:
batch = await client.completions.create(
model="qwen3-lora",
prompt=["Hello there", "Foo bar bazz buzz"],
max_tokens=5,
)
results.append(batch)
except Exception as e:
results.append(e)
return results
# Create task to run good requests
good_task = asyncio.create_task(run_good_requests(client))
# Run a bunch of bad adapter loads
for _ in range(25):
with suppress(openai.NotFoundError):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
)
for _ in range(25):
with suppress(openai.BadRequestError):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": "invalid", "lora_path": str(invalid_files)},
)
# Ensure all the running requests with lora adapters succeeded
stop_good_requests_event.set()
results = await good_task
for r in results:
assert not isinstance(r, Exception), f"Got exception {r}"
# Ensure we can load another adapter and run it
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": "valid", "lora_path": qwen3_lora_files},
)
await client.completions.create(
model="valid",
prompt=["Hello there", "Foo bar bazz buzz"],
max_tokens=5,
)
@pytest.mark.asyncio
async def test_beam_search_with_lora_adapters(
client: openai.AsyncOpenAI,
tmp_path,
qwen3_lora_files,
):
"""Validate that async beam search can be used with lora."""
async def load_and_run_adapter(adapter_name: str):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
)
for _ in range(3):
await client.completions.create(
model=adapter_name,
prompt=["Hello there", "Foo bar bazz buzz"],
max_tokens=5,
extra_body=dict(use_beam_search=True),
)
lora_tasks = []
for i in range(3):
lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
results, _ = await asyncio.wait(lora_tasks)
for r in results:
assert not isinstance(r, Exception), f"Got exception {r}"

View File

@@ -0,0 +1,230 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import suppress
from dataclasses import dataclass, field
from http import HTTPStatus
from unittest.mock import AsyncMock, MagicMock
import pytest
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
from vllm.tokenizers import get_tokenizer
from vllm.v1.engine.async_llm import AsyncLLM
MODEL_NAME = "openai-community/gpt2"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
MOCK_RESOLVER_NAME = "mock_test_resolver"
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass
class MockModelConfig:
"""Minimal mock ModelConfig for testing."""
model: str = MODEL_NAME
tokenizer: str = MODEL_NAME
trust_remote_code: bool = False
tokenizer_mode: str = "auto"
max_model_len: int = 100
tokenizer_revision: str | None = None
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
logits_processors: list[str] | None = None
logits_processor_pattern: str | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
skip_tokenizer_init: bool = False
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
class MockLoRAResolver(LoRAResolver):
async def resolve_lora(
self, base_model_name: str, lora_name: str
) -> LoRARequest | None:
if lora_name == "test-lora":
return LoRARequest(
lora_name="test-lora",
lora_int_id=1,
lora_local_path="/fake/path/test-lora",
)
elif lora_name == "invalid-lora":
return LoRARequest(
lora_name="invalid-lora",
lora_int_id=2,
lora_local_path="/fake/path/invalid-lora",
)
return None
@pytest.fixture(autouse=True)
def register_mock_resolver():
"""Fixture to register and unregister the mock LoRA resolver."""
resolver = MockLoRAResolver()
LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
yield
# Cleanup: remove the resolver after the test runs
if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]
@pytest.fixture
def mock_serving_setup():
"""Provides a mocked engine and serving completion instance."""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.errored = False
tokenizer = get_tokenizer(MODEL_NAME)
mock_engine.get_tokenizer = AsyncMock(return_value=tokenizer)
async def mock_add_lora_side_effect(lora_request: LoRARequest):
"""Simulate engine behavior when adding LoRAs."""
if lora_request.lora_name == "test-lora":
# Simulate successful addition
return True
if lora_request.lora_name == "invalid-lora":
# Simulate failure during addition (e.g. invalid format)
raise ValueError(f"Simulated failure adding LoRA: {lora_request.lora_name}")
return True
mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect)
async def mock_generate(*args, **kwargs):
for _ in []:
yield _
mock_engine.generate = MagicMock(spec=AsyncLLM.generate, side_effect=mock_generate)
mock_engine.generate.reset_mock()
mock_engine.add_lora.reset_mock()
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
models = OpenAIServingModels(
engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
)
serving_completion = OpenAIServingCompletion(
mock_engine, models, request_logger=None
)
serving_completion._process_inputs = AsyncMock(
return_value=(MagicMock(name="engine_request"), {})
)
return mock_engine, serving_completion
@pytest.mark.asyncio
async def test_serving_completion_with_lora_resolver(mock_serving_setup, monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
mock_engine, serving_completion = mock_serving_setup
lora_model_name = "test-lora"
req_found = CompletionRequest(
model=lora_model_name,
prompt="Generate with LoRA",
)
# Suppress potential errors during the mocked generate call,
# as we are primarily checking for add_lora and generate calls
with suppress(Exception):
await serving_completion.create_completion(req_found)
mock_engine.add_lora.assert_awaited_once()
called_lora_request = mock_engine.add_lora.call_args[0][0]
assert isinstance(called_lora_request, LoRARequest)
assert called_lora_request.lora_name == lora_model_name
mock_engine.generate.assert_called_once()
called_lora_request = mock_engine.generate.call_args[1]["lora_request"]
assert isinstance(called_lora_request, LoRARequest)
assert called_lora_request.lora_name == lora_model_name
@pytest.mark.asyncio
async def test_serving_completion_resolver_not_found(mock_serving_setup, monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
mock_engine, serving_completion = mock_serving_setup
non_existent_model = "non-existent-lora-adapter"
req = CompletionRequest(
model=non_existent_model,
prompt="what is 1+1?",
)
response = await serving_completion.create_completion(req)
mock_engine.add_lora.assert_not_awaited()
mock_engine.generate.assert_not_called()
assert isinstance(response, ErrorResponse)
assert response.error.code == HTTPStatus.NOT_FOUND.value
assert non_existent_model in response.error.message
@pytest.mark.asyncio
async def test_serving_completion_resolver_add_lora_fails(
mock_serving_setup, monkeypatch
):
monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
mock_engine, serving_completion = mock_serving_setup
invalid_model = "invalid-lora"
req = CompletionRequest(
model=invalid_model,
prompt="what is 1+1?",
)
response = await serving_completion.create_completion(req)
# Assert add_lora was called before the failure
mock_engine.add_lora.assert_awaited_once()
called_lora_request = mock_engine.add_lora.call_args[0][0]
assert isinstance(called_lora_request, LoRARequest)
assert called_lora_request.lora_name == invalid_model
# Assert generate was *not* called due to the failure
mock_engine.generate.assert_not_called()
# Assert the correct error response
assert isinstance(response, ErrorResponse)
assert response.error.code == HTTPStatus.BAD_REQUEST.value
assert invalid_model in response.error.message
@pytest.mark.asyncio
async def test_serving_completion_flag_not_set(mock_serving_setup):
mock_engine, serving_completion = mock_serving_setup
lora_model_name = "test-lora"
req_found = CompletionRequest(
model=lora_model_name,
prompt="Generate with LoRA",
)
await serving_completion.create_completion(req_found)
mock_engine.add_lora.assert_not_called()
mock_engine.generate.assert_not_called()

View File

@@ -0,0 +1,155 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import anthropic
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
"--max-model-len",
"2048",
"--enforce-eager",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
"--served-model-name",
"claude-3-7-sonnet-latest",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client_anthropic() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_simple_messages(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[{"role": "user", "content": "how are you!"}],
)
assert resp.stop_reason == "end_turn"
assert resp.role == "assistant"
print(f"Anthropic response: {resp.model_dump_json()}")
@pytest.mark.asyncio
async def test_system_message(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
system="you are a helpful assistant",
messages=[{"role": "user", "content": "how are you!"}],
)
assert resp.stop_reason == "end_turn"
assert resp.role == "assistant"
print(f"Anthropic response: {resp.model_dump_json()}")
@pytest.mark.asyncio
async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[{"role": "user", "content": "how are you!"}],
stream=True,
)
first_chunk = None
chunk_count = 0
async for chunk in resp:
chunk_count += 1
if first_chunk is None and chunk.type == "message_start":
first_chunk = chunk
print(chunk.model_dump_json())
assert chunk_count > 0
assert first_chunk is not None, "message_start chunk was never observed"
assert first_chunk.message is not None, "first chunk should include message"
assert first_chunk.message.usage is not None, (
"first chunk should include usage stats"
)
assert first_chunk.message.usage.output_tokens == 0
assert first_chunk.message.usage.input_tokens > 5
@pytest.mark.asyncio
async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{"role": "user", "content": "What's the weather like in New York today?"}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
"required": ["location"],
},
}
],
stream=False,
)
assert resp.stop_reason == "tool_use"
assert resp.role == "assistant"
print(f"Anthropic response: {resp.model_dump_json()}")
@pytest.mark.asyncio
async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{
"role": "user",
"content": "What's the weather like in New York today?",
}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
"required": ["location"],
},
}
],
stream=True,
)
async for chunk in resp:
print(chunk.model_dump_json())

View File

@@ -0,0 +1,454 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import subprocess
import sys
import tempfile
import time
from http import HTTPStatus
import openai
import pytest
import pytest_asyncio
import requests
from prometheus_client.parser import text_string_to_metric_families
from transformers import AutoTokenizer
from vllm import version
from ...conftest import LocalAssetServer
from ...utils import RemoteOpenAIServer
MODELS = {
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
}
PREV_MINOR_VERSION = version._prev_minor_version()
@pytest.fixture(scope="module", params=list(MODELS.keys()))
def model_key(request):
yield request.param
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"1024",
"--enforce-eager",
"--max-num-seqs",
"128",
]
@pytest.fixture(
scope="module",
params=[
"",
"--enable-chunked-prefill",
"--disable-frontend-multiprocessing",
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
def server(model_key, default_server_args, request):
if request.param:
default_server_args.append(request.param)
model_name = MODELS[model_key]
with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as cl:
yield cl
_PROMPT = "Hello my name is Robert and I love magic"
def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
num_prompt_tokens = len(prompt_ids)
# {metric_family: [(suffix, expected_value)]}
return {
"vllm:time_to_first_token_seconds": [("_count", num_requests)],
"vllm:time_per_output_token_seconds": [
("_count", num_requests * (max_tokens - 1))
],
"vllm:e2e_request_latency_seconds": [("_count", num_requests)],
"vllm:request_queue_time_seconds": [("_count", num_requests)],
"vllm:request_inference_time_seconds": [("_count", num_requests)],
"vllm:request_prefill_time_seconds": [("_count", num_requests)],
"vllm:request_decode_time_seconds": [("_count", num_requests)],
"vllm:request_prompt_tokens": [
("_sum", num_requests * num_prompt_tokens),
("_count", num_requests),
],
"vllm:request_generation_tokens": [
("_sum", num_requests * max_tokens),
("_count", num_requests),
],
"vllm:request_params_n": [("_count", num_requests)],
"vllm:request_params_max_tokens": [
("_sum", num_requests * max_tokens),
("_count", num_requests),
],
"vllm:iteration_tokens_total": [
(
"_sum",
num_requests * (num_prompt_tokens + max_tokens),
),
("_count", num_requests * max_tokens),
],
"vllm:prompt_tokens": [("_total", num_requests * num_prompt_tokens)],
"vllm:generation_tokens": [("_total", num_requests * max_tokens)],
"vllm:request_success": [("_total", num_requests)],
}
@pytest.mark.asyncio
async def test_metrics_counts(
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
):
if model_key == "multimodal":
pytest.skip("Unnecessary test")
model_name = MODELS[model_key]
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt_ids = tokenizer.encode(_PROMPT)
num_requests = 10
max_tokens = 10
for _ in range(num_requests):
# sending a request triggers the metrics to be logged.
await client.completions.create(
model=model_name,
prompt=prompt_ids,
max_tokens=max_tokens,
)
response = requests.get(server.url_for("metrics"))
print(response.text)
assert response.status_code == HTTPStatus.OK
# Loop over all expected metric_families
expected_values = _get_expected_values(num_requests, prompt_ids, max_tokens)
for metric_family, suffix_values_list in expected_values.items():
if metric_family not in EXPECTED_METRICS_V1 or (
not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS
):
continue
found_metric = False
# Check to see if the metric_family is found in the prom endpoint.
for family in text_string_to_metric_families(response.text):
if family.name == metric_family:
found_metric = True
# Check that each suffix is found in the prom endpoint.
for suffix, expected_value in suffix_values_list:
metric_name_w_suffix = f"{metric_family}{suffix}"
found_suffix = False
for sample in family.samples:
if sample.name == metric_name_w_suffix:
found_suffix = True
# For each suffix, value sure the value matches
# what we expect.
assert sample.value == expected_value, (
f"{metric_name_w_suffix} expected value of "
f"{expected_value} did not match found value "
f"{sample.value}"
)
break
assert found_suffix, (
f"Did not find {metric_name_w_suffix} in prom endpoint"
)
break
assert found_metric, f"Did not find {metric_family} in prom endpoint"
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:kv_cache_usage_perc",
"vllm:prefix_cache_queries",
"vllm:prefix_cache_hits",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:iteration_tokens_total",
"vllm:cache_config_info",
"vllm:request_success_total",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:inter_token_latency_seconds_sum",
"vllm:inter_token_latency_seconds_bucket",
"vllm:inter_token_latency_seconds_count",
"vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count",
"vllm:request_queue_time_seconds_sum",
"vllm:request_queue_time_seconds_bucket",
"vllm:request_queue_time_seconds_count",
"vllm:request_inference_time_seconds_sum",
"vllm:request_inference_time_seconds_bucket",
"vllm:request_inference_time_seconds_count",
"vllm:request_prefill_time_seconds_sum",
"vllm:request_prefill_time_seconds_bucket",
"vllm:request_prefill_time_seconds_count",
"vllm:request_decode_time_seconds_sum",
"vllm:request_decode_time_seconds_bucket",
"vllm:request_decode_time_seconds_count",
]
EXPECTED_METRICS_MM = [
"vllm:mm_cache_queries",
"vllm:mm_cache_hits",
]
HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
]
@pytest.mark.asyncio
async def test_metrics_exist(
local_asset_server: LocalAssetServer,
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
):
model_name = MODELS[model_key]
# sending a request triggers the metrics to be logged.
if model_key == "text":
await client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
)
else:
# https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
await client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": local_asset_server.url_for(
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
),
},
},
{"type": "text", "text": "What's in this image?"},
],
}
],
max_tokens=5,
temperature=0.0,
)
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
expected_metrics = EXPECTED_METRICS_V1
if model_key == "multimodal":
# NOTE: Don't use in-place assignment
expected_metrics = expected_metrics + EXPECTED_METRICS_MM
for metric in expected_metrics:
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
continue
assert metric in response.text
@pytest.mark.asyncio
async def test_abort_metrics_reset(
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
):
model_name = MODELS[model_key]
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt_ids = tokenizer.encode(_PROMPT)
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server,
)
# Expect no running requests or kvcache usage
assert running_requests == 0
assert waiting_requests == 0
assert kv_cache_usage == 0.0
# Start some long-running requests that we can abort
tasks = []
for _ in range(3):
task = asyncio.create_task(
client.completions.create(
model=model_name,
prompt=prompt_ids,
max_tokens=100, # Long generation to give time to abort
temperature=0.0,
)
)
tasks.append(task)
# Wait a bit for requests to start processing
await asyncio.sleep(0.5)
# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server,
)
# Expect running requests and kvcache usage
assert running_requests > 0
assert kv_cache_usage > 0
# Cancel all tasks to abort the requests
for task in tasks:
task.cancel()
# Wait for cancellations to be processed
await asyncio.sleep(1.0)
# Check that metrics have reset to zero
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server)
)
assert running_requests_after == 0, (
f"Expected 0 running requests after abort, got {running_requests_after}"
)
assert waiting_requests_after == 0, (
f"Expected 0 waiting requests after abort, got {waiting_requests_after}"
)
assert kv_cache_usage_after == 0, (
f"Expected 0% KV cache usage after abort, got {kv_cache_usage_after}"
)
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
"""Return (running_count, waiting_count, kv_cache_usage)"""
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None
kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running":
for sample in family.samples:
if sample.name == "vllm:num_requests_running":
running_requests = sample.value
break
elif family.name == "vllm:num_requests_waiting":
for sample in family.samples:
if sample.name == "vllm:num_requests_waiting":
waiting_requests = sample.value
break
elif family.name == kv_cache_usage_metric:
for sample in family.samples:
if sample.name == kv_cache_usage_metric:
kv_cache_usage = sample.value
break
assert running_requests is not None
assert waiting_requests is not None
assert kv_cache_usage is not None
return running_requests, waiting_requests, kv_cache_usage
def test_metrics_exist_run_batch():
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
base_url = "0.0.0.0"
port = "8001"
server_url = f"http://{base_url}:{port}"
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write(input_batch)
input_file.flush()
proc = subprocess.Popen(
[
sys.executable,
"-m",
"vllm.entrypoints.openai.run_batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"intfloat/multilingual-e5-small",
"--enable-metrics",
"--url",
base_url,
"--port",
port,
],
)
def is_server_up(url):
try:
response = requests.get(url)
return response.status_code == 200
except requests.ConnectionError:
return False
while not is_server_up(server_url):
time.sleep(1)
response = requests.get(server_url + "/metrics")
assert response.status_code == HTTPStatus.OK
proc.wait()

View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
@pytest.fixture(scope="module")
def server(qwen3_lora_files):
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
f"qwen3-lora={qwen3_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"128",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
models = await client.models.list()
models = models.data
served_model = models[0]
lora_models = models[1:]
assert served_model.id == MODEL_NAME
assert served_model.root == MODEL_NAME
assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
assert lora_models[0].id == "qwen3-lora"

View File

@@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from ...utils import VLLM_PATH, RemoteOpenAIServer
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
def run_and_test_dummy_opt_api_server(model, tp=1):
# the model is registered through the plugin
server_args = [
"--gpu-memory-utilization",
"0.10",
"--dtype",
"float32",
"--chat-template",
str(chatml_jinja_path),
"--load-format",
"dummy",
"-tp",
f"{tp}",
]
with RemoteOpenAIServer(model, server_args) as server:
client = server.get_client()
completion = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"},
],
temperature=0,
)
generated_text = completion.choices[0].message.content
assert generated_text is not None
# make sure only the first token is generated
rest = generated_text.replace("<s>", "")
assert rest == ""
def test_oot_registration_for_api_server(dummy_opt_path: str):
run_and_test_dummy_opt_api_server(dummy_opt_path)

View File

@@ -0,0 +1,145 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from typing import Final
import pytest
import schemathesis
from hypothesis import settings
from schemathesis import GenerationConfig
from ...utils import RemoteOpenAIServer
schemathesis.experimental.OPEN_API_3_1.enable()
MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
MAXIMUM_IMAGES = 2
DEFAULT_TIMEOUT_SECONDS: Final[int] = 10
LONG_TIMEOUT_SECONDS: Final[int] = 60
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"generate",
"--max-model-len",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
json.dumps({"image": MAXIMUM_IMAGES}),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def get_schema(server):
# avoid generating null (\x00) bytes in strings during test case generation
return schemathesis.openapi.from_uri(
f"{server.url_root}/openapi.json",
generation_config=GenerationConfig(allow_x00=False),
)
schema = schemathesis.from_pytest_fixture("get_schema")
@schemathesis.hook
def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
op = context.operation
assert op is not None
def no_invalid_types(case: schemathesis.models.Case):
"""
This filter skips test cases with invalid data that schemathesis
incorrectly generates due to permissive schema configurations.
1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in
message content, which isn't implemented.
2. Skips tool_calls with `"type": "custom"` which schemathesis
incorrectly generates instead of the valid `"type": "function"`.
Example test cases that are skipped:
curl -X POST -H 'Content-Type: application/json' \
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
http://localhost:8000/tokenize
curl -X POST -H 'Content-Type: application/json' \
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
http://localhost:8000/v1/chat/completions
""" # noqa: E501
if hasattr(case, "body") and isinstance(case.body, dict):
if (
"messages" in case.body
and isinstance(case.body["messages"], list)
and len(case.body["messages"]) > 0
):
for message in case.body["messages"]:
if not isinstance(message, dict):
continue
# Check for invalid file type in tokenize endpoint
if op.method.lower() == "post" and op.path == "/tokenize":
content = message.get("content", [])
if (
isinstance(content, list)
and len(content) > 0
and any(item.get("type") == "file" for item in content)
):
return False
# Check for invalid tool_calls with non-function types
tool_calls = message.get("tool_calls", [])
if isinstance(tool_calls, list):
for tool_call in tool_calls:
if isinstance(tool_call, dict):
if tool_call.get("type") != "function":
return False
if "custom" in tool_call:
return False
# Sometimes structured_outputs.grammar is generated to be empty
# Causing a server error in EBNF grammar parsing
# https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
structured_outputs = case.body.get("structured_outputs", {})
grammar = (
structured_outputs.get("grammar")
if isinstance(structured_outputs, dict)
else None
)
if grammar == "":
# Allow None (will be handled as no grammar)
# But skip empty strings
return False
return True
return strategy.filter(no_invalid_types)
@schema.parametrize()
@schema.override(headers={"Content-Type": "application/json"})
@settings(deadline=LONG_TIMEOUT_SECONDS * 1000)
def test_openapi_stateless(case: schemathesis.Case):
key = (
case.operation.method.upper(),
case.operation.path,
)
if case.operation.path.startswith("/v1/responses"):
# Skip responses API as it is meant to be stateful.
return
timeout = {
# requires a longer timeout
("POST", "/v1/chat/completions"): LONG_TIMEOUT_SECONDS,
}.get(key, DEFAULT_TIMEOUT_SECONDS)
# No need to verify SSL certificate for localhost
case.call_and_validate(verify=False, timeout=timeout)

View File

@@ -0,0 +1,118 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for middleware that's off by default and can be toggled through
server arguments, mainly --api-key and --enable-request-id-headers.
"""
from http import HTTPStatus
import pytest
import requests
from ...utils import RemoteOpenAIServer
# Use a small embeddings model for faster startup and smaller memory footprint.
# Since we are not testing any chat functionality,
# using a chat capable model is overkill.
MODEL_NAME = "intfloat/multilingual-e5-small"
@pytest.fixture(scope="module")
def server(request: pytest.FixtureRequest):
passed_params = []
if hasattr(request, "param"):
passed_params = request.param
if isinstance(passed_params, str):
passed_params = [passed_params]
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"512",
"--enforce-eager",
"--max-num-seqs",
"2",
*passed_params,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_no_api_token(server: RemoteOpenAIServer):
response = requests.get(server.url_for("v1/models"))
assert response.status_code == HTTPStatus.OK
@pytest.mark.asyncio
async def test_no_request_id_header(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health"))
assert "X-Request-Id" not in response.headers
@pytest.mark.parametrize(
"server",
[["--api-key", "test"]],
indirect=True,
)
@pytest.mark.asyncio
async def test_missing_api_token(server: RemoteOpenAIServer):
response = requests.get(server.url_for("v1/models"))
assert response.status_code == HTTPStatus.UNAUTHORIZED
@pytest.mark.parametrize(
"server",
[["--api-key", "test"]],
indirect=True,
)
@pytest.mark.asyncio
async def test_passed_api_token(server: RemoteOpenAIServer):
response = requests.get(
server.url_for("v1/models"), headers={"Authorization": "Bearer test"}
)
assert response.status_code == HTTPStatus.OK
@pytest.mark.parametrize(
"server",
[["--api-key", "test"]],
indirect=True,
)
@pytest.mark.asyncio
async def test_not_v1_api_token(server: RemoteOpenAIServer):
# Authorization check is skipped for any paths that
# don't start with /v1 (e.g. /v1/chat/completions).
response = requests.get(server.url_for("health"))
assert response.status_code == HTTPStatus.OK
@pytest.mark.parametrize(
"server",
["--enable-request-id-headers"],
indirect=True,
)
@pytest.mark.asyncio
async def test_enable_request_id_header(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health"))
assert "X-Request-Id" in response.headers
assert len(response.headers.get("X-Request-Id", "")) == 32
@pytest.mark.parametrize(
"server",
["--enable-request-id-headers"],
indirect=True,
)
@pytest.mark.asyncio
async def test_custom_request_id_header(server: RemoteOpenAIServer):
response = requests.get(
server.url_for("health"), headers={"X-Request-Id": "Custom"}
)
assert "X-Request-Id" in response.headers
assert response.headers.get("X-Request-Id") == "Custom"

View File

@@ -0,0 +1,126 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module", params=[True])
def server(request, monkeypatch_module):
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_chat_completion_with_orca_header(server: RemoteOpenAIServer):
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"},
]
client = openai.OpenAI(
api_key="EMPTY",
base_url=f"http://localhost:{server.port}/v1",
default_headers={"endpoint-load-metrics-format": "TEXT"},
)
# 1. Use raw client to get response headers.
raw_client = client.with_raw_response
# 2. Make the API call using the raw_client
response_with_raw = raw_client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
extra_headers={"endpoint-load-metrics-format": "TEXT"},
)
# 3. Access the raw httpx.Response object
raw_http_response = response_with_raw.http_response
# 4. Get the headers from the httpx.Response object
response_headers = raw_http_response.headers
assert "endpoint-load-metrics" in response_headers
@pytest.mark.asyncio
async def test_completion_with_orca_header(client: openai.AsyncOpenAI):
# 1. Use raw client to get response headers.
raw_client = client.with_raw_response
# 2. Make the API call using the raw_client
completion = await raw_client.completions.create(
model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
extra_headers={"endpoint-load-metrics-format": "JSON"},
)
# 3. Access the raw httpx.Response object
raw_http_response = completion.http_response
# 4. Get the headers from the httpx.Response object
response_headers = raw_http_response.headers
assert "endpoint-load-metrics" in response_headers
@pytest.mark.asyncio
async def test_single_completion(client: openai.AsyncOpenAI):
completion = await client.completions.create(
model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
extra_headers={"endpoint-load-metrics-format": "JSON"},
temperature=0.0,
)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
assert len(choice.text) >= 5
assert choice.finish_reason == "length"
# When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=5, total_tokens=10
)
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 1
assert completion.choices[0].prompt_logprobs is None

View File

@@ -0,0 +1,118 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
from unittest.mock import Mock
# imports for structured outputs tests
import openai
import pybase64
import pytest
import regex as re
import torch
from vllm.config import ModelConfig
from vllm.entrypoints.renderer import CompletionRenderer
from ...utils import RemoteOpenAIServer
@pytest.mark.asyncio
async def test_empty_prompt():
model_name = "gpt2"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(
openai.BadRequestError,
match="Either prompt or prompt_embeds must be provided and non-empty.",
):
await client.completions.create(
model=model_name,
prompt="",
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": []},
)
@pytest.mark.asyncio
async def test_out_of_vocab_token_ids():
model_name = "gpt2"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(
openai.BadRequestError, match=re.compile(".*out of vocabulary.*").pattern
):
await client.completions.create(
model=model_name, prompt=[999999], max_tokens=5, temperature=0.0
)
@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
@pytest.mark.parametrize(
"layout", [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr]
)
@pytest.mark.parametrize("seq_len", [2, 10])
@pytest.mark.parametrize("hidden_size", [2, 10])
def test_load_prompt_embeds(
dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
):
model_config = Mock(spec=ModelConfig)
model_config.enable_prompt_embeds = True
renderer = CompletionRenderer(model_config, tokenizer=None)
# construct arbitrary tensors of various dtypes, layouts, and sizes.
# We need to check against different layouts to make sure that if a user
# uses sparse tensors to reduce the transmission size of prompt embeddings,
# we must cast them to dense/strided before passing them into the engine.
# We don't use non-CPU tensors in this test to avoid preemptively
# initializing cuda and break other tests in the suite that fork processes.
# We also need to make sure that we only use devices that are actually
# available in the environment the test is running on. For simplicity,
# we just test against CPU.
tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
if layout == torch.strided:
tensor = tensor.contiguous()
elif layout == torch.sparse_coo:
tensor = tensor.to_sparse_coo()
elif layout == torch.sparse_csc:
tensor = tensor.to_sparse_csc()
elif layout == torch.sparse_csr:
tensor = tensor.to_sparse_csr()
buffer = io.BytesIO()
torch.save(tensor, buffer)
buffer.seek(0)
encoded_tensor = pybase64.b64encode(buffer.getvalue())
loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
assert len(loaded_prompt_embeds) == 1
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
assert loaded_tensor.device.type == "cpu"
assert loaded_tensor.layout == torch.strided
torch.testing.assert_close(
loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
)
@pytest.mark.parametrize("dtype", [torch.float32])
@pytest.mark.parametrize("seq_len", [2])
@pytest.mark.parametrize("hidden_size", [2])
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
model_config = Mock(spec=ModelConfig)
model_config.enable_prompt_embeds = False
renderer = CompletionRenderer(model_config, tokenizer=None)
tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
buffer = io.BytesIO()
torch.save(tensor, buffer)
buffer.seek(0)
encoded_tensor = pybase64.b64encode(buffer.getvalue())
with pytest.raises(ValueError, match="--enable-prompt-embeds"):
renderer.load_prompt_embeds(encoded_tensor)

View File

@@ -0,0 +1,36 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from openai_harmony import (
Message,
)
from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages
def test_serialize_message() -> None:
dict_value = {"a": 1, "b": "2"}
assert serialize_message(dict_value) == dict_value
msg_value = {
"role": "assistant",
"name": None,
"content": [{"type": "text", "text": "Test 1"}],
"channel": "analysis",
}
msg = Message.from_dict(msg_value)
assert serialize_message(msg) == msg_value
def test_serialize_messages() -> None:
assert serialize_messages(None) is None
assert serialize_messages([]) is None
dict_value = {"a": 3, "b": "4"}
msg_value = {
"role": "assistant",
"name": None,
"content": [{"type": "text", "text": "Test 2"}],
"channel": "analysis",
}
msg = Message.from_dict(msg_value)
assert serialize_messages([msg, dict_value]) == [msg_value, dict_value]

View File

@@ -0,0 +1,261 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import pytest_asyncio
from openai import OpenAI
from openai_harmony import ToolDescription, ToolNamespaceConfig
from vllm.entrypoints.tool_server import MCPToolServer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "openai/gpt-oss-20b"
@pytest.fixture(scope="module")
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module")
def mcp_disabled_server(monkeypatch_module: pytest.MonkeyPatch):
args = ["--enforce-eager", "--tool-server", "demo"]
with monkeypatch_module.context() as m:
m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
# Helps the model follow instructions better
m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="function")
def mcp_enabled_server(monkeypatch_module: pytest.MonkeyPatch):
args = ["--enforce-eager", "--tool-server", "demo"]
with monkeypatch_module.context() as m:
m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
m.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,container")
# Helps the model follow instructions better
m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def mcp_disabled_client(mcp_disabled_server):
async with mcp_disabled_server.get_async_client() as async_client:
yield async_client
@pytest_asyncio.fixture
async def mcp_enabled_client(mcp_enabled_server):
async with mcp_enabled_server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name: str):
response = await mcp_enabled_client.responses.create(
model=model_name,
input=(
"Execute the following code: "
"import random; print(random.randint(1, 1000000))"
),
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
tools=[
{
"type": "mcp",
"server_label": "code_interpreter",
# URL unused for DemoToolServer
"server_url": "http://localhost:8888",
}
],
extra_body={"enable_response_messages": True},
)
assert response is not None
assert response.status == "completed"
# Verify output messages: Tool calls and responses on analysis channel
tool_call_found = False
tool_response_found = False
for message in response.output_messages:
recipient = message.get("recipient")
if recipient and recipient.startswith("python"):
tool_call_found = True
assert message.get("channel") == "analysis", (
"Tool call should be on analysis channel"
)
author = message.get("author", {})
if (
author.get("role") == "tool"
and author.get("name")
and author.get("name").startswith("python")
):
tool_response_found = True
assert message.get("channel") == "analysis", (
"Tool response should be on analysis channel"
)
assert tool_call_found, "Should have found at least one Python tool call"
assert tool_response_found, "Should have found at least one Python tool response"
for message in response.input_messages:
assert message.get("author").get("role") != "developer", (
"No developer messages should be present with valid mcp tool"
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_with_allowed_tools_star(
mcp_enabled_client: OpenAI, model_name: str
):
"""Test MCP tool with allowed_tools=['*'] to select all available tools.
This E2E test verifies that the "*" wildcard works end-to-end.
See test_serving_responses.py for detailed unit tests of "*" normalization.
"""
response = await mcp_enabled_client.responses.create(
model=model_name,
input=(
"Execute the following code: "
"import random; print(random.randint(1, 1000000))"
),
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
tools=[
{
"type": "mcp",
"server_label": "code_interpreter",
"server_url": "http://localhost:8888",
# Using "*" to allow all tools from this MCP server
"allowed_tools": ["*"],
}
],
extra_body={"enable_response_messages": True},
)
assert response is not None
assert response.status == "completed"
# Verify tool calls work with allowed_tools=["*"]
tool_call_found = False
for message in response.output_messages:
recipient = message.get("recipient")
if recipient and recipient.startswith("python"):
tool_call_found = True
break
assert tool_call_found, "Should have found at least one Python tool call with '*'"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_name: str):
response = await mcp_disabled_client.responses.create(
model=model_name,
input=(
"Execute the following code if the tool is present: "
"import random; print(random.randint(1, 1000000))"
),
tools=[
{
"type": "mcp",
"server_label": "code_interpreter",
# URL unused for DemoToolServer
"server_url": "http://localhost:8888",
}
],
extra_body={"enable_response_messages": True},
)
assert response is not None
assert response.status == "completed"
# Verify output messages: No tool calls and responses
tool_call_found = False
tool_response_found = False
for message in response.output_messages:
recipient = message.get("recipient")
if recipient and recipient.startswith("python"):
tool_call_found = True
assert message.get("channel") == "analysis", (
"Tool call should be on analysis channel"
)
author = message.get("author", {})
if (
author.get("role") == "tool"
and author.get("name")
and author.get("name").startswith("python")
):
tool_response_found = True
assert message.get("channel") == "analysis", (
"Tool response should be on analysis channel"
)
assert not tool_call_found, "Should not have a python call"
assert not tool_response_found, "Should not have a tool response"
for message in response.input_messages:
assert message.get("author").get("role") != "developer", (
"No developer messages should be present without a valid tool"
)
def test_get_tool_description():
"""Test MCPToolServer.get_tool_description filtering logic.
Note: The wildcard "*" is normalized to None by
_extract_allowed_tools_from_mcp_requests before reaching this layer,
so we only test None and specific tool filtering here.
See test_serving_responses.py for "*" normalization tests.
"""
pytest.importorskip("mcp")
server = MCPToolServer()
tool1 = ToolDescription.new(
name="tool1", description="First", parameters={"type": "object"}
)
tool2 = ToolDescription.new(
name="tool2", description="Second", parameters={"type": "object"}
)
tool3 = ToolDescription.new(
name="tool3", description="Third", parameters={"type": "object"}
)
server.harmony_tool_descriptions = {
"test_server": ToolNamespaceConfig(
name="test_server", description="test", tools=[tool1, tool2, tool3]
)
}
# Nonexistent server
assert server.get_tool_description("nonexistent") is None
# None (no filter) - returns all tools
result = server.get_tool_description("test_server", allowed_tools=None)
assert len(result.tools) == 3
# Filter to specific tools
result = server.get_tool_description(
"test_server", allowed_tools=["tool1", "tool3"]
)
assert len(result.tools) == 2
assert result.tools[0].name == "tool1"
assert result.tools[1].name == "tool3"
# Single tool
result = server.get_tool_description("test_server", allowed_tools=["tool2"])
assert len(result.tools) == 1
assert result.tools[0].name == "tool2"
# No matching tools - returns None
result = server.get_tool_description("test_server", allowed_tools=["nonexistent"])
assert result is None
# Empty list - returns None
assert server.get_tool_description("test_server", allowed_tools=[]) is None

View File

@@ -0,0 +1,180 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import json
import pytest
import pytest_asyncio
from openai import OpenAI
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-8B"
@pytest.fixture(scope="module")
def server():
assert importlib.util.find_spec("gpt_oss") is not None, (
"Harmony tests require gpt_oss package to be installed"
)
args = [
"--reasoning-parser",
"qwen3",
"--max_model_len",
"5000",
"--structured-outputs-config.backend",
"xgrammar",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
"--tool-server",
"demo",
]
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",
VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
)
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
)
assert response is not None
print("response: ", response)
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{"type": "message", "content": "Hello.", "role": "user"},
{
"type": "reasoning",
"id": "lol",
"content": [
{
"type": "reasoning_text",
"text": "We need to respond: greeting.",
}
],
"summary": [],
},
{
"arguments": '{"location": "Paris", "unit": "celsius"}',
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
"name": "get_weather",
"type": "function_call",
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
"status": "completed",
},
{
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
"output": "The weather in Paris is 20 Celsius",
"status": "completed",
"type": "function_call_output",
},
],
temperature=0.0,
)
assert response is not None
assert response.status == "completed"
# make sure we get a reasoning and text output
assert response.output[0].type == "reasoning"
assert response.output[1].type == "message"
assert type(response.output[1].content[0].text) is str
def get_horoscope(sign):
return f"{sign}: Next Tuesday you will befriend a baby otter."
def call_function(name, args):
if name == "get_horoscope":
return get_horoscope(**args)
else:
raise ValueError(f"Unknown function: {name}")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_call_first_turn(client: OpenAI, model_name: str):
tools = [
{
"type": "function",
"name": "get_horoscope",
"description": "Get today's horoscope for an astrological sign.",
"parameters": {
"type": "object",
"properties": {
"sign": {"type": "string"},
},
"required": ["sign"],
"additionalProperties": False,
},
"strict": True,
}
]
response = await client.responses.create(
model=model_name,
input="What is the horoscope for Aquarius today?",
tools=tools,
temperature=0.0,
)
assert response is not None
assert response.status == "completed"
assert len(response.output) == 2
assert response.output[0].type == "reasoning"
assert response.output[1].type == "function_call"
function_call = response.output[1]
assert function_call.name == "get_horoscope"
assert function_call.call_id is not None
args = json.loads(function_call.arguments)
assert "sign" in args
# the multi turn function call is tested above in
# test_reasoning_and_function_items
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_call(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24? Use python to calculate the result.",
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
temperature=0.0,
)
assert response is not None
assert response.status == "completed"
assert response.output[0].type == "reasoning"
assert response.output[1].type == "mcp_call"
assert type(response.output[1].arguments) is str
assert type(response.output[1].output) is str
assert response.output[2].type == "reasoning"
# make sure the correct math is in the final output
assert response.output[3].type == "message"
assert "312" in response.output[3].content[0].text

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import pytest_asyncio
from openai import OpenAI
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-8B"
@pytest.fixture(scope="module")
def server():
args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",
# uncomment for tool calling
# PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
)
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
)
assert response is not None
print("response: ", response)
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_enable_response_messages(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="Hello?",
extra_body={"enable_response_messages": True},
)
assert response.status == "completed"
assert response.input_messages[0]["type"] == "raw_message_tokens"
assert type(response.input_messages[0]["message"]) is str
assert len(response.input_messages[0]["message"]) > 10
assert type(response.input_messages[0]["tokens"][0]) is int
assert type(response.output_messages[0]["message"]) is str
assert len(response.output_messages[0]["message"]) > 10
assert type(response.output_messages[0]["tokens"][0]) is int
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_reasoning_item(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{"type": "message", "content": "Hello.", "role": "user"},
{
"type": "reasoning",
"id": "lol",
"content": [
{
"type": "reasoning_text",
"text": "We need to respond: greeting.",
}
],
"summary": [],
},
],
temperature=0.0,
)
assert response is not None
assert response.status == "completed"
# make sure we get a reasoning and text output
assert response.output[0].type == "reasoning"
assert response.output[1].type == "message"
assert type(response.output[1].content[0].text) is str

View File

@@ -0,0 +1,988 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import json
import time
import pytest
import pytest_asyncio
import requests
from openai import BadRequestError, NotFoundError, OpenAI
from openai_harmony import (
Message,
)
from ...utils import RemoteOpenAIServer
MODEL_NAME = "openai/gpt-oss-20b"
GET_WEATHER_SCHEMA = {
"type": "function",
"name": "get_weather",
"description": "Get current temperature for provided coordinates in celsius.", # noqa
"parameters": {
"type": "object",
"properties": {
"latitude": {"type": "number"},
"longitude": {"type": "number"},
},
"required": ["latitude", "longitude"],
"additionalProperties": False,
},
"strict": True,
}
@pytest.fixture(scope="module")
def server():
assert importlib.util.find_spec("gpt_oss") is not None, (
"Harmony tests require gpt_oss package to be installed"
)
args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",
PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
)
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
)
assert response is not None
print("response: ", response)
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic_with_instructions(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
instructions="Respond in Korean.",
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is the capital of South Korea?",
reasoning={"effort": "low"},
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is the first paragraph of Moby Dick?",
reasoning={"effort": "low"},
max_output_tokens=30,
)
assert response is not None
assert response.status == "incomplete"
assert response.incomplete_details.reason == "max_output_tokens"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{"role": "system", "content": "Respond in Korean."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Hello! How can I help you today?"},
{"role": "user", "content": "What is 13 * 24? Explain your answer."},
],
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_with_input_type(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{
"role": "user",
"content": [{"type": "input_text", "text": "What is 13*24?"}],
},
],
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_structured_output(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{"role": "system", "content": "Extract the event information."},
{
"role": "user",
"content": "Alice and Bob are going to a science fair on Friday.",
},
],
text={
"format": {
"type": "json_schema",
"name": "calendar_event",
"schema": {
"type": "object",
"properties": {
"name": {"type": "string"},
"date": {"type": "string"},
"participants": {"type": "array", "items": {"type": "string"}},
},
"required": ["name", "date", "participants"],
"additionalProperties": False,
},
"description": "A calendar event.",
"strict": True,
}
},
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_structured_output_with_parse(client: OpenAI, model_name: str):
from pydantic import BaseModel
class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]
response = await client.responses.parse(
model=model_name,
input="Alice and Bob are going to a science fair on Friday",
instructions="Extract the event information",
text_format=CalendarEvent,
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_store(client: OpenAI, model_name: str):
for store in [True, False]:
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
store=store,
)
assert response is not None
try:
_retrieved_response = await client.responses.retrieve(response.id)
is_not_found = False
except NotFoundError:
is_not_found = True
assert is_not_found == (not store)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_background(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
background=True,
)
assert response is not None
retries = 0
max_retries = 30
while retries < max_retries:
response = await client.responses.retrieve(response.id)
if response.status == "completed":
break
time.sleep(1)
retries += 1
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_background_cancel(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="Write a long story about a cat.",
background=True,
)
assert response is not None
time.sleep(1)
cancelled_response = await client.responses.cancel(response.id)
assert cancelled_response is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_stateful_multi_turn(client: OpenAI, model_name: str):
response1 = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
)
assert response1 is not None
assert response1.status == "completed"
response2 = await client.responses.create(
model=model_name,
input="What if I increase both numbers by 1?",
previous_response_id=response1.id,
)
assert response2 is not None
assert response2.status == "completed"
response3 = await client.responses.create(
model=model_name,
input="Divide the result by 2.",
previous_response_id=response2.id,
)
assert response3 is not None
assert response3.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_streaming_types(client: OpenAI, model_name: str):
prompts = [
"tell me a story about a cat in 20 words",
]
# this links the "done" type with the "start" type
# so every "done" type should have a corresponding "start" type
# and every open block should be closed by the end of the stream
pairs_of_event_types = {
"response.completed": "response.created",
"response.output_item.done": "response.output_item.added",
"response.content_part.done": "response.content_part.added",
"response.output_text.done": "response.output_text.delta",
"response.web_search_call.done": "response.web_search_call.added",
"response.reasoning_text.done": "response.reasoning_text.delta",
"response.reasoning_part.done": "response.reasoning_part.added",
}
for prompt in prompts:
response = await client.responses.create(
model=model_name,
input=prompt,
reasoning={"effort": "low"},
tools=[],
stream=True,
background=False,
)
stack_of_event_types = []
async for event in response:
if event.type == "response.created":
stack_of_event_types.append(event.type)
elif event.type == "response.completed":
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
stack_of_event_types.pop()
if event.type.endswith("added"):
stack_of_event_types.append(event.type)
elif event.type.endswith("delta"):
if stack_of_event_types[-1] == event.type:
continue
stack_of_event_types.append(event.type)
elif event.type.endswith("done"):
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
stack_of_event_types.pop()
assert len(stack_of_event_types) == 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_with_streaming_types(client: OpenAI, model_name: str):
# this links the "done" type with the "start" type
# so every "done" type should have a corresponding "start" type
# and every open block should be closed by the end of the stream
pairs_of_event_types = {
"response.completed": "response.created",
"response.output_item.done": "response.output_item.added",
"response.output_text.done": "response.output_text.delta",
"response.reasoning_text.done": "response.reasoning_text.delta",
"response.reasoning_part.done": "response.reasoning_part.added",
"response.function_call_arguments.done": "response.function_call_arguments.delta", # noqa
}
tools = [GET_WEATHER_SCHEMA]
input_list = [
{
"role": "user",
"content": "What's the weather like in Paris today?",
}
]
stream_response = await client.responses.create(
model=model_name,
input=input_list,
tools=tools,
stream=True,
)
stack_of_event_types = []
async for event in stream_response:
if event.type == "response.created":
stack_of_event_types.append(event.type)
elif event.type == "response.completed":
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
stack_of_event_types.pop()
if event.type.endswith("added"):
stack_of_event_types.append(event.type)
elif event.type.endswith("delta"):
if stack_of_event_types[-1] == event.type:
continue
stack_of_event_types.append(event.type)
elif event.type.endswith("done"):
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
stack_of_event_types.pop()
assert len(stack_of_event_types) == 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("background", [True, False])
async def test_streaming(client: OpenAI, model_name: str, background: bool):
# TODO: Add back when web search and code interpreter are available in CI
prompts = [
"tell me a story about a cat in 20 words",
"What is 13 * 24? Use python to calculate the result.",
# "When did Jensen found NVIDIA? Search it and answer the year only.",
]
for prompt in prompts:
response = await client.responses.create(
model=model_name,
input=prompt,
reasoning={"effort": "low"},
tools=[
# {
# "type": "web_search_preview"
# },
{"type": "code_interpreter", "container": {"type": "auto"}},
],
stream=True,
background=background,
extra_body={"enable_response_messages": True},
)
current_item_id = ""
current_content_index = -1
events = []
current_event_mode = None
resp_id = None
checked_response_completed = False
async for event in response:
if event.type == "response.created":
resp_id = event.response.id
# test vllm custom types are in the response
if event.type in [
"response.completed",
"response.in_progress",
"response.created",
]:
assert "input_messages" in event.response.model_extra
assert "output_messages" in event.response.model_extra
if event.type == "response.completed":
# make sure the serialization of content works
for msg in event.response.model_extra["output_messages"]:
# make sure we can convert the messages back into harmony
Message.from_dict(msg)
for msg in event.response.model_extra["input_messages"]:
# make sure we can convert the messages back into harmony
Message.from_dict(msg)
checked_response_completed = True
if current_event_mode != event.type:
current_event_mode = event.type
print(f"\n[{event.type}] ", end="", flush=True)
# verify current_item_id is correct
if event.type == "response.output_item.added":
assert event.item.id != current_item_id
current_item_id = event.item.id
elif event.type in [
"response.output_text.delta",
"response.reasoning_text.delta",
]:
assert event.item_id == current_item_id
# verify content_index_id is correct
if event.type in [
"response.content_part.added",
"response.reasoning_part.added",
]:
assert event.content_index != current_content_index
current_content_index = event.content_index
elif event.type in [
"response.output_text.delta",
"response.reasoning_text.delta",
]:
assert event.content_index == current_content_index
if "text.delta" in event.type:
print(event.delta, end="", flush=True)
elif "reasoning_text.delta" in event.type:
print(f"{event.delta}", end="", flush=True)
elif "response.code_interpreter_call_code.done" in event.type:
print(f"Code: {event.code}", end="", flush=True)
elif (
"response.output_item.added" in event.type
and event.item.type == "web_search_call"
):
print(f"Web search: {event.item.action}", end="", flush=True)
events.append(event)
assert len(events) > 0
response_completed_event = events[-1]
assert len(response_completed_event.response.output) > 0
assert checked_response_completed
if background:
starting_after = 5
async with await client.responses.retrieve(
response_id=resp_id, stream=True, starting_after=starting_after
) as stream:
counter = starting_after
async for event in stream:
counter += 1
assert event == events[counter]
assert counter == len(events) - 1
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
async def test_web_search(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="Who is the president of South Korea as of now?",
tools=[{"type": "web_search_preview"}],
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_code_interpreter(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
# TODO: Ideally should be able to set max tool calls
# to prevent multi-turn, but it is not currently supported
# would speed up the test
input=(
"What's the first 4 digits after the decimal point of "
"cube root of `19910212 * 20250910`? "
"Show only the digits. The python interpreter is not stateful "
"and you must print to see the output."
),
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
temperature=0.0, # More deterministic output in response
)
assert response is not None
assert response.status == "completed"
assert response.usage.output_tokens_details.tool_output_tokens > 0
for item in response.output:
if item.type == "message":
output_string = item.content[0].text
print("output_string: ", output_string, flush=True)
assert "5846" in output_string
def get_weather(latitude, longitude):
response = requests.get(
f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&current=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m" # noqa
)
data = response.json()
return data["current"]["temperature_2m"]
def get_place_to_travel():
return "Paris"
def get_horoscope(sign):
return f"{sign}: Next Tuesday you will befriend a baby otter."
def call_function(name, args):
if name == "get_weather":
return get_weather(**args)
elif name == "get_place_to_travel":
return get_place_to_travel()
elif name == "get_horoscope":
return get_horoscope(**args)
else:
raise ValueError(f"Unknown function: {name}")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_reasoning_item(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{"type": "message", "content": "Hello.", "role": "user"},
{
"type": "reasoning",
"id": "lol",
"content": [
{
"type": "reasoning_text",
"text": "We need to respond: greeting.",
}
],
"summary": [],
},
],
temperature=0.0,
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling(client: OpenAI, model_name: str):
tools = [GET_WEATHER_SCHEMA]
response = await client.responses.create(
model=model_name,
input="What's the weather like in Paris today?",
tools=tools,
temperature=0.0,
extra_body={"request_id": "test_function_calling_non_resp"},
)
assert response is not None
assert response.status == "completed"
assert len(response.output) == 2
assert response.output[0].type == "reasoning"
assert response.output[1].type == "function_call"
tool_call = response.output[1]
name = tool_call.name
args = json.loads(tool_call.arguments)
result = call_function(name, args)
response_2 = await client.responses.create(
model=model_name,
input=[
{
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
],
tools=tools,
previous_response_id=response.id,
)
assert response_2 is not None
assert response_2.status == "completed"
assert response_2.output_text is not None
# NOTE: chain-of-thought should be removed.
response_3 = await client.responses.create(
model=model_name,
input="What's the weather like in Paris today?",
tools=tools,
previous_response_id=response_2.id,
)
assert response_3 is not None
assert response_3.status == "completed"
assert response_3.output_text is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.flaky(reruns=5)
async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
tools = [
{
"type": "function",
"name": "get_place_to_travel",
"description": "Get a random place to travel",
"parameters": {
"type": "object",
"properties": {},
"required": [],
"additionalProperties": False,
},
"strict": True,
},
GET_WEATHER_SCHEMA,
]
response = await client.responses.create(
model=model_name,
input="Help me plan a trip to a random place. And tell me the weather there.",
tools=tools,
)
assert response is not None
assert response.status == "completed"
assert len(response.output) == 2
assert response.output[0].type == "reasoning"
assert response.output[1].type == "function_call"
tool_call = response.output[1]
name = tool_call.name
args = json.loads(tool_call.arguments)
result = call_function(name, args)
response_2 = await client.responses.create(
model=model_name,
input=[
{
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
],
tools=tools,
previous_response_id=response.id,
)
assert response_2 is not None
assert response_2.status == "completed"
assert len(response_2.output) == 2
assert response_2.output[0].type == "reasoning"
assert response_2.output[1].type == "function_call"
tool_call = response_2.output[1]
name = tool_call.name
args = json.loads(tool_call.arguments)
result = call_function(name, args)
response_3 = await client.responses.create(
model=model_name,
input=[
{
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
],
tools=tools,
previous_response_id=response_2.id,
)
assert response_3 is not None
assert response_3.status == "completed"
assert response_3.output_text is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_required(client: OpenAI, model_name: str):
tools = [GET_WEATHER_SCHEMA]
with pytest.raises(BadRequestError):
await client.responses.create(
model=model_name,
input="What's the weather like in Paris today?",
tools=tools,
tool_choice="required",
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_system_message_with_tools(client: OpenAI, model_name: str):
from vllm.entrypoints.openai.parser.harmony_utils import get_system_message
# Test with custom tools enabled - commentary channel should be available
sys_msg = get_system_message(with_custom_tools=True)
valid_channels = sys_msg.content[0].channel_config.valid_channels
assert "commentary" in valid_channels
# Test with custom tools disabled - commentary channel should be removed
sys_msg = get_system_message(with_custom_tools=False)
valid_channels = sys_msg.content[0].channel_config.valid_channels
assert "commentary" not in valid_channels
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_full_history(client: OpenAI, model_name: str):
tools = [GET_WEATHER_SCHEMA]
input_messages = [
{"role": "user", "content": "What's the weather like in Paris today?"}
]
response = await client.responses.create(
model=model_name,
input=input_messages,
tools=tools,
)
assert response is not None
assert response.status == "completed"
tool_call = response.output[-1]
name = tool_call.name
args = json.loads(tool_call.arguments)
result = call_function(name, args)
input_messages.extend(response.output) # append model's function call message
input_messages.append(
{ # append result message
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
)
response_2 = await client.responses.create(
model=model_name,
input=input_messages,
tools=tools,
)
assert response_2 is not None
assert response_2.status == "completed"
assert response_2.output_text is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_with_stream(client: OpenAI, model_name: str):
tools = [GET_WEATHER_SCHEMA]
input_list = [
{
"role": "user",
"content": "What's the weather like in Paris today?",
}
]
stream_response = await client.responses.create(
model=model_name,
input=input_list,
tools=tools,
stream=True,
)
assert stream_response is not None
final_tool_calls = {}
final_tool_calls_named = {}
async for event in stream_response:
if event.type == "response.output_item.added":
if event.item.type != "function_call":
continue
final_tool_calls[event.output_index] = event.item
final_tool_calls_named[event.item.name] = event.item
elif event.type == "response.function_call_arguments.delta":
index = event.output_index
tool_call = final_tool_calls[index]
if tool_call:
tool_call.arguments += event.delta
final_tool_calls_named[tool_call.name] = tool_call
elif event.type == "response.function_call_arguments.done":
assert event.arguments == final_tool_calls_named[event.name].arguments
for tool_call in final_tool_calls.values():
if (
tool_call
and tool_call.type == "function_call"
and tool_call.name == "get_weather"
):
args = json.loads(tool_call.arguments)
result = call_function(tool_call.name, args)
input_list += [tool_call]
break
assert result is not None
response = await client.responses.create(
model=model_name,
input=input_list
+ [
{
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
],
tools=tools,
stream=True,
)
assert response is not None
async for event in response:
# check that no function call events in the stream
assert event.type != "response.function_call_arguments.delta"
assert event.type != "response.function_call_arguments.done"
# check that the response contains output text
if event.type == "response.completed":
assert len(event.response.output) > 0
assert event.response.output_text is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
response = await client.responses.create(
model=model_name,
input="What is the capital of South Korea?",
extra_body={"enable_response_messages": True},
)
assert response is not None
assert response.status == "completed"
assert len(response.input_messages) > 0
assert len(response.output_messages) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_call_with_previous_input_messages(
client: OpenAI, model_name: str
):
"""Test function calling using previous_input_messages
for multi-turn conversation with a function call"""
# Define the get_horoscope tool
tools = [
{
"type": "function",
"name": "get_horoscope",
"description": "Get today's horoscope for an astrological sign.",
"parameters": {
"type": "object",
"properties": {
"sign": {"type": "string"},
},
"required": ["sign"],
"additionalProperties": False,
},
"strict": True,
}
]
# Step 1: First call with the function tool
stream_response = await client.responses.create(
model=model_name,
input="What is the horoscope for Aquarius today?",
tools=tools,
extra_body={"enable_response_messages": True},
stream=True,
)
response = None
async for event in stream_response:
if event.type == "response.completed":
response = event.response
assert response is not None
assert response.status == "completed"
# Step 2: Parse the first output to find the function_call type
function_call = None
for item in response.output:
if item.type == "function_call":
function_call = item
break
assert function_call is not None, "Expected a function_call in the output"
assert function_call.name == "get_horoscope"
assert function_call.call_id is not None
# Verify the format matches expectations
args = json.loads(function_call.arguments)
assert "sign" in args
# Step 3: Call the get_horoscope function
result = call_function(function_call.name, args)
assert "Aquarius" in result
assert "baby otter" in result
# Get the input_messages and output_messages from the first response
first_input_messages = response.input_messages
first_output_messages = response.output_messages
# Construct the full conversation history using previous_input_messages
previous_messages = (
first_input_messages
+ first_output_messages
+ [
{
"role": "tool",
"name": "functions.get_horoscope",
"content": [{"type": "text", "text": str(result)}],
}
]
)
# Step 4: Make another responses.create() call with previous_input_messages
stream_response_2 = await client.responses.create(
model=model_name,
tools=tools,
input="",
extra_body={
"previous_input_messages": previous_messages,
"enable_response_messages": True,
},
stream=True,
)
async for event in stream_response_2:
if event.type == "response.completed":
response_2 = event.response
assert response_2 is not None
assert response_2.status == "completed"
assert response_2.output_text is not None
# verify only one system message / developer message
num_system_messages_input = 0
num_developer_messages_input = 0
num_function_call_input = 0
for message_dict in response_2.input_messages:
message = Message.from_dict(message_dict)
if message.author.role == "system":
num_system_messages_input += 1
elif message.author.role == "developer":
num_developer_messages_input += 1
elif message.author.role == "tool":
num_function_call_input += 1
assert num_system_messages_input == 1
assert num_developer_messages_input == 1
assert num_function_call_input == 1
# Verify the output makes sense - should contain information about the horoscope
output_text = response_2.output_text.lower()
assert (
"aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
)

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from unittest.mock import MagicMock
import pytest
from vllm.entrypoints.openai.protocol import ErrorResponse
from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing
@pytest.mark.asyncio
async def test_raise_if_error_raises_generation_error():
"""test _raise_if_error raises GenerationError"""
# create a minimal OpenAIServing instance
mock_engine = MagicMock()
mock_engine.model_config = MagicMock()
mock_engine.model_config.max_model_len = 100
mock_models = MagicMock()
serving = OpenAIServing(
engine_client=mock_engine,
models=mock_models,
request_logger=None,
)
# test that error finish_reason raises GenerationError
with pytest.raises(GenerationError) as exc_info:
serving._raise_if_error("error", "test-request-id")
assert str(exc_info.value) == "Internal server error"
assert exc_info.value.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
# test that other finish_reasons don't raise
serving._raise_if_error("stop", "test-request-id") # should not raise
serving._raise_if_error("length", "test-request-id") # should not raise
serving._raise_if_error(None, "test-request-id") # should not raise
@pytest.mark.asyncio
async def test_convert_generation_error_to_response():
"""test _convert_generation_error_to_response creates proper ErrorResponse"""
mock_engine = MagicMock()
mock_engine.model_config = MagicMock()
mock_engine.model_config.max_model_len = 100
mock_models = MagicMock()
serving = OpenAIServing(
engine_client=mock_engine,
models=mock_models,
request_logger=None,
)
# create a GenerationError
gen_error = GenerationError("Internal server error")
# convert to ErrorResponse
error_response = serving._convert_generation_error_to_response(gen_error)
assert isinstance(error_response, ErrorResponse)
assert error_response.error.type == "InternalServerError"
assert error_response.error.message == "Internal server error"
assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
@pytest.mark.asyncio
async def test_convert_generation_error_to_streaming_response():
"""test _convert_generation_error_to_streaming_response output"""
mock_engine = MagicMock()
mock_engine.model_config = MagicMock()
mock_engine.model_config.max_model_len = 100
mock_models = MagicMock()
serving = OpenAIServing(
engine_client=mock_engine,
models=mock_models,
request_logger=None,
)
# create a GenerationError
gen_error = GenerationError("Internal server error")
# convert to streaming error response
error_json = serving._convert_generation_error_to_streaming_response(gen_error)
assert isinstance(error_json, str)
assert "Internal server error" in error_json
assert "InternalServerError" in error_json

View File

@@ -0,0 +1,330 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test function call parsing in ResponsesRequest."""
import json
import pytest
from openai.types.responses import ResponseFunctionToolCall
from vllm.entrypoints.openai.protocol import ResponsesRequest
def test_function_call_dict_converted_to_object():
"""Test that function_call dictionaries are correctly parsed into
ResponseFunctionToolCall objects."""
# Create a request with function_call as dict
request_data = {
"model": "gpt-oss",
"input": [
{
"type": "function_call",
"call_id": "fc_123",
"name": "get_weather",
"arguments": '{"location": "Boston", "unit": "celsius"}',
}
],
}
request = ResponsesRequest(**request_data)
# Verify the input item is now a ResponseFunctionToolCall object
assert len(request.input) == 1
assert isinstance(request.input[0], ResponseFunctionToolCall)
assert request.input[0].call_id == "fc_123"
assert request.input[0].name == "get_weather"
assert request.input[0].arguments == '{"location": "Boston", "unit": "celsius"}'
def test_direct_function_call_object_preservation():
"""Test that ResponseFunctionToolCall objects passed directly are preserved."""
# Create a request with ResponseFunctionToolCall object
function_call = ResponseFunctionToolCall(
type="function_call",
call_id="fc_456",
name="get_stock_price",
arguments='{"symbol": "AAPL"}',
)
request_data = {"model": "gpt-oss", "input": [function_call]}
request = ResponsesRequest(**request_data)
# Verify the object is preserved
assert len(request.input) == 1
assert request.input[0] is function_call
def test_mixed_input_types_with_function_calls():
"""Test parsing with mixed input types including function calls."""
request_data = {
"model": "gpt-oss",
"input": [
# Valid Message type
{
"type": "message",
"role": "user",
"content": [{"type": "input_text", "text": "What's the weather?"}],
},
# Function call that should be parsed
{
"type": "function_call",
"call_id": "fc_789",
"name": "check_weather",
"arguments": '{"location": "NYC"}',
},
# Another function call
{
"type": "function_call",
"call_id": "fc_790",
"name": "get_time",
"arguments": "{}",
},
],
}
request = ResponsesRequest(**request_data)
# Verify mixed types are handled correctly
assert len(request.input) == 3
# First item should be validated as Message
assert request.input[0]["type"] == "message"
# Second item should be parsed to ResponseFunctionToolCall
assert isinstance(request.input[1], ResponseFunctionToolCall)
assert request.input[1].call_id == "fc_789"
assert request.input[1].name == "check_weather"
# Third item should also be parsed to ResponseFunctionToolCall
assert isinstance(request.input[2], ResponseFunctionToolCall)
assert request.input[2].call_id == "fc_790"
assert request.input[2].name == "get_time"
def test_function_call_with_complex_arguments():
"""Test parsing function calls with complex nested arguments."""
complex_args = {
"query": "weather forecast",
"filters": {
"location": {"city": "San Francisco", "state": "CA"},
"timeRange": {"start": "2024-01-01", "end": "2024-01-07"},
"metrics": ["temperature", "humidity", "precipitation"],
},
"options": {"format": "detailed", "includeAlerts": True},
}
request_data = {
"model": "gpt-oss",
"input": [
{
"type": "function_call",
"call_id": "fc_complex",
"name": "advanced_weather_query",
"arguments": json.dumps(complex_args),
}
],
}
request = ResponsesRequest(**request_data)
# Verify complex arguments are preserved correctly
assert len(request.input) == 1
assert isinstance(request.input[0], ResponseFunctionToolCall)
assert request.input[0].call_id == "fc_complex"
assert request.input[0].name == "advanced_weather_query"
# Parse the arguments back to verify they're intact
parsed_args = json.loads(request.input[0].arguments)
assert parsed_args == complex_args
def test_invalid_function_call_fallback():
"""Test that invalid function call dictionaries fall back gracefully."""
# Missing required field 'call_id'
request_data = {
"model": "gpt-oss",
"input": [
{"type": "function_call", "name": "incomplete_function", "arguments": "{}"}
],
}
# This should not raise an error during model creation
# The validator should keep the original dict and let Pydantic
# handle validation
with pytest.raises(ValueError):
# Pydantic should raise a validation error for the invalid structure
ResponsesRequest(**request_data)
def test_string_input_not_affected():
"""Test that string input is not affected by the validator."""
request_data = {"model": "gpt-oss", "input": "This is a simple string input"}
request = ResponsesRequest(**request_data)
# Verify string input remains unchanged
assert request.input == "This is a simple string input"
def test_empty_list_input():
"""Test that empty list input is handled correctly."""
request_data = {"model": "gpt-oss", "input": []}
request = ResponsesRequest(**request_data)
# Verify empty list is preserved
assert request.input == []
def test_function_call_output_not_affected():
"""Test that FunctionCallOutput is not affected by the function_call parsing."""
# Test with FunctionCallOutput as dict (should not be parsed)
request_data = {
"model": "gpt-oss",
"input": [
{
"type": "function_call_output",
"call_id": "fc_output_123",
"output": "The weather in Boston is 72°F and sunny.",
}
],
}
request = ResponsesRequest(**request_data)
# FunctionCallOutput should remain as dict (not converted to an object)
assert len(request.input) == 1
assert isinstance(request.input[0], dict)
assert request.input[0]["type"] == "function_call_output"
assert request.input[0]["call_id"] == "fc_output_123"
assert request.input[0]["output"] == "The weather in Boston is 72°F and sunny."
def test_mixed_function_call_and_output():
"""Test that function_call is parsed while function_call_output is preserved."""
request_data = {
"model": "gpt-oss",
"input": [
# This should be parsed to ResponseFunctionToolCall
{
"type": "function_call",
"call_id": "fc_call_456",
"name": "get_weather",
"arguments": '{"location": "NYC"}',
},
# This should remain as dict
{
"type": "function_call_output",
"call_id": "fc_call_456",
"output": "NYC weather is 68°F with light rain",
},
],
}
request = ResponsesRequest(**request_data)
assert len(request.input) == 2
# First item should be parsed to ResponseFunctionToolCall
assert isinstance(request.input[0], ResponseFunctionToolCall)
assert request.input[0].call_id == "fc_call_456"
assert request.input[0].name == "get_weather"
# Second item should remain as dict (FunctionCallOutput)
assert isinstance(request.input[1], dict)
assert request.input[1]["type"] == "function_call_output"
assert request.input[1]["call_id"] == "fc_call_456"
assert request.input[1]["output"] == "NYC weather is 68°F with light rain"
def test_function_call_validation_failure_logs_debug(caplog):
"""Test that validation failures are logged at debug level."""
from unittest.mock import patch
request_data = {
"model": "gpt-oss",
"input": [
{
"type": "function_call",
"name": "incomplete_function",
"arguments": "{}", # Missing call_id
}
],
}
# Mock the logger to verify debug was called
with patch("vllm.entrypoints.openai.protocol.logger") as mock_logger:
with pytest.raises(ValueError):
ResponsesRequest(**request_data)
# Verify debug was called with expected message
mock_logger.debug.assert_called_once()
call_args = mock_logger.debug.call_args[0][0]
assert "Failed to parse function_call" in call_args
def test_validator_handles_iterator_input():
"""Test that validator can handle ValidatorIterator input (Pydantic internal)."""
# This test simulates when Pydantic passes a ValidatorIterator instead of a list
# This happened with complex nested structures containing reasoning + function_call
# Create test data that would normally be a list
test_input_items = [
{
"type": "message",
"role": "user",
"content": [{"type": "input_text", "text": "Test"}],
},
{
"type": "reasoning",
"id": "rs_1",
"summary": [{"type": "summary_text", "text": "Test reasoning"}],
"content": [{"type": "reasoning_text", "text": "Test content"}],
},
{
"type": "function_call",
"call_id": "call_1",
"name": "test_function",
"arguments": '{"test": "value"}',
"id": "fc_1",
},
]
# Mock data where input is an iterator (simulates Pydantic ValidatorIterator)
mock_data = {
"model": "test-model",
"input": iter(test_input_items), # Iterator instead of list
}
# This should NOT raise an error with the fixed validator
try:
request = ResponsesRequest(**mock_data)
# Verify the validator processed the data correctly
assert len(request.input) == 3
# Verify function_call was converted to ResponseFunctionToolCall object
function_call_item = None
for item in request.input:
if isinstance(item, ResponseFunctionToolCall):
function_call_item = item
break
assert function_call_item is not None
assert function_call_item.call_id == "call_1"
assert function_call_item.name == "test_function"
except Exception as e:
pytest.fail(f"Validator should handle iterator input, but failed with: {e}")
def test_validator_handles_empty_iterator():
"""Test validator handles empty iterator gracefully."""
mock_data = {
"model": "test-model",
"input": iter([]), # Empty iterator
}
request = ResponsesRequest(**mock_data)
assert request.input == []

View File

@@ -0,0 +1,369 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("return_token_ids", [True, False, None])
async def test_basic_completion_with_emoji(server, return_token_ids: bool | None):
"""Test basic completion with emoji to verify token_ids field."""
extra_body = None
if return_token_ids is not None:
extra_body = {"return_token_ids": return_token_ids}
async with server.get_async_client() as client:
# Test with return_token_ids enabled
completion = await client.completions.create(
model=MODEL_NAME,
prompt="Complete this sentence with emojis: I love coding 🚀",
max_tokens=10,
temperature=0,
logprobs=1,
extra_body=extra_body,
)
# Check the raw response to see the structure
completion_dict = completion.model_dump()
# Verify prompt_token_ids field is present in the completion response
assert "prompt_token_ids" in completion_dict["choices"][0]
if not return_token_ids:
# If return_token_ids is False, token_ids should not be present
assert completion_dict["choices"][0].get("token_ids") is None
assert completion_dict["choices"][0].get("prompt_token_ids") is None
# Skip further checks
return
assert isinstance(completion.choices[0].prompt_token_ids, list)
# Check against the expected prompt token IDs
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
encoded_tokens = tokenizer.encode(
"Complete this sentence with emojis: I love coding 🚀"
)
# Check that encoded_tokens is a subsequence of prompt_token_ids
assert any(
completion.choices[0].prompt_token_ids[i : i + len(encoded_tokens)]
== encoded_tokens
for i in range(
len(completion.choices[0].prompt_token_ids) - len(encoded_tokens) + 1
)
)
# Verify token_ids field is present in the choice
assert completion.choices[0].token_ids is not None
assert isinstance(completion.choices[0].token_ids, list)
assert len(completion.choices[0].token_ids) > 0
# Verify decoding works correctly
decoded_text = tokenizer.decode(completion.choices[0].token_ids)
# The decoded text should contain a <|im_end|> at the end
assert decoded_text.startswith(completion.choices[0].text)
# Test without return_token_ids (should be None)
completion_without = await client.completions.create(
model=MODEL_NAME,
prompt="Complete this sentence with emojis: I love coding 🚀",
max_tokens=10,
temperature=0,
logprobs=1,
extra_body={"return_token_ids": False},
)
completion_without_dict = completion_without.model_dump()
assert completion_without_dict["choices"][0].get("token_ids") is None
assert completion_without_dict.get("prompt_token_ids") is None
@pytest.mark.asyncio
async def test_chat_completion_with_tool_use(server):
"""Test chat completion with tool use (get_weather function)."""
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The unit of temperature",
},
},
"required": ["location"],
},
},
}
]
async with server.get_async_client() as client:
# Test with return_token_ids enabled
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What's the weather like in Paris?"},
],
tools=tools,
tool_choice="auto",
max_tokens=100,
temperature=0,
logprobs=True,
extra_body={"return_token_ids": True},
)
# Verify token_ids field is present in choices
assert response.choices[0].token_ids is not None
assert isinstance(response.choices[0].token_ids, list)
# Verify prompt_token_ids field is present
assert response.prompt_token_ids is not None
assert isinstance(response.prompt_token_ids, list)
# Verify the prompt texts and response texts
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
prompt_text = tokenizer.decode(response.prompt_token_ids)
assert prompt_text.startswith(
"<|im_start|>system\nYou are a helpful assistant."
)
assert prompt_text.endswith(
"What's the weather like in Paris?<|im_end|>\n<|im_start|>assistant\n"
)
response_text = tokenizer.decode(response.choices[0].token_ids)
assert response_text.startswith('<tool_call>\n{"name": "get_weather"')
assert response_text.endswith("</tool_call><|im_end|>")
# If tool call was made, verify the response structure
if response.choices[0].message.tool_calls:
assert len(response.choices[0].message.tool_calls) > 0
tool_call = response.choices[0].message.tool_calls[0]
assert tool_call.function.name == "get_weather"
# Test without return_token_ids
response_without = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What's the weather like in Paris?"},
],
tools=tools,
tool_choice="auto",
max_tokens=100,
temperature=0,
logprobs=True,
extra_body={"return_token_ids": False},
)
assert response_without.choices[0].token_ids is None
assert response_without.prompt_token_ids is None
@pytest.mark.asyncio
async def test_comparison_with_prompt_logprobs_and_logprobs(server):
"""
Test that token_ids align with prompt_logprobs and
logprobs when return_tokens_as_token_ids is enabled.
"""
async with server.get_async_client() as client:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion = await client.completions.create(
model=MODEL_NAME,
prompt="Hello, world! How are you today?",
max_tokens=20,
temperature=0,
echo=True,
logprobs=1,
extra_body={
"return_token_ids": True,
"return_tokens_as_token_ids": True,
"prompt_logprobs": 1,
},
)
# Verify all fields are present
assert completion.choices[0].token_ids is not None
assert completion.choices[0].prompt_token_ids is not None
assert completion.choices[0].prompt_logprobs is not None
assert completion.choices[0].logprobs is not None
# Extract token IDs from logprobs
# (when return_tokens_as_token_ids is True)
logprobs_token_ids = []
for token_str in completion.choices[0].logprobs.tokens:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if token_str.startswith("token_id:"):
token_id = int(token_str.removeprefix("token_id:"))
logprobs_token_ids.append(token_id)
# When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the suffix of response portion
# The prompt_token_ids should match the prompt portion
assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
response_token_ids_length = len(completion.choices[0].token_ids)
assert (
logprobs_token_ids[-response_token_ids_length:]
== completion.choices[0].token_ids
)
# Verify tokenizer consistency
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# Decode prompt tokens
if completion.choices[0].prompt_token_ids:
prompt_text = tokenizer.decode(completion.choices[0].prompt_token_ids)
# The decoded prompt should match or close to original prompt
assert "Hello, world" in prompt_text
# Decode response tokens
if completion.choices[0].token_ids:
response_text = tokenizer.decode(completion.choices[0].token_ids)
assert completion.choices[0].text.endswith(response_text)
# Test streaming mode
stream = await client.completions.create(
model=MODEL_NAME,
prompt="Tell me a short fact about Python:",
max_tokens=30,
temperature=0,
stream=True,
echo=False,
logprobs=1,
extra_body={"return_token_ids": True, "return_tokens_as_token_ids": True},
)
# Collect streamed tokens
streamed_prompt_token_ids = []
streamed_token_ids = []
streamed_logprob_token_ids = []
first_chunk = True
async for chunk in stream:
for token_str in chunk.choices[0].logprobs.tokens:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if token_str.startswith("token_id:"):
token_id = int(token_str.removeprefix("token_id:"))
streamed_logprob_token_ids.append(token_id)
if first_chunk:
streamed_prompt_token_ids = chunk.choices[0].prompt_token_ids
first_chunk = False
streamed_token_ids += chunk.choices[0].token_ids
# Verify we collected some tokens and first chunk had prompt_token_ids
assert len(streamed_prompt_token_ids) > 0
assert streamed_token_ids == streamed_logprob_token_ids
@pytest.mark.asyncio
async def test_chat_completion_with_emoji_and_token_ids(server):
"""Test chat completion with emojis to verify token_ids handling."""
chat_messages = [
{"role": "system", "content": "You like to use emojis in your responses."},
{"role": "user", "content": "Repeat after me: I love cats 🐱"},
]
async with server.get_async_client() as client:
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=chat_messages,
max_tokens=50,
temperature=0,
logprobs=True,
extra_body={"return_token_ids": True},
)
# Verify token_ids are present
response_dict = response.model_dump()
assert response.choices[0].token_ids is not None
assert "prompt_token_ids" in response_dict
# Verify the response contains the expected fields
assert response.choices[0].message.content is not None
# Decode token_ids and verify consistency
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
decoded_prompt = tokenizer.decode(response.prompt_token_ids)
assert decoded_prompt.startswith(
"<|im_start|>system\nYou like to use emojis in your responses."
)
assert decoded_prompt.endswith(
"I love cats 🐱<|im_end|>\n<|im_start|>assistant\n"
)
decoded_response = tokenizer.decode(response.choices[0].token_ids)
# The content should match the response text
# except the ending <|im_end|>
assert decoded_response == response.choices[0].message.content + "<|im_end|>"
# Test with streaming
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=chat_messages,
max_tokens=50,
temperature=0,
stream=True,
extra_body={"return_token_ids": True},
)
collected_content = ""
collected_token_ids = []
first_chunk = True
async for chunk in stream:
if first_chunk:
assert chunk.prompt_token_ids is not None
assert isinstance(chunk.prompt_token_ids, list)
# Check the prompt_token_ids match the initial prompt
decoded_prompt_stream = tokenizer.decode(chunk.prompt_token_ids)
assert decoded_prompt_stream == decoded_prompt
first_chunk = False
else:
chunk_dump = chunk.model_dump()
assert "prompt_token_ids" not in chunk_dump, (
"Subsequent chunks should not have prompt_token_ids"
)
if chunk.choices:
if chunk.choices[0].delta.content:
collected_content += chunk.choices[0].delta.content
# token_ids may not present in all chunks
choice_dump = chunk.choices[0].model_dump()
if "token_ids" in choice_dump:
collected_token_ids.extend(chunk.choices[0].token_ids)
# Verify we got response and token_ids
assert len(collected_content) > 0
assert len(collected_token_ids) > 0
# Verify token_ids decode properly
decoded_response = tokenizer.decode(collected_token_ids)
assert decoded_response == collected_content + "<|im_end|>"

View File

@@ -0,0 +1,123 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Separate these tests out from test_completion and test_chat, because they
# require launching a second server with a different flag. Running both servers
# at the same time on a single node will OOM.
import pytest
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def default_server_args(qwen3_lora_files):
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--max-num-seqs",
"128",
"--enforce-eager",
# lora config
"--enable-lora",
"--lora-modules",
f"qwen3-lora={qwen3_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
]
@pytest.fixture(scope="module")
def server_fixture(request, default_server_args): # noqa: F811
use_server_flag = request.param
if use_server_flag:
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
yield (remote_server, True)
else:
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield (remote_server, False)
@pytest.mark.asyncio
@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
async def test_completion_return_tokens_as_token_ids_completion(server_fixture):
server, use_server_flag = server_fixture
request_args = {}
if not use_server_flag:
request_args["return_tokens_as_token_ids"] = True
async with server.get_async_client() as client:
completion = await client.completions.create(
model=MODEL_NAME,
# Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer
prompt="Say 'Hello, world! 🎉'",
echo=True,
temperature=0,
max_tokens=10,
logprobs=1,
extra_body=request_args,
)
text = completion.choices[0].text
token_strs = completion.choices[0].logprobs.tokens
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# Check that the token representations are consistent between raw
# tokens and top_logprobs
# Slice off the first one, because there's no scoring associated
# with BOS
top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
top_logprob_keys = [
next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
]
assert token_strs[1:] == top_logprob_keys
# Check that decoding the tokens gives the expected text
tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
assert text == tokenizer.decode(tokens, skip_special_tokens=True)
@pytest.mark.asyncio
@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
server, use_server_flag = server_fixture
request_args = {}
if not use_server_flag:
request_args["return_tokens_as_token_ids"] = True
async with server.get_async_client() as client:
response = await client.chat.completions.create(
model=MODEL_NAME,
# Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer
messages=[
{
"role": "system",
"content": "You like to respond in only emojis, like 🎉",
},
{"role": "user", "content": "Please write some emojis: 🐱🐶🎉"},
],
temperature=0,
max_tokens=8,
logprobs=True,
extra_body=request_args,
)
text = response.choices[0].message.content
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
token_ids = []
for logprob_content in response.choices[0].logprobs.content:
token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
assert tokenizer.decode(token_ids, skip_special_tokens=True) == text

View File

@@ -0,0 +1,104 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
import os
from typing import Any, NamedTuple
import openai # use the official client for correctness check
import pytest
from ...utils import RemoteOpenAIServer
# # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
API_KEY = "abc-123"
ERROR_API_KEY = "abc"
ROOT_PATH = "llm"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--enforce-eager",
"--max-model-len",
"4080",
"--root-path", # use --root-path=/llm for testing
"/" + ROOT_PATH,
]
envs = os.environ.copy()
envs["VLLM_API_KEY"] = API_KEY
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
yield remote_server
class TestCase(NamedTuple):
model_name: str
base_url: list[str]
api_key: str
expected_error: Any
@pytest.mark.asyncio
@pytest.mark.parametrize(
"test_case",
[
TestCase(
model_name=MODEL_NAME,
base_url=["v1"], # http://localhost:8000/v1
api_key=ERROR_API_KEY,
expected_error=openai.AuthenticationError,
),
TestCase(
model_name=MODEL_NAME,
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
api_key=ERROR_API_KEY,
expected_error=openai.AuthenticationError,
),
TestCase(
model_name=MODEL_NAME,
base_url=["v1"], # http://localhost:8000/v1
api_key=API_KEY,
expected_error=None,
),
TestCase(
model_name=MODEL_NAME,
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
api_key=API_KEY,
expected_error=None,
),
],
)
async def test_chat_session_root_path_with_api_key(
server: RemoteOpenAIServer, test_case: TestCase
):
saying: str = "Here is a common saying about apple. An apple a day, keeps"
ctx = contextlib.nullcontext()
if test_case.expected_error is not None:
ctx = pytest.raises(test_case.expected_error)
with ctx:
client = openai.AsyncOpenAI(
api_key=test_case.api_key,
base_url=server.url_for(*test_case.base_url),
max_retries=0,
)
chat_completion = await client.chat.completions.create(
model=test_case.model_name,
messages=[
{"role": "user", "content": "tell me a common saying"},
{"role": "assistant", "content": saying},
],
extra_body={"continue_final_message": True, "add_generation_prompt": False},
)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "stop"
message = choice.message
assert len(message.content) > 0
assert message.role == "assistant"

View File

@@ -0,0 +1,240 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import subprocess
import tempfile
import pytest
from vllm.entrypoints.openai.run_batch import BatchRequestOutput
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
# ruff: noqa: E501
INPUT_BATCH = (
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "NonExistModel", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {{"stream": "True", "model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
).format(MODEL_NAME)
INVALID_INPUT_BATCH = (
'{{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
).format(MODEL_NAME)
INPUT_EMBEDDING_BATCH = (
'{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}\n'
'{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}\n'
'{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}\n'
'{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}'
)
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}"""
def test_empty_file():
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write("")
input_file.flush()
proc = subprocess.Popen(
[
"vllm",
"run-batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"intfloat/multilingual-e5-small",
],
)
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
assert contents.strip() == ""
def test_completions():
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write(INPUT_BATCH)
input_file.flush()
proc = subprocess.Popen(
[
"vllm",
"run-batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
MODEL_NAME,
],
)
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
for line in contents.strip().split("\n"):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)
def test_completions_invalid_input():
"""
Ensure that we fail when the input doesn't conform to the openai api.
"""
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write(INVALID_INPUT_BATCH)
input_file.flush()
proc = subprocess.Popen(
[
"vllm",
"run-batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
MODEL_NAME,
],
)
proc.communicate()
proc.wait()
assert proc.returncode != 0, f"{proc=}"
def test_embeddings():
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write(INPUT_EMBEDDING_BATCH)
input_file.flush()
proc = subprocess.Popen(
[
"vllm",
"run-batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"intfloat/multilingual-e5-small",
],
)
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
for line in contents.strip().split("\n"):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)
@pytest.mark.parametrize("input_batch", [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
def test_score(input_batch):
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write(input_batch)
input_file.flush()
proc = subprocess.Popen(
[
"vllm",
"run-batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"BAAI/bge-reranker-v2-m3",
],
)
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
for line in contents.strip().split("\n"):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)
# Ensure that there is no error in the response.
line_dict = json.loads(line)
assert isinstance(line_dict, dict)
assert line_dict["error"] is None
def test_reasoning_parser():
"""
Test that reasoning_parser parameter works correctly in run_batch.
"""
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write(INPUT_REASONING_BATCH)
input_file.flush()
proc = subprocess.Popen(
[
"vllm",
"run-batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"Qwen/Qwen3-0.6B",
"--reasoning-parser",
"qwen3",
],
)
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
for line in contents.strip().split("\n"):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)
# Ensure that there is no error in the response.
line_dict = json.loads(line)
assert isinstance(line_dict, dict)
assert line_dict["error"] is None
# Check that reasoning is present and not empty
reasoning = line_dict["response"]["body"]["choices"][0]["message"][
"reasoning"
]
assert reasoning is not None
assert len(reasoning) > 0

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import time
from unittest.mock import Mock
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.tokenizers.mistral import MistralTokenizer
@pytest.fixture()
def serving() -> OpenAIServing:
"""Create a minimal OpenAIServing instance for testing."""
# Create minimal mocks
engine_client = Mock()
model_config = Mock(spec=ModelConfig)
model_config.max_model_len = 32768
models = Mock(spec=OpenAIServingModels)
models.model_config = model_config
models.input_processor = Mock()
models.io_processor = Mock()
serving = OpenAIServing(
engine_client=engine_client,
models=models,
request_logger=None,
)
return serving
@pytest.mark.asyncio
async def test_async_mistral_tokenizer_does_not_block_event_loop(
serving: OpenAIServing,
):
expected_tokens = [1, 2, 3]
# Mock the blocking version to sleep
def mocked_apply_chat_template(*_args, **_kwargs):
time.sleep(2)
return expected_tokens
mock_tokenizer = Mock(spec=MistralTokenizer)
mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template
task = serving._apply_mistral_chat_template_async(
tokenizer=mock_tokenizer, messages=[], chat_template=None, tools=[]
)
# Ensure the event loop is not blocked
blocked_count = 0
for _i in range(20): # Check over ~2 seconds
start = time.perf_counter()
await asyncio.sleep(0)
elapsed = time.perf_counter() - start
# an overly generous elapsed time for slow machines
if elapsed >= 0.5:
blocked_count += 1
await asyncio.sleep(0.1)
# Ensure task completes
tokens = await task
assert tokens == expected_tokens, "Mocked blocking tokenizer was not called"
assert blocked_count == 0, "Event loop blocked during tokenization"

View File

@@ -0,0 +1,129 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from unittest.mock import MagicMock
import pytest
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (
ErrorResponse,
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest,
)
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.lora.request import LoRARequest
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully."
LORA_UNLOADING_SUCCESS_MESSAGE = (
"Success: LoRA adapter '{lora_name}' removed successfully."
)
async def _async_serving_models_init() -> OpenAIServingModels:
mock_engine_client = MagicMock(spec=EngineClient)
# Set the max_model_len attribute to avoid missing attribute
mock_model_config = MagicMock(spec=ModelConfig)
mock_model_config.max_model_len = 2048
mock_engine_client.model_config = mock_model_config
mock_engine_client.input_processor = MagicMock()
mock_engine_client.io_processor = MagicMock()
serving_models = OpenAIServingModels(
engine_client=mock_engine_client,
base_model_paths=BASE_MODEL_PATHS,
lora_modules=None,
)
await serving_models.init_static_loras()
return serving_models
@pytest.mark.asyncio
async def test_serving_model_name():
serving_models = await _async_serving_models_init()
assert serving_models.model_name(None) == MODEL_NAME
request = LoRARequest(
lora_name="adapter", lora_path="/path/to/adapter2", lora_int_id=1
)
assert serving_models.model_name(request) == request.lora_name
@pytest.mark.asyncio
async def test_load_lora_adapter_success():
serving_models = await _async_serving_models_init()
request = LoadLoRAAdapterRequest(lora_name="adapter", lora_path="/path/to/adapter2")
response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter")
assert len(serving_models.lora_requests) == 1
assert "adapter" in serving_models.lora_requests
assert serving_models.lora_requests["adapter"].lora_name == "adapter"
@pytest.mark.asyncio
async def test_load_lora_adapter_missing_fields():
serving_models = await _async_serving_models_init()
request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.error.type == "InvalidUserInput"
assert response.error.code == HTTPStatus.BAD_REQUEST
@pytest.mark.asyncio
async def test_load_lora_adapter_duplicate():
serving_models = await _async_serving_models_init()
request = LoadLoRAAdapterRequest(
lora_name="adapter1", lora_path="/path/to/adapter1"
)
response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
assert len(serving_models.lora_requests) == 1
request = LoadLoRAAdapterRequest(
lora_name="adapter1", lora_path="/path/to/adapter1"
)
response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.error.type == "InvalidUserInput"
assert response.error.code == HTTPStatus.BAD_REQUEST
assert len(serving_models.lora_requests) == 1
@pytest.mark.asyncio
async def test_unload_lora_adapter_success():
serving_models = await _async_serving_models_init()
request = LoadLoRAAdapterRequest(
lora_name="adapter1", lora_path="/path/to/adapter1"
)
response = await serving_models.load_lora_adapter(request)
assert len(serving_models.lora_requests) == 1
request = UnloadLoRAAdapterRequest(lora_name="adapter1")
response = await serving_models.unload_lora_adapter(request)
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
assert len(serving_models.lora_requests) == 0
@pytest.mark.asyncio
async def test_unload_lora_adapter_missing_fields():
serving_models = await _async_serving_models_init()
request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.error.type == "InvalidUserInput"
assert response.error.code == HTTPStatus.BAD_REQUEST
@pytest.mark.asyncio
async def test_unload_lora_adapter_not_found():
serving_models = await _async_serving_models_init()
request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.error.type == "NotFoundError"
assert response.error.code == HTTPStatus.NOT_FOUND

View File

@@ -0,0 +1,352 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import AsyncExitStack
from unittest.mock import MagicMock
import pytest
import pytest_asyncio
from openai.types.responses.tool import (
CodeInterpreterContainerCodeInterpreterToolAuto,
LocalShell,
Mcp,
Tool,
)
from vllm.entrypoints.context import ConversationContext
from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
from vllm.entrypoints.openai.serving_responses import (
OpenAIServingResponses,
_extract_allowed_tools_from_mcp_requests,
extract_tool_types,
)
from vllm.entrypoints.tool_server import ToolServer
from vllm.inputs.data import TokensPrompt
class MockConversationContext(ConversationContext):
"""Mock conversation context for testing"""
def __init__(self):
self.init_tool_sessions_called = False
self.init_tool_sessions_args = None
self.init_tool_sessions_kwargs = None
def append_output(self, output) -> None:
pass
def append_tool_output(self, output) -> None:
pass
async def call_tool(self):
return []
def need_builtin_tool_call(self) -> bool:
return False
def render_for_completion(self):
return []
async def init_tool_sessions(self, tool_server, exit_stack, request_id, mcp_tools):
self.init_tool_sessions_called = True
self.init_tool_sessions_args = (tool_server, exit_stack, request_id, mcp_tools)
async def cleanup_session(self) -> None:
pass
@pytest.fixture
def mock_serving_responses():
"""Create a mock OpenAIServingResponses instance"""
serving_responses = MagicMock(spec=OpenAIServingResponses)
serving_responses.tool_server = MagicMock(spec=ToolServer)
return serving_responses
@pytest.fixture
def mock_context():
"""Create a mock conversation context"""
return MockConversationContext()
@pytest.fixture
def mock_exit_stack():
"""Create a mock async exit stack"""
return MagicMock(spec=AsyncExitStack)
def test_extract_tool_types(monkeypatch: pytest.MonkeyPatch) -> None:
tools: list[Tool] = []
assert extract_tool_types(tools) == set()
tools.append(LocalShell(type="local_shell"))
assert extract_tool_types(tools) == {"local_shell"}
tools.append(CodeInterpreterContainerCodeInterpreterToolAuto(type="auto"))
assert extract_tool_types(tools) == {"local_shell", "auto"}
tools.extend(
[
Mcp(type="mcp", server_label="random", server_url=""),
Mcp(type="mcp", server_label="container", server_url=""),
Mcp(type="mcp", server_label="code_interpreter", server_url=""),
Mcp(type="mcp", server_label="web_search_preview", server_url=""),
]
)
# When envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS is not set,
# mcp tool types are all ignored.
assert extract_tool_types(tools) == {"local_shell", "auto"}
# container is allowed, it would be extracted
monkeypatch.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "container")
assert extract_tool_types(tools) == {"local_shell", "auto", "container"}
# code_interpreter and web_search_preview are allowed,
# they would be extracted
monkeypatch.setenv(
"VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,web_search_preview"
)
assert extract_tool_types(tools) == {
"local_shell",
"auto",
"code_interpreter",
"web_search_preview",
}
class TestInitializeToolSessions:
"""Test class for _initialize_tool_sessions method"""
@pytest_asyncio.fixture
async def serving_responses_instance(self):
"""Create a real OpenAIServingResponses instance for testing"""
# Create minimal mocks for required dependencies
engine_client = MagicMock()
model_config = MagicMock()
model_config.hf_config.model_type = "test"
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()
models = MagicMock()
tool_server = MagicMock(spec=ToolServer)
# Create the actual instance
instance = OpenAIServingResponses(
engine_client=engine_client,
models=models,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
tool_server=tool_server,
)
return instance
@pytest.mark.asyncio
async def test_initialize_tool_sessions(
self, serving_responses_instance, mock_context, mock_exit_stack
):
"""Test that method works correctly with only MCP tools"""
request = ResponsesRequest(input="test input", tools=[])
# Call the method
await serving_responses_instance._initialize_tool_sessions(
request, mock_context, mock_exit_stack
)
assert mock_context.init_tool_sessions_called is False
# Create only MCP tools
tools = [
{"type": "web_search_preview"},
{"type": "code_interpreter", "container": {"type": "auto"}},
]
request = ResponsesRequest(input="test input", tools=tools)
# Call the method
await serving_responses_instance._initialize_tool_sessions(
request, mock_context, mock_exit_stack
)
# Verify that init_tool_sessions was called
assert mock_context.init_tool_sessions_called
def test_validate_create_responses_input(
self, serving_responses_instance, mock_context, mock_exit_stack
):
request = ResponsesRequest(
input="test input",
previous_input_messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is my horoscope? I am an Aquarius.",
}
],
}
],
previous_response_id="lol",
)
error = serving_responses_instance._validate_create_responses_input(request)
assert error is not None
assert error.error.type == "invalid_request_error"
class TestValidateGeneratorInput:
"""Test class for _validate_generator_input method"""
@pytest_asyncio.fixture
async def serving_responses_instance(self):
"""Create a real OpenAIServingResponses instance for testing"""
# Create minimal mocks for required dependencies
engine_client = MagicMock()
model_config = MagicMock()
model_config.hf_config.model_type = "test"
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()
models = MagicMock()
# Create the actual instance
instance = OpenAIServingResponses(
engine_client=engine_client,
models=models,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
# Set max_model_len for testing
instance.max_model_len = 100
return instance
def test_validate_generator_input(self, serving_responses_instance):
"""Test _validate_generator_input with valid prompt length"""
# Create an engine prompt with valid length (less than max_model_len)
valid_prompt_token_ids = list(range(5)) # 5 tokens < 100 max_model_len
engine_prompt = TokensPrompt(prompt_token_ids=valid_prompt_token_ids)
# Call the method
result = serving_responses_instance._validate_generator_input(engine_prompt)
# Should return None for valid input
assert result is None
# create an invalid engine prompt
invalid_prompt_token_ids = list(range(200)) # 100 tokens >= 100 max_model_len
engine_prompt = TokensPrompt(prompt_token_ids=invalid_prompt_token_ids)
# Call the method
result = serving_responses_instance._validate_generator_input(engine_prompt)
# Should return an ErrorResponse
assert result is not None
assert isinstance(result, ErrorResponse)
class TestExtractAllowedToolsFromMcpRequests:
"""Test class for _extract_allowed_tools_from_mcp_requests function"""
def test_extract_allowed_tools_basic_formats(self):
"""Test extraction with list format, object format, and None."""
from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
tools = [
# List format
Mcp(
type="mcp",
server_label="server1",
allowed_tools=["tool1", "tool2"],
),
# Object format
Mcp(
type="mcp",
server_label="server2",
allowed_tools=McpAllowedToolsMcpToolFilter(
tool_names=["tool3", "tool4"]
),
),
# None (no filter)
Mcp(
type="mcp",
server_label="server3",
allowed_tools=None,
),
]
result = _extract_allowed_tools_from_mcp_requests(tools)
assert result == {
"server1": ["tool1", "tool2"],
"server2": ["tool3", "tool4"],
"server3": None,
}
def test_extract_allowed_tools_star_normalization(self):
"""Test that '*' wildcard is normalized to None (select all tools).
This is the key test requested by reviewers to explicitly demonstrate
that the "*" select-all scenario is handled correctly.
"""
from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
tools = [
# Star in list format
Mcp(
type="mcp",
server_label="server1",
allowed_tools=["*"],
),
# Star mixed with other tools in list
Mcp(
type="mcp",
server_label="server2",
allowed_tools=["tool1", "*"],
),
# Star in object format
Mcp(
type="mcp",
server_label="server3",
allowed_tools=McpAllowedToolsMcpToolFilter(tool_names=["*"]),
),
]
result = _extract_allowed_tools_from_mcp_requests(tools)
# All should be normalized to None (allows all tools)
assert result == {
"server1": None,
"server2": None,
"server3": None,
}
def test_extract_allowed_tools_filters_non_mcp(self):
"""Test that non-MCP tools are ignored during extraction."""
tools = [
Mcp(
type="mcp",
server_label="server1",
allowed_tools=["tool1"],
),
LocalShell(type="local_shell"), # Non-MCP tool should be ignored
Mcp(
type="mcp",
server_label="server2",
allowed_tools=["tool2"],
),
]
result = _extract_allowed_tools_from_mcp_requests(tools)
# Non-MCP tools should be ignored
assert result == {
"server1": ["tool1"],
"server2": ["tool2"],
}

View File

@@ -0,0 +1,262 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import httpx
import pytest
import pytest_asyncio
from transformers import AutoTokenizer
from vllm.config import ModelConfig
from vllm.v1.engine.detokenizer import check_stop_strings
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
GEN_ENDPOINT = "/inference/v1/generate"
def get_vocab_size(model_name):
config = ModelConfig(
model=model_name,
seed=0,
dtype="bfloat16",
)
return config.get_vocab_size()
@pytest.fixture(scope="module")
def tokenizer():
return AutoTokenizer.from_pretrained(MODEL_NAME)
@pytest.fixture(scope="module")
def messages():
return [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "How many countries are in the EU?"},
]
@pytest.fixture(scope="module")
def server(request):
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"1024",
"--enforce-eager",
]
extra_args = getattr(request, "param", None)
if extra_args is not None:
args = args + (
list(extra_args)
if isinstance(extra_args, (list, tuple))
else [str(extra_args)]
)
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server: RemoteOpenAIServer):
transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
async with httpx.AsyncClient(
transport=transport,
base_url=server.url_root,
timeout=600,
headers=headers,
) as c:
yield c
@pytest.mark.asyncio
async def test_generate_endpoint(client):
payload = {
"model": MODEL_NAME,
"token_ids": [1, 2, 3],
"sampling_params": {"max_tokens": 5},
"stream": False,
}
resp = await client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
assert "choices" in data
@pytest.mark.asyncio
async def test_same_response_as_chat_completions(client, tokenizer, messages):
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False, # default with Qwen3
)
for ignore_eos in [True, False]:
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {
"max_tokens": 24,
"temperature": 0.0,
# NOTE coordinator will set this to skip detokenization
"detokenize": False,
"ignore_eos": ignore_eos,
},
"stream": False,
}
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_data = generate_resp.json()
generate_res = tokenizer.decode(
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
)
payload = {
"model": MODEL_NAME,
"messages": messages,
"max_tokens": 24,
"temperature": 0.0,
"stream": False,
"ignore_eos": ignore_eos,
"chat_template_kwargs": dict(enable_thinking=False),
}
completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"]
assert generate_res == completions_res
@pytest.mark.asyncio
async def test_stop_string_workflow(client, tokenizer, messages):
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False, # default with Qwen3
)
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {
"max_tokens": 24,
"temperature": 0.0,
"detokenize": False,
# stop strings are only supported when detokenize is True.
"stop": ["27 member"],
},
# TODO stream test is much more interesting
"stream": False,
}
with pytest.raises(httpx.HTTPStatusError):
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_resp.raise_for_status()
payload["sampling_params"]["stop"] = None
generate_resp = await client.post(
GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
)
generate_data = generate_resp.json()
generate_res = tokenizer.decode(
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
)
# NOTE This is under the responsibility of the coordinator
# stop_checker = StopChecker(
# max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
# )
stop_str, truncate_to = check_stop_strings(
generate_res, len(generate_res), ["27 member"], False
)
assert stop_str == "27 member"
# abort request that hit stop string (requires tokens-only mode)
# res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
# res.raise_for_status()
generate_res = generate_res[:truncate_to]
# Get stop_str response from chat completions
payload = {
"model": MODEL_NAME,
"messages": messages,
"max_tokens": 24,
"temperature": 0.0,
"stream": False,
"stop": ["27 member"],
"chat_template_kwargs": dict(enable_thinking=False),
}
completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"]
assert generate_res == completions_res
@pytest.mark.asyncio
@pytest.mark.parametrize(
"server",
[
[
"--enable-lora",
"--lora-modules",
"Alice=charent/self_cognition_Alice",
"Bob=charent/self_cognition_Bob",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
]
],
indirect=True,
)
async def test_generate_with_lora_adapter(client, tokenizer, messages):
# Verify adapters are listed
models_resp = await client.get("/v1/models")
models_resp.raise_for_status()
models = {m["id"] for m in models_resp.json().get("data", [])}
assert {"Alice", "Bob"}.issubset(models)
# Generate using a LoRA adapter by specifying its name as the model
payload = {
"model": "Alice",
"token_ids": [1, 2, 3],
"sampling_params": {"max_tokens": 5},
"stream": False,
}
resp = await client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
assert "choices" in data
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False, # default with Qwen3
)
payload = {
"model": "Alice",
"token_ids": token_ids,
"sampling_params": {
"max_tokens": 24,
"temperature": 0.0,
"detokenize": False,
},
"stream": False,
}
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_data = generate_resp.json()
generate_res = tokenizer.decode(
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
)
payload = {
"model": "Alice",
"messages": messages,
"max_tokens": 24,
"temperature": 0.0,
"stream": False,
"chat_template_kwargs": dict(enable_thinking=False),
}
completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"]
assert generate_res == completions_res

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import signal
import subprocess
import sys
import time
import openai
import pytest
from vllm.utils.network_utils import get_open_port
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
@pytest.mark.asyncio
async def test_shutdown_on_engine_failure():
"""Verify that API returns connection error when server process is killed.
Starts a vLLM server, kills it to simulate a crash, then verifies that
subsequent API calls fail appropriately.
"""
port = get_open_port()
proc = subprocess.Popen(
[
# dtype, max-len etc set so that this can run in CI
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
MODEL_NAME,
"--dtype",
"bfloat16",
"--max-model-len",
"128",
"--enforce-eager",
"--port",
str(port),
"--gpu-memory-utilization",
"0.05",
"--max-num-seqs",
"2",
"--disable-frontend-multiprocessing",
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
)
# Wait for server startup
start_time = time.time()
client = openai.AsyncOpenAI(
base_url=f"http://localhost:{port}/v1",
api_key="dummy",
max_retries=0,
timeout=10,
)
# Poll until server is ready
while time.time() - start_time < 30:
try:
await client.completions.create(
model=MODEL_NAME, prompt="Hello", max_tokens=1
)
break
except Exception:
time.sleep(0.5)
if proc.poll() is not None:
stdout, stderr = proc.communicate(timeout=1)
pytest.fail(
f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
)
else:
proc.terminate()
proc.wait(timeout=5)
pytest.fail("Server failed to start in 30 seconds")
# Kill server to simulate crash
proc.terminate()
time.sleep(1)
# Verify API calls now fail
with pytest.raises((openai.APIConnectionError, openai.APIStatusError)):
await client.completions.create(
model=MODEL_NAME, prompt="This should fail", max_tokens=1
)
return_code = proc.wait(timeout=5)
assert return_code is not None

View File

@@ -0,0 +1,110 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import requests
from prometheus_client.parser import text_string_to_metric_families
from ...utils import RemoteOpenAIServer
MODEL_NAME = "meta-llama/Llama-3.2-1B"
def test_sleep_mode():
# dtype, max-len etc set so that this can run in CI
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--max-num-seqs",
"128",
"--enable-sleep-mode",
]
with RemoteOpenAIServer(
MODEL_NAME,
args,
env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
) as remote_server:
response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
assert response.status_code == 200
response = requests.get(remote_server.url_for("is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is True
# check sleep metrics
response = requests.get(remote_server.url_for("metrics"))
assert response.status_code == 200
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
assert awake == 0
assert weights_offloaded == 1
assert discard_all == 0
response = requests.post(remote_server.url_for("wake_up"))
assert response.status_code == 200
response = requests.get(remote_server.url_for("is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is False
# check sleep metrics
response = requests.get(remote_server.url_for("metrics"))
assert response.status_code == 200
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
assert awake == 1
assert weights_offloaded == 0
assert discard_all == 0
# test wake up with tags
response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
assert response.status_code == 200
response = requests.post(
remote_server.url_for("wake_up"), params={"tags": ["weights"]}
)
assert response.status_code == 200
# is sleeping should be false after waking up any part of the engine
response = requests.get(remote_server.url_for("is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is True
response = requests.post(
remote_server.url_for("wake_up"), params={"tags": ["kv_cache"]}
)
assert response.status_code == 200
response = requests.get(remote_server.url_for("is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is False
# check sleep metrics
response = requests.get(remote_server.url_for("metrics"))
assert response.status_code == 200
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
assert awake == 1
assert weights_offloaded == 0
assert discard_all == 0
def _get_sleep_metrics_from_api(response: requests.Response):
"""Return (awake, weights_offloaded, discard_all)"""
awake, weights_offloaded, discard_all = None, None, None
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:engine_sleep_state":
for sample in family.samples:
if sample.name == "vllm:engine_sleep_state":
for label_name, label_value in sample.labels.items():
if label_value == "awake":
awake = sample.value
elif label_value == "weights_offloaded":
weights_offloaded = sample.value
elif label_value == "discard_all":
discard_all = sample.value
assert awake is not None
assert weights_offloaded is not None
assert discard_all is not None
return awake, weights_offloaded, discard_all

View File

@@ -0,0 +1,342 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Sparse tensor validation in embedding APIs.
Tests verify that malicious sparse tensors are rejected before they can trigger
out-of-bounds memory writes during to_dense() operations.
"""
import base64
import io
import pytest
import torch
from vllm.entrypoints.renderer import CompletionRenderer
from vllm.multimodal.audio import AudioEmbeddingMediaIO
from vllm.multimodal.image import ImageEmbeddingMediaIO
def _encode_tensor(tensor: torch.Tensor) -> bytes:
"""Helper to encode a tensor as base64 bytes."""
buffer = io.BytesIO()
torch.save(tensor, buffer)
buffer.seek(0)
return base64.b64encode(buffer.read())
def _create_malicious_sparse_tensor() -> torch.Tensor:
"""
Create a malicious sparse COO tensor with out-of-bounds indices.
This tensor has indices that point beyond the declared shape, which would
cause an out-of-bounds write when converted to dense format without
validation.
"""
# Create a 3x3 sparse tensor but with indices pointing to (10, 10)
indices = torch.tensor([[10], [10]]) # Out of bounds for 3x3 shape
values = torch.tensor([1.0])
shape = (3, 3)
# Create sparse tensor (this will be invalid)
sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
return sparse_tensor
def _create_valid_sparse_tensor() -> torch.Tensor:
"""Create a valid sparse COO tensor for baseline testing."""
indices = torch.tensor([[0, 1, 2], [0, 1, 2]])
values = torch.tensor([1.0, 2.0, 3.0])
shape = (3, 3)
sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
return sparse_tensor
def _create_valid_dense_tensor() -> torch.Tensor:
"""Create a valid dense tensor for baseline testing."""
return torch.randn(10, 768, dtype=torch.float32) # (seq_len, hidden_size)
class TestPromptEmbedsValidation:
"""Test sparse tensor validation in prompt embeddings (Completions API)."""
def test_valid_dense_tensor_accepted(self, model_config):
"""Baseline: Valid dense tensors should work normally."""
renderer = CompletionRenderer(model_config)
valid_tensor = _create_valid_dense_tensor()
encoded = _encode_tensor(valid_tensor)
# Should not raise any exception
result = renderer.load_prompt_embeds(encoded)
assert len(result) == 1
assert result[0]["prompt_embeds"].shape == valid_tensor.shape
def test_valid_sparse_tensor_accepted(self):
"""Baseline: Valid sparse tensors should load successfully."""
io_handler = ImageEmbeddingMediaIO()
valid_sparse = _create_valid_sparse_tensor()
encoded = _encode_tensor(valid_sparse)
# Should not raise any exception (sparse tensors remain sparse)
result = io_handler.load_base64("", encoded.decode("utf-8"))
assert result.shape == valid_sparse.shape
def test_malicious_sparse_tensor_rejected(self, model_config):
"""Security: Malicious sparse tensors should be rejected."""
renderer = CompletionRenderer(model_config)
malicious_tensor = _create_malicious_sparse_tensor()
encoded = _encode_tensor(malicious_tensor)
# Should raise RuntimeError due to invalid sparse tensor
with pytest.raises((RuntimeError, ValueError)) as exc_info:
renderer.load_prompt_embeds(encoded)
# Error should indicate sparse tensor validation failure
error_msg = str(exc_info.value).lower()
assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
def test_extremely_large_indices_rejected(self, model_config):
"""Security: Sparse tensors with extremely large indices should be rejected."""
renderer = CompletionRenderer(model_config)
# Create tensor with indices far beyond reasonable bounds
indices = torch.tensor([[999999], [999999]])
values = torch.tensor([1.0])
shape = (10, 10)
malicious_tensor = torch.sparse_coo_tensor(
indices, values, shape, dtype=torch.float32
)
encoded = _encode_tensor(malicious_tensor)
with pytest.raises((RuntimeError, ValueError)):
renderer.load_prompt_embeds(encoded)
def test_negative_indices_rejected(self, model_config):
"""Security: Sparse tensors with negative indices should be rejected."""
renderer = CompletionRenderer(model_config)
# Create tensor with negative indices
indices = torch.tensor([[-1], [-1]])
values = torch.tensor([1.0])
shape = (10, 10)
malicious_tensor = torch.sparse_coo_tensor(
indices, values, shape, dtype=torch.float32
)
encoded = _encode_tensor(malicious_tensor)
with pytest.raises((RuntimeError, ValueError)):
renderer.load_prompt_embeds(encoded)
class TestImageEmbedsValidation:
"""Test sparse tensor validation in image embeddings (Chat API)."""
def test_valid_dense_tensor_accepted(self):
"""Baseline: Valid dense tensors should work normally."""
io_handler = ImageEmbeddingMediaIO()
valid_tensor = _create_valid_dense_tensor()
encoded = _encode_tensor(valid_tensor)
# Should not raise any exception
result = io_handler.load_base64("", encoded.decode("utf-8"))
assert result.shape == valid_tensor.shape
def test_valid_sparse_tensor_accepted(self):
"""Baseline: Valid sparse tensors should load successfully."""
io_handler = AudioEmbeddingMediaIO()
valid_sparse = _create_valid_sparse_tensor()
encoded = _encode_tensor(valid_sparse)
# Should not raise any exception (sparse tensors remain sparse)
result = io_handler.load_base64("", encoded.decode("utf-8"))
assert result.shape == valid_sparse.shape
def test_malicious_sparse_tensor_rejected(self):
"""Security: Malicious sparse tensors should be rejected."""
io_handler = ImageEmbeddingMediaIO()
malicious_tensor = _create_malicious_sparse_tensor()
encoded = _encode_tensor(malicious_tensor)
# Should raise RuntimeError due to invalid sparse tensor
with pytest.raises((RuntimeError, ValueError)) as exc_info:
io_handler.load_base64("", encoded.decode("utf-8"))
error_msg = str(exc_info.value).lower()
assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
def test_load_bytes_validates(self):
"""Security: Validation should also work for load_bytes method."""
io_handler = ImageEmbeddingMediaIO()
malicious_tensor = _create_malicious_sparse_tensor()
buffer = io.BytesIO()
torch.save(malicious_tensor, buffer)
buffer.seek(0)
with pytest.raises((RuntimeError, ValueError)):
io_handler.load_bytes(buffer.read())
class TestAudioEmbedsValidation:
"""Test sparse tensor validation in audio embeddings (Chat API)."""
def test_valid_dense_tensor_accepted(self):
"""Baseline: Valid dense tensors should work normally."""
io_handler = AudioEmbeddingMediaIO()
valid_tensor = _create_valid_dense_tensor()
encoded = _encode_tensor(valid_tensor)
# Should not raise any exception
result = io_handler.load_base64("", encoded.decode("utf-8"))
assert result.shape == valid_tensor.shape
def test_valid_sparse_tensor_accepted(self):
"""Baseline: Valid sparse tensors should be converted successfully."""
io_handler = AudioEmbeddingMediaIO()
valid_sparse = _create_valid_sparse_tensor()
encoded = _encode_tensor(valid_sparse)
# Should not raise any exception
result = io_handler.load_base64("", encoded.decode("utf-8"))
assert result.is_sparse is False
def test_malicious_sparse_tensor_rejected(self):
"""Security: Malicious sparse tensors should be rejected."""
io_handler = AudioEmbeddingMediaIO()
malicious_tensor = _create_malicious_sparse_tensor()
encoded = _encode_tensor(malicious_tensor)
# Should raise RuntimeError due to invalid sparse tensor
with pytest.raises((RuntimeError, ValueError)) as exc_info:
io_handler.load_base64("", encoded.decode("utf-8"))
error_msg = str(exc_info.value).lower()
assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
def test_load_bytes_validates(self):
"""Security: Validation should also work for load_bytes method."""
io_handler = AudioEmbeddingMediaIO()
malicious_tensor = _create_malicious_sparse_tensor()
buffer = io.BytesIO()
torch.save(malicious_tensor, buffer)
buffer.seek(0)
with pytest.raises((RuntimeError, ValueError)):
io_handler.load_bytes(buffer.read())
class TestSparseTensorValidationIntegration:
"""
These tests verify the complete attack chain is blocked at all entry points.
"""
def test_attack_scenario_completions_api(self, model_config):
"""
Simulate a complete attack through the Completions API.
Attack scenario:
1. Attacker crafts malicious sparse tensor
2. Encodes it as base64
3. Sends to /v1/completions with prompt_embeds parameter
4. Server should reject before memory corruption occurs
"""
renderer = CompletionRenderer(model_config)
# Step 1-2: Attacker creates malicious payload
attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
# Step 3-4: Server processes and should reject
with pytest.raises((RuntimeError, ValueError)):
renderer.load_prompt_embeds(attack_payload)
def test_attack_scenario_chat_api_image(self):
"""
Simulate attack through Chat API with image_embeds.
Verifies the image embeddings path is protected.
"""
io_handler = ImageEmbeddingMediaIO()
attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
with pytest.raises((RuntimeError, ValueError)):
io_handler.load_base64("", attack_payload.decode("utf-8"))
def test_attack_scenario_chat_api_audio(self):
"""
Simulate attack through Chat API with audio_embeds.
Verifies the audio embeddings path is protected.
"""
io_handler = AudioEmbeddingMediaIO()
attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
with pytest.raises((RuntimeError, ValueError)):
io_handler.load_base64("", attack_payload.decode("utf-8"))
def test_multiple_valid_embeddings_in_batch(self, model_config):
"""
Regression test: Multiple valid embeddings should still work.
Ensures the fix doesn't break legitimate batch processing.
"""
renderer = CompletionRenderer(model_config)
valid_tensors = [
_encode_tensor(_create_valid_dense_tensor()),
_encode_tensor(_create_valid_dense_tensor()),
_encode_tensor(_create_valid_dense_tensor()),
]
# Should process all without error
result = renderer.load_prompt_embeds(valid_tensors)
assert len(result) == 3
def test_mixed_valid_and_malicious_rejected(self, model_config):
"""
Security: Batch with one malicious tensor should be rejected.
Even if most tensors are valid, a single malicious one should
cause rejection of the entire batch.
"""
renderer = CompletionRenderer(model_config)
mixed_batch = [
_encode_tensor(_create_valid_dense_tensor()),
_encode_tensor(_create_malicious_sparse_tensor()), # Malicious
_encode_tensor(_create_valid_dense_tensor()),
]
# Should fail on the malicious tensor
with pytest.raises((RuntimeError, ValueError)):
renderer.load_prompt_embeds(mixed_batch)
# Pytest fixtures
@pytest.fixture
def model_config():
"""Mock ModelConfig for testing."""
from vllm.config import ModelConfig
return ModelConfig(
model="facebook/opt-125m",
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float32",
seed=0,
enable_prompt_embeds=True, # Required for prompt embeds tests
)

View File

@@ -0,0 +1,105 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import os
import tempfile
import openai
import pytest
import pytest_asyncio
import torch.cuda
from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.model_loader.tensorizer import (
TensorizerConfig,
tensorize_lora_adapter,
tensorize_vllm_model,
)
from ...utils import RemoteOpenAIServer
MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
LORA_PATH = "davzoku/finqa_adapter_1b"
def _cleanup():
gc.collect()
torch.cuda.empty_cache()
@pytest.fixture(autouse=True)
def cleanup():
_cleanup()
@pytest.fixture(scope="module")
def tmp_dir():
with tempfile.TemporaryDirectory() as path:
yield path
@pytest.fixture(scope="module")
def model_uri(tmp_dir):
yield f"{tmp_dir}/model.tensors"
@pytest.fixture(scope="module")
def tensorize_model_and_lora(tmp_dir, model_uri):
tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri, lora_dir=tmp_dir)
args = EngineArgs(model=MODEL_NAME)
tensorize_lora_adapter(LORA_PATH, tensorizer_config)
tensorize_vllm_model(args, tensorizer_config)
# Manually invoke a _cleanup() here, as the cleanup()
# fixture won't be guaranteed to be called after this
# when this fixture is used for a test
_cleanup()
yield
@pytest.fixture(scope="module")
def server(model_uri, tensorize_model_and_lora):
# In this case, model_uri is a directory with a model.tensors
# file and all necessary model artifacts, particularly a
# HF `config.json` file. In this case, Tensorizer can infer the
# `TensorizerConfig` so --model-loader-extra-config can be completely
# omitted.
## Start OpenAI API server
args = [
"--load-format",
"tensorizer",
"--served-model-name",
MODEL_NAME,
"--enable-lora",
]
model_dir = os.path.dirname(model_uri)
with RemoteOpenAIServer(model_dir, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
_cleanup()
completion = await client.completions.create(
model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
assert completion.model == MODEL_NAME
assert len(completion.choices) == 1
assert len(completion.choices[0].text) >= 5
assert completion.choices[0].finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11
)

View File

@@ -0,0 +1,74 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import tempfile
import pytest
from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
@pytest.fixture(scope="module")
def server():
global MODEL_PATH
MODEL_PATH = download_weights_from_hf(
MODEL_NAME,
allow_patterns=["*"],
cache_dir=MODEL_PATH,
ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"],
)
args = [
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
"--skip-tokenizer-init",
"--load-format",
"dummy",
]
with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_token_in_token_out_and_logprobs(server):
"""
Test token-in-token-out and token_ids align with prompt_logprobs
& logprobs when return_tokens_as_token_ids is enabled.
"""
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
text = "Hello, world! How are you today?"
token_ids = tokenizer.encode(text)
async with server.get_async_client() as client:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion = await client.completions.create(
model=MODEL_PATH,
prompt=token_ids,
max_tokens=20,
temperature=0,
echo=True,
extra_body={
"return_token_ids": True,
},
)
# Verify all fields are present
assert (
completion.choices[0].token_ids is not None
and 0 < len(completion.choices[0].token_ids) <= 20
)
assert completion.choices[0].prompt_token_ids is not None
# Decode prompt tokens
if completion.choices[0].prompt_token_ids:
prompt_text = tokenizer.decode(completion.choices[0].prompt_token_ids)
# The decoded prompt should match or close to original prompt
assert prompt_text == text

View File

@@ -0,0 +1,355 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import pytest_asyncio
import requests
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
"--enable-tokenizer-info-endpoint",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def tokenizer_name(model_name: str):
return model_name
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenize_completions(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
for add_special in [False, True]:
prompt = "vllm1 This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(
server.url_for("tokenize"),
json={
"add_special_tokens": add_special,
"model": model_name,
"prompt": prompt,
},
)
response.raise_for_status()
result = response.json()
assert result["tokens"] == tokens
assert result["count"] == len(tokens)
assert result["max_model_len"] == 8192
assert result["token_strs"] is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenize_chat(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
for add_generation in [False, True]:
for add_special in [False, True]:
conversation = [
{"role": "user", "content": "Hi there!"},
{"role": "assistant", "content": "Nice to meet you!"},
{"role": "user", "content": "Can I ask a question? vllm1"},
]
for continue_final in [False, True]:
if add_generation and continue_final:
continue
if continue_final:
conversation.append({"role": "assistant", "content": "Sure,"})
prompt = tokenizer.apply_chat_template(
add_generation_prompt=add_generation,
continue_final_message=continue_final,
conversation=conversation,
tokenize=False,
)
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(
server.url_for("tokenize"),
json={
"add_generation_prompt": add_generation,
"continue_final_message": continue_final,
"add_special_tokens": add_special,
"messages": conversation,
"model": model_name,
},
)
response.raise_for_status()
result = response.json()
assert result["tokens"] == tokens
assert result["count"] == len(tokens)
assert result["max_model_len"] == 8192
assert result["token_strs"] is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenize_chat_with_tools(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
for add_generation in [False, True]:
for add_special in [False, True]:
conversation = [
{
"role": "user",
"content": "What's the weather like in Paris today?",
}
]
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string"}},
},
},
}
]
for continue_final in [False, True]:
if add_generation and continue_final:
continue
if continue_final:
conversation.append({"role": "assistant", "content": "Sure,"})
prompt = tokenizer.apply_chat_template(
add_generation_prompt=add_generation,
continue_final_message=continue_final,
conversation=conversation,
tools=tools,
tokenize=False,
)
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(
server.url_for("tokenize"),
json={
"add_generation_prompt": add_generation,
"continue_final_message": continue_final,
"add_special_tokens": add_special,
"messages": conversation,
"model": model_name,
"tools": tools,
},
)
response.raise_for_status()
result = response.json()
assert result["tokens"] == tokens
assert result["count"] == len(tokens)
assert result["max_model_len"] == 8192
assert result["token_strs"] is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name, tokenizer_name",
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenize_with_return_token_strs(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
prompt = "This is a token_strs test prompt! vllm1"
response = requests.post(
server.url_for("tokenize"),
json={"prompt": prompt, "model": model_name, "return_token_strs": True},
)
response.raise_for_status()
tokens = tokenizer.encode(prompt, add_special_tokens=True)
tokens_str = tokenizer.convert_ids_to_tokens(tokens)
result = response.json()
assert result["tokens"] == tokens
assert result["count"] == len(tokens)
assert result["max_model_len"] == 8192
assert result["token_strs"] == tokens_str
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_detokenize(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
prompt = "This is a test prompt. vllm1"
tokens = tokenizer.encode(prompt, add_special_tokens=False)
response = requests.post(
server.url_for("detokenize"), json={"model": model_name, "tokens": tokens}
)
response.raise_for_status()
assert response.json() == {"prompt": prompt}
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenizer_info_basic(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
"""Test basic tokenizer info endpoint functionality."""
response = requests.get(server.url_for("tokenizer_info"))
response.raise_for_status()
result = response.json()
assert "tokenizer_class" in result
assert isinstance(result["tokenizer_class"], str)
assert result["tokenizer_class"]
@pytest.mark.asyncio
async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
"""Test that the response matches expected schema types."""
response = requests.get(server.url_for("tokenizer_info"))
response.raise_for_status()
result = response.json()
field_types = {
"add_bos_token": bool,
"add_prefix_space": bool,
"clean_up_tokenization_spaces": bool,
"split_special_tokens": bool,
"bos_token": str,
"eos_token": str,
"pad_token": str,
"unk_token": str,
"chat_template": str,
"errors": str,
"model_max_length": int,
"additional_special_tokens": list,
"added_tokens_decoder": dict,
}
for field, expected_type in field_types.items():
if field in result and result[field] is not None:
assert isinstance(result[field], expected_type), (
f"{field} should be {expected_type.__name__}"
)
@pytest.mark.asyncio
async def test_tokenizer_info_added_tokens_structure(
server: RemoteOpenAIServer,
):
"""Test added_tokens_decoder structure if present."""
response = requests.get(server.url_for("tokenizer_info"))
response.raise_for_status()
result = response.json()
added_tokens = result.get("added_tokens_decoder")
if added_tokens:
for token_id, token_info in added_tokens.items():
assert isinstance(token_id, str), "Token IDs should be strings"
assert isinstance(token_info, dict), "Token info should be a dict"
assert "content" in token_info, "Token info should have content"
assert "special" in token_info, "Token info should have special flag"
assert isinstance(token_info["special"], bool), (
"Special flag should be boolean"
)
@pytest.mark.asyncio
async def test_tokenizer_info_consistency_with_tokenize(
server: RemoteOpenAIServer,
):
"""Test that tokenizer info is consistent with tokenization endpoint."""
info_response = requests.get(server.url_for("tokenizer_info"))
info_response.raise_for_status()
info = info_response.json()
tokenize_response = requests.post(
server.url_for("tokenize"),
json={"model": MODEL_NAME, "prompt": "Hello world!"},
)
tokenize_response.raise_for_status()
tokenize_result = tokenize_response.json()
info_max_len = info.get("model_max_length")
tokenize_max_len = tokenize_result.get("max_model_len")
if info_max_len and tokenize_max_len:
assert info_max_len >= tokenize_max_len, (
"Info max length should be >= tokenize max length"
)
@pytest.mark.asyncio
async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
"""Test chat template is properly included."""
response = requests.get(server.url_for("tokenizer_info"))
response.raise_for_status()
result = response.json()
chat_template = result.get("chat_template")
if chat_template:
assert isinstance(chat_template, str), "Chat template should be a string"
assert chat_template.strip(), "Chat template should not be empty"

View File

@@ -0,0 +1,100 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# imports for structured outputs tests
import json
import pytest
from ...utils import RemoteOpenAIServer
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode",
"mistral",
"--config_format",
"mistral",
"--load_format",
"mistral",
]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
async def test_basic_audio(mary_had_lamb, model_name):
server_args = ["--enforce-eager"]
if model_name.startswith("mistralai"):
server_args += MISTRAL_FORMAT_ARGS
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
assert "Mary had a little lamb," in out_text
assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb):
"""Ensure STT (transcribe) requests can pass LoRA through to generate."""
model_name = "ibm-granite/granite-speech-3.3-2b"
lora_model_name = "speech"
server_args = [
"--enforce-eager",
"--enable-lora",
"--max-lora-rank",
"64",
"--lora-modules",
f"{lora_model_name}={model_name}",
"--max-model-len",
"2048",
"--max-num-seqs",
"1",
]
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=lora_model_name,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
assert "mary had a little lamb" in out_text
assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio
async def test_basic_audio_gemma(foscolo):
# Gemma accuracy on some of the audio samples we use is particularly bad,
# hence we use a different one here. WER is evaluated separately.
model_name = "google/gemma-3n-E2B-it"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(
model_name, server_args, max_wait_seconds=480
) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=model_name,
file=foscolo,
language="it",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)["text"]
assert "da cui vergine nacque Venere" in out

View File

@@ -0,0 +1,246 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# imports for structured outputs tests
import asyncio
import io
import json
import librosa
import numpy as np
import openai
import pytest
import pytest_asyncio
import soundfile as sf
from ...utils import RemoteOpenAIServer
MODEL_NAME = "openai/whisper-large-v3-turbo"
SERVER_ARGS = ["--enforce-eager"]
@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def whisper_client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_basic_audio(whisper_client, mary_had_lamb):
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
assert "Mary had a little lamb," in out_text
assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio
async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
transcription = whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
transcription2 = whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
response_format="text",
temperature=0.0,
)
# Await both transcriptions by scheduling coroutines together
transcription, transcription2 = await asyncio.gather(transcription, transcription2)
out = json.loads(transcription)
out_text = out["text"]
assert "Mary had a little lamb," in out_text
out2 = json.loads(transcription2)
out_text2 = out2["text"]
assert "Edgar Martinez" in out_text2
@pytest.mark.asyncio
async def test_bad_requests(mary_had_lamb, whisper_client):
# invalid language
with pytest.raises(openai.BadRequestError):
await whisper_client.audio.transcriptions.create(
model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
)
@pytest.mark.asyncio
async def test_long_audio_request(mary_had_lamb, whisper_client):
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
audio = np.pad(audio, (0, 1600))
repeated_audio = np.tile(audio, 10)
# Repeated audio to buffer
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format="WAV")
buffer.seek(0)
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=buffer,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
counts = out_text.count("Mary had a little lamb")
assert counts == 10, counts
assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
async def test_completion_endpoints(whisper_client):
# text to text model
res = await whisper_client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "system", "content": "You are a helpful assistant."}],
)
err = res.error
assert err["code"] == 400
assert err["message"] == "The model does not support Chat Completions API"
res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
err = res.error
assert err["code"] == 400
assert err["message"] == "The model does not support Completions API"
@pytest.mark.asyncio
async def test_streaming_response(winning_call, whisper_client):
transcription = ""
res_no_stream = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
response_format="json",
language="en",
temperature=0.0,
)
res = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
temperature=0.0,
stream=True,
timeout=30,
)
# Reconstruct from chunks and validate
async for chunk in res:
text = chunk.choices[0]["delta"]["content"]
transcription += text
assert transcription == res_no_stream.text
@pytest.mark.asyncio
async def test_stream_options(winning_call, whisper_client):
res = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
temperature=0.0,
stream=True,
extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
timeout=30,
)
final = False
continuous = True
async for chunk in res:
if not len(chunk.choices):
# final usage sent
final = True
else:
continuous = continuous and hasattr(chunk, "usage")
assert final and continuous
@pytest.mark.asyncio
async def test_sampling_params(mary_had_lamb, whisper_client):
"""
Compare sampling with params and greedy sampling to assert results
are different when extreme sampling parameters values are picked.
"""
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
temperature=0.8,
extra_body=dict(
seed=42,
repetition_penalty=1.9,
top_k=12,
top_p=0.4,
min_p=0.5,
frequency_penalty=1.8,
presence_penalty=2.0,
),
)
greedy_transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
temperature=0.0,
extra_body=dict(seed=42),
)
assert greedy_transcription.text != transcription.text
@pytest.mark.asyncio
async def test_audio_prompt(mary_had_lamb, whisper_client):
prompt = "This is a speech, recorded in a phonograph."
# Prompts should not omit the part of original prompt while transcribing.
prefix = "The first words I spoke in the original phonograph"
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)["text"]
assert prefix in out
transcription_wprompt = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
prompt=prompt,
temperature=0.0,
)
out_prompt = json.loads(transcription_wprompt)["text"]
assert prefix in out_prompt
@pytest.mark.asyncio
async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="verbose_json",
temperature=0.0,
)
assert transcription.segments is not None
assert len(transcription.segments) > 0

View File

@@ -0,0 +1,229 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
# imports for structured outputs tests
import json
import httpx
import librosa
import numpy as np
import pytest
import pytest_asyncio
import soundfile as sf
from ...utils import RemoteOpenAIServer
SERVER_ARGS = ["--enforce-eager"]
@pytest.fixture(
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
)
def server(request):
# Parametrize over model name
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
yield remote_server, request.param
@pytest_asyncio.fixture
async def client_and_model(server):
server, model_name = server
async with server.get_async_client() as async_client:
yield async_client, model_name
@pytest.mark.asyncio
async def test_non_asr_model(foscolo):
# text to text model
model_name = "JackFram/llama-68m"
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
client = remote_server.get_async_client()
res = await client.audio.translations.create(
model=model_name, file=foscolo, temperature=0.0
)
err = res.error
assert err["code"] == 400 and not res.text
assert err["message"] == "The model does not support Translations API"
@pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb):
"""Ensure STT (translate) requests can pass LoRA through to generate."""
# NOTE - careful to call this test before the module scoped server
# fixture, otherwise it'll OOMkill the CI
model_name = "ibm-granite/granite-speech-3.3-2b"
lora_model_name = "speech"
server_args = [
"--enforce-eager",
"--enable-lora",
"--max-lora-rank",
"64",
"--lora-modules",
f"{lora_model_name}={model_name}",
"--max-model-len",
"2048",
"--max-num-seqs",
"1",
]
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
translation = await client.audio.translations.create(
model=lora_model_name,
file=mary_had_lamb,
extra_body=dict(language="en", to_language="es"),
response_format="text",
temperature=0.0,
)
out = json.loads(translation)["text"].strip().lower()
assert "pequeño" in out.split(" ")
# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio
async def test_basic_audio(foscolo, client_and_model):
client, model_name = client_and_model
translation = await client.audio.translations.create(
model=model_name,
file=foscolo,
response_format="text",
# TODO remove `language="it"` once language detection is implemented
extra_body=dict(language="it", to_language="en"),
temperature=0.0,
)
out = json.loads(translation)["text"].strip().lower()
assert "greek sea" in out
@pytest.mark.asyncio
async def test_audio_prompt(foscolo, client_and_model):
client, model_name = client_and_model
# Condition whisper on starting text
prompt = "Nor have I ever"
transcription = await client.audio.translations.create(
model=model_name,
file=foscolo,
prompt=prompt,
extra_body=dict(language="it", to_language="en"),
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)["text"]
assert "Nor will I ever touch the sacred" not in out
assert prompt not in out
@pytest.mark.asyncio
async def test_streaming_response(foscolo, client_and_model, server):
client, model_name = client_and_model
translation = ""
res_no_stream = await client.audio.translations.create(
model=model_name,
file=foscolo,
response_format="json",
extra_body=dict(language="it", to_language="en", seed=42),
temperature=0.0,
)
# Stream via HTTPX since OpenAI translation client doesn't expose streaming
server, model_name = server
url = server.url_for("v1/audio/translations")
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
data = {
"model": model_name,
"language": "it",
"to_language": "en",
"stream": True,
"temperature": 0.0,
"seed": 42,
}
foscolo.seek(0)
async with httpx.AsyncClient() as http_client:
files = {"file": foscolo}
async with http_client.stream(
"POST", url, headers=headers, data=data, files=files
) as response:
async for line in response.aiter_lines():
if not line:
continue
if line.startswith("data: "):
line = line[len("data: ") :]
if line.strip() == "[DONE]":
break
chunk = json.loads(line)
text = chunk["choices"][0].get("delta", {}).get("content")
translation += text or ""
res_stream = translation.split()
# NOTE There's a small non-deterministic issue here, likely in the attn
# computation, which will cause a few tokens to be different, while still
# being very close semantically.
assert (
sum([x == y for x, y in zip(res_stream, res_no_stream.text.split())])
>= len(res_stream) * 0.9
)
@pytest.mark.asyncio
async def test_stream_options(foscolo, server):
server, model_name = server
url = server.url_for("v1/audio/translations")
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
data = {
"model": model_name,
"language": "it",
"to_language": "en",
"stream": True,
"stream_include_usage": True,
"stream_continuous_usage_stats": True,
"temperature": 0.0,
}
foscolo.seek(0)
final = False
continuous = True
async with httpx.AsyncClient() as http_client:
files = {"file": foscolo}
async with http_client.stream(
"POST", url, headers=headers, data=data, files=files
) as response:
async for line in response.aiter_lines():
if not line:
continue
if line.startswith("data: "):
line = line[len("data: ") :]
if line.strip() == "[DONE]":
break
chunk = json.loads(line)
choices = chunk.get("choices", [])
if not choices:
# final usage sent
final = True
else:
continuous = continuous and ("usage" in chunk)
assert final and continuous
@pytest.mark.asyncio
async def test_long_audio_request(foscolo, client_and_model):
client, model_name = client_and_model
if model_name == "google/gemma-3n-E2B-it":
pytest.skip("Gemma3n does not support long audio requests")
foscolo.seek(0)
audio, sr = librosa.load(foscolo)
repeated_audio = np.tile(audio, 2)
# Repeated audio to buffer
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format="WAV")
buffer.seek(0)
translation = await client.audio.translations.create(
model=model_name,
file=buffer,
extra_body=dict(language="it", to_language="en"),
response_format="text",
temperature=0.0,
)
out = json.loads(translation)["text"].strip().lower()
assert out.count("greek sea") == 2

View File

@@ -0,0 +1,43 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from tempfile import TemporaryDirectory
import httpx
import pytest
from vllm.version import __version__ as VLLM_VERSION
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def server():
with TemporaryDirectory() as tmpdir:
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
"--uds",
f"{tmpdir}/vllm.sock",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_show_version(server: RemoteOpenAIServer):
transport = httpx.HTTPTransport(uds=server.uds)
client = httpx.Client(transport=transport)
response = client.get(server.url_for("version"))
response.raise_for_status()
assert response.json() == {"version": VLLM_VERSION}

View File

@@ -0,0 +1,325 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai
import pytest
import pytest_asyncio
from vllm.multimodal.utils import encode_video_base64, fetch_video
from ...utils import RemoteOpenAIServer
MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
MAXIMUM_VIDEOS = 4
TEST_VIDEO_URLS = [
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
]
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"generate",
"--max-model-len",
"32768",
"--max-num-seqs",
"2",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
json.dumps({"video": MAXIMUM_VIDEOS}),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_video() -> dict[str, str]:
return {
video_url: encode_video_base64(fetch_video(video_url)[0])
for video_url in TEST_VIDEO_URLS
}
def dummy_messages_from_video_url(
video_urls: str | list[str],
content_text: str = "What's in this video?",
):
if isinstance(video_urls, str):
video_urls = [video_urls]
return [
{
"role": "user",
"content": [
*(
{"type": "video_url", "video_url": {"url": video_url}}
for video_url in video_urls
),
{"type": "text", "text": content_text},
],
}
]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video(
client: openai.AsyncOpenAI, model_name: str, video_url: str
):
messages = dummy_messages_from_video_url(video_url)
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5,
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6287, total_tokens=6297
)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_error_on_invalid_video_url_type(
client: openai.AsyncOpenAI, model_name: str, video_url: str
):
messages = [
{
"role": "user",
"content": [
{"type": "video_url", "video_url": video_url},
{"type": "text", "text": "What's in this video?"},
],
}
]
# video_url should be a dict {"url": "some url"}, not directly a string
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_beamsearch(
client: openai.AsyncOpenAI, model_name: str, video_url: str
):
messages = dummy_messages_from_video_url(video_url)
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(use_beam_search=True),
)
assert len(chat_completion.choices) == 2
assert (
chat_completion.choices[0].message.content
!= chat_completion.choices[1].message.content
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
video_url: str,
base64_encoded_video: dict[str, str],
):
messages = dummy_messages_from_video_url(
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
)
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5,
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6287, total_tokens=6297
)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded_beamsearch(
client: openai.AsyncOpenAI,
model_name: str,
video_url: str,
base64_encoded_video: dict[str, str],
):
messages = dummy_messages_from_video_url(
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
)
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
extra_body=dict(use_beam_search=True),
)
assert len(chat_completion.choices) == 2
assert (
chat_completion.choices[0].message.content
!= chat_completion.choices[1].message.content
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_chat_streaming_video(
client: openai.AsyncOpenAI, model_name: str, video_url: str
):
messages = dummy_messages_from_video_url(video_url)
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=True,
)
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
)
async def test_multi_video_input(
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
):
messages = dummy_messages_from_video_url(video_urls)
if len(video_urls) > MAXIMUM_VIDEOS:
with pytest.raises(openai.BadRequestError): # test multi-video input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
else:
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0

View File

@@ -0,0 +1,567 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai
import pytest
import pytest_asyncio
from transformers import AutoProcessor
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
from ...utils import RemoteOpenAIServer
MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
MAXIMUM_IMAGES = 2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_ASSETS = [
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
]
EXPECTED_MM_BEAM_SEARCH_RES = [
[
"The image shows a wooden boardwalk leading through a",
"The image shows a wooden boardwalk extending into a",
],
[
"The image shows two parrots perched on",
"The image shows two birds perched on a cur",
],
[
"The image shows a Venn diagram with three over",
"The image shows a colorful Venn diagram with",
],
[
"This image displays a gradient of colors ranging from",
"This image displays a gradient of colors forming a spectrum",
],
]
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"generate",
"--max-model-len",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
json.dumps({"image": MAXIMUM_IMAGES}),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_asset: encode_image_base64(
local_asset_server.get_image_asset(image_asset)
)
for image_asset in TEST_IMAGE_ASSETS
}
def dummy_messages_from_image_url(
image_urls: str | list[str],
content_text: str = "What's in this image?",
):
if isinstance(image_urls, str):
image_urls = [image_urls]
return [
{
"role": "user",
"content": [
*(
{"type": "image_url", "image_url": {"url": image_url}}
for image_url in image_urls
),
{"type": "text", "text": content_text},
],
}
]
def get_hf_prompt_tokens(model_name, content, image_url):
processor = AutoProcessor.from_pretrained(
model_name, trust_remote_code=True, num_crops=4
)
placeholder = "<|image_1|>\n"
messages = [
{
"role": "user",
"content": f"{placeholder}{content}",
}
]
image = fetch_image(image_url)
# Unwrap MediaWithBytes if present
if isinstance(image, MediaWithBytes):
image = image.media
images = [image]
prompt = processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = processor(prompt, images, return_tensors="pt")
return inputs.input_ids.shape[1]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image(
client: openai.AsyncOpenAI, model_name: str, image_url: str
):
content_text = "What's in this image?"
messages = dummy_messages_from_image_url(image_url, content_text)
max_completion_tokens = 10
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=max_completion_tokens,
logprobs=True,
temperature=0.0,
top_logprobs=5,
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=max_completion_tokens,
prompt_tokens=hf_prompt_tokens,
total_tokens=hf_prompt_tokens + max_completion_tokens,
)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_error_on_invalid_image_url_type(
client: openai.AsyncOpenAI, model_name: str, image_url: str
):
content_text = "What's in this image?"
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": image_url},
{"type": "text", "text": content_text},
],
}
]
# image_url should be a dict {"url": "some url"}, not directly a string
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image_beamsearch(
client: openai.AsyncOpenAI, model_name: str, image_url: str
):
content_text = "What's in this image?"
messages = dummy_messages_from_image_url(image_url, content_text)
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(use_beam_search=True),
)
assert len(chat_completion.choices) == 2
assert (
chat_completion.choices[0].message.content
!= chat_completion.choices[1].message.content
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
raw_image_url: str,
image_url: str,
base64_encoded_image: dict[str, str],
):
content_text = "What's in this image?"
messages = dummy_messages_from_image_url(
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
content_text,
)
max_completion_tokens = 10
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=max_completion_tokens,
logprobs=True,
temperature=0.0,
top_logprobs=5,
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=max_completion_tokens,
prompt_tokens=hf_prompt_tokens,
total_tokens=hf_prompt_tokens + max_completion_tokens,
)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
async def test_single_chat_session_image_base64encoded_beamsearch(
client: openai.AsyncOpenAI,
model_name: str,
image_idx: int,
base64_encoded_image: dict[str, str],
):
# NOTE: This test also validates that we pass MM data through beam search
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
messages = dummy_messages_from_image_url(
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
)
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
temperature=0.0,
extra_body=dict(use_beam_search=True),
)
assert len(chat_completion.choices) == 2
for actual, expected_str in zip(chat_completion.choices, expected_res):
assert actual.message.content == expected_str
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_chat_streaming_image(
client: openai.AsyncOpenAI, model_name: str, image_url: str
):
messages = dummy_messages_from_image_url(image_url)
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=True,
)
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
indirect=True,
)
async def test_multi_image_input(
client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
):
messages = dummy_messages_from_image_url(image_urls)
if len(image_urls) > MAXIMUM_IMAGES:
with pytest.raises(openai.BadRequestError): # test multi-image input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
else:
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
indirect=True,
)
async def test_completions_with_image(
client: openai.AsyncOpenAI,
model_name: str,
image_urls: list[str],
):
for image_url in image_urls:
chat_completion = await client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image.",
},
{
"type": "image_url",
"image_url": {
"url": image_url,
},
},
],
},
],
model=model_name,
)
assert chat_completion.choices[0].message.content is not None
assert isinstance(chat_completion.choices[0].message.content, str)
assert len(chat_completion.choices[0].message.content) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
indirect=True,
)
async def test_completions_with_image_with_uuid(
client: openai.AsyncOpenAI,
model_name: str,
image_urls: list[str],
):
for image_url in image_urls:
chat_completion = await client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image.",
},
{
"type": "image_url",
"image_url": {
"url": image_url,
},
"uuid": image_url,
},
],
},
],
model=model_name,
)
assert chat_completion.choices[0].message.content is not None
assert isinstance(chat_completion.choices[0].message.content, str)
assert len(chat_completion.choices[0].message.content) > 0
# Second request, with empty image but the same uuid.
chat_completion_with_empty_image = await client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image.",
},
{"type": "image_url", "image_url": {}, "uuid": image_url},
],
},
],
model=model_name,
)
assert chat_completion_with_empty_image.choices[0].message.content is not None
assert isinstance(
chat_completion_with_empty_image.choices[0].message.content, str
)
assert len(chat_completion_with_empty_image.choices[0].message.content) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_completions_with_empty_image_with_uuid_without_cache_hit(
client: openai.AsyncOpenAI,
model_name: str,
):
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image.",
},
{
"type": "image_url",
"image_url": {},
"uuid": "uuid_not_previously_seen",
},
],
},
],
model=model_name,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
indirect=True,
)
async def test_completions_with_image_with_incorrect_uuid_format(
client: openai.AsyncOpenAI,
model_name: str,
image_urls: list[str],
):
for image_url in image_urls:
chat_completion = await client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image.",
},
{
"type": "image_url",
"image_url": {
"url": image_url,
"incorrect_uuid_key": image_url,
},
"also_incorrect_uuid_key": image_url,
},
],
},
],
model=model_name,
)
assert chat_completion.choices[0].message.content is not None
assert isinstance(chat_completion.choices[0].message.content, str)
assert len(chat_completion.choices[0].message.content) > 0

View File

@@ -0,0 +1,70 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import numpy as np
import pytest
import requests
import torch
from vllm.utils.serial_utils import tensor2base64
from ...utils import RemoteOpenAIServer
def _terratorch_dummy_messages():
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
return [
{
"role": "user",
"content": [
{
"type": "image_embeds",
"image_embeds": {
"pixel_values": tensor2base64(pixel_values),
"location_coords": tensor2base64(location_coords),
},
}
],
}
]
@pytest.mark.parametrize(
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
)
def test_single_request(model_name: str):
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--enforce-eager",
"--trust-remote-code",
"--max-num-seqs",
"32",
"--model-impl",
"terratorch",
"--skip-tokenizer-init",
"--enable-mm-embeds",
]
with RemoteOpenAIServer(model_name, args) as server:
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"messages": _terratorch_dummy_messages(),
"encoding_format": "base64",
},
)
response.raise_for_status()
output = response.json()["data"][0]["data"]
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
assert len(np_response) == 524288

View File

@@ -0,0 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from vllm.tokenizers import TokenizerLike
@pytest.fixture(scope="function")
def default_tokenizer() -> TokenizerLike:
return AutoTokenizer.from_pretrained("gpt2")

View File

@@ -0,0 +1,176 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager
SIMPLE_ARGS_DICT = {
"action": "create",
"id": "preferences",
}
SIMPLE_FUNCTION_JSON = json.dumps(
{
"name": "manage_user_memory",
"arguments": SIMPLE_ARGS_DICT,
},
ensure_ascii=False,
)
SIMPLE_FUNCTION_OUTPUT = "function call" + SIMPLE_FUNCTION_JSON
SIMPLE_FUNCTION_CALL = FunctionCall(
name="manage_user_memory",
arguments=json.dumps(SIMPLE_ARGS_DICT, ensure_ascii=False),
)
PARAMETERLESS_FUNCTION_JSON = json.dumps(
{
"name": "manage_user_memory",
"arguments": {},
},
ensure_ascii=False,
)
PARAMETERLESS_FUNCTION_OUTPUT = "function call" + PARAMETERLESS_FUNCTION_JSON
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
name="manage_user_memory",
arguments=json.dumps({}, ensure_ascii=False),
)
COMPLEX_ARGS_DICT = {
"action": "create",
"id": "preferences",
"content": {
"short_answers": True,
"hate_emojis": True,
"english_ui": False,
"russian_math_explanations": True,
},
}
COMPLEX_FUNCTION_JSON = json.dumps(
{
"name": "manage_user_memory",
"arguments": COMPLEX_ARGS_DICT,
},
ensure_ascii=False,
)
COMPLEX_FUNCTION_OUTPUT = "function call" + COMPLEX_FUNCTION_JSON
COMPLEX_FUNCTION_CALL = FunctionCall(
name="manage_user_memory",
arguments=json.dumps(COMPLEX_ARGS_DICT, ensure_ascii=False),
)
@pytest.mark.parametrize("streaming", [True, False])
def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
default_tokenizer
)
model_output = "How can I help you today?"
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert content == model_output
assert len(tool_calls) == 0
TEST_CASES = [
pytest.param(
True,
SIMPLE_FUNCTION_OUTPUT,
[SIMPLE_FUNCTION_CALL],
None,
id="simple_streaming",
),
pytest.param(
False,
SIMPLE_FUNCTION_OUTPUT,
[SIMPLE_FUNCTION_CALL],
None,
id="simple_nonstreaming",
),
pytest.param(
True,
PARAMETERLESS_FUNCTION_OUTPUT,
[PARAMETERLESS_FUNCTION_CALL],
None,
id="parameterless_streaming",
),
pytest.param(
False,
PARAMETERLESS_FUNCTION_OUTPUT,
[PARAMETERLESS_FUNCTION_CALL],
None,
id="parameterless_nonstreaming",
),
pytest.param(
True,
COMPLEX_FUNCTION_OUTPUT,
[COMPLEX_FUNCTION_CALL],
None,
id="complex_streaming",
),
pytest.param(
False,
COMPLEX_FUNCTION_OUTPUT,
[COMPLEX_FUNCTION_CALL],
None,
id="complex_nonstreaming",
),
]
@pytest.mark.parametrize(
"streaming, model_output, expected_tool_calls, expected_content", TEST_CASES
)
def test_tool_call(
streaming: bool,
model_output: str,
expected_tool_calls: list[FunctionCall],
expected_content: str | None,
default_tokenizer: TokenizerLike,
):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
default_tokenizer
)
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert content == expected_content
assert len(tool_calls) == len(expected_tool_calls)
for actual, expected in zip(tool_calls, expected_tool_calls):
assert actual.type == "function"
assert actual.function.name == expected.name
actual_args = json.loads(actual.function.arguments)
expected_args = json.loads(expected.arguments)
assert actual_args == expected_args
def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
default_tokenizer
)
model_output_deltas = [
"function call",
COMPLEX_FUNCTION_JSON[:40],
COMPLEX_FUNCTION_JSON[40:],
]
reconstructor = run_tool_extraction_streaming(
tool_parser,
model_output_deltas,
assert_one_tool_per_delta=False,
)
assert len(reconstructor.tool_calls) == 1
call = reconstructor.tool_calls[0]
assert call.type == "function"
assert call.function.name == "manage_user_memory"
args_dict = json.loads(call.function.arguments)
assert args_dict == COMPLEX_ARGS_DICT

View File

@@ -0,0 +1,460 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
from ....utils import RemoteOpenAIServer
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci"
SERVER_ARGS = [
"--enforce-eager",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
"--enable-lora",
"--lora-modules",
f"{LORA_MODEL}={LORA_MODEL}",
"--tokenizer",
f"{LORA_MODEL}",
]
TOOLS = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
]
PRODUCT_TOOLS = [
{
"type": "function",
"function": {
"name": "get_product_info",
"description": "Get detailed information of a product based on its "
"product ID.",
"parameters": {
"type": "object",
"properties": {
"inserted": {
"type": "boolean",
"description": "inserted.",
},
"product_id": {
"type": "integer",
"description": "The product ID of the product.",
},
},
"required": ["product_id", "inserted"],
},
},
}
]
MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]
PRODUCT_MESSAGES = [
{
"role": "user",
"content": "Hi! Do you have any detailed information about the product id "
"7355608 and inserted true?",
}
]
@pytest.mark.asyncio
async def test_non_streaming_tool_call():
"""Test tool call in non-streaming mode."""
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
client = server.get_async_client()
response = await client.chat.completions.create(
model=LORA_MODEL,
messages=MESSAGES,
tools=TOOLS,
tool_choice="auto",
temperature=0.0,
)
assert response.choices
choice = response.choices[0]
message = choice.message
assert choice.finish_reason == "tool_calls"
assert message.tool_calls is not None
tool_call = message.tool_calls[0]
assert tool_call.type == "function"
assert tool_call.function.name == "get_current_weather"
arguments = json.loads(tool_call.function.arguments)
assert "location" in arguments
assert "Boston" in arguments["location"]
print("\n[Non-Streaming Test Passed]")
print(f"Tool Call: {tool_call.function.name}")
print(f"Arguments: {arguments}")
@pytest.mark.asyncio
async def test_streaming_tool_call():
"""Test tool call in streaming mode."""
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
client = server.get_async_client()
stream = await client.chat.completions.create(
model=LORA_MODEL,
messages=MESSAGES,
tools=TOOLS,
tool_choice="auto",
temperature=0.0,
stream=True,
)
tool_call_chunks = {}
async for chunk in stream:
if not chunk.choices:
continue
delta = chunk.choices[0].delta
if not delta or not delta.tool_calls:
continue
for tool_chunk in delta.tool_calls:
index = tool_chunk.index
if index not in tool_call_chunks:
tool_call_chunks[index] = {"name": "", "arguments": ""}
if tool_chunk.function.name:
tool_call_chunks[index]["name"] += tool_chunk.function.name
if tool_chunk.function.arguments:
tool_call_chunks[index]["arguments"] += (
tool_chunk.function.arguments
)
assert len(tool_call_chunks) == 1
reconstructed_tool_call = tool_call_chunks[0]
assert reconstructed_tool_call["name"] == "get_current_weather"
arguments = json.loads(reconstructed_tool_call["arguments"])
assert "location" in arguments
assert "Boston" in arguments["location"]
print("\n[Streaming Test Passed]")
print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
print(f"Reconstructed Arguments: {arguments}")
@pytest.mark.asyncio
async def test_non_streaming_product_tool_call():
"""Test tool call integer and boolean parameters in non-streaming mode."""
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
client = server.get_async_client()
response = await client.chat.completions.create(
model=LORA_MODEL,
messages=PRODUCT_MESSAGES,
tools=PRODUCT_TOOLS,
tool_choice="auto",
temperature=0.66,
)
assert response.choices
choice = response.choices[0]
message = choice.message
assert choice.finish_reason == "tool_calls"
assert message.tool_calls is not None
tool_call = message.tool_calls[0]
assert tool_call.type == "function"
assert tool_call.function.name == "get_product_info"
arguments = json.loads(tool_call.function.arguments)
assert "product_id" in arguments
assert "inserted" in arguments
product_id = arguments.get("product_id")
inserted = arguments.get("inserted")
assert isinstance(product_id, int)
assert product_id == 7355608
assert isinstance(inserted, bool)
assert inserted is True
print("\n[Non-Streaming Product Test Passed]")
print(f"Tool Call: {tool_call.function.name}")
print(f"Arguments: {arguments}")
@pytest.mark.asyncio
async def test_streaming_product_tool_call():
"""Test tool call integer and boolean parameters in streaming mode."""
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
client = server.get_async_client()
stream = await client.chat.completions.create(
model=LORA_MODEL,
messages=PRODUCT_MESSAGES,
tools=PRODUCT_TOOLS,
tool_choice="auto",
temperature=0.66,
stream=True,
)
tool_call_chunks = {}
async for chunk in stream:
if not chunk.choices:
continue
delta = chunk.choices[0].delta
if not delta or not delta.tool_calls:
continue
for tool_chunk in delta.tool_calls:
index = tool_chunk.index
if index not in tool_call_chunks:
tool_call_chunks[index] = {"name": "", "arguments": ""}
if tool_chunk.function.name:
tool_call_chunks[index]["name"] += tool_chunk.function.name
if tool_chunk.function.arguments:
tool_call_chunks[index]["arguments"] += (
tool_chunk.function.arguments
)
assert len(tool_call_chunks) == 1
reconstructed_tool_call = tool_call_chunks[0]
assert reconstructed_tool_call["name"] == "get_product_info"
arguments = json.loads(reconstructed_tool_call["arguments"])
assert "product_id" in arguments
assert "inserted" in arguments
# Handle type coercion for streaming test as well
product_id = arguments.get("product_id")
inserted = arguments.get("inserted")
assert isinstance(product_id, int)
assert product_id == 7355608
assert isinstance(inserted, bool)
assert inserted is True
print("\n[Streaming Product Test Passed]")
print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
print(f"Reconstructed Arguments: {arguments}")
@pytest.fixture
def qwen_tokenizer() -> TokenizerLike:
from vllm.tokenizers import get_tokenizer
return get_tokenizer("Qwen/Qwen3-32B")
@pytest.fixture
def hermes_parser(qwen_tokenizer: TokenizerLike) -> Hermes2ProToolParser:
return Hermes2ProToolParser(qwen_tokenizer)
@pytest.fixture
def any_chat_request() -> ChatCompletionRequest:
return ChatCompletionRequest(
seed=42,
model="Qwen/Qwen3-32B",
messages=[],
)
def test_hermes_parser_streaming_just_forward_text(
qwen_tokenizer: TokenizerLike,
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """This is some prior text that has nothing to do with tool calling."""
tokens = qwen_tokenizer.encode(text)
previous_text = ""
delta_messages = []
for token in tokens:
delta_text = qwen_tokenizer.decode([token])
current_text = previous_text + delta_text
delta = hermes_parser.extract_tool_calls_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=delta_text,
previous_token_ids=[],
current_token_ids=[],
delta_token_ids=[],
request=any_chat_request,
)
previous_text = current_text
delta_messages.append(delta)
for delta in delta_messages:
assert delta is not None
assert not delta.tool_calls
print(delta_messages)
assert "".join([delta.content for delta in delta_messages]) == text
def test_hermes_parser_streaming_failure_case_bug_19056(
qwen_tokenizer: TokenizerLike,
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """<tool_call>
{"name": "final_answer", "arguments": {"trigger": true}}
</tool_call>"""
tokens = qwen_tokenizer.encode(text)
previous_text = ""
delta_messages = []
for token in tokens:
text = qwen_tokenizer.decode([token])
current_text = previous_text + text
delta = hermes_parser.extract_tool_calls_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=text,
previous_token_ids=[],
current_token_ids=[],
delta_token_ids=[],
request=any_chat_request,
)
previous_text = current_text
if delta is not None:
delta_messages.append(delta)
assert delta_messages[0].tool_calls[0].function.name == "final_answer"
tool_call_args = "".join(
delta.tool_calls[0].function.arguments or "" for delta in delta_messages
)
assert tool_call_args == '{"trigger": true}'
def test_hermes_parser_streaming(
qwen_tokenizer: TokenizerLike,
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = '<tool_call>\
{"name": "get_current_temperature",\
"arguments": {"location":\
"San Francisco, California, United States", "unit": "celsius"}}\
</tool_call>'
tokens = qwen_tokenizer.encode(text)
previous_text = ""
delta_messages = []
for token in tokens:
text = qwen_tokenizer.decode([token])
current_text = previous_text + text
delta = hermes_parser.extract_tool_calls_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=text,
previous_token_ids=[],
current_token_ids=[],
delta_token_ids=[],
request=any_chat_request,
)
previous_text = current_text
if delta is not None:
delta_messages.append(delta)
print(delta_messages)
assert delta_messages[0].tool_calls[0].function.name == "get_current_temperature"
tool_call_args = "".join(
delta.tool_calls[0].function.arguments or "" for delta in delta_messages
)
assert tool_call_args == (
'{"location":"San Francisco, California, United States", "unit": "celsius"}'
)
def test_hermes_parser_non_streaming_no_tool_call(
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """This is not a tool call."""
tool_call = hermes_parser.extract_tool_calls(
model_output=text,
request=any_chat_request,
)
assert tool_call is not None
assert not tool_call.tools_called
def test_hermes_parser_non_streaming_tool_call_between_tags(
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """<tool_call>
{"name": "final_answer", "arguments": {"trigger": true}}
</tool_call>"""
tool_call = hermes_parser.extract_tool_calls(
model_output=text,
request=any_chat_request,
)
assert tool_call is not None
assert tool_call.tools_called
assert tool_call.tool_calls[0].function.name == "final_answer"
assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
def test_hermes_parser_non_streaming_tool_call_until_eos(
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """<tool_call>
{"name": "final_answer", "arguments": {"trigger": true}}"""
tool_call = hermes_parser.extract_tool_calls(
model_output=text,
request=any_chat_request,
)
assert tool_call is not None
assert tool_call.tools_called
assert tool_call.tool_calls[0].function.name == "final_answer"
assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
def test_hermes_parser_non_streaming_tool_call_invalid_json(
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
# Missing closing brace to trigger exception
text = """<tool_call>
{"name": "final_answer", "arguments": {"trigger": true}"""
tool_call = hermes_parser.extract_tool_calls(
model_output=text,
request=any_chat_request,
)
assert tool_call is not None
assert not tool_call.tools_called

View File

@@ -0,0 +1,179 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
import json
from unittest.mock import MagicMock
import pytest
from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.tool_parsers import ToolParser, ToolParserManager
def make_tool_call(name, arguments):
return ToolCall(
type="function",
function=FunctionCall(name=name, arguments=json.dumps(arguments)),
)
# TODO: add reason prefix and suffix.
@pytest.mark.parametrize(
"model_output,expected_tool_calls,expected_content",
[
# No tool call
("How can I help you today?", [], "How can I help you today?"),
# Single tool call, no content
(
'<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}]</tool_calls>', # noqa: E501
[
make_tool_call(
"get_weather", {"city": "San Francisco", "metric": "celsius"}
)
],
None,
),
# Multiple tool calls
(
'<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}, {"name": "register_user", "arguments": {"name": "John Doe", "age": 37, "address": {"city": "San Francisco", "state": "CA"}, "role": null, "passed_test": true, "aliases": ["John", "Johnny"]}}]</tool_calls>', # noqa: E501
[
make_tool_call(
"get_weather", {"city": "San Francisco", "metric": "celsius"}
),
make_tool_call(
"register_user",
{
"name": "John Doe",
"age": 37,
"address": {"city": "San Francisco", "state": "CA"},
"role": None,
"passed_test": True,
"aliases": ["John", "Johnny"],
},
),
],
None,
),
# Content before tool call
(
'I will call the tool now. <tool_calls>[{"name": "get_weather", "arguments": {"city": "Boston"}}]</tool_calls>', # noqa: E501
[make_tool_call("get_weather", {"city": "Boston"})],
"I will call the tool now. ",
),
# Content after tool call (should be stripped)
(
'<tool_calls>[{"name": "get_weather", "arguments": {"city": "Seattle"}}]</tool_calls>\nThank you!', # noqa: E501
[make_tool_call("get_weather", {"city": "Seattle"})],
None,
),
(
'<tool_calls>[{"name": "complex_tool", "arguments": {"level1": {"level2": {"level3": {"value": 123}}}}}]</tool_calls>',
[
make_tool_call(
"complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
)
],
None,
),
],
)
def test_hunyuan_a13b_tool_parser_extract(
model_output, expected_tool_calls, expected_content
):
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
mock_tokenizer
)
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=False
)
# align the random id.
for idx in range(len(tool_calls)):
tool_calls[idx].id = expected_tool_calls[idx].id
assert tool_calls == expected_tool_calls
assert content == expected_content
# Streaming test: simulate incremental output
@pytest.mark.parametrize(
"model_deltas,expected_tool_calls",
[
(
[
'<tool_calls>[{"name": "get_weather", ',
'"arguments": {"city": "San Francisco", ',
'"metric": "celsius"}}]',
"</tool_calls>",
],
[
make_tool_call(
"get_weather", {"city": "San Francisco", "metric": "celsius"}
)
],
),
(
[
'<tool_calls>[{"name":',
' "get_weather",',
' "arguments":',
' {"city": "Boston"}',
"}]",
"</tool_calls>",
],
[make_tool_call("get_weather", {"city": "Boston"})],
),
(
[
"",
'<tool_calls>[{"name":',
' "get_weather",',
' "arguments":',
' {"city": "Boston"}',
"}]",
"</tool_calls>",
"\n</answer>",
],
[make_tool_call("get_weather", {"city": "Boston"})],
),
pytest.param(
[
'<tool_calls>[{"name": "complex_tool",',
' "arguments": ',
' {"level1": {"level2": ',
'{"level3": {"value": 123}}}}}',
"]</tool_calls>",
],
[
make_tool_call(
"complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
)
],
marks=pytest.mark.xfail(
reason="stream parsing not support nested json yet."
),
),
],
)
def test_hunyuan_a13b_tool_parser_streaming(model_deltas, expected_tool_calls):
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
mock_tokenizer
)
reconstructor = run_tool_extraction_streaming(
tool_parser, model_deltas, assert_one_tool_per_delta=False
)
# align the random id.
for idx in range(len(reconstructor.tool_calls)):
reconstructor.tool_calls[idx].id = expected_tool_calls[idx].id
assert reconstructor.tool_calls == expected_tool_calls

View File

@@ -0,0 +1,262 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock, patch
import pytest
from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser
@pytest.fixture
def parser(default_tokenizer: TokenizerLike):
return Llama3JsonToolParser(default_tokenizer)
def test_extract_tool_calls_simple(parser):
# Test with a simple tool call
model_output = (
'Here is the result: {"name": "getOpenIncidentsTool", '
'"parameters": {}} Would you like to know more?'
)
result = parser.extract_tool_calls(model_output, None)
assert isinstance(result, ExtractedToolCallInformation)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].type == "function"
assert result.tool_calls[0].function.name == "getOpenIncidentsTool"
assert result.tool_calls[0].function.arguments == "{}"
assert result.content is None
def test_extract_tool_calls_with_arguments(parser):
# Test with a tool call that has arguments
model_output = (
'{"name": "searchTool", "parameters": {"query": "test query", "limit": 10}}'
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "searchTool"
assert '"query": "test query"' in result.tool_calls[0].function.arguments
assert '"limit": 10' in result.tool_calls[0].function.arguments
def test_extract_tool_calls_no_json(parser):
# Test with text that doesn't contain a JSON object
model_output = "This is just some text without any tool calls"
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is False
assert len(result.tool_calls) == 0
assert result.content == model_output
def test_extract_tool_calls_invalid_json(parser):
# Test with invalid JSON
model_output = '{"name": "invalidTool", "parameters": {invalid json}'
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is False
assert len(result.tool_calls) == 0
assert result.content == model_output
def test_extract_tool_calls_with_arguments_key(parser):
# Test with a tool call that uses "arguments" instead of "parameters"
model_output = '{"name": "searchTool", "arguments": {"query": "test"}}'
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "searchTool"
assert '"query": "test"' in result.tool_calls[0].function.arguments
def test_extract_tool_calls_multiple_json(parser):
# Test with multiple JSONs separated by semicolons
model_output = (
'{"name": "searchTool", "parameters": {"query": "test1"}}; '
'{"name": "getOpenIncidentsTool", "parameters": {}}; '
'{"name": "searchTool", "parameters": {"query": "test2"}}'
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 3
# Check first tool call
assert result.tool_calls[0].function.name == "searchTool"
assert '"query": "test1"' in result.tool_calls[0].function.arguments
# Check second tool call
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
assert result.tool_calls[1].function.arguments == "{}"
# Check third tool call
assert result.tool_calls[2].function.name == "searchTool"
assert '"query": "test2"' in result.tool_calls[2].function.arguments
def test_extract_tool_calls_multiple_json_with_whitespace(parser):
# Test with multiple JSONs separated by semicolons and extra whitespace
model_output = (
'{"name": "searchTool", "parameters": {"query": "test1"}} ; '
'{"name": "getOpenIncidentsTool", "parameters": {}} ; '
'{"name": "searchTool", "parameters": {"query": "test2"}}'
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 3
assert result.tool_calls[0].function.name == "searchTool"
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
assert result.tool_calls[2].function.name == "searchTool"
def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
# Test with multiple JSONs and surrounding text
model_output = (
"Here are the results: "
'{"name": "searchTool", "parameters": {"query": "test1"}}; '
'{"name": "getOpenIncidentsTool", "parameters": {}}; '
'{"name": "searchTool", "parameters": {"query": "test2"}} '
"Would you like to know more?"
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 3
assert result.tool_calls[0].function.name == "searchTool"
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
assert result.tool_calls[2].function.name == "searchTool"
def test_extract_tool_calls_deeply_nested_json(parser):
# Test with deeply nested JSON parameters (5 levels)
model_output = (
'{"name": "complexTool", '
'"parameters": {'
'"level1": {'
'"level2": {'
'"level3": {'
'"level4": {'
'"value": "deep"'
"}}}}}}"
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "complexTool"
# Verify the nested structure is preserved in the arguments
import json
args = json.loads(result.tool_calls[0].function.arguments)
assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
def test_extract_tool_calls_multiple_with_deep_nesting(parser):
# Test with multiple tool calls where some have deeply nested parameters
model_output = (
'{"name": "simpleTool", "parameters": {"value": "test"}}; '
'{"name": "complexTool", "parameters": '
'{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 2
# Check first tool call
assert result.tool_calls[0].function.name == "simpleTool"
import json
args0 = json.loads(result.tool_calls[0].function.arguments)
assert args0["value"] == "test"
# Check second tool call with deep nesting
assert result.tool_calls[1].function.name == "complexTool"
args1 = json.loads(result.tool_calls[1].function.arguments)
assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
# Test with quotes and brackets inside quoted string values
model_output = (
'{"name": "searchTool", '
'"parameters": {'
'"query": "test {value} [complex]",'
'"nested": {"inner": "more {brackets}"}'
"}}"
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "searchTool"
# Verify the string values are preserved including brackets and quotes
import json
args = json.loads(result.tool_calls[0].function.arguments)
assert args["query"] == "test {value} [complex]"
assert args["nested"]["inner"] == "more {brackets}"
def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
# Test with escaped quotes in deeply nested JSON
model_output = (
'{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "parserTool"
# Verify escaped quotes are preserved
import json
args = json.loads(result.tool_calls[0].function.arguments)
assert args["text"] == 'He said "Hello {world}"'
def test_extract_tool_calls_missing_name_key(parser):
# Test that missing "name" key returns content
model_output = '{"parameters": {}}'
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is False
assert len(result.tool_calls) == 0
assert result.content == model_output
def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
# Test that missing both "parameters" and "arguments" keys returns content
model_output = '{"name": "toolWithoutParams"}'
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is False
assert len(result.tool_calls) == 0
assert result.content == model_output
def test_regex_timeout_handling(parser):
"""Test regex timeout is handled gracefully"""
fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
# create a mock regex that raises TimeoutError
mock_regex = MagicMock()
mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
with patch.object(parser, "tool_call_start_regex", mock_regex):
result = parser.extract_tool_calls(fake_problematic_input, None)
# should treat as regular text when regex times out
assert result.content == fake_problematic_input
assert result.tools_called is False
assert len(result.tool_calls) == 0
mock_regex.finditer.assert_called_once()

View File

@@ -0,0 +1,269 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock, patch
import pytest
from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager
# Test cases similar to pythonic parser but with Llama4 specific format
SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
SIMPLE_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "LA", "metric": "C"}',
)
MORE_TYPES_FUNCTION_OUTPUT = (
"[register_user(name='Doe', "
"age=9, "
"address={'city': 'LA', 'state': 'CA'}, "
"role=None, "
"passed_test=True, "
"aliases=['John', 'Johnny'])]"
)
MORE_TYPES_FUNCTION_CALL = FunctionCall(
name="register_user",
arguments='{"name": "Doe", '
'"age": 9, '
'"address": {"city": "LA", "state": "CA"}, '
'"role": null, '
'"passed_test": true, '
'"aliases": ["John", "Johnny"]}',
)
PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]"
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments="{}",
)
EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]"
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"additional_data": {}}',
)
EMPTY_LIST_FUNCTION_OUTPUT = "[do_something_cool(steps=[])]"
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"steps": []}',
)
ESCAPED_STRING_FUNCTION_OUTPUT = (
r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]"
)
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
)
PYTHON_TAG_FUNCTION_OUTPUT = (
"<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>"
)
@pytest.mark.parametrize("streaming", [True, False])
def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
default_tokenizer
)
model_output = "How can I help you today?"
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert content == model_output
assert len(tool_calls) == 0
test_str = "<|python_start|>"
test_str += "[get_weather(city='LA', metric='C'),"
test_str += "register_user(name='Doe', age=9)]"
TEST_CASES = [
pytest.param(
True,
ESCAPED_STRING_FUNCTION_OUTPUT,
[ESCAPED_STRING_FUNCTION_CALL],
id="simple_streaming",
),
pytest.param(
False, SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], id="simple_nonstreaming"
),
pytest.param(
True,
MORE_TYPES_FUNCTION_OUTPUT,
[MORE_TYPES_FUNCTION_CALL],
id="more_types_streaming",
),
pytest.param(
False,
MORE_TYPES_FUNCTION_OUTPUT,
[MORE_TYPES_FUNCTION_CALL],
id="more_types_nonstreaming",
),
pytest.param(
True,
PARAMETERLESS_FUNCTION_OUTPUT,
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_streaming",
),
pytest.param(
False,
PARAMETERLESS_FUNCTION_OUTPUT,
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_nonstreaming",
),
pytest.param(
True,
EMPTY_DICT_FUNCTION_OUTPUT,
[EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_streaming",
),
pytest.param(
False,
EMPTY_DICT_FUNCTION_OUTPUT,
[EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_nonstreaming",
),
pytest.param(
True,
EMPTY_LIST_FUNCTION_OUTPUT,
[EMPTY_LIST_FUNCTION_CALL],
id="empty_list_streaming",
),
pytest.param(
False,
EMPTY_LIST_FUNCTION_OUTPUT,
[EMPTY_LIST_FUNCTION_CALL],
id="empty_list_nonstreaming",
),
pytest.param(
True,
ESCAPED_STRING_FUNCTION_OUTPUT,
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_streaming",
),
pytest.param(
False,
ESCAPED_STRING_FUNCTION_OUTPUT,
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_nonstreaming",
),
pytest.param(
True,
"[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
[
SIMPLE_FUNCTION_CALL,
FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
],
id="parallel_calls_streaming",
),
pytest.param(
False,
"[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
[
SIMPLE_FUNCTION_CALL,
FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
],
id="parallel_calls_nonstreaming",
),
pytest.param(
True,
PYTHON_TAG_FUNCTION_OUTPUT,
[SIMPLE_FUNCTION_CALL],
id="python_tag_streaming",
),
pytest.param(
False,
PYTHON_TAG_FUNCTION_OUTPUT,
[SIMPLE_FUNCTION_CALL],
id="python_tag_nonstreaming",
),
pytest.param(
True,
test_str,
[
SIMPLE_FUNCTION_CALL,
FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
],
id="parallel_calls_streaming",
),
pytest.param(
False,
"<|python_start|>[get_weather(city='LA', metric='C'), "
+ "register_user(name='Doe', age=9)]",
[
SIMPLE_FUNCTION_CALL,
FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
],
id="parallel_calls_nonstreaming",
),
]
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
def test_tool_call(
streaming: bool,
model_output: str,
expected_tool_calls: list[FunctionCall],
default_tokenizer: TokenizerLike,
):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
default_tokenizer
)
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert len(tool_calls) == len(expected_tool_calls)
for actual, expected in zip(tool_calls, expected_tool_calls):
assert actual.type == "function"
assert actual.function == expected
def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
default_tokenizer
)
model_output_deltas = [
"<|python_start|>[get_weather(city='LA', metric='C'), "
"get_weather(), "
"do_something_cool(steps=[])]<|python_end|>",
]
reconstructor = run_tool_extraction_streaming(
tool_parser, model_output_deltas, assert_one_tool_per_delta=False
)
assert reconstructor.other_content == ""
assert len(reconstructor.tool_calls) == 3
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
@pytest.mark.parametrize("streaming", [False])
def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
"""test regex timeout is handled gracefully"""
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
default_tokenizer
)
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
# create a mock regex that raises TimeoutError
mock_regex = MagicMock()
mock_regex.match.side_effect = TimeoutError("Regex timeout")
with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
content, tool_calls = run_tool_extraction(
tool_parser, fake_problematic_input, streaming=streaming
)
# should treat as regular text when regex times out
assert content == fake_problematic_input
assert len(tool_calls) == 0
mock_regex.match.assert_called_once()

View File

@@ -0,0 +1,251 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock, patch
import pytest
from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
SIMPLE_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "San Francisco", "metric": "celsius"}',
)
MORE_TYPES_FUNCTION_OUTPUT = (
"register_user(name='John Doe', "
"age=37, "
"address={'city': 'San Francisco', 'state': 'CA'}, "
"role=None, "
"passed_test=True, "
"aliases=['John', 'Johnny'])"
)
MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS = (
"register_user(name='John Doe', "
"age=37, "
"address={'city': 'San Francisco', 'state': 'CA'}, "
"role=null, "
"passed_test=true, "
"aliases=['John', 'Johnny'])"
)
MORE_TYPES_FUNCTION_CALL = FunctionCall(
name="register_user",
arguments='{"name": "John Doe", '
'"age": 37, '
'"address": {"city": "San Francisco", "state": "CA"}, '
'"role": null, '
'"passed_test": true, '
'"aliases": ["John", "Johnny"]}',
)
PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments="{}",
)
EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"additional_data": {}}',
)
EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"steps": []}',
)
ESCAPED_STRING_FUNCTION_OUTPUT = (
r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
)
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
)
@pytest.mark.parametrize("streaming", [True, False])
def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
default_tokenizer
)
model_output = "How can I help you today?"
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert content == model_output
assert len(tool_calls) == 0
TEST_CASES = [
pytest.param(
True,
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
[SIMPLE_FUNCTION_CALL],
id="simple_streaming",
),
pytest.param(
False,
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
[SIMPLE_FUNCTION_CALL],
id="simple_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_streaming",
),
pytest.param(
False,
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_streaming_json_literals",
),
pytest.param(
False,
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_nonstreaming_json_literals",
),
pytest.param(
True,
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_streaming",
),
pytest.param(
False,
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
[EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_streaming",
),
pytest.param(
False,
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
[EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
[EMPTY_LIST_FUNCTION_CALL],
id="empty_list_streaming",
),
pytest.param(
False,
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
[EMPTY_LIST_FUNCTION_CALL],
id="empty_list_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_streaming",
),
pytest.param(
False,
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
id="parallel_calls_streaming",
),
pytest.param(
False,
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
id="parallel_calls_nonstreaming",
),
]
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
def test_tool_call(
streaming: bool,
model_output: str,
expected_tool_calls: list[FunctionCall],
default_tokenizer: TokenizerLike,
):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
default_tokenizer
)
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert content is None
assert len(tool_calls) == len(expected_tool_calls)
for actual, expected in zip(tool_calls, expected_tool_calls):
assert actual.type == "function"
assert actual.function == expected
def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
default_tokenizer
)
model_output_deltas = [
"<function_calls>get_weather(city='San",
" Francisco', metric='celsius')\n"
f"{PARAMETERLESS_FUNCTION_OUTPUT}\n"
f"{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
]
reconstructor = run_tool_extraction_streaming(
tool_parser, model_output_deltas, assert_one_tool_per_delta=False
)
assert reconstructor.other_content == ""
assert len(reconstructor.tool_calls) == 3
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
@pytest.mark.parametrize("streaming", [False])
def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
"""test regex timeout is handled gracefully"""
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
default_tokenizer
)
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
# create a mock regex that raises TimeoutError
mock_regex = MagicMock()
mock_regex.match.side_effect = TimeoutError("Regex timeout")
with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
content, tool_calls = run_tool_extraction(
tool_parser, fake_problematic_input, streaming=streaming
)
# should treat as regular text when regex times out
assert content == fake_problematic_input
assert len(tool_calls) == 0
mock_regex.match.assert_called_once()

View File

@@ -0,0 +1,359 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import jsonschema
import openai
import pytest
import pytest_asyncio
from rapidfuzz import fuzz
from ....utils import RemoteOpenAIServer
MODEL_NAME = "openai/gpt-oss-20b"
@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
"8192",
"--enforce-eager",
"--enable-auto-tool-choice",
"--tool-call-parser",
"openai",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
"""Async fixture providing an OpenAI-compatible vLLM client."""
async with server.get_async_client() as async_client:
yield async_client
# ==========================================================
# Tool Definitions
# ==========================================================
TOOLS = [
{
"type": "function",
"function": {
"name": "calculator",
"description": "Performs basic arithmetic calculations.",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": (
"Arithmetic expression to evaluate, e.g. '123 + 456'."
),
}
},
"required": ["expression"],
},
},
},
{
"type": "function",
"function": {
"name": "get_time",
"description": "Retrieves the current local time for a given city.",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "City name, e.g. 'New York'.",
}
},
"required": ["city"],
},
},
},
]
# ==========================================================
# Message Examples
# ==========================================================
MESSAGES_CALC = [
{"role": "user", "content": "Calculate 123 + 456 using the calculator."}
]
MESSAGES_GET_TIME = [
{"role": "user", "content": "What is the current time in New York?"}
]
MESSAGES_MULTIPLE_CALLS = [
{
"role": "system",
"content": (
"You can call multiple tools. "
"When using more than one, return single JSON object with tool_calls array"
"containing each tool call with its function name and arguments. "
"Do not output multiple JSON objects separately."
),
},
{
"role": "user",
"content": "First, calculate 7 * 8 using the calculator. "
"Then, use get_time to tell me the current time in New York.",
},
]
MESSAGES_INVALID_CALL = [
{
"role": "user",
"content": "Can you help with something, "
"but dont actually perform any calculation?",
}
]
# Expected outputs
FUNC_CALC = "calculator"
FUNC_ARGS_CALC = '{"expression":"123 + 456"}'
FUNC_TIME = "get_time"
FUNC_ARGS_TIME = '{"city": "New York"}'
# ==========================================================
# Utility to extract reasoning and tool calls
# ==========================================================
def extract_reasoning_and_calls(chunks: list) -> tuple[str, list[str], list[str]]:
"""
Extract accumulated reasoning text and tool call arguments
from streaming chunks.
"""
reasoning_content: str = ""
tool_calls: dict[int, dict[str, str]] = {}
for chunk in chunks:
choice = getattr(chunk.choices[0], "delta", None)
if not choice:
continue
if hasattr(choice, "reasoning_content") and choice.reasoning_content:
reasoning_content += choice.reasoning_content
for tc in getattr(choice, "tool_calls", []) or []:
idx = getattr(tc, "index", 0)
tool_entry = tool_calls.setdefault(idx, {"name": "", "arguments": ""})
if getattr(tc, "function", None):
func = tc.function
if getattr(func, "name", None):
tool_entry["name"] = func.name
if getattr(func, "arguments", None):
tool_entry["arguments"] += func.arguments
function_names: list[str] = [v["name"] for _, v in sorted(tool_calls.items())]
arguments: list[str] = [v["arguments"] for _, v in sorted(tool_calls.items())]
return reasoning_content, arguments, function_names
# ==========================================================
# Test Scenarios
# ==========================================================
@pytest.mark.asyncio
async def test_calculator_tool_call_and_argument_accuracy(client: openai.AsyncOpenAI):
"""Verify calculator tool call is made and arguments are accurate."""
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES_CALC,
tools=TOOLS,
temperature=0.0,
stream=False,
)
message = response.choices[0].message
tool_calls = getattr(message, "tool_calls", [])
assert tool_calls, "No tool calls detected"
calc_call = next((c for c in tool_calls if c.function.name == FUNC_CALC), None)
assert calc_call, "Calculator function not called"
raw_args = calc_call.function.arguments
assert raw_args, "Calculator arguments missing"
assert "123" in raw_args and "456" in raw_args, (
f"Expected values not in raw arguments: {raw_args}"
)
try:
parsed_args = json.loads(raw_args)
except json.JSONDecodeError:
pytest.fail(f"Invalid JSON in calculator arguments: {raw_args}")
expected_expr = "123 + 456"
actual_expr = parsed_args.get("expression", "")
similarity = fuzz.ratio(actual_expr, expected_expr)
assert similarity > 90, (
f"Expression mismatch: expected '{expected_expr}' "
f"got '{actual_expr}' (similarity={similarity}%)"
)
@pytest.mark.asyncio
async def test_streaming_tool_call_get_time_with_reasoning(client: openai.AsyncOpenAI):
"""Verify streamed reasoning and tool call behavior for get_time."""
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES_GET_TIME,
tools=TOOLS,
temperature=0.0,
stream=True,
)
chunks = [chunk async for chunk in stream]
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
assert FUNC_TIME in function_names, "get_time function not called"
assert any("New York" in arg for arg in arguments), (
f"Expected get_time arguments for New York not found in {arguments}"
)
assert len(reasoning) > 0, "Expected reasoning content missing"
assert any(keyword in reasoning for keyword in ["New York", "time", "current"]), (
f"Reasoning is not relevant to the request: {reasoning}"
)
@pytest.mark.asyncio
async def test_streaming_multiple_tools(client: openai.AsyncOpenAI):
"""Test streamed multi-tool response with reasoning."""
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES_MULTIPLE_CALLS,
tools=TOOLS,
temperature=0.0,
stream=True,
)
chunks = [chunk async for chunk in stream]
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
try:
assert FUNC_CALC in function_names, (
f"Calculator tool missing — found {function_names}"
)
assert FUNC_TIME in function_names, (
f"Time tool missing — found {function_names}"
)
assert len(reasoning) > 0, "Expected reasoning content in streamed response"
except AssertionError as e:
print(f"ERROR: {e}")
@pytest.mark.asyncio
async def test_invalid_tool_call(client: openai.AsyncOpenAI):
"""
Verify that ambiguous instructions that should not trigger a tool
do not produce any tool calls.
"""
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES_INVALID_CALL,
tools=TOOLS,
temperature=0.0,
stream=False,
)
message = response.choices[0].message
assert message is not None, "Expected message in response"
assert hasattr(message, "content"), "Expected 'content' field in message"
tool_calls = getattr(message, "tool_calls", [])
assert not tool_calls, (
f"Model unexpectedly attempted a tool call on invalid input: {tool_calls}"
)
@pytest.mark.asyncio
async def test_tool_call_with_temperature(client: openai.AsyncOpenAI):
"""
Verify model produces valid tool or text output
under non-deterministic sampling.
"""
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES_CALC,
tools=TOOLS,
temperature=0.7,
stream=False,
)
message = response.choices[0].message
assert message is not None, "Expected non-empty message in response"
assert message.tool_calls or message.content, (
"Response missing both text and tool calls"
)
print(f"\nTool calls: {message.tool_calls}")
print(f"Text: {message.content}")
@pytest.mark.asyncio
async def test_tool_response_schema_accuracy(client: openai.AsyncOpenAI):
"""Validate that tool call arguments adhere to their declared JSON schema."""
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES_MULTIPLE_CALLS,
tools=TOOLS,
temperature=0.0,
)
calls = response.choices[0].message.tool_calls
assert calls, "No tool calls produced"
for call in calls:
func_name = call.function.name
args = json.loads(call.function.arguments)
schema: dict[str, object] | None = None
for tool_entry in TOOLS:
function_def = tool_entry.get("function")
if (
function_def
and isinstance(function_def, dict)
and function_def.get("name") == func_name
):
schema = function_def.get("parameters")
break
assert schema is not None, f"No matching tool schema found for {func_name}"
jsonschema.validate(instance=args, schema=schema)
@pytest.mark.asyncio
async def test_semantic_consistency_with_temperature(client: openai.AsyncOpenAI):
"""Test that temperature variation doesn't cause contradictory reasoning."""
responses = []
for temp in [0.0, 0.5, 1.0]:
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES_CALC,
tools=TOOLS,
temperature=temp,
)
text = (resp.choices[0].message.content or "").strip()
responses.append(text)
# Compare fuzzy similarity between low- and mid-temperature outputs
low_mid_sim = fuzz.ratio(responses[0], responses[1])
assert low_mid_sim > 60, (
f"Semantic drift too large between T=0.0 and T=0.5 ({low_mid_sim}%)"
)

View File

@@ -0,0 +1,231 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock, patch
import pytest
from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
SIMPLE_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "San Francisco", "metric": "celsius"}',
)
MORE_TYPES_FUNCTION_OUTPUT = (
"register_user(name='John Doe', "
"age=37, "
"address={'city': 'San Francisco', 'state': 'CA'}, "
"role=None, "
"passed_test=True, "
"aliases=['John', 'Johnny'])"
)
MORE_TYPES_FUNCTION_CALL = FunctionCall(
name="register_user",
arguments='{"name": "John Doe", '
'"age": 37, '
'"address": {"city": "San Francisco", "state": "CA"}, '
'"role": null, '
'"passed_test": true, '
'"aliases": ["John", "Johnny"]}',
)
PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments="{}",
)
EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"additional_data": {}}',
)
EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"steps": []}',
)
ESCAPED_STRING_FUNCTION_OUTPUT = (
r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
)
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
)
@pytest.mark.parametrize("streaming", [True, False])
def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
default_tokenizer
)
model_output = "How can I help you today?"
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert content == model_output
assert len(tool_calls) == 0
TEST_CASES = [
pytest.param(
True,
f"[{SIMPLE_FUNCTION_OUTPUT}]",
[SIMPLE_FUNCTION_CALL],
id="simple_streaming",
),
pytest.param(
False,
f"[{SIMPLE_FUNCTION_OUTPUT}]",
[SIMPLE_FUNCTION_CALL],
id="simple_nonstreaming",
),
pytest.param(
True,
f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_streaming",
),
pytest.param(
False,
f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_nonstreaming",
),
pytest.param(
True,
f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_streaming",
),
pytest.param(
False,
f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_nonstreaming",
),
pytest.param(
True,
f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
[EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_streaming",
),
pytest.param(
False,
f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
[EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_nonstreaming",
),
pytest.param(
True,
f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
[EMPTY_LIST_FUNCTION_CALL],
id="empty_list_streaming",
),
pytest.param(
False,
f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
[EMPTY_LIST_FUNCTION_CALL],
id="empty_list_nonstreaming",
),
pytest.param(
True,
f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_streaming",
),
pytest.param(
False,
f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_nonstreaming",
),
pytest.param(
True,
f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
id="parallel_calls_streaming",
),
pytest.param(
False,
f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
id="parallel_calls_nonstreaming",
),
]
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
def test_tool_call(
streaming: bool,
model_output: str,
expected_tool_calls: list[FunctionCall],
default_tokenizer: TokenizerLike,
):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
default_tokenizer
)
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert content is None
assert len(tool_calls) == len(expected_tool_calls)
for actual, expected in zip(tool_calls, expected_tool_calls):
assert actual.type == "function"
assert actual.function == expected
def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
default_tokenizer
)
model_output_deltas = [
"[get_weather(city='San",
" Francisco', metric='celsius'), "
f"{PARAMETERLESS_FUNCTION_OUTPUT}, "
f"{EMPTY_LIST_FUNCTION_OUTPUT}]",
]
reconstructor = run_tool_extraction_streaming(
tool_parser, model_output_deltas, assert_one_tool_per_delta=False
)
assert reconstructor.other_content == ""
assert len(reconstructor.tool_calls) == 3
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
@pytest.mark.parametrize("streaming", [False])
def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
"""test regex timeout is handled gracefully"""
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
default_tokenizer
)
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
# create a mock regex that raises TimeoutError
mock_regex = MagicMock()
mock_regex.match.side_effect = TimeoutError("Regex timeout")
with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
content, tool_calls = run_tool_extraction(
tool_parser, fake_problematic_input, streaming=streaming
)
# should treat as regular text when regex times out
assert content == fake_problematic_input
assert len(tool_calls) == 0
mock_regex.match.assert_called_once()

View File

@@ -0,0 +1,167 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ExtractedToolCallInformation,
FunctionCall,
ToolCall,
)
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser
class StreamingToolReconstructor:
def __init__(self, assert_one_tool_per_delta: bool = True):
self.tool_calls: list[ToolCall] = []
self.other_content: str = ""
self._assert_one_tool_per_delta = assert_one_tool_per_delta
def append_delta(self, delta: DeltaMessage):
if delta.content is not None:
self.other_content += delta.content
else:
assert delta.tool_calls, (
"Streaming results should have either content or tool calls (or both)"
)
if self._assert_one_tool_per_delta:
# Note: This isn't strictly required by the API and may not be
# possible to adhere to depending on the token space and number of
# tokens per streamed response from the model, but it is required
# by tool_use tests, so we enforce it here by default also.
assert len(delta.tool_calls) < 2, (
"Streaming should include only one tool call per update."
)
for call_delta in delta.tool_calls:
assert call_delta.type is None or call_delta.type == "function", (
"Streaming tool calls should only emit function calls. Got "
f"{call_delta.type}"
)
current_tool_call = (
self.tool_calls[call_delta.index]
if call_delta.index < len(self.tool_calls)
else None
)
if current_tool_call:
assert not call_delta.function.name, (
"Streaming tool calls should emit the full function name "
f"exactly once. Got {call_delta.function.name}"
)
assert not call_delta.id, (
"Streaming tool calls must emit function id only once. Got "
f"{call_delta.id}"
)
assert call_delta.index == len(self.tool_calls) - 1, (
f"Incorrect index for tool delta. Got {call_delta.index}, "
f"expected {len(self.tool_calls) - 1}"
)
current_tool_call.function.arguments += call_delta.function.arguments
else:
assert call_delta.id is not None, (
"Streaming tool calls must have an id on first appearance"
)
assert call_delta.function.name is not None, (
"Streaming tool calls must have a function name on first appearance"
)
assert call_delta.index == len(self.tool_calls), (
f"Incorrect index for tool delta. Got {call_delta.index}, "
f"expected {len(self.tool_calls)}"
)
self.tool_calls.append(
ToolCall(
id=call_delta.id,
function=FunctionCall(
name=call_delta.function.name,
arguments=call_delta.function.arguments or "",
),
)
)
def run_tool_extraction(
tool_parser: ToolParser,
model_output: str,
request: ChatCompletionRequest | None = None,
streaming: bool = False,
assert_one_tool_per_delta: bool = True,
) -> tuple[str | None, list[ToolCall]]:
if streaming:
reconstructor = run_tool_extraction_streaming(
tool_parser,
model_output,
request,
assert_one_tool_per_delta=assert_one_tool_per_delta,
)
return reconstructor.other_content or None, reconstructor.tool_calls
else:
extracted = run_tool_extraction_nonstreaming(tool_parser, model_output, request)
assert extracted.tools_called == bool(extracted.tool_calls)
return extracted.content, extracted.tool_calls
def run_tool_extraction_nonstreaming(
tool_parser: ToolParser,
model_output: str,
request: ChatCompletionRequest | None = None,
) -> ExtractedToolCallInformation:
request = request or ChatCompletionRequest(messages=[], model="test-model")
return tool_parser.extract_tool_calls(model_output, request)
def split_string_into_token_deltas(tokenizer: TokenizerLike, text: str) -> list[str]:
# Split a string into a series of deltas using the provided tokenizer. Each
# delta will be the string equivalent of a single token.
token_ids = tokenizer.encode(text, add_special_tokens=False)
previously_decoded_text = ""
deltas = []
for i in range(1, len(token_ids) + 1):
current_tokens = token_ids[:i]
current_text = tokenizer.decode(current_tokens)
new_text = current_text[len(previously_decoded_text) :]
previously_decoded_text = current_text
deltas.append(new_text)
return deltas
def run_tool_extraction_streaming(
tool_parser: ToolParser,
model_deltas: Iterable[str],
request: ChatCompletionRequest | None = None,
assert_one_tool_per_delta: bool = True,
) -> StreamingToolReconstructor:
if isinstance(model_deltas, str):
model_deltas = split_string_into_token_deltas(
tool_parser.model_tokenizer, model_deltas
)
request = request or ChatCompletionRequest(messages=[], model="test-model")
reconstructor = StreamingToolReconstructor(
assert_one_tool_per_delta=assert_one_tool_per_delta
)
previous_text = ""
previous_tokens: list[int] = []
for delta in model_deltas:
token_delta = [
tool_parser.vocab.get(token)
for token in tool_parser.model_tokenizer.tokenize(delta)
if token in tool_parser.vocab
]
current_text = previous_text + delta
current_tokens = previous_tokens + token_delta
delta_message = tool_parser.extract_tool_calls_streaming(
previous_text,
current_text,
delta,
previous_tokens,
current_tokens,
token_delta,
request,
)
if delta_message is not None:
reconstructor.append_delta(delta_message)
previous_text = current_text
previous_tokens = current_tokens
return reconstructor

View File

@@ -0,0 +1,190 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import AsyncGenerator
from typing import Any
from vllm.entrypoints.openai.protocol import (
ChatCompletionResponse,
ChatCompletionResponseChoice,
ChatCompletionStreamResponse,
ChatMessage,
UsageInfo,
)
async def accumulate_streaming_response(
stream_generator: AsyncGenerator[str, None],
) -> ChatCompletionResponse:
"""
Accumulate streaming SSE chunks into a complete ChatCompletionResponse.
This helper parses the SSE format and builds up the complete response
by combining all the delta chunks.
"""
accumulated_content = ""
accumulated_reasoning = None
accumulated_tool_calls: list[dict[str, Any]] = []
role = None
finish_reason = None
response_id = None
created = None
model = None
index = 0
async for chunk_str in stream_generator:
# Skip empty lines and [DONE] marker
if not chunk_str.strip() or chunk_str.strip() == "data: [DONE]":
continue
# Parse SSE format: "data: {json}\n\n"
if chunk_str.startswith("data: "):
json_str = chunk_str[6:].strip()
try:
chunk_data = json.loads(json_str)
# print(f"DEBUG: Parsed chunk_data: {chunk_data}")
chunk = ChatCompletionStreamResponse(**chunk_data)
# Store metadata from first chunk
if response_id is None:
response_id = chunk.id
created = chunk.created
model = chunk.model
# Process each choice in the chunk
for choice in chunk.choices:
if choice.delta.role:
role = choice.delta.role
if choice.delta.content:
accumulated_content += choice.delta.content
if choice.delta.reasoning:
if accumulated_reasoning is None:
accumulated_reasoning = ""
accumulated_reasoning += choice.delta.reasoning
if choice.delta.tool_calls:
# Accumulate tool calls
for tool_call_delta in choice.delta.tool_calls:
# Find or create the tool call at this index
while len(accumulated_tool_calls) <= tool_call_delta.index:
accumulated_tool_calls.append(
{
"id": None,
"type": "function",
"function": {"name": "", "arguments": ""},
}
)
if tool_call_delta.id:
accumulated_tool_calls[tool_call_delta.index]["id"] = (
tool_call_delta.id
)
if tool_call_delta.function:
if tool_call_delta.function.name:
accumulated_tool_calls[tool_call_delta.index][
"function"
]["name"] += tool_call_delta.function.name
if tool_call_delta.function.arguments:
accumulated_tool_calls[tool_call_delta.index][
"function"
]["arguments"] += tool_call_delta.function.arguments
if choice.finish_reason:
finish_reason = choice.finish_reason
if choice.index is not None:
index = choice.index
except json.JSONDecodeError:
continue
# Build the final message
message_kwargs = {
"role": role or "assistant",
"content": accumulated_content if accumulated_content else None,
"reasoning": accumulated_reasoning,
}
# Only include tool_calls if there are any
if accumulated_tool_calls:
message_kwargs["tool_calls"] = [
{"id": tc["id"], "type": tc["type"], "function": tc["function"]}
for tc in accumulated_tool_calls
]
message = ChatMessage(**message_kwargs)
# Build the final response
choice = ChatCompletionResponseChoice(
index=index,
message=message,
finish_reason=finish_reason or "stop",
)
# Create usage info (with dummy values for tests)
usage = UsageInfo(
prompt_tokens=0,
completion_tokens=0,
total_tokens=0,
)
response = ChatCompletionResponse(
id=response_id or "chatcmpl-test",
object="chat.completion",
created=created or 0,
model=model or "test-model",
choices=[choice],
usage=usage,
)
return response
def verify_harmony_messages(
messages: list[Any], expected_messages: list[dict[str, Any]]
):
assert len(messages) == len(expected_messages)
for msg, expected in zip(messages, expected_messages):
if "role" in expected:
assert msg.author.role == expected["role"]
if "author_name" in expected:
assert msg.author.name == expected["author_name"]
if "channel" in expected:
assert msg.channel == expected["channel"]
if "recipient" in expected:
assert msg.recipient == expected["recipient"]
if "content" in expected:
assert msg.content[0].text == expected["content"]
if "content_type" in expected:
assert msg.content_type == expected["content_type"]
if "tool_definitions" in expected:
# Check that the tool definitions match the expected list of tool names
actual_tools = [t.name for t in msg.content[0].tools["functions"].tools]
assert actual_tools == expected["tool_definitions"]
def verify_chat_response(
response: ChatCompletionResponse,
content: str | None = None,
reasoning: str | None = None,
tool_calls: list[tuple[str, str]] | None = None,
):
assert len(response.choices) == 1
message = response.choices[0].message
if content is not None:
assert message.content == content
else:
assert not message.content
if reasoning is not None:
assert message.reasoning == reasoning
else:
assert not message.reasoning
if tool_calls:
assert message.tool_calls is not None
assert len(message.tool_calls) == len(tool_calls)
for tc, (expected_name, expected_args) in zip(message.tool_calls, tool_calls):
assert tc.function.name == expected_name
assert tc.function.arguments == expected_args
else:
assert not message.tool_calls

View File

View File

@@ -0,0 +1,90 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
PROMPTS = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
TOKEN_IDS = [
# Using ID={0, 1, 2, 3} results in NaN values,
# so we add this offset of 1000
[1000],
[1000, 1001],
[1000, 1002, 1001],
[1000, 1003, 1001, 1002],
]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_multiple_pooling_params(llm: LLM):
pooling_params = [
PoolingParams(),
PoolingParams(),
PoolingParams(),
PoolingParams(),
]
# Multiple PoolingParams should be matched with each prompt
outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.encode(
PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
)
# Single PoolingParams should be applied to every prompt
single_pooling_params = PoolingParams()
outputs = llm.encode(
PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
)
assert len(PROMPTS) == len(outputs)
# pooling_params is None, default params should be applied
outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
assert len(PROMPTS) == len(outputs)
def test_right_side_truncation(llm: LLM):
# Embeddings models should truncate the end of the prompt
tokenizer = llm.get_tokenizer()
assert tokenizer.truncation_side == "right"

View File

@@ -0,0 +1,119 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
max_model_len = 128
input = """Immerse yourself in the enchanting chronicle of calculus, a
mathematical domain that has radically transformed our comprehension of
change and motion. Despite its roots in ancient civilizations, the
formal birth of calculus predominantly occurred in the 17th century,
primarily under the influential guidance of Sir Isaac Newton and Gottfried
Wilhelm Leibniz. The earliest traces of calculus concepts are found in
ancient Greek mathematics,most notably in the works of Eudoxus and
Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a
technique for computing areas and volumes through the use of finite sums.
This methodology laid crucial foundational work for integral calculus.
In the 17th century, both Newton and Leibniz independently pioneered
calculus, each contributing unique perspectives that would shape this new
field."""
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
"--dtype",
"bfloat16",
"--enforce-eager",
"--max-model-len",
str(max_model_len),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
truncation_size = 10
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size,
}
response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert response["usage"]["prompt_tokens"] == truncation_size
@pytest.mark.asyncio
async def test_zero_truncation_size(client: openai.AsyncOpenAI):
truncation_size = 0
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size,
}
response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert response["usage"]["prompt_tokens"] == truncation_size
@pytest.mark.asyncio
async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
truncation_size = max_model_len + 1
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size,
}
with pytest.raises(openai.BadRequestError) as err:
await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert err.value.status_code == 400
error_details = err.value.response.json()["error"]
assert error_details["type"] == "BadRequestError"
expected_message = (
"truncate_prompt_tokens value is "
"greater than max_model_len."
" Please, select a smaller truncation size."
)
assert error_details["message"] == expected_message
@pytest.mark.asyncio
async def test_max_truncation_size(client: openai.AsyncOpenAI):
truncation_size = -1
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size,
}
response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert response["usage"]["prompt_tokens"] == max_model_len

View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
import torch
from tests.models.utils import softmax
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_pooling_params(llm: LLM):
def get_outputs(use_activation):
outputs = llm.classify(
prompts,
pooling_params=PoolingParams(use_activation=use_activation),
use_tqdm=False,
)
return torch.tensor([x.outputs.probs for x in outputs])
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
@pytest.mark.skip_global_cleanup
def test_token_classify(llm: LLM):
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
def test_score_api(llm: LLM):
err_msg = "Score API is only enabled for num_labels == 1."
with pytest.raises(ValueError, match=err_msg):
llm.score("ping", "pong", use_tqdm=False)

View File

@@ -0,0 +1,293 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import requests
import torch
import torch.nn.functional as F
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
DTYPE = "float32" # Use float32 to avoid NaN issue
@pytest.fixture(scope="module")
def server():
args = [
"--enforce-eager",
"--max-model-len",
"512",
"--dtype",
DTYPE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_single_input_classification(server: RemoteOpenAIServer, model_name: str):
input_text = "This product was excellent and exceeded my expectations"
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": input_text},
)
classification_response.raise_for_status()
output = ClassificationResponse.model_validate(classification_response.json())
assert output.object == "list"
assert output.model == MODEL_NAME
assert len(output.data) == 1
assert hasattr(output.data[0], "label")
assert hasattr(output.data[0], "probs")
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": "hello", "add_special_tokens": False},
)
response.raise_for_status()
ClassificationResponse.model_validate(response.json())
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
input_texts = [
"The product arrived on time and works perfectly",
"I'm very satisfied with my purchase, would buy again",
"The customer service was helpful and resolved my issue quickly",
"This product broke after one week, terrible quality",
"I'm very disappointed with this purchase, complete waste of money",
"The customer service was rude and unhelpful",
]
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": input_texts},
)
output = ClassificationResponse.model_validate(classification_response.json())
assert len(output.data) == len(input_texts)
for i, item in enumerate(output.data):
assert item.index == i
assert hasattr(item, "label")
assert hasattr(item, "probs")
assert len(item.probs) == item.num_classes
assert item.label in ["Default", "Spoiled"]
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
long_text = "hello " * 600
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": long_text, "truncate_prompt_tokens": 5},
)
classification_response.raise_for_status()
output = ClassificationResponse.model_validate(classification_response.json())
assert len(output.data) == 1
assert output.data[0].index == 0
assert hasattr(output.data[0], "probs")
assert output.usage.prompt_tokens == 5
assert output.usage.total_tokens == 5
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_invalid_truncate_prompt_tokens_error(
server: RemoteOpenAIServer, model_name: str
):
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": "test", "truncate_prompt_tokens": 513},
)
error = classification_response.json()
assert classification_response.status_code == 400
assert "truncate_prompt_tokens" in error["error"]["message"]
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": ""},
)
error = classification_response.json()
assert classification_response.status_code == 400
assert "error" in error
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_batch_classification_empty_list(server: RemoteOpenAIServer, model_name: str):
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": []},
)
classification_response.raise_for_status()
output = ClassificationResponse.model_validate(classification_response.json())
assert output.object == "list"
assert isinstance(output.data, list)
assert len(output.data) == 0
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer):
request_args = {
"model": MODEL_NAME,
"input": "This product was excellent and exceeded my expectations",
}
classification_response = requests.post(
server.url_for("classify"), json=request_args
)
classification_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
classification_output = classification_response.json()
invocation_output = invocation_response.json()
assert classification_output.keys() == invocation_output.keys()
for classification_data, invocation_data in zip(
classification_output["data"], invocation_output["data"]
):
assert classification_data.keys() == invocation_data.keys()
assert classification_data["probs"] == pytest.approx(
invocation_data["probs"], rel=0.01
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
input_text = ["This product was excellent and exceeded my expectations"]
async def get_outputs(use_activation):
response = requests.post(
server.url_for("classify"),
json={
"model": model_name,
"input": input_text,
"use_activation": use_activation,
},
)
outputs = response.json()
return torch.tensor([x["probs"] for x in outputs["data"]])
default = await get_outputs(use_activation=None)
w_activation = await get_outputs(use_activation=True)
wo_activation = await get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_score(server: RemoteOpenAIServer, model_name: str):
# score api is only enabled for num_labels == 1.
response = requests.post(
server.url_for("score"),
json={
"model": model_name,
"text_1": "ping",
"text_2": "pong",
},
)
assert response.json()["error"]["type"] == "BadRequestError"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_rerank(server: RemoteOpenAIServer, model_name: str):
# rerank api is only enabled for num_labels == 1.
response = requests.post(
server.url_for("rerank"),
json={
"model": model_name,
"query": "ping",
"documents": ["pong"],
},
)
assert response.json()["error"]["type"] == "BadRequestError"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
input_text = "This product was excellent and exceeded my expectations"
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": "classify",
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 2
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
task = "token_classify"
input_text = ["This product was excellent and exceeded my expectations"]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 8
assert len(poolings.data[0].data[0]) == 2
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
async def test_pooling_not_supported(
server: RemoteOpenAIServer, model_name: str, task: str
):
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": "test",
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(
f"Task {task} is not supported"
)

View File

@@ -0,0 +1,95 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
import requests
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
MAXIMUM_VIDEOS = 1
TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
HF_OVERRIDES = {
"text_config": {
"architectures": ["Qwen2_5_VLForSequenceClassification"],
},
}
@pytest.fixture(scope="module")
def server_vlm_classify():
args = [
"--runner",
"pooling",
"--max-model-len",
"5000",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"video": MAXIMUM_VIDEOS}),
]
with RemoteOpenAIServer(
VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
) as remote_server:
yield remote_server
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
def test_classify_accepts_chat_text_only(
server_vlm_classify: RemoteOpenAIServer, model_name: str
) -> None:
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please classify this text request."},
],
}
]
response = requests.post(
server_vlm_classify.url_for("classify"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == model_name
assert len(output.data) == 1
assert len(output.data[0].probs) == 2
assert output.usage.prompt_tokens == 22
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
def test_classify_accepts_chat_video_url(
server_vlm_classify: RemoteOpenAIServer, model_name: str
) -> None:
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please classify this video."},
{"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
],
}
]
response = requests.post(
server_vlm_classify.url_for("classify"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == model_name
assert len(output.data) == 1
assert len(output.data[0].probs) == 2
assert output.usage.prompt_tokens == 4807

View File

@@ -0,0 +1,47 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import (
MTEB_EMBED_TASKS,
MTEB_EMBED_TOL,
OpenAIClientMtebEncoder,
run_mteb_embed_task,
)
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "intfloat/e5-small"
MAIN_SCORE = 0.7422994752439667
@pytest.fixture(scope="module")
def server():
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def test_mteb_embed(server):
client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE
print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < MTEB_EMBED_TOL

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
import torch
import torch.nn.functional as F
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_token_embed(llm: LLM):
outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
multi_vector = outputs[0].outputs.data
assert multi_vector.shape == (11, 384)
def test_pooling_params(llm: LLM):
def get_outputs(normalize):
outputs = llm.embed(
prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
)
return torch.tensor([x.outputs.embedding for x in outputs])
default = get_outputs(normalize=None)
w_normal = get_outputs(normalize=True)
wo_normal = get_outputs(normalize=False)
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
"wo_normal should not use normal."
)
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
"w_normal should be close to normal(wo_normal)."
)

View File

@@ -0,0 +1,680 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import json
import numpy as np
import openai
import pytest
import pytest_asyncio
import requests
import torch
import torch.nn.functional as F
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.platforms import current_platform
from vllm.tokenizers import get_tokenizer
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
MetadataItem,
binary2tensor,
build_metadata_items,
decode_pooling_output,
)
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"512",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="module")
def hf_model(hf_runner):
with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
yield hf_model
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 11
assert embeddings.usage.total_tokens == 11
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
# test list[str]
input_texts = [
"The cat sat on the mat.",
"A feline was resting on a rug.",
"Stars twinkle brightly in the night sky.",
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 33
assert embeddings.usage.total_tokens == 33
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test list[list[int]]
input_tokens = [
[4, 5, 7, 9, 20],
[15, 29, 499],
[24, 24, 24, 24, 24],
[25, 32, 64, 77],
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
chat_response = requests.post(
server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
},
)
chat_response.raise_for_status()
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name)
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = EmbeddingResponse.model_validate(
completion_response.model_dump(mode="json")
)
assert chat_embeddings.id is not None
assert completion_embeddings.id is not None
assert chat_embeddings.created <= completion_embeddings.created
assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
completion_embeddings.model_dump(exclude={"id", "created"})
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(
hf_model, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models",
]
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
run_embedding_correctness_test(hf_model, input_texts, float_data)
responses_base64 = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="base64"
)
base64_data = []
for data in responses_base64.data:
base64_data.append(
np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
)
run_embedding_correctness_test(hf_model, input_texts, base64_data)
# Default response is float32 decoded from base64 by OpenAI Client
responses_default = await client.embeddings.create(
input=input_texts, model=model_name
)
default_data = [d.embedding for d in responses_default.data]
run_embedding_correctness_test(hf_model, input_texts, default_data)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_base64_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
base64_data = []
for data in responses_base64.json()["data"]:
binary = base64.b64decode(data["embedding"])
tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
base64_data.append(tensor.to(torch.float32).tolist())
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=base64_data,
name_0="float_data",
name_1="base64_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
metadata = json.loads(responses_bytes.headers["metadata"])
body = responses_bytes.content
items = [MetadataItem(**x) for x in metadata["data"]]
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_only_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
embedding_size = len(float_data[0])
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
assert "metadata" not in responses_bytes.headers
body = responses_bytes.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(embedding_size,),
n_request=len(input_texts),
)
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
async def test_params_not_supported(
server: RemoteOpenAIServer, model_name: str, param_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
param_name: f"bad_{param_name}",
},
)
assert responses_base64.status_code == 400
assert "literal_error" in responses_base64.json()["error"]["message"]
assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1,
24428,
289,
18341,
26165,
285,
19323,
283,
289,
26789,
3871,
28728,
9901,
340,
2229,
385,
340,
315,
28741,
28804,
2,
]
embedding_response = await client.embeddings.create(
model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(
client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
response = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193},
)
assert "error" in response.object
assert (
"truncate_prompt_tokens value is greater than max_model_len. "
"Please, select a smaller truncation size." in response.message
)
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
input_texts = [
"The chef prepared a delicious meal.",
]
request_args = {
"model": MODEL_NAME,
"input": input_texts,
"encoding_format": "float",
}
completion_response = await client.embeddings.create(**request_args)
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
completion_output = completion_response.model_dump()
invocation_output = invocation_response.json()
assert completion_output.keys() == invocation_output.keys()
for completion_data, invocation_data in zip(
completion_output["data"], invocation_output["data"]
):
assert completion_data.keys() == invocation_data.keys()
check_embeddings_close(
embeddings_0_lst=[completion_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="completion",
name_1="invocation",
)
@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
request_args = {
"model": MODEL_NAME,
"messages": messages,
"encoding_format": "float",
}
chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
chat_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
chat_output = chat_response.json()
invocation_output = invocation_response.json()
assert chat_output.keys() == invocation_output.keys()
for chat_data, invocation_data in zip(
chat_output["data"], invocation_output["data"]
):
assert chat_data.keys() == invocation_data.keys()
check_embeddings_close(
embeddings_0_lst=[chat_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="chat",
name_1="invocation",
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
input_text = ["The chef prepared a delicious meal."]
async def get_outputs(normalize):
request_args = {
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"normalize": normalize,
}
response = requests.post(server.url_for("v1/embeddings"), json=request_args)
outputs = response.json()
return torch.tensor([x["embedding"] for x in outputs["data"]])
default = await get_outputs(normalize=None)
w_normal = await get_outputs(normalize=True)
wo_normal = await get_outputs(normalize=False)
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
"wo_normal should not use normal."
)
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
"w_normal should be close to normal(wo_normal)."
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
task = "embed"
input_text = ["The chef prepared a delicious meal."]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 384
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
task = "token_embed"
input_text = ["The chef prepared a delicious meal."]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 11
assert len(poolings.data[0].data[0]) == 384
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
async def test_pooling_not_supported(
server: RemoteOpenAIServer, model_name: str, task: str
):
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": "test",
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(
f"Task {task} is not supported"
)

Some files were not shown because too many files have changed in this diff Show More