Sync from v0.13
This commit is contained in:
0
tests/entrypoints/__init__.py
Normal file
0
tests/entrypoints/__init__.py
Normal file
203
tests/entrypoints/conftest.py
Normal file
203
tests/entrypoints/conftest.py
Normal file
@@ -0,0 +1,203 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_prompts():
|
||||
return [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_token_ids():
|
||||
return [
|
||||
[0],
|
||||
[0, 1],
|
||||
[0, 2, 1],
|
||||
[0, 3, 1, 2],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_regex():
|
||||
return (
|
||||
r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
|
||||
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_json_schema():
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"age": {"type": "integer"},
|
||||
"skills": {
|
||||
"type": "array",
|
||||
"items": {"type": "string", "maxLength": 10},
|
||||
"minItems": 3,
|
||||
},
|
||||
"work_history": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {"type": "string"},
|
||||
"duration": {"type": "number"},
|
||||
"position": {"type": "string"},
|
||||
},
|
||||
"required": ["company", "position"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["name", "age", "skills", "work_history"],
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_complex_json_schema():
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"score": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 100, # Numeric range
|
||||
},
|
||||
"grade": {
|
||||
"type": "string",
|
||||
"pattern": "^[A-D]$", # Regex pattern
|
||||
},
|
||||
"email": {
|
||||
"type": "string",
|
||||
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
|
||||
},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
# Combining length and pattern restrictions
|
||||
"pattern": "^[a-z]{1,10}$",
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["score", "grade", "email", "tags"],
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_definition_json_schema():
|
||||
return {
|
||||
"$defs": {
|
||||
"Step": {
|
||||
"properties": {
|
||||
"explanation": {"title": "Explanation", "type": "string"},
|
||||
"output": {"title": "Output", "type": "string"},
|
||||
},
|
||||
"required": ["explanation", "output"],
|
||||
"title": "Step",
|
||||
"type": "object",
|
||||
}
|
||||
},
|
||||
"properties": {
|
||||
"steps": {
|
||||
"items": {"$ref": "#/$defs/Step"},
|
||||
"title": "Steps",
|
||||
"type": "array",
|
||||
},
|
||||
"final_answer": {"title": "Final Answer", "type": "string"},
|
||||
},
|
||||
"required": ["steps", "final_answer"],
|
||||
"title": "MathReasoning",
|
||||
"type": "object",
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_enum_json_schema():
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"enum": ["active", "inactive", "pending"], # Literal values using enum
|
||||
},
|
||||
"priority": {
|
||||
"type": "string",
|
||||
"enum": ["low", "medium", "high", "critical"],
|
||||
},
|
||||
"category": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["bug", "feature", "improvement"],
|
||||
},
|
||||
"severity": {
|
||||
"type": "integer",
|
||||
"enum": [1, 2, 3, 4, 5], # Enum can also contain numbers
|
||||
},
|
||||
},
|
||||
"required": ["type", "severity"],
|
||||
},
|
||||
"flags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["urgent", "blocked", "needs_review", "approved"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["status", "priority", "category", "flags"],
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_structured_outputs_choices():
|
||||
return [
|
||||
"Python",
|
||||
"Java",
|
||||
"JavaScript",
|
||||
"C++",
|
||||
"C#",
|
||||
"PHP",
|
||||
"TypeScript",
|
||||
"Ruby",
|
||||
"Swift",
|
||||
"Kotlin",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_sql_statements():
|
||||
return """
|
||||
start: select_statement
|
||||
select_statement: "SELECT" column "from" table "where" condition
|
||||
column: "col_1" | "col_2"
|
||||
table: "table_1" | "table_2"
|
||||
condition: column "=" number
|
||||
number: "1" | "2"
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_lora_files():
|
||||
"""Download Qwen3 LoRA files once per test session."""
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
return snapshot_download(repo_id="charent/self_cognition_Alice")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def opt125_lora_files() -> str:
|
||||
"""Download opt-125m LoRA files once per test session."""
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
return snapshot_download(repo_id="peft-internal-testing/opt-125m-dummy-lora")
|
||||
0
tests/entrypoints/llm/__init__.py
Normal file
0
tests/entrypoints/llm/__init__.py
Normal file
94
tests/entrypoints/llm/test_accuracy.py
Normal file
94
tests/entrypoints/llm/test_accuracy.py
Normal file
@@ -0,0 +1,94 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file test accuracy of the vLLM server via LMEval.
|
||||
It uses local-completions, which interacts with vLLM
|
||||
through the OAI API with N concurrent connections.
|
||||
This simulates real work usage of the API and makes
|
||||
sure that the zmq frontend mp RPC message passing and
|
||||
AsyncLLMEngine are working correctly.
|
||||
"""
|
||||
|
||||
import lm_eval
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAMES = [
|
||||
"Qwen/Qwen3-1.7B",
|
||||
"google/gemma-3-1b-it",
|
||||
]
|
||||
FP8_KV_MODEL_NAMES = [
|
||||
"Qwen/Qwen3-1.7B",
|
||||
]
|
||||
NUM_CONCURRENT = 500
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
EXPECTED_VALUES = {
|
||||
"Qwen/Qwen3-1.7B": 0.68,
|
||||
"google/gemma-3-1b-it": 0.25,
|
||||
}
|
||||
|
||||
|
||||
def run_test(model_name, more_args=None):
|
||||
"""Run the end to end accuracy test."""
|
||||
|
||||
model_args = f"pretrained={model_name},max_model_len=4096"
|
||||
|
||||
if more_args is not None:
|
||||
model_args = "{},{}".format(model_args, more_args)
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks="gsm8k",
|
||||
batch_size="auto",
|
||||
)
|
||||
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert model_name in EXPECTED_VALUES, (
|
||||
f"Cannot find the expected value for the model {model_name=}"
|
||||
)
|
||||
expected_value = EXPECTED_VALUES[model_name]
|
||||
assert (
|
||||
measured_value - RTOL < expected_value
|
||||
and measured_value + RTOL > expected_value
|
||||
), f"Expected: {expected_value} | Measured: {measured_value}"
|
||||
|
||||
|
||||
# TODO: [AlexM] Fix it with new CI/CD tests
|
||||
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODEL_NAMES)
|
||||
def test_lm_eval_accuracy_v1_engine(model):
|
||||
"""Run with the V1 Engine."""
|
||||
|
||||
more_args = None
|
||||
if current_platform.is_tpu():
|
||||
# Limit compilation time for TPU V1
|
||||
|
||||
more_args = "max_model_len=2048,max_num_seqs=64"
|
||||
|
||||
# Add TP test (if provided)
|
||||
if TPU_TP_TEST_STR:
|
||||
more_args += ",{}".format(TPU_TP_TEST_STR)
|
||||
|
||||
run_test(model, more_args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
|
||||
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
|
||||
"""Run with the V1 Engine."""
|
||||
|
||||
more_args = None
|
||||
if current_platform.is_tpu():
|
||||
# Limit compilation time for TPU V1
|
||||
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
|
||||
|
||||
# Add TP test (if provided)
|
||||
if TPU_TP_TEST_STR:
|
||||
more_args += ",{}".format(TPU_TP_TEST_STR)
|
||||
|
||||
run_test(model, more_args)
|
||||
212
tests/entrypoints/llm/test_chat.py
Normal file
212
tests/entrypoints/llm/test_chat.py
Normal file
@@ -0,0 +1,212 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ..openai.test_vision import TEST_IMAGE_ASSETS
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def text_llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def llm_for_failure_test():
|
||||
"""
|
||||
Fixture for testing issue #26081.
|
||||
Uses a small max_model_len to easily trigger length errors.
|
||||
"""
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
max_model_len=128,
|
||||
disable_log_stats=True,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def test_chat(text_llm):
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": prompt1},
|
||||
]
|
||||
outputs = text_llm.chat(messages)
|
||||
assert len(outputs) == 1
|
||||
|
||||
|
||||
def test_multi_chat(text_llm):
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
prompt2 = "Explain what among us is."
|
||||
|
||||
conversation1 = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": prompt1},
|
||||
]
|
||||
|
||||
conversation2 = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": prompt2},
|
||||
]
|
||||
|
||||
messages = [conversation1, conversation2]
|
||||
|
||||
outputs = text_llm.chat(messages)
|
||||
assert len(outputs) == 2
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def vision_llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
seed=0,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
|
||||
)
|
||||
def test_chat_multi_image(vision_llm, image_urls: list[str]):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "image_url", "image_url": {"url": image_url}}
|
||||
for image_url in image_urls
|
||||
),
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
outputs = vision_llm.chat(messages)
|
||||
assert len(outputs) >= 0
|
||||
|
||||
|
||||
def test_llm_chat_tokenization_no_double_bos(text_llm):
|
||||
"""
|
||||
LLM.chat() should not add special tokens when using chat templates.
|
||||
Check we get a single BOS token for llama chat.
|
||||
"""
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": "Hello!"},
|
||||
]
|
||||
outputs = text_llm.chat(messages)
|
||||
assert len(outputs) == 1
|
||||
|
||||
prompt_token_ids = outputs[0].prompt_token_ids
|
||||
assert prompt_token_ids is not None
|
||||
|
||||
bos_token = text_llm.get_tokenizer().bos_token_id
|
||||
|
||||
# Ensure we have a single BOS
|
||||
assert prompt_token_ids[0] == bos_token
|
||||
assert prompt_token_ids[1] != bos_token, "Double BOS"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def thinking_llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_thinking", [True, False])
|
||||
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": "What is 1+1?"},
|
||||
]
|
||||
|
||||
outputs = thinking_llm.chat(
|
||||
messages,
|
||||
chat_template_kwargs={"enable_thinking": enable_thinking},
|
||||
)
|
||||
assert len(outputs) == 1
|
||||
|
||||
prompt_token_ids = outputs[0].prompt_token_ids
|
||||
assert prompt_token_ids is not None
|
||||
|
||||
think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]
|
||||
|
||||
if enable_thinking:
|
||||
assert think_id not in prompt_token_ids
|
||||
else:
|
||||
# The chat template includes dummy thinking process
|
||||
assert think_id in prompt_token_ids
|
||||
|
||||
|
||||
def test_chat_batch_failure_cleanup(llm_for_failure_test):
|
||||
"""
|
||||
Tests that if a batch call to llm.chat() fails mid-way
|
||||
(e.g., due to one invalid prompt), the requests that
|
||||
were already enqueued are properly aborted and do not
|
||||
pollute the queue for subsequent calls.
|
||||
(Fixes Issue #26081)
|
||||
"""
|
||||
llm = llm_for_failure_test
|
||||
valid_msg = [{"role": "user", "content": "Hello"}]
|
||||
long_text = "This is a very long text to test the error " * 50
|
||||
invalid_msg = [{"role": "user", "content": long_text}]
|
||||
batch_1 = [
|
||||
valid_msg,
|
||||
valid_msg,
|
||||
invalid_msg,
|
||||
]
|
||||
batch_2 = [
|
||||
valid_msg,
|
||||
valid_msg,
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
with pytest.raises(ValueError, match="longer than the maximum model length"):
|
||||
llm.chat(batch_1, sampling_params=sampling_params)
|
||||
outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
|
||||
assert len(outputs_2) == len(batch_2)
|
||||
assert llm.llm_engine.get_num_unfinished_requests() == 0
|
||||
36
tests/entrypoints/llm/test_collective_rpc.py
Normal file
36
tests/entrypoints/llm/test_collective_rpc.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("backend", ["mp", "ray"])
|
||||
@create_new_process_for_each_test()
|
||||
def test_collective_rpc(tp_size, backend, monkeypatch):
|
||||
if torch.cuda.device_count() < tp_size:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
if tp_size == 1 and backend == "ray":
|
||||
pytest.skip("Skip duplicate test case")
|
||||
if tp_size == 1:
|
||||
backend = None
|
||||
|
||||
# intentionally define the method and class in the test function,
|
||||
# to test if they can be serialized and sent to the workers
|
||||
def echo_rank(self):
|
||||
return self.rank
|
||||
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
llm = LLM(
|
||||
model="hmellor/tiny-random-LlamaForCausalLM",
|
||||
enforce_eager=True,
|
||||
load_format="dummy",
|
||||
tensor_parallel_size=tp_size,
|
||||
distributed_executor_backend=backend,
|
||||
)
|
||||
assert llm.collective_rpc(echo_rank) == list(range(tp_size))
|
||||
124
tests/entrypoints/llm/test_generate.py
Normal file
124
tests/entrypoints/llm/test_generate.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "distilbert/distilgpt2"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
TOKEN_IDS = [
|
||||
[0],
|
||||
[0, 1],
|
||||
[0, 2, 1],
|
||||
[0, 3, 1, 2],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
max_num_batched_tokens=4096,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_multiple_sampling_params(llm: LLM):
|
||||
sampling_params = [
|
||||
SamplingParams(temperature=0.01, top_p=0.95),
|
||||
SamplingParams(temperature=0.3, top_p=0.95),
|
||||
SamplingParams(temperature=0.7, top_p=0.95),
|
||||
SamplingParams(temperature=0.99, top_p=0.95),
|
||||
]
|
||||
|
||||
# Multiple SamplingParams should be matched with each prompt
|
||||
outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
|
||||
|
||||
# Single SamplingParams should be applied to every prompt
|
||||
single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
|
||||
outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# sampling_params is None, default params should be applied
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
def test_multiple_priority(llm: LLM):
|
||||
# Generate works when priority is None
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None, priority=None)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Generate works when length of priority is same as the len(PROMPTS)
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None, priority=[0] * len(PROMPTS))
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the length of priority does not match the length of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(
|
||||
PROMPTS, sampling_params=None, priority=[0] * (len(PROMPTS) - 1)
|
||||
)
|
||||
|
||||
# Exception raised, if the priority list is empty
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None, priority=[])
|
||||
|
||||
|
||||
def test_max_model_len():
|
||||
max_model_len = 20
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True, # reduce test time
|
||||
)
|
||||
sampling_params = SamplingParams(max_tokens=max_model_len + 10)
|
||||
outputs = llm.generate(PROMPTS, sampling_params)
|
||||
for output in outputs:
|
||||
num_total_tokens = len(output.prompt_token_ids) + len(
|
||||
output.outputs[0].token_ids
|
||||
)
|
||||
# Total tokens must not exceed max_model_len.
|
||||
# It can be less if generation finishes due to other reasons (e.g., EOS)
|
||||
# before reaching the absolute model length limit.
|
||||
assert num_total_tokens <= max_model_len
|
||||
|
||||
|
||||
def test_log_stats():
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True, # reduce test time
|
||||
)
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None)
|
||||
|
||||
# disable_log_stats is False, every output should have metrics
|
||||
assert all(output.metrics is not None for output in outputs)
|
||||
27
tests/entrypoints/llm/test_gpu_utilization.py
Normal file
27
tests/entrypoints/llm/test_gpu_utilization.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def test_gpu_memory_utilization():
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# makes sure gpu_memory_utilization is per-instance limit,
|
||||
# not a global limit
|
||||
llms = [
|
||||
LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3, enforce_eager=True)
|
||||
for i in range(3)
|
||||
]
|
||||
for llm in llms:
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
98
tests/entrypoints/llm/test_mm_cache_stats.py
Normal file
98
tests/entrypoints/llm/test_mm_cache_stats.py
Normal file
@@ -0,0 +1,98 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
from vllm.v1.metrics import loggers as stat_loggers
|
||||
from vllm.v1.metrics.reader import Counter, Metric
|
||||
|
||||
from ..openai.test_vision import TEST_IMAGE_ASSETS
|
||||
|
||||
|
||||
def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _get_counter_value(metrics: list[Metric], name: str):
|
||||
metric = next(m for m in metrics if m.name == name)
|
||||
assert isinstance(metric, Counter)
|
||||
return metric.value
|
||||
|
||||
|
||||
def _get_mm_cache_stats(metrics: list[Metric]):
|
||||
mm_cache_queries = _get_counter_value(metrics, "vllm:mm_cache_queries")
|
||||
mm_cache_hits = _get_counter_value(metrics, "vllm:mm_cache_hits")
|
||||
|
||||
return mm_cache_queries, mm_cache_hits
|
||||
|
||||
|
||||
def _get_mm_cache_log(llm: LLM, caplog_vllm: pytest.LogCaptureFixture) -> float:
|
||||
caplog_vllm.clear()
|
||||
with caplog_vllm.at_level(logging.INFO, logger=stat_loggers.__name__):
|
||||
llm.llm_engine.do_log_stats()
|
||||
|
||||
assert len(caplog_vllm.records) == 1
|
||||
msg = caplog_vllm.records[0].getMessage()
|
||||
|
||||
assert "MM cache hit rate" in msg
|
||||
match = re.search(r"MM cache hit rate: ([0-9.]+)%", msg)
|
||||
assert match is not None
|
||||
return float(match.group(1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("image_urls", [TEST_IMAGE_ASSETS[:2]], indirect=True)
|
||||
@pytest.mark.parametrize("mm_processor_cache_type", ["lru", "shm"])
|
||||
def test_mm_cache_stats(
|
||||
num_gpus_available,
|
||||
image_urls,
|
||||
mm_processor_cache_type,
|
||||
caplog_vllm,
|
||||
):
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
enforce_eager=True,
|
||||
mm_processor_cache_type=mm_processor_cache_type,
|
||||
disable_log_stats=False,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
)
|
||||
|
||||
llm.chat(_make_messages(image_urls[0]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (1, 0)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
|
||||
|
||||
llm.chat(_make_messages(image_urls[1]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (2, 0)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
|
||||
|
||||
llm.chat(_make_messages(image_urls[0]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (3, 1)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(33.3)
|
||||
|
||||
# NOTE: This only resets hit rate stats in CachingMetrics
|
||||
# The raw queries and hits counts remain unaffected
|
||||
llm.reset_mm_cache()
|
||||
|
||||
llm.chat(_make_messages(image_urls[0]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (4, 1)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
|
||||
|
||||
llm.chat(_make_messages(image_urls[1]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (5, 1)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
|
||||
34
tests/entrypoints/llm/test_prompt_validation.py
Normal file
34
tests/entrypoints/llm/test_prompt_validation.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
|
||||
def test_empty_prompt():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match="decoder prompt cannot be empty"):
|
||||
llm.generate([""])
|
||||
|
||||
|
||||
def test_out_of_vocab_token():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match="out of vocabulary"):
|
||||
llm.generate({"prompt_token_ids": [999999]})
|
||||
|
||||
|
||||
def test_require_mm_embeds():
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
enforce_eager=True,
|
||||
enable_mm_embeds=False,
|
||||
)
|
||||
with pytest.raises(ValueError, match="--enable-mm-embeds"):
|
||||
llm.generate(
|
||||
{
|
||||
"prompt": "<image>",
|
||||
"multi_modal_data": {"image": torch.empty(1, 1, 1)},
|
||||
}
|
||||
)
|
||||
0
tests/entrypoints/offline_mode/__init__.py
Normal file
0
tests/entrypoints/offline_mode/__init__.py
Normal file
156
tests/entrypoints/offline_mode/test_offline_mode.py
Normal file
156
tests/entrypoints/offline_mode/test_offline_mode.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for HF_HUB_OFFLINE mode"""
|
||||
|
||||
import dataclasses
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
import urllib3
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
|
||||
MODEL_CONFIGS = [
|
||||
{
|
||||
"model": "facebook/opt-125m",
|
||||
"enforce_eager": True,
|
||||
"gpu_memory_utilization": 0.20,
|
||||
"max_model_len": 64,
|
||||
"max_num_batched_tokens": 64,
|
||||
"max_num_seqs": 64,
|
||||
"tensor_parallel_size": 1,
|
||||
},
|
||||
{
|
||||
"model": "Qwen/Qwen3-0.6B",
|
||||
"enforce_eager": True,
|
||||
"gpu_memory_utilization": 0.50,
|
||||
"max_model_len": 64,
|
||||
"max_num_batched_tokens": 64,
|
||||
"max_num_seqs": 64,
|
||||
"tensor_parallel_size": 1,
|
||||
"tokenizer": "Qwen/Qwen3-4B",
|
||||
},
|
||||
{
|
||||
"model": "mistralai/Mistral-7B-Instruct-v0.1",
|
||||
"enforce_eager": True,
|
||||
"gpu_memory_utilization": 0.95,
|
||||
"max_model_len": 64,
|
||||
"max_num_batched_tokens": 64,
|
||||
"max_num_seqs": 64,
|
||||
"tensor_parallel_size": 1,
|
||||
"tokenizer_mode": "mistral",
|
||||
},
|
||||
# TODO: re-enable once these tests are run with V1
|
||||
# {
|
||||
# "model": "sentence-transformers/all-MiniLM-L12-v2",
|
||||
# "enforce_eager": True,
|
||||
# "gpu_memory_utilization": 0.20,
|
||||
# "max_model_len": 64,
|
||||
# "max_num_batched_tokens": 64,
|
||||
# "max_num_seqs": 64,
|
||||
# "tensor_parallel_size": 1,
|
||||
# },
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def cache_models():
|
||||
# Cache model files first
|
||||
for model_config in MODEL_CONFIGS:
|
||||
LLM(**model_config)
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.usefixtures("cache_models")
|
||||
def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
|
||||
# Set HF to offline mode and ensure we can still construct an LLM
|
||||
with monkeypatch.context() as m:
|
||||
try:
|
||||
m.setenv("HF_HUB_OFFLINE", "1")
|
||||
m.setenv("VLLM_NO_USAGE_STATS", "1")
|
||||
|
||||
def disable_connect(*args, **kwargs):
|
||||
raise RuntimeError("No http calls allowed")
|
||||
|
||||
m.setattr(
|
||||
urllib3.connection.HTTPConnection,
|
||||
"connect",
|
||||
disable_connect,
|
||||
)
|
||||
m.setattr(
|
||||
urllib3.connection.HTTPSConnection,
|
||||
"connect",
|
||||
disable_connect,
|
||||
)
|
||||
|
||||
# Need to re-import huggingface_hub
|
||||
# and friends to set up offline mode
|
||||
_re_import_modules()
|
||||
# Cached model files should be used in offline mode
|
||||
for model_config in MODEL_CONFIGS:
|
||||
LLM(**model_config)
|
||||
finally:
|
||||
# Reset the environment after the test
|
||||
# NB: Assuming tests are run in online mode
|
||||
_re_import_modules()
|
||||
|
||||
|
||||
def _re_import_modules():
|
||||
hf_hub_module_names = [k for k in sys.modules if k.startswith("huggingface_hub")]
|
||||
transformers_module_names = [
|
||||
k
|
||||
for k in sys.modules
|
||||
if k.startswith("transformers") and not k.startswith("transformers_modules")
|
||||
]
|
||||
|
||||
reload_exception = None
|
||||
for module_name in hf_hub_module_names + transformers_module_names:
|
||||
try:
|
||||
importlib.reload(sys.modules[module_name])
|
||||
except Exception as e:
|
||||
reload_exception = e
|
||||
# Try to continue clean up so that other tests are less likely to
|
||||
# be affected
|
||||
|
||||
# Error this test if reloading a module failed
|
||||
if reload_exception is not None:
|
||||
raise reload_exception
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.usefixtures("cache_models")
|
||||
def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
|
||||
# Set HF to offline mode and ensure we can still construct an LLM
|
||||
with monkeypatch.context() as m:
|
||||
try:
|
||||
m.setenv("HF_HUB_OFFLINE", "1")
|
||||
m.setenv("VLLM_NO_USAGE_STATS", "1")
|
||||
|
||||
def disable_connect(*args, **kwargs):
|
||||
raise RuntimeError("No http calls allowed")
|
||||
|
||||
m.setattr(
|
||||
urllib3.connection.HTTPConnection,
|
||||
"connect",
|
||||
disable_connect,
|
||||
)
|
||||
m.setattr(
|
||||
urllib3.connection.HTTPSConnection,
|
||||
"connect",
|
||||
disable_connect,
|
||||
)
|
||||
# Need to re-import huggingface_hub
|
||||
# and friends to set up offline mode
|
||||
_re_import_modules()
|
||||
engine_args = EngineArgs(model="facebook/opt-125m")
|
||||
LLM(**dataclasses.asdict(engine_args))
|
||||
finally:
|
||||
# Reset the environment after the test
|
||||
# NB: Assuming tests are run in online mode
|
||||
_re_import_modules()
|
||||
0
tests/entrypoints/openai/__init__.py
Normal file
0
tests/entrypoints/openai/__init__.py
Normal file
27
tests/entrypoints/openai/conftest.py
Normal file
27
tests/entrypoints/openai/conftest.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mary_had_lamb():
|
||||
path = AudioAsset("mary_had_lamb").get_local_path()
|
||||
with open(str(path), "rb") as f:
|
||||
yield f
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def winning_call():
|
||||
path = AudioAsset("winning_call").get_local_path()
|
||||
with open(str(path), "rb") as f:
|
||||
yield f
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def foscolo():
|
||||
# Test translation it->en
|
||||
path = AudioAsset("azacinto_foscolo").get_local_path()
|
||||
with open(str(path), "rb") as f:
|
||||
yield f
|
||||
0
tests/entrypoints/openai/correctness/__init__.py
Normal file
0
tests/entrypoints/openai/correctness/__init__.py
Normal file
78
tests/entrypoints/openai/correctness/test_lmeval.py
Normal file
78
tests/entrypoints/openai/correctness/test_lmeval.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file test accuracy of the vLLM server via LMEval.
|
||||
It uses local-completions, which interacts with vLLM
|
||||
through the OAI API with N concurrent connections.
|
||||
This simulates real work usage of the API and makes
|
||||
sure that the zmq frontend mp RPC message passing and
|
||||
AsyncLLMEngine are working correctly.
|
||||
"""
|
||||
|
||||
import lm_eval
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
|
||||
NUM_CONCURRENT = 500
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
EXPECTED_VALUE = 0.54
|
||||
DEFAULT_ARGS = ["--max-model-len", "4096"]
|
||||
MORE_ARGS_LIST = [
|
||||
[], # Default
|
||||
["--enable-chunked-prefill"], # Chunked
|
||||
]
|
||||
MAX_WAIT_SECONDS = None
|
||||
|
||||
if current_platform.is_tpu():
|
||||
MORE_ARGS_LIST = [
|
||||
[], # Default
|
||||
]
|
||||
MAX_WAIT_SECONDS = 600
|
||||
|
||||
|
||||
def run_test(more_args):
|
||||
"""Run the end to end accuracy test."""
|
||||
|
||||
args = list(DEFAULT_ARGS)
|
||||
args.extend(more_args)
|
||||
print(f"Running with: {args}")
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, max_wait_seconds=MAX_WAIT_SECONDS
|
||||
) as remote_server:
|
||||
url = f"{remote_server.url_for('v1')}/completions"
|
||||
|
||||
model_args = (
|
||||
f"model={MODEL_NAME},"
|
||||
f"base_url={url},"
|
||||
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
|
||||
)
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="local-completions",
|
||||
model_args=model_args,
|
||||
tasks=TASK,
|
||||
)
|
||||
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (
|
||||
measured_value - RTOL < EXPECTED_VALUE
|
||||
and measured_value + RTOL > EXPECTED_VALUE
|
||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||
|
||||
|
||||
def test_lm_eval_accuracy_v1_engine():
|
||||
"""Run with the V1 Engine."""
|
||||
|
||||
more_args = []
|
||||
|
||||
# Limit compilation time for V1
|
||||
if current_platform.is_tpu():
|
||||
more_args = ["--max-num-seqs", "64"]
|
||||
|
||||
run_test(more_args)
|
||||
@@ -0,0 +1,171 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Evaluate Transcription API correctness by computing Word Error Rate (WER)
|
||||
on a given ASR dataset. When provided, it will also compare the WER against
|
||||
a baseline.
|
||||
This simulates real work usage of the API and makes sure that the frontend and
|
||||
AsyncLLMEngine are working correctly.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import time
|
||||
from statistics import mean, median
|
||||
|
||||
import librosa
|
||||
import pytest
|
||||
import soundfile
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from evaluate import load
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from ....utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
def to_bytes(y, sr):
|
||||
buffer = io.BytesIO()
|
||||
soundfile.write(buffer, y, sr, format="WAV")
|
||||
buffer.seek(0)
|
||||
return buffer
|
||||
|
||||
|
||||
async def transcribe_audio(client, tokenizer, y, sr):
|
||||
# Send loaded audio directly instead of loading from disk,
|
||||
# don't account for that time though
|
||||
with to_bytes(y, sr) as f:
|
||||
start_time = time.perf_counter()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
file=f,
|
||||
model=tokenizer.name_or_path,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
)
|
||||
end_time = time.perf_counter()
|
||||
# NOTE there's no streaming in transcriptions, can't measure ttft
|
||||
latency = end_time - start_time
|
||||
num_output_tokens = len(
|
||||
tokenizer(transcription.text, add_special_tokens=False).input_ids
|
||||
)
|
||||
return latency, num_output_tokens, transcription.text
|
||||
|
||||
|
||||
async def bound_transcribe(sem, client, tokenizer, audio, reference):
|
||||
# Use semaphore to limit concurrent requests.
|
||||
async with sem:
|
||||
result = await transcribe_audio(client, tokenizer, *audio)
|
||||
# Normalize *english* output/reference for evaluation.
|
||||
out = tokenizer.normalize(result[2])
|
||||
ref = tokenizer.normalize(reference)
|
||||
return result[:2] + (out, ref)
|
||||
|
||||
|
||||
async def process_dataset(model, client, data, concurrent_request):
|
||||
sem = asyncio.Semaphore(concurrent_request)
|
||||
|
||||
# Load tokenizer once outside the loop
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
|
||||
# Warmup call as the first `librosa.load` server-side is quite slow.
|
||||
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
|
||||
_ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")
|
||||
|
||||
tasks: list[asyncio.Task] = []
|
||||
for sample in data:
|
||||
audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
|
||||
task = asyncio.create_task(
|
||||
bound_transcribe(sem, client, tokenizer, (audio, sr), sample["text"])
|
||||
)
|
||||
tasks.append(task)
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
def print_performance_metrics(results, total_time):
|
||||
latencies = [res[0] for res in results]
|
||||
total_tokens = sum([res[1] for res in results])
|
||||
|
||||
total = len(results)
|
||||
print(f"Total Requests: {total}")
|
||||
print(f"Successful Requests: {len(latencies)}")
|
||||
print(f"Average Latency: {mean(latencies):.4f} seconds")
|
||||
print(f"Median Latency: {median(latencies):.4f} seconds")
|
||||
perc = sorted(latencies)[int(len(latencies) * 0.95) - 1]
|
||||
print(f"95th Percentile Latency: {perc:.4f} seconds")
|
||||
# Throughput
|
||||
req_throughput = len(latencies) / total_time
|
||||
print(f"Estimated req_Throughput: {req_throughput:.2f} requests/s")
|
||||
throughput = total_tokens / total_time
|
||||
print(f"Estimated Throughput: {throughput:.2f} tok/s")
|
||||
|
||||
|
||||
def add_duration(sample):
|
||||
y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
|
||||
sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000
|
||||
return sample
|
||||
|
||||
|
||||
def load_hf_dataset(dataset_repo: str, split="validation", **hf_kwargs):
|
||||
## Load and filter the dataset
|
||||
dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
|
||||
if "duration_ms" not in dataset[0]:
|
||||
# compute duration to filter
|
||||
dataset = dataset.map(add_duration)
|
||||
|
||||
# Whisper max supported duration
|
||||
dataset = dataset.filter(lambda example: example["duration_ms"] < 30000)
|
||||
return dataset
|
||||
|
||||
|
||||
def run_evaluation(
|
||||
model: str,
|
||||
client,
|
||||
dataset,
|
||||
max_concurrent_reqs: int,
|
||||
n_examples: int = -1,
|
||||
print_metrics: bool = True,
|
||||
):
|
||||
if n_examples > 0:
|
||||
dataset = dataset.select(range(n_examples))
|
||||
start = time.perf_counter()
|
||||
results = asyncio.run(process_dataset(model, client, dataset, max_concurrent_reqs))
|
||||
end = time.perf_counter()
|
||||
total_time = end - start
|
||||
print(f"Total Test Time: {total_time:.4f} seconds")
|
||||
if print_metrics:
|
||||
print_performance_metrics(results, total_time)
|
||||
# Compute WER
|
||||
predictions = [res[2] for res in results]
|
||||
references = [res[3] for res in results]
|
||||
wer = load("wer")
|
||||
wer_score = 100 * wer.compute(references=references, predictions=predictions)
|
||||
print("WER:", wer_score)
|
||||
return wer_score
|
||||
|
||||
|
||||
# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
|
||||
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
|
||||
# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
|
||||
@pytest.mark.parametrize(
|
||||
"dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"]
|
||||
)
|
||||
# NOTE: Expected WER measured with equivalent hf.transformers args:
|
||||
# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
|
||||
@pytest.mark.parametrize("expected_wer", [12.744980])
|
||||
def test_wer_correctness(
|
||||
model_name, dataset_repo, expected_wer, n_examples=-1, max_concurrent_request=None
|
||||
):
|
||||
# TODO refactor to use `ASRDataset`
|
||||
with RemoteOpenAIServer(model_name, ["--enforce-eager"]) as remote_server:
|
||||
dataset = load_hf_dataset(dataset_repo)
|
||||
|
||||
if not max_concurrent_request:
|
||||
# No max concurrency
|
||||
max_concurrent_request = n_examples if n_examples > 0 else len(dataset)
|
||||
|
||||
client = remote_server.get_async_client()
|
||||
wer = run_evaluation(
|
||||
model_name, client, dataset, max_concurrent_request, n_examples
|
||||
)
|
||||
if expected_wer:
|
||||
torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
|
||||
0
tests/entrypoints/openai/parser/__init__.py
Normal file
0
tests/entrypoints/openai/parser/__init__.py
Normal file
1201
tests/entrypoints/openai/parser/test_harmony_utils.py
Normal file
1201
tests/entrypoints/openai/parser/test_harmony_utils.py
Normal file
File diff suppressed because it is too large
Load Diff
82
tests/entrypoints/openai/test_async_tokenization.py
Normal file
82
tests/entrypoints/openai/test_async_tokenization.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
from collections.abc import Callable
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--load-format",
|
||||
"dummy",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
ids=["completion", "chat"],
|
||||
argnames=["create_func_gen", "content_body"],
|
||||
argvalues=[
|
||||
(lambda x: x.completions.create, {"prompt": " ".join(["A"] * 10_000)}),
|
||||
(
|
||||
lambda x: x.chat.completions.create,
|
||||
{"messages": [{"role": "user", "content": " ".join(["A"] * 10_000)}]},
|
||||
),
|
||||
],
|
||||
)
|
||||
async def test_with_and_without_truncate(
|
||||
server: RemoteOpenAIServer,
|
||||
client: openai.AsyncOpenAI,
|
||||
create_func_gen: Callable,
|
||||
content_body: dict,
|
||||
):
|
||||
create_func = create_func_gen(client)
|
||||
body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
|
||||
|
||||
num_requests = 10
|
||||
truncate_prompt_tokens = [1000] * (num_requests // 2) + [None] * (
|
||||
num_requests - num_requests // 2
|
||||
)
|
||||
random.shuffle(truncate_prompt_tokens)
|
||||
|
||||
bodies = [
|
||||
{**body, "extra_body": {"truncate_prompt_tokens": t}}
|
||||
for t in truncate_prompt_tokens
|
||||
]
|
||||
|
||||
async def get_status_code(**kwargs):
|
||||
try:
|
||||
await create_func(**kwargs)
|
||||
return 200
|
||||
except openai.APIStatusError as e:
|
||||
return e.status_code
|
||||
|
||||
responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
|
||||
assert 500 not in responses
|
||||
392
tests/entrypoints/openai/test_audio.py
Normal file
392
tests/entrypoints/openai/test_audio.py
Normal file
@@ -0,0 +1,392 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.multimodal.utils import encode_audio_base64, fetch_audio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
TEST_AUDIO_URLS = [
|
||||
AudioAsset("winning_call").url,
|
||||
AudioAsset("mary_had_lamb").url,
|
||||
]
|
||||
MAXIMUM_AUDIOS = 2
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--dtype",
|
||||
"float32",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"5",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": MAXIMUM_AUDIOS}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_audio() -> dict[str, str]:
|
||||
return {
|
||||
audio_url: encode_audio_base64(*fetch_audio(audio_url))
|
||||
for audio_url in TEST_AUDIO_URLS
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_audio_url(
|
||||
audio_urls: str | list[str],
|
||||
content_text: str = "What's happening in this audio?",
|
||||
):
|
||||
if isinstance(audio_urls, str):
|
||||
audio_urls = [audio_urls]
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "audio_url", "audio_url": {"url": audio_url}}
|
||||
for audio_url in audio_urls
|
||||
),
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||
async def test_single_chat_session_audio(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(audio_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||
async def test_error_on_invalid_audio_url_type(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio_url", "audio_url": audio_url},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# audio_url should be a dict {"url": "some url"}, not directly a string
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||
async def test_single_chat_session_audio_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
audio_url: str,
|
||||
base64_encoded_audio: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(
|
||||
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
|
||||
)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||
async def test_single_chat_session_input_audio(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
audio_url: str,
|
||||
base64_encoded_audio: dict[str, str],
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": base64_encoded_audio[audio_url],
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||
async def test_chat_streaming_audio(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(
|
||||
audio_url, "What's a short title for this audio?"
|
||||
)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=8,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=8,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||
async def test_chat_streaming_input_audio(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
audio_url: str,
|
||||
base64_encoded_audio: dict[str, str],
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": base64_encoded_audio[audio_url],
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=8,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=8,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]]
|
||||
)
|
||||
async def test_multi_audio_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(audio_urls)
|
||||
|
||||
if len(audio_urls) > MAXIMUM_AUDIOS:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-audio input
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# the server should still work afterwards
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
completion = completion.choices[0].text
|
||||
assert completion is not None and len(completion) >= 0
|
||||
else:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
250
tests/entrypoints/openai/test_basic.py
Normal file
250
tests/entrypoints/openai/test_basic.py
Normal file
@@ -0,0 +1,250 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
from http import HTTPStatus
|
||||
from unittest.mock import AsyncMock, Mock
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.v1.engine.exceptions import EngineDeadError
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_args(request: pytest.FixtureRequest) -> list[str]:
|
||||
"""Provide extra arguments to the server via indirect parametrization
|
||||
|
||||
Usage:
|
||||
|
||||
>>> @pytest.mark.parametrize(
|
||||
>>> "server_args",
|
||||
>>> [
|
||||
>>> ["--disable-frontend-multiprocessing"],
|
||||
>>> [
|
||||
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
|
||||
>>> "--enable-auto-tool-choice",
|
||||
>>> ],
|
||||
>>> ],
|
||||
>>> indirect=True,
|
||||
>>> )
|
||||
>>> def test_foo(server, client):
|
||||
>>> ...
|
||||
|
||||
This will run `test_foo` twice with servers with:
|
||||
- `--disable-frontend-multiprocessing`
|
||||
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
|
||||
|
||||
"""
|
||||
if not hasattr(request, "param"):
|
||||
return []
|
||||
|
||||
val = request.param
|
||||
|
||||
if isinstance(val, str):
|
||||
return [val]
|
||||
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(server_args):
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
*server_args,
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server_args",
|
||||
[
|
||||
pytest.param([], id="default-frontend-multiprocessing"),
|
||||
pytest.param(
|
||||
["--disable-frontend-multiprocessing"],
|
||||
id="disable-frontend-multiprocessing",
|
||||
),
|
||||
],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_show_version(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("version"))
|
||||
response.raise_for_status()
|
||||
|
||||
assert response.json() == {"version": VLLM_VERSION}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server_args",
|
||||
[
|
||||
pytest.param([], id="default-frontend-multiprocessing"),
|
||||
pytest.param(
|
||||
["--disable-frontend-multiprocessing"],
|
||||
id="disable-frontend-multiprocessing",
|
||||
),
|
||||
],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_health(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("health"))
|
||||
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server_args",
|
||||
[
|
||||
pytest.param(
|
||||
["--max-model-len", "10100"], id="default-frontend-multiprocessing"
|
||||
),
|
||||
pytest.param(
|
||||
["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
|
||||
id="disable-frontend-multiprocessing",
|
||||
),
|
||||
],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_cancellation(server: RemoteOpenAIServer):
|
||||
# clunky test: send an ungodly amount of load in with short timeouts
|
||||
# then ensure that it still responds quickly afterwards
|
||||
|
||||
chat_input = [{"role": "user", "content": "Write a long story"}]
|
||||
client = server.get_async_client(timeout=0.5)
|
||||
tasks = []
|
||||
# Request about 2 million tokens
|
||||
for _ in range(200):
|
||||
task = asyncio.create_task(
|
||||
client.chat.completions.create(
|
||||
messages=chat_input,
|
||||
model=MODEL_NAME,
|
||||
max_tokens=10000,
|
||||
extra_body={"min_tokens": 10000},
|
||||
)
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
done, pending = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
|
||||
|
||||
# Make sure all requests were sent to the server and timed out
|
||||
# (We don't want to hide other errors like 400s that would invalidate this
|
||||
# test)
|
||||
assert len(pending) == 0
|
||||
for d in done:
|
||||
with pytest.raises(openai.APITimeoutError):
|
||||
d.result()
|
||||
|
||||
# If the server had not cancelled all the other requests, then it would not
|
||||
# be able to respond to this one within the timeout
|
||||
client = server.get_async_client(timeout=5)
|
||||
response = await client.chat.completions.create(
|
||||
messages=chat_input, model=MODEL_NAME, max_tokens=10
|
||||
)
|
||||
|
||||
assert len(response.choices) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_wrong_content_type(server: RemoteOpenAIServer):
|
||||
chat_input = [{"role": "user", "content": "Write a long story"}]
|
||||
client = server.get_async_client()
|
||||
|
||||
with pytest.raises(openai.APIStatusError):
|
||||
await client.chat.completions.create(
|
||||
messages=chat_input,
|
||||
model=MODEL_NAME,
|
||||
max_tokens=10000,
|
||||
extra_headers={"Content-Type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server_args",
|
||||
[pytest.param(["--enable-server-load-tracking"], id="enable-server-load-tracking")],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_server_load(server: RemoteOpenAIServer):
|
||||
# Check initial server load
|
||||
response = requests.get(server.url_for("load"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
assert response.json().get("server_load") == 0
|
||||
|
||||
def make_long_completion_request():
|
||||
return requests.post(
|
||||
server.url_for("v1/completions"),
|
||||
headers={"Content-Type": "application/json"},
|
||||
json={
|
||||
"prompt": "Give me a long story",
|
||||
"max_tokens": 1000,
|
||||
"temperature": 0,
|
||||
},
|
||||
)
|
||||
|
||||
# Start the completion request in a background thread.
|
||||
completion_future = asyncio.create_task(
|
||||
asyncio.to_thread(make_long_completion_request)
|
||||
)
|
||||
|
||||
# Give a short delay to ensure the request has started.
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Check server load while the completion request is running.
|
||||
response = requests.get(server.url_for("load"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
assert response.json().get("server_load") == 1
|
||||
|
||||
# Wait for the completion request to finish.
|
||||
await completion_future
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Check server load after the completion request has finished.
|
||||
response = requests.get(server.url_for("load"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
assert response.json().get("server_load") == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_health_check_engine_dead_error():
|
||||
# Import the health function directly to test it in isolation
|
||||
from vllm.entrypoints.serve.instrumentator.health import health
|
||||
|
||||
# Create a mock request that simulates what FastAPI would provide
|
||||
mock_request = Mock(spec=Request)
|
||||
mock_app_state = Mock()
|
||||
mock_engine_client = AsyncMock()
|
||||
mock_engine_client.check_health.side_effect = EngineDeadError()
|
||||
mock_app_state.engine_client = mock_engine_client
|
||||
mock_request.app.state = mock_app_state
|
||||
|
||||
# Test the health function directly with our mocked request
|
||||
# This simulates what would happen if the engine dies
|
||||
response = await health(mock_request)
|
||||
|
||||
# Assert that it returns 503 Service Unavailable
|
||||
assert response.status_code == 503
|
||||
798
tests/entrypoints/openai/test_chat.py
Normal file
798
tests/entrypoints/openai/test_chat.py
Normal file
@@ -0,0 +1,798 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# imports for structured outputs tests
|
||||
import json
|
||||
|
||||
import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import regex as re
|
||||
import requests
|
||||
import torch
|
||||
from openai import BadRequestError
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
"""Download zephyr LoRA files once per test session."""
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=False,
|
||||
)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora hereafter
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=True,
|
||||
top_logprobs=0,
|
||||
)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.content is not None
|
||||
assert len(choice.logprobs.content[0].top_logprobs) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.content is not None
|
||||
assert len(choice.logprobs.content[0].top_logprobs) == 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
# Default max_logprobs is 20, so this should raise an error
|
||||
with pytest.raises((openai.BadRequestError, openai.APIError)):
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=21,
|
||||
stream=True,
|
||||
)
|
||||
async for chunk in stream:
|
||||
...
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=30,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
# the server should still work afterwards
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name, messages=messages, max_completion_tokens=10, stream=False
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, prompt_logprobs",
|
||||
[(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
|
||||
)
|
||||
async def test_prompt_logprobs_chat(
|
||||
client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
|
||||
):
|
||||
params: dict = {
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Who won the world series in 2020?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "The Los Angeles Dodgers won the World Series in 2020.",
|
||||
},
|
||||
{"role": "user", "content": "Where was it played?"},
|
||||
],
|
||||
"model": model_name,
|
||||
}
|
||||
|
||||
if prompt_logprobs is not None:
|
||||
params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
|
||||
|
||||
if prompt_logprobs is not None and prompt_logprobs < 0:
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.chat.completions.create(**params)
|
||||
else:
|
||||
completion = await client.chat.completions.create(**params)
|
||||
if prompt_logprobs is not None:
|
||||
assert completion.prompt_logprobs is not None
|
||||
assert len(completion.prompt_logprobs) > 0
|
||||
else:
|
||||
assert completion.prompt_logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME],
|
||||
)
|
||||
async def test_more_than_one_prompt_logprobs_chat(
|
||||
client: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
params: dict = {
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Who won the world series in 2020?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "The Los Angeles Dodgers won the World Series in 2020.",
|
||||
},
|
||||
{"role": "user", "content": "Where was it played?"},
|
||||
],
|
||||
"model": model_name,
|
||||
"extra_body": {"prompt_logprobs": 1},
|
||||
}
|
||||
|
||||
completion_1 = await client.chat.completions.create(**params)
|
||||
|
||||
params["extra_body"] = {"prompt_logprobs": 2}
|
||||
completion_2 = await client.chat.completions.create(**params)
|
||||
|
||||
assert len(completion_1.prompt_logprobs[3]) == 1
|
||||
assert len(completion_2.prompt_logprobs[3]) == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert chat_completion.id is not None
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=37, total_tokens=47
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora hereafter
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
|
||||
)
|
||||
async def test_chat_completion_stream_options(
|
||||
client: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is the capital of France?"},
|
||||
]
|
||||
|
||||
# Test stream=True, stream_options={"include_usage": False}
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={"include_usage": False},
|
||||
)
|
||||
async for chunk in stream:
|
||||
assert chunk.usage is None
|
||||
|
||||
# Test stream=True, stream_options={"include_usage": True,
|
||||
# "continuous_usage_stats": False}}
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={"include_usage": True, "continuous_usage_stats": False},
|
||||
)
|
||||
|
||||
async for chunk in stream:
|
||||
if chunk.choices[0].finish_reason is None:
|
||||
assert chunk.usage is None
|
||||
else:
|
||||
assert chunk.usage is None
|
||||
final_chunk = await anext(stream)
|
||||
assert final_chunk.usage is not None
|
||||
assert final_chunk.usage.prompt_tokens > 0
|
||||
assert final_chunk.usage.completion_tokens > 0
|
||||
assert final_chunk.usage.total_tokens == (
|
||||
final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
|
||||
)
|
||||
assert final_chunk.choices == []
|
||||
|
||||
# Test stream=False, stream_options={"include_usage": None}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"include_usage": None},
|
||||
)
|
||||
|
||||
# Test stream=False, stream_options={"include_usage": True}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"include_usage": True},
|
||||
)
|
||||
|
||||
# Test stream=True, stream_options={"include_usage": True,
|
||||
# "continuous_usage_stats": True}
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
extra_body=dict(min_tokens=10),
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True,
|
||||
},
|
||||
)
|
||||
last_completion_tokens = 0
|
||||
async for chunk in stream:
|
||||
assert chunk.usage.prompt_tokens >= 0
|
||||
assert (
|
||||
last_completion_tokens == 0
|
||||
or chunk.usage.completion_tokens > last_completion_tokens
|
||||
or (
|
||||
not chunk.choices
|
||||
and chunk.usage.completion_tokens == last_completion_tokens
|
||||
)
|
||||
)
|
||||
assert chunk.usage.total_tokens == (
|
||||
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
|
||||
)
|
||||
last_completion_tokens = chunk.usage.completion_tokens
|
||||
|
||||
assert last_completion_tokens == 10
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_structured_outputs_choice_chat(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_structured_outputs_choices,
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "The best language for type-safe systems programming is ",
|
||||
},
|
||||
]
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.7,
|
||||
extra_body=dict(
|
||||
structured_outputs={"choice": sample_structured_outputs_choices}
|
||||
),
|
||||
)
|
||||
choice1 = chat_completion.choices[0].message.content
|
||||
assert choice1 in sample_structured_outputs_choices
|
||||
|
||||
messages.append({"role": "assistant", "content": choice1})
|
||||
messages.append({"role": "user", "content": "I disagree, pick another one"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.7,
|
||||
extra_body=dict(
|
||||
structured_outputs={"choice": sample_structured_outputs_choices}
|
||||
),
|
||||
)
|
||||
choice2 = chat_completion.choices[0].message.content
|
||||
assert choice2 in sample_structured_outputs_choices
|
||||
assert choice1 != choice2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_structured_outputs_json_chat(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_json_schema,
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}",
|
||||
},
|
||||
]
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(structured_outputs={"json": sample_json_schema}),
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json1 = json.loads(message.content)
|
||||
jsonschema.validate(instance=json1, schema=sample_json_schema)
|
||||
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
messages.append(
|
||||
{"role": "user", "content": "Give me another one with a different name and age"}
|
||||
)
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(structured_outputs={"json": sample_json_schema}),
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json2 = json.loads(message.content)
|
||||
jsonschema.validate(instance=json2, schema=sample_json_schema)
|
||||
assert json1["name"] != json2["name"]
|
||||
assert json1["age"] != json2["age"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_structured_outputs_regex_chat(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_regex,
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Give an example IP address with this regex: {sample_regex}",
|
||||
},
|
||||
]
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(structured_outputs={"regex": sample_regex}),
|
||||
)
|
||||
ip1 = chat_completion.choices[0].message.content
|
||||
assert ip1 is not None
|
||||
assert re.fullmatch(sample_regex, ip1) is not None
|
||||
|
||||
messages.append({"role": "assistant", "content": ip1})
|
||||
messages.append({"role": "user", "content": "Give me a different one"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(structured_outputs={"regex": sample_regex}),
|
||||
)
|
||||
ip2 = chat_completion.choices[0].message.content
|
||||
assert ip2 is not None
|
||||
assert re.fullmatch(sample_regex, ip2) is not None
|
||||
assert ip1 != ip2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "The best language for type-safe systems programming is ",
|
||||
},
|
||||
]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
extra_body=dict(structured_outputs={"regex": {1: "Python", 2: "C++"}}),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_structured_outputs_choice_chat_logprobs(
|
||||
client: openai.AsyncOpenAI, sample_structured_outputs_choices
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "The best language for type-safe systems programming is ",
|
||||
},
|
||||
]
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
extra_body=dict(
|
||||
structured_outputs={"choice": sample_structured_outputs_choices}
|
||||
),
|
||||
)
|
||||
|
||||
assert chat_completion.choices[0].logprobs is not None
|
||||
assert chat_completion.choices[0].logprobs.content is not None
|
||||
top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
|
||||
|
||||
# -9999.0 is the minimum logprob returned by OpenAI
|
||||
for item in top_logprobs:
|
||||
assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_response_format_json_object(client: openai.AsyncOpenAI):
|
||||
for _ in range(2):
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"what is 1+1? please respond with a JSON object, "
|
||||
'the format is {"result": 2}'
|
||||
),
|
||||
}
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
|
||||
content = resp.choices[0].message.content
|
||||
assert content is not None
|
||||
|
||||
loaded = json.loads(content)
|
||||
assert loaded == {"result": 2}, loaded
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_response_format_json_schema(client: openai.AsyncOpenAI):
|
||||
prompt = 'what is 1+1? The format is "result": 2'
|
||||
# Check that this prompt cannot lead to a valid JSON without json_schema
|
||||
for _ in range(2):
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
content = resp.choices[0].message.content
|
||||
assert content is not None
|
||||
with pytest.raises((json.JSONDecodeError, AssertionError)):
|
||||
loaded = json.loads(content)
|
||||
assert loaded == {"result": 2}, loaded
|
||||
|
||||
for _ in range(2):
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "foo_test",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"result": {"type": "integer"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
content = resp.choices[0].message.content
|
||||
assert content is not None
|
||||
|
||||
loaded = json.loads(content)
|
||||
assert loaded == {"result": 2}, loaded
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what is 1+1?",
|
||||
"extra_field": "0",
|
||||
}
|
||||
], # type: ignore
|
||||
temperature=0,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
content = resp.choices[0].message.content
|
||||
assert content is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_complex_message_content(client: openai.AsyncOpenAI):
|
||||
content = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "what is 1+1? please provide the result without any other text.",
|
||||
}
|
||||
]
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": content,
|
||||
}
|
||||
],
|
||||
temperature=0,
|
||||
seed=0,
|
||||
)
|
||||
content = resp.choices[0].message.content
|
||||
assert content == "2"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_custom_role(client: openai.AsyncOpenAI):
|
||||
# Not sure how the model handles custom roles so we just check that
|
||||
# both string and complex message content are handled in the same way
|
||||
|
||||
resp1 = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{
|
||||
"role": "my-custom-role",
|
||||
"content": "what is 1+1?",
|
||||
}
|
||||
], # type: ignore
|
||||
temperature=0,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
resp2 = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{
|
||||
"role": "my-custom-role",
|
||||
"content": [{"type": "text", "text": "what is 1+1?"}],
|
||||
}
|
||||
], # type: ignore
|
||||
temperature=0,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
content1 = resp1.choices[0].message.content
|
||||
content2 = resp2.choices[0].message.content
|
||||
assert content1 == content2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_long_seed(client: openai.AsyncOpenAI):
|
||||
for seed in [torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).max + 1]:
|
||||
with pytest.raises(BadRequestError) as exc_info:
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
}
|
||||
],
|
||||
temperature=0,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
assert (
|
||||
"greater_than_equal" in exc_info.value.message
|
||||
or "less_than_equal" in exc_info.value.message
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
request_args = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": messages,
|
||||
"max_completion_tokens": 5,
|
||||
"temperature": 0.0,
|
||||
"logprobs": False,
|
||||
}
|
||||
|
||||
chat_completion = await client.chat.completions.create(**request_args)
|
||||
|
||||
invocation_response = requests.post(
|
||||
server.url_for("invocations"), json=request_args
|
||||
)
|
||||
invocation_response.raise_for_status()
|
||||
|
||||
chat_output = chat_completion.model_dump()
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert chat_output.keys() == invocation_output.keys()
|
||||
assert chat_output["choices"] == invocation_output["choices"]
|
||||
132
tests/entrypoints/openai/test_chat_echo.py
Normal file
132
tests/entrypoints/openai/test_chat_echo.py
Normal file
@@ -0,0 +1,132 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# # any model with a chat template should work here
|
||||
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
|
||||
|
||||
|
||||
def get_vocab_size(model_name):
|
||||
config = ModelConfig(
|
||||
model=model_name,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
return config.get_vocab_size()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"4080",
|
||||
"--max-logprobs", # test prompt_logprobs equal to -1
|
||||
"151936",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
class TestCase(NamedTuple):
|
||||
model_name: str
|
||||
echo: bool
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
TestCase(model_name=MODEL_NAME, echo=True),
|
||||
TestCase(model_name=MODEL_NAME, echo=False),
|
||||
],
|
||||
)
|
||||
async def test_chat_session_with_echo_and_continue_final_message(
|
||||
client: openai.AsyncOpenAI, test_case: TestCase
|
||||
):
|
||||
saying: str = "Here is a common saying about apple. An apple a day, keeps"
|
||||
# test echo with continue_final_message parameter
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=test_case.model_name,
|
||||
messages=[
|
||||
{"role": "user", "content": "tell me a common saying"},
|
||||
{"role": "assistant", "content": saying},
|
||||
],
|
||||
extra_body={
|
||||
"echo": test_case.echo,
|
||||
"continue_final_message": True,
|
||||
"add_generation_prompt": False,
|
||||
},
|
||||
)
|
||||
assert chat_completion.id is not None
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "stop"
|
||||
|
||||
message = choice.message
|
||||
if test_case.echo:
|
||||
assert message.content is not None and saying in message.content
|
||||
else:
|
||||
assert message.content is not None and saying not in message.content
|
||||
assert message.role == "assistant"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prompt_logprobs(client: openai.AsyncOpenAI):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Beijing is the capital of which country?"},
|
||||
]
|
||||
|
||||
completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
extra_body={"prompt_logprobs": -1},
|
||||
)
|
||||
|
||||
assert completion.prompt_logprobs is not None
|
||||
assert len(completion.prompt_logprobs) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_top_logprobs(client: openai.AsyncOpenAI):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Beijing is the capital of which country?"},
|
||||
]
|
||||
|
||||
completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=1,
|
||||
extra_body={
|
||||
"top_logprobs": -1,
|
||||
"logprobs": "true",
|
||||
},
|
||||
)
|
||||
assert completion.choices[0].logprobs is not None
|
||||
assert completion.choices[0].logprobs.content is not None
|
||||
assert len(completion.choices[0].logprobs.content) > 0
|
||||
assert len(
|
||||
completion.choices[0].logprobs.content[0].top_logprobs
|
||||
) == get_vocab_size(MODEL_NAME)
|
||||
227
tests/entrypoints/openai/test_chat_error.py
Normal file
227
tests/entrypoints/openai/test_chat_error.py
Normal file
@@ -0,0 +1,227 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from http import HTTPStatus
|
||||
from typing import Any
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
MODEL_NAME = "openai-community/gpt2"
|
||||
MODEL_NAME_SHORT = "gpt2"
|
||||
BASE_MODEL_PATHS = [
|
||||
BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
|
||||
BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockHFConfig:
|
||||
model_type: str = "any"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
task = "generate"
|
||||
runner_type = "generate"
|
||||
tokenizer = MODEL_NAME
|
||||
trust_remote_code = False
|
||||
tokenizer_mode = "auto"
|
||||
max_model_len = 100
|
||||
tokenizer_revision = None
|
||||
multimodal_config = MultiModalConfig()
|
||||
hf_config = MockHFConfig()
|
||||
logits_processor_pattern = None
|
||||
logits_processors: list[str] | None = None
|
||||
diff_sampling_param: dict | None = None
|
||||
allowed_local_media_path: str = ""
|
||||
allowed_media_domains: list[str] | None = None
|
||||
encoder_config = None
|
||||
generation_config: str = "auto"
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
skip_tokenizer_init = False
|
||||
|
||||
def get_diff_sampling_param(self):
|
||||
return self.diff_sampling_param or {}
|
||||
|
||||
|
||||
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||
models = OpenAIServingModels(
|
||||
engine_client=engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
serving_chat = OpenAIServingChat(
|
||||
engine,
|
||||
models,
|
||||
response_role="assistant",
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
)
|
||||
|
||||
async def _fake_process_inputs(
|
||||
request_id,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
*,
|
||||
lora_request,
|
||||
trace_headers,
|
||||
priority,
|
||||
):
|
||||
return dict(engine_prompt), {}
|
||||
|
||||
async def _fake_preprocess_chat(*args, **kwargs):
|
||||
# return conversation, engine_prompts
|
||||
return (
|
||||
[{"role": "user", "content": "Test"}],
|
||||
[{"prompt_token_ids": [1, 2, 3]}],
|
||||
)
|
||||
|
||||
serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
|
||||
serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
|
||||
return serving_chat
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_error_non_stream():
|
||||
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
|
||||
completion_output = CompletionOutput(
|
||||
index=0,
|
||||
text="",
|
||||
token_ids=[],
|
||||
cumulative_logprob=None,
|
||||
logprobs=None,
|
||||
finish_reason="error",
|
||||
)
|
||||
|
||||
request_output = RequestOutput(
|
||||
request_id="test-id",
|
||||
prompt="Test prompt",
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_logprobs=None,
|
||||
outputs=[completion_output],
|
||||
finished=True,
|
||||
metrics=None,
|
||||
lora_request=None,
|
||||
encoder_prompt=None,
|
||||
encoder_prompt_token_ids=None,
|
||||
)
|
||||
|
||||
async def mock_generate(*args, **kwargs):
|
||||
yield request_output
|
||||
|
||||
mock_engine.generate = MagicMock(side_effect=mock_generate)
|
||||
|
||||
request = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": "Test prompt"}],
|
||||
max_tokens=10,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
response = await serving_chat.create_chat_completion(request)
|
||||
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.error.type == "InternalServerError"
|
||||
assert response.error.message == "Internal server error"
|
||||
assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_error_stream():
|
||||
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
|
||||
completion_output_1 = CompletionOutput(
|
||||
index=0,
|
||||
text="Hello",
|
||||
token_ids=[100],
|
||||
cumulative_logprob=None,
|
||||
logprobs=None,
|
||||
finish_reason=None,
|
||||
)
|
||||
|
||||
request_output_1 = RequestOutput(
|
||||
request_id="test-id",
|
||||
prompt="Test prompt",
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_logprobs=None,
|
||||
outputs=[completion_output_1],
|
||||
finished=False,
|
||||
metrics=None,
|
||||
lora_request=None,
|
||||
encoder_prompt=None,
|
||||
encoder_prompt_token_ids=None,
|
||||
)
|
||||
|
||||
completion_output_2 = CompletionOutput(
|
||||
index=0,
|
||||
text="Hello",
|
||||
token_ids=[100],
|
||||
cumulative_logprob=None,
|
||||
logprobs=None,
|
||||
finish_reason="error",
|
||||
)
|
||||
|
||||
request_output_2 = RequestOutput(
|
||||
request_id="test-id",
|
||||
prompt="Test prompt",
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_logprobs=None,
|
||||
outputs=[completion_output_2],
|
||||
finished=True,
|
||||
metrics=None,
|
||||
lora_request=None,
|
||||
encoder_prompt=None,
|
||||
encoder_prompt_token_ids=None,
|
||||
)
|
||||
|
||||
async def mock_generate(*args, **kwargs):
|
||||
yield request_output_1
|
||||
yield request_output_2
|
||||
|
||||
mock_engine.generate = MagicMock(side_effect=mock_generate)
|
||||
|
||||
request = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": "Test prompt"}],
|
||||
max_tokens=10,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
response = await serving_chat.create_chat_completion(request)
|
||||
|
||||
chunks = []
|
||||
async for chunk in response:
|
||||
chunks.append(chunk)
|
||||
|
||||
assert len(chunks) >= 2
|
||||
assert any("Internal server error" in chunk for chunk in chunks), (
|
||||
f"Expected error message in chunks: {chunks}"
|
||||
)
|
||||
assert chunks[-1] == "data: [DONE]\n\n"
|
||||
79
tests/entrypoints/openai/test_chat_logit_bias_validation.py
Normal file
79
tests/entrypoints/openai/test_chat_logit_bias_validation.py
Normal file
@@ -0,0 +1,79 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
|
||||
|
||||
def get_vocab_size(model_name):
|
||||
config = ModelConfig(
|
||||
model=model_name,
|
||||
seed=0,
|
||||
dtype="bfloat16",
|
||||
)
|
||||
return config.get_vocab_size()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"1024",
|
||||
"--enforce-eager",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_logit_bias_valid(client):
|
||||
"""Test that valid logit_bias values are accepted in chat completions."""
|
||||
vocab_size = get_vocab_size(MODEL_NAME)
|
||||
valid_token_id = vocab_size - 1
|
||||
|
||||
completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": "Testing valid logit bias"}],
|
||||
max_tokens=5,
|
||||
logit_bias={str(valid_token_id): 1.0},
|
||||
)
|
||||
|
||||
assert completion.choices[0].message.content is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_logit_bias_invalid(client):
|
||||
"""Test that invalid logit_bias values are rejected in chat completions."""
|
||||
vocab_size = get_vocab_size(MODEL_NAME)
|
||||
invalid_token_id = vocab_size + 1
|
||||
|
||||
with pytest.raises(openai.BadRequestError) as excinfo:
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": "Testing invalid logit bias"}],
|
||||
max_tokens=5,
|
||||
logit_bias={str(invalid_token_id): 1.0},
|
||||
)
|
||||
|
||||
error = excinfo.value
|
||||
error_message = str(error)
|
||||
|
||||
assert error.status_code == 400
|
||||
assert str(invalid_token_id) in error_message
|
||||
assert str(vocab_size) in error_message
|
||||
156
tests/entrypoints/openai/test_chat_template.py
Normal file
156
tests/entrypoints/openai/test_chat_template.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...models.registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import VLLM_PATH
|
||||
|
||||
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert chatml_jinja_path.exists()
|
||||
|
||||
# Define models, templates, and their corresponding expected outputs
|
||||
MODEL_TEMPLATE_GENERATION_OUTPUT = [
|
||||
(
|
||||
"facebook/opt-125m",
|
||||
chatml_jinja_path,
|
||||
True,
|
||||
False,
|
||||
"""<|im_start|>user
|
||||
Hello<|im_end|>
|
||||
<|im_start|>assistant
|
||||
Hi there!<|im_end|>
|
||||
<|im_start|>user
|
||||
What is the capital of<|im_end|>
|
||||
<|im_start|>assistant
|
||||
""",
|
||||
),
|
||||
(
|
||||
"facebook/opt-125m",
|
||||
chatml_jinja_path,
|
||||
False,
|
||||
False,
|
||||
"""<|im_start|>user
|
||||
Hello<|im_end|>
|
||||
<|im_start|>assistant
|
||||
Hi there!<|im_end|>
|
||||
<|im_start|>user
|
||||
What is the capital of""",
|
||||
),
|
||||
(
|
||||
"facebook/opt-125m",
|
||||
chatml_jinja_path,
|
||||
False,
|
||||
True,
|
||||
"""<|im_start|>user
|
||||
Hello<|im_end|>
|
||||
<|im_start|>assistant
|
||||
Hi there!<|im_end|>
|
||||
<|im_start|>user
|
||||
What is the capital of<|im_end|>
|
||||
<|im_start|>assistant
|
||||
The capital of""",
|
||||
),
|
||||
]
|
||||
|
||||
TEST_MESSAGES = [
|
||||
{"role": "user", "content": "Hello"},
|
||||
{"role": "assistant", "content": "Hi there!"},
|
||||
{"role": "user", "content": "What is the capital of"},
|
||||
]
|
||||
ASSISTANT_MESSAGE_TO_CONTINUE = {"role": "assistant", "content": "The capital of"}
|
||||
|
||||
|
||||
def test_load_chat_template():
|
||||
# Testing chatml template
|
||||
template_content = load_chat_template(chat_template=chatml_jinja_path)
|
||||
|
||||
# Test assertions
|
||||
assert template_content is not None
|
||||
# Hard coded value for template_chatml.jinja
|
||||
assert (
|
||||
template_content
|
||||
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
|
||||
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
def test_no_load_chat_template_filelike():
|
||||
# Testing chatml template
|
||||
template = "../../examples/does_not_exist"
|
||||
|
||||
with pytest.raises(ValueError, match="looks like a file path"):
|
||||
load_chat_template(chat_template=template)
|
||||
|
||||
|
||||
def test_no_load_chat_template_literallike():
|
||||
# Testing chatml template
|
||||
template = "{{ messages }}"
|
||||
|
||||
template_content = load_chat_template(chat_template=template)
|
||||
|
||||
assert template_content == template
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,template,add_generation_prompt,continue_final_message,expected_output",
|
||||
MODEL_TEMPLATE_GENERATION_OUTPUT,
|
||||
)
|
||||
def test_get_gen_prompt(
|
||||
model, template, add_generation_prompt, continue_final_message, expected_output
|
||||
):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
model_config = ModelConfig(
|
||||
model,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
revision=model_info.revision,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
# Initialize the tokenizer
|
||||
tokenizer = get_tokenizer(
|
||||
tokenizer_name=model_config.tokenizer,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
template_content = load_chat_template(chat_template=template)
|
||||
|
||||
# Create a mock request object using keyword arguments
|
||||
mock_request = ChatCompletionRequest(
|
||||
model=model,
|
||||
messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
|
||||
if continue_final_message
|
||||
else TEST_MESSAGES,
|
||||
add_generation_prompt=add_generation_prompt,
|
||||
continue_final_message=continue_final_message,
|
||||
)
|
||||
|
||||
# Call the function and get the result
|
||||
result = apply_hf_chat_template(
|
||||
tokenizer=tokenizer,
|
||||
conversation=mock_request.messages,
|
||||
chat_template=mock_request.chat_template or template_content,
|
||||
model_config=model_config,
|
||||
tools=None,
|
||||
add_generation_prompt=mock_request.add_generation_prompt,
|
||||
continue_final_message=mock_request.continue_final_message,
|
||||
)
|
||||
|
||||
# Test assertion
|
||||
assert result == expected_output, (
|
||||
f"The generated prompt does not match the expected output for "
|
||||
f"model {model} and template {template}"
|
||||
)
|
||||
141
tests/entrypoints/openai/test_chat_with_tool_reasoning.py
Normal file
141
tests/entrypoints/openai/test_chat_with_tool_reasoning.py
Normal file
@@ -0,0 +1,141 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# a reasoning and tool calling model
|
||||
MODEL_NAME = "Qwen/QwQ-32B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
"--reasoning-parser",
|
||||
"deepseek_r1",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. "
|
||||
"'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "the two-letter abbreviation for the state that "
|
||||
"the city is in, e.g. 'CA' which would mean 'California'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
MESSAGES = [
|
||||
{"role": "user", "content": "Hi! How are you doing today?"},
|
||||
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Can you tell me what the temperate will be in Dallas, "
|
||||
"in fahrenheit?",
|
||||
},
|
||||
]
|
||||
|
||||
FUNC_NAME = "get_current_weather"
|
||||
FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
|
||||
|
||||
|
||||
def extract_reasoning_and_calls(chunks: list):
|
||||
reasoning = ""
|
||||
tool_call_idx = -1
|
||||
arguments = []
|
||||
function_names = []
|
||||
for chunk in chunks:
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.index != tool_call_idx:
|
||||
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
|
||||
arguments.append("")
|
||||
function_names.append("")
|
||||
|
||||
if tool_call.function:
|
||||
if tool_call.function.name:
|
||||
function_names[tool_call_idx] = tool_call.function.name
|
||||
|
||||
if tool_call.function.arguments:
|
||||
arguments[tool_call_idx] += tool_call.function.arguments
|
||||
else:
|
||||
if hasattr(chunk.choices[0].delta, "reasoning"):
|
||||
reasoning += chunk.choices[0].delta.reasoning
|
||||
return reasoning, arguments, function_names
|
||||
|
||||
|
||||
# test streaming
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI):
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
chunks = []
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk)
|
||||
|
||||
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
assert len(reasoning) > 0
|
||||
assert len(function_names) > 0 and function_names[0] == FUNC_NAME
|
||||
assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
|
||||
|
||||
|
||||
# test full generate
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
|
||||
tool_calls = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert len(tool_calls.choices[0].message.reasoning) > 0
|
||||
assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
|
||||
assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS
|
||||
127
tests/entrypoints/openai/test_chunked_prompt.py
Normal file
127
tests/entrypoints/openai/test_chunked_prompt.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enable-chunked-prefill",
|
||||
"--max-num-batched-tokens",
|
||||
"1000",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_stream_options_and_logprobs_with_long_prompts(
|
||||
client: openai.AsyncOpenAI,
|
||||
):
|
||||
# Test stream with long prompt
|
||||
prompt = "What is the capital of France?" * 400
|
||||
|
||||
stream = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True,
|
||||
},
|
||||
logprobs=5,
|
||||
)
|
||||
|
||||
tokens_received = 0
|
||||
finished = False
|
||||
async for chunk in stream:
|
||||
assert chunk.usage.prompt_tokens >= 0
|
||||
assert chunk.usage.completion_tokens >= 0
|
||||
assert chunk.usage.total_tokens == (
|
||||
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
|
||||
)
|
||||
if not finished:
|
||||
tokens_received += 1
|
||||
assert chunk.choices[0].text
|
||||
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finished = True
|
||||
|
||||
if finished:
|
||||
assert chunk.usage.completion_tokens == tokens_received
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
|
||||
client: openai.AsyncOpenAI,
|
||||
):
|
||||
# Test stream with long prompt
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is the capital of France?" * 400},
|
||||
]
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True,
|
||||
},
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
)
|
||||
|
||||
tokens_received = 0
|
||||
empty_chunks_received = 0
|
||||
finished = False
|
||||
async for chunk in stream:
|
||||
assert chunk.usage.prompt_tokens >= 0
|
||||
assert chunk.usage.completion_tokens >= 0
|
||||
assert chunk.usage.total_tokens == (
|
||||
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
|
||||
)
|
||||
|
||||
if not finished:
|
||||
if chunk.choices[0].delta.content == "":
|
||||
# when there is no tokens generated
|
||||
assert chunk.usage.completion_tokens == 0
|
||||
assert chunk.choices[0].logprobs is None
|
||||
empty_chunks_received += 1
|
||||
else:
|
||||
tokens_received += 1
|
||||
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finished = True
|
||||
|
||||
if finished:
|
||||
assert chunk.usage.completion_tokens == tokens_received
|
||||
|
||||
assert empty_chunks_received <= 1
|
||||
210
tests/entrypoints/openai/test_cli_args.py
Normal file
210
tests/entrypoints/openai/test_cli_args.py
Normal file
@@ -0,0 +1,210 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
|
||||
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
from ...utils import VLLM_PATH
|
||||
|
||||
LORA_MODULE = {
|
||||
"name": "module2",
|
||||
"path": "/path/to/module2",
|
||||
"base_model_name": "llama",
|
||||
}
|
||||
CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert CHATML_JINJA_PATH.exists()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def serve_parser():
|
||||
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
|
||||
return make_arg_parser(parser)
|
||||
|
||||
|
||||
### Test config parsing
|
||||
def test_config_arg_parsing(serve_parser, cli_config_file):
|
||||
args = serve_parser.parse_args([])
|
||||
assert args.port == 8000
|
||||
args = serve_parser.parse_args(["--config", cli_config_file])
|
||||
assert args.port == 12312
|
||||
args = serve_parser.parse_args(
|
||||
[
|
||||
"--config",
|
||||
cli_config_file,
|
||||
"--port",
|
||||
"9000",
|
||||
]
|
||||
)
|
||||
assert args.port == 9000
|
||||
args = serve_parser.parse_args(
|
||||
[
|
||||
"--port",
|
||||
"9000",
|
||||
"--config",
|
||||
cli_config_file,
|
||||
]
|
||||
)
|
||||
assert args.port == 9000
|
||||
|
||||
|
||||
### Tests for LoRA module parsing
|
||||
def test_valid_key_value_format(serve_parser):
|
||||
# Test old format: name=path
|
||||
args = serve_parser.parse_args(
|
||||
[
|
||||
"--lora-modules",
|
||||
"module1=/path/to/module1",
|
||||
]
|
||||
)
|
||||
expected = [LoRAModulePath(name="module1", path="/path/to/module1")]
|
||||
assert args.lora_modules == expected
|
||||
|
||||
|
||||
def test_valid_json_format(serve_parser):
|
||||
# Test valid JSON format input
|
||||
args = serve_parser.parse_args(
|
||||
[
|
||||
"--lora-modules",
|
||||
json.dumps(LORA_MODULE),
|
||||
]
|
||||
)
|
||||
expected = [
|
||||
LoRAModulePath(name="module2", path="/path/to/module2", base_model_name="llama")
|
||||
]
|
||||
assert args.lora_modules == expected
|
||||
|
||||
|
||||
def test_invalid_json_format(serve_parser):
|
||||
# Test invalid JSON format input, missing closing brace
|
||||
with pytest.raises(SystemExit):
|
||||
serve_parser.parse_args(
|
||||
["--lora-modules", '{"name": "module3", "path": "/path/to/module3"']
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_type_error(serve_parser):
|
||||
# Test type error when values are not JSON or key=value
|
||||
with pytest.raises(SystemExit):
|
||||
serve_parser.parse_args(
|
||||
[
|
||||
"--lora-modules",
|
||||
"invalid_format", # This is not JSON or key=value format
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_json_field(serve_parser):
|
||||
# Test valid JSON format but missing required fields
|
||||
with pytest.raises(SystemExit):
|
||||
serve_parser.parse_args(
|
||||
[
|
||||
"--lora-modules",
|
||||
'{"name": "module4"}', # Missing required 'path' field
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def test_empty_values(serve_parser):
|
||||
# Test when no LoRA modules are provided
|
||||
args = serve_parser.parse_args(["--lora-modules", ""])
|
||||
assert args.lora_modules == []
|
||||
|
||||
|
||||
def test_multiple_valid_inputs(serve_parser):
|
||||
# Test multiple valid inputs (both old and JSON format)
|
||||
args = serve_parser.parse_args(
|
||||
[
|
||||
"--lora-modules",
|
||||
"module1=/path/to/module1",
|
||||
json.dumps(LORA_MODULE),
|
||||
]
|
||||
)
|
||||
expected = [
|
||||
LoRAModulePath(name="module1", path="/path/to/module1"),
|
||||
LoRAModulePath(
|
||||
name="module2", path="/path/to/module2", base_model_name="llama"
|
||||
),
|
||||
]
|
||||
assert args.lora_modules == expected
|
||||
|
||||
|
||||
### Tests for serve argument validation that run prior to loading
|
||||
def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
|
||||
"""Ensure validation fails if tool choice is enabled with no call parser"""
|
||||
# If we enable-auto-tool-choice, explode with no tool-call-parser
|
||||
args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
|
||||
with pytest.raises(TypeError):
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
|
||||
"""Ensure validation passes with tool choice enabled with a call parser"""
|
||||
args = serve_parser.parse_args(
|
||||
args=[
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"mistral",
|
||||
]
|
||||
)
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
|
||||
"""Ensure validation fails if reasoning is enabled with auto tool choice"""
|
||||
args = serve_parser.parse_args(
|
||||
args=[
|
||||
"--enable-auto-tool-choice",
|
||||
"--reasoning-parser",
|
||||
"deepseek_r1",
|
||||
]
|
||||
)
|
||||
with pytest.raises(TypeError):
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_passes_with_reasoning_parser(serve_parser):
|
||||
"""Ensure validation passes if reasoning is enabled
|
||||
with a reasoning parser"""
|
||||
args = serve_parser.parse_args(
|
||||
args=[
|
||||
"--reasoning-parser",
|
||||
"deepseek_r1",
|
||||
]
|
||||
)
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_chat_template_validation_for_happy_paths(serve_parser):
|
||||
"""Ensure validation passes if the chat template exists"""
|
||||
args = serve_parser.parse_args(
|
||||
args=["--chat-template", CHATML_JINJA_PATH.absolute().as_posix()]
|
||||
)
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_chat_template_validation_for_sad_paths(serve_parser):
|
||||
"""Ensure validation fails if the chat template doesn't exist"""
|
||||
args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
|
||||
with pytest.raises(ValueError):
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cli_args, expected_middleware",
|
||||
[
|
||||
(
|
||||
["--middleware", "middleware1", "--middleware", "middleware2"],
|
||||
["middleware1", "middleware2"],
|
||||
),
|
||||
([], []),
|
||||
],
|
||||
)
|
||||
def test_middleware(serve_parser, cli_args, expected_middleware):
|
||||
"""Ensure multiple middleware args are parsed properly"""
|
||||
args = serve_parser.parse_args(args=cli_args)
|
||||
assert args.middleware == expected_middleware
|
||||
84
tests/entrypoints/openai/test_collective_rpc.py
Normal file
84
tests/entrypoints/openai/test_collective_rpc.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
class TestWorkerExtension:
|
||||
def get_model_name(self) -> str:
|
||||
"""Test non-pydantic return type."""
|
||||
return MODEL_NAME
|
||||
|
||||
def echo_args_kwargs(self, *args, **kwargs) -> dict[str, Any]:
|
||||
"""Echo back both args and kwargs."""
|
||||
return dict(
|
||||
args=list(args),
|
||||
kwargs=kwargs,
|
||||
total_items=len(args) + len(kwargs),
|
||||
)
|
||||
|
||||
def return_none(self, *args, **kwargs) -> None:
|
||||
"""Test method that does not return anything"""
|
||||
return
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--worker-extension-cls",
|
||||
"tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
|
||||
]
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME,
|
||||
args,
|
||||
env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
def test_get_model_name(server):
|
||||
"""Test basic response"""
|
||||
response = requests.post(
|
||||
server.url_for("collective_rpc"), json={"method": "get_model_name"}
|
||||
)
|
||||
assert response.status_code == 200
|
||||
results = response.json()
|
||||
assert "results" in results
|
||||
assert results["results"] == [MODEL_NAME]
|
||||
|
||||
|
||||
def test_return_none(server):
|
||||
"""Test return none"""
|
||||
response = requests.post(
|
||||
server.url_for("collective_rpc"), json={"method": "return_none"}
|
||||
)
|
||||
assert response.status_code == 200
|
||||
results = response.json()
|
||||
assert results["results"] == [None]
|
||||
|
||||
|
||||
def test_echo_args_kwargs(server):
|
||||
"""Test args, kwargs, and dict response"""
|
||||
args = ["arg1", "arg2"]
|
||||
kwargs = {"key1": "value1", "key2": "value2"}
|
||||
response = requests.post(
|
||||
server.url_for("collective_rpc"),
|
||||
json={"method": "echo_args_kwargs", "args": args, "kwargs": kwargs},
|
||||
)
|
||||
assert response.status_code == 200
|
||||
results = response.json()
|
||||
result = results["results"][0]
|
||||
assert result["args"] == args
|
||||
assert result["kwargs"] == kwargs
|
||||
assert result["total_items"] == len(args) + len(kwargs)
|
||||
216
tests/entrypoints/openai/test_completion_error.py
Normal file
216
tests/entrypoints/openai/test_completion_error.py
Normal file
@@ -0,0 +1,216 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from http import HTTPStatus
|
||||
from typing import Any
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
MODEL_NAME = "openai-community/gpt2"
|
||||
MODEL_NAME_SHORT = "gpt2"
|
||||
BASE_MODEL_PATHS = [
|
||||
BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
|
||||
BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockHFConfig:
|
||||
model_type: str = "any"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
task = "generate"
|
||||
runner_type = "generate"
|
||||
tokenizer = MODEL_NAME
|
||||
trust_remote_code = False
|
||||
tokenizer_mode = "auto"
|
||||
max_model_len = 100
|
||||
tokenizer_revision = None
|
||||
multimodal_config = MultiModalConfig()
|
||||
hf_config = MockHFConfig()
|
||||
logits_processor_pattern = None
|
||||
logits_processors: list[str] | None = None
|
||||
diff_sampling_param: dict | None = None
|
||||
allowed_local_media_path: str = ""
|
||||
allowed_media_domains: list[str] | None = None
|
||||
encoder_config = None
|
||||
generation_config: str = "auto"
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
skip_tokenizer_init = False
|
||||
|
||||
def get_diff_sampling_param(self):
|
||||
return self.diff_sampling_param or {}
|
||||
|
||||
|
||||
def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
|
||||
models = OpenAIServingModels(
|
||||
engine_client=engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
serving_completion = OpenAIServingCompletion(
|
||||
engine,
|
||||
models,
|
||||
request_logger=None,
|
||||
)
|
||||
|
||||
async def _fake_process_inputs(
|
||||
request_id,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
*,
|
||||
lora_request,
|
||||
trace_headers,
|
||||
priority,
|
||||
):
|
||||
return dict(engine_prompt), {}
|
||||
|
||||
serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
|
||||
return serving_completion
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_error_non_stream():
|
||||
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
serving_completion = _build_serving_completion(mock_engine)
|
||||
|
||||
completion_output = CompletionOutput(
|
||||
index=0,
|
||||
text="",
|
||||
token_ids=[],
|
||||
cumulative_logprob=None,
|
||||
logprobs=None,
|
||||
finish_reason="error",
|
||||
)
|
||||
|
||||
request_output = RequestOutput(
|
||||
request_id="test-id",
|
||||
prompt="Test prompt",
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_logprobs=None,
|
||||
outputs=[completion_output],
|
||||
finished=True,
|
||||
metrics=None,
|
||||
lora_request=None,
|
||||
encoder_prompt=None,
|
||||
encoder_prompt_token_ids=None,
|
||||
)
|
||||
|
||||
async def mock_generate(*args, **kwargs):
|
||||
yield request_output
|
||||
|
||||
mock_engine.generate = MagicMock(side_effect=mock_generate)
|
||||
|
||||
request = CompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
prompt="Test prompt",
|
||||
max_tokens=10,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
response = await serving_completion.create_completion(request)
|
||||
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.error.type == "InternalServerError"
|
||||
assert response.error.message == "Internal server error"
|
||||
assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_error_stream():
|
||||
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
serving_completion = _build_serving_completion(mock_engine)
|
||||
|
||||
completion_output_1 = CompletionOutput(
|
||||
index=0,
|
||||
text="Hello",
|
||||
token_ids=[100],
|
||||
cumulative_logprob=None,
|
||||
logprobs=None,
|
||||
finish_reason=None,
|
||||
)
|
||||
|
||||
request_output_1 = RequestOutput(
|
||||
request_id="test-id",
|
||||
prompt="Test prompt",
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_logprobs=None,
|
||||
outputs=[completion_output_1],
|
||||
finished=False,
|
||||
metrics=None,
|
||||
lora_request=None,
|
||||
encoder_prompt=None,
|
||||
encoder_prompt_token_ids=None,
|
||||
)
|
||||
|
||||
completion_output_2 = CompletionOutput(
|
||||
index=0,
|
||||
text="Hello",
|
||||
token_ids=[100],
|
||||
cumulative_logprob=None,
|
||||
logprobs=None,
|
||||
finish_reason="error",
|
||||
)
|
||||
|
||||
request_output_2 = RequestOutput(
|
||||
request_id="test-id",
|
||||
prompt="Test prompt",
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_logprobs=None,
|
||||
outputs=[completion_output_2],
|
||||
finished=True,
|
||||
metrics=None,
|
||||
lora_request=None,
|
||||
encoder_prompt=None,
|
||||
encoder_prompt_token_ids=None,
|
||||
)
|
||||
|
||||
async def mock_generate(*args, **kwargs):
|
||||
yield request_output_1
|
||||
yield request_output_2
|
||||
|
||||
mock_engine.generate = MagicMock(side_effect=mock_generate)
|
||||
|
||||
request = CompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
prompt="Test prompt",
|
||||
max_tokens=10,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
response = await serving_completion.create_completion(request)
|
||||
|
||||
chunks = []
|
||||
async for chunk in response:
|
||||
chunks.append(chunk)
|
||||
|
||||
assert len(chunks) >= 2
|
||||
assert any("Internal server error" in chunk for chunk in chunks), (
|
||||
f"Expected error message in chunks: {chunks}"
|
||||
)
|
||||
assert chunks[-1] == "data: [DONE]\n\n"
|
||||
@@ -0,0 +1,486 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
import json
|
||||
|
||||
import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
# downloading lora to test lora requests
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. "
|
||||
"'Vienna'",
|
||||
"default": "Vienna",
|
||||
},
|
||||
"country": {
|
||||
"type": "string",
|
||||
"description": "The country that the city is in, e.g. "
|
||||
"'Austria'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
"options": {
|
||||
"$ref": "#/$defs/WeatherOptions",
|
||||
"description": "Optional parameters for weather query",
|
||||
},
|
||||
},
|
||||
"required": ["country", "unit"],
|
||||
"$defs": {
|
||||
"WeatherOptions": {
|
||||
"title": "WeatherOptions",
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"default": "celsius",
|
||||
"description": "Temperature unit",
|
||||
"title": "Temperature Unit",
|
||||
},
|
||||
"include_forecast": {
|
||||
"type": "boolean",
|
||||
"default": False,
|
||||
"description": "Whether to include a 24-hour forecast",
|
||||
"title": "Include Forecast",
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"default": "zh-CN",
|
||||
"description": "Language of the response",
|
||||
"title": "Language",
|
||||
"enum": ["zh-CN", "en-US", "ja-JP"],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_forecast",
|
||||
"description": "Get the weather forecast for a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to get the forecast for, e.g. "
|
||||
"'Vienna'",
|
||||
"default": "Vienna",
|
||||
},
|
||||
"country": {
|
||||
"type": "string",
|
||||
"description": "The country that the city is in, e.g. "
|
||||
"'Austria'",
|
||||
},
|
||||
"days": {
|
||||
"type": "integer",
|
||||
"description": "Number of days to get the forecast for (1-7)",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["country", "days", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "Hi! How are you doing today?"},
|
||||
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Can you tell me what the current weather is in Berlin and the "
|
||||
"forecast for the next 5 days, in fahrenheit?",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"half",
|
||||
"--enable-auto-tool-choice",
|
||||
"--structured-outputs-config.backend",
|
||||
"xgrammar",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
"--reasoning-parser",
|
||||
"qwen3",
|
||||
"--gpu-memory-utilization",
|
||||
"0.4",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("stream", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"tool_choice",
|
||||
[
|
||||
"auto",
|
||||
"required",
|
||||
{"type": "function", "function": {"name": "get_current_weather"}},
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("enable_thinking", [True, False])
|
||||
async def test_function_tool_use(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
stream: bool,
|
||||
tool_choice: str | dict,
|
||||
enable_thinking: bool,
|
||||
):
|
||||
if not stream:
|
||||
# Non-streaming test
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model_name,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
|
||||
)
|
||||
if enable_thinking:
|
||||
assert chat_completion.choices[0].message.reasoning is not None
|
||||
assert chat_completion.choices[0].message.reasoning != ""
|
||||
assert chat_completion.choices[0].message.tool_calls is not None
|
||||
assert len(chat_completion.choices[0].message.tool_calls) > 0
|
||||
else:
|
||||
# Streaming test
|
||||
output_stream = await client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model_name,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
stream=True,
|
||||
extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
|
||||
)
|
||||
|
||||
output = []
|
||||
reasoning = []
|
||||
async for chunk in output_stream:
|
||||
if chunk.choices:
|
||||
if enable_thinking and getattr(
|
||||
chunk.choices[0].delta, "reasoning", None
|
||||
):
|
||||
reasoning.append(chunk.choices[0].delta.reasoning)
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
output.extend(chunk.choices[0].delta.tool_calls)
|
||||
|
||||
assert len(output) > 0
|
||||
if enable_thinking:
|
||||
assert len(reasoning) > 0
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def k2_server(): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"half",
|
||||
"--enable-auto-tool-choice",
|
||||
"--structured-outputs-config.backend",
|
||||
"xgrammar",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
"--reasoning-parser",
|
||||
"qwen3",
|
||||
"--gpu-memory-utilization",
|
||||
"0.4",
|
||||
]
|
||||
# hack to test kimi_k2 tool use tool_id format.
|
||||
# avoid error in is_deepseek_mla check by setting kv_lora_rank=null
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME,
|
||||
args,
|
||||
override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def k2_client(k2_server):
|
||||
async with k2_server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("stream", [True, False])
|
||||
@pytest.mark.parametrize("tool_choice", ["required"])
|
||||
async def test_tool_id_kimi_k2(
|
||||
k2_client: openai.AsyncOpenAI, model_name: str, stream: bool, tool_choice: str
|
||||
):
|
||||
if not stream:
|
||||
# Non-streaming test
|
||||
chat_completion = await k2_client.chat.completions.create(
|
||||
messages=messages, model=model_name, tools=tools, tool_choice=tool_choice
|
||||
)
|
||||
assert chat_completion.choices[0].message.tool_calls is not None
|
||||
assert len(chat_completion.choices[0].message.tool_calls) > 0
|
||||
assert chat_completion.choices[0].message.tool_calls[0].id in [
|
||||
"functions.get_current_weather:0",
|
||||
"functions.get_forecast:1",
|
||||
]
|
||||
else:
|
||||
# Streaming test
|
||||
output_stream = await k2_client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model_name,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
output = []
|
||||
async for chunk in output_stream:
|
||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||
output.extend(chunk.choices[0].delta.tool_calls)
|
||||
for o in output:
|
||||
assert o.id is None or o.id in [
|
||||
"functions.get_current_weather:0",
|
||||
"functions.get_forecast:1",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("arguments", ["{}", ""])
|
||||
async def test_no_args_tool_call(
|
||||
client: openai.AsyncOpenAI, model_name: str, arguments: str
|
||||
):
|
||||
# Step 1: Define a tool that requires no parameters
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_time",
|
||||
"description": "Get the current date and time. No parameters needed.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}, # No parameters
|
||||
"required": [], # No required fields
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
messages = [{"role": "user", "content": "What time is it now?"}]
|
||||
# Step 2: Send user message and let model decide whether to call the tool
|
||||
response = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
tool_choice="auto", # Let model choose automatically
|
||||
)
|
||||
|
||||
# Step 3: Check if model wants to call a tool
|
||||
message = response.choices[0].message
|
||||
if message.tool_calls:
|
||||
# Get the first tool call
|
||||
tool_call = message.tool_calls[0]
|
||||
tool_name = tool_call.function.name
|
||||
# Step 4: Execute the tool locally (no parameters)
|
||||
if tool_name == "get_current_time":
|
||||
# Test both empty string and "{}" for no-arg tool calls
|
||||
tool_call.function.arguments = arguments
|
||||
messages.append(message)
|
||||
current_time = datetime.datetime.now()
|
||||
result = current_time.isoformat()
|
||||
messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call.id,
|
||||
"content": result,
|
||||
}
|
||||
)
|
||||
# Step 5: Send tool result back to model to continue conversation
|
||||
final_response = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
)
|
||||
# Output final natural language response
|
||||
assert final_response.choices[0].message.content is not None
|
||||
|
||||
else:
|
||||
# No tool called — just print model's direct reply
|
||||
assert message.content is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_named_tool_use(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_json_schema,
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Give an example JSON for an employee profile using the specified tool."
|
||||
),
|
||||
},
|
||||
]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema,
|
||||
},
|
||||
}
|
||||
]
|
||||
tool_choice = {"type": "function", "function": {"name": "dummy_function_name"}}
|
||||
|
||||
# non-streaming
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=tools,
|
||||
temperature=0.0,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert len(message.content) == 0
|
||||
json_string = message.tool_calls[0].function.arguments
|
||||
json1 = json.loads(json_string)
|
||||
jsonschema.validate(instance=json1, schema=sample_json_schema)
|
||||
|
||||
messages.append({"role": "assistant", "content": json_string})
|
||||
messages.append(
|
||||
{"role": "user", "content": "Give me another one with a different name and age"}
|
||||
)
|
||||
|
||||
# streaming
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
output = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
assert delta.content is None or len(delta.content) == 0
|
||||
if delta.tool_calls:
|
||||
output.append(delta.tool_calls[0].function.arguments)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
json2 = json.loads("".join(output))
|
||||
jsonschema.validate(instance=json2, schema=sample_json_schema)
|
||||
assert json1["name"] != json2["name"]
|
||||
assert json1["age"] != json2["age"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_inconsistent_tool_choice_and_tools(
|
||||
client: openai.AsyncOpenAI, sample_json_schema
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}",
|
||||
},
|
||||
]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {"name": "dummy_function_name"},
|
||||
},
|
||||
)
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema,
|
||||
},
|
||||
}
|
||||
],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {"name": "nondefined_function_name"},
|
||||
},
|
||||
)
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema,
|
||||
},
|
||||
}
|
||||
],
|
||||
tool_choice={},
|
||||
)
|
||||
307
tests/entrypoints/openai/test_completion_with_prompt_embeds.py
Normal file
307
tests/entrypoints/openai/test_completion_with_prompt_embeds.py
Normal file
@@ -0,0 +1,307 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import torch
|
||||
|
||||
# downloading lora to test lora requests
|
||||
from openai import BadRequestError
|
||||
from transformers import AutoConfig
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
LORA_SERVING_MODEL_NAME = "opt125m-lora"
|
||||
|
||||
CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=["use-lora"])
|
||||
def default_server_args(
|
||||
request: pytest.FixtureRequest, opt125_lora_files: str
|
||||
) -> list[str]:
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
# Prompt Embeds server args
|
||||
"--enable-prompt-embeds",
|
||||
]
|
||||
|
||||
if request.param == "use-lora":
|
||||
lora_module_1 = {
|
||||
"name": LORA_SERVING_MODEL_NAME,
|
||||
"path": opt125_lora_files,
|
||||
"base_model_name": MODEL_NAME,
|
||||
}
|
||||
|
||||
args.extend(
|
||||
[
|
||||
"--enable-lora",
|
||||
"--lora-module",
|
||||
json.dumps(lora_module_1),
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
]
|
||||
)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
EXAMPLE_PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"What is an LLM?",
|
||||
]
|
||||
|
||||
|
||||
def _encode_embeds(embeds: torch.Tensor):
|
||||
buffer = io.BytesIO()
|
||||
torch.save(embeds, buffer)
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def example_prompt_embeds(hf_runner):
|
||||
"""Create example embeddings and return them as base64 encoded string."""
|
||||
with hf_runner(MODEL_NAME) as hf_model:
|
||||
example_embeddings = hf_model.get_prompt_embeddings(EXAMPLE_PROMPTS)
|
||||
|
||||
return [_encode_embeds(item) for item in example_embeddings]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
|
||||
def server_with_prompt_embeds(default_server_args, request):
|
||||
if request.param:
|
||||
default_server_args.append(request.param)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client_with_prompt_embeds(server_with_prompt_embeds):
|
||||
async with server_with_prompt_embeds.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
|
||||
async def test_completions_with_prompt_embeds(
|
||||
example_prompt_embeds,
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
):
|
||||
encoded_embeds, encoded_embeds2 = example_prompt_embeds
|
||||
|
||||
# Test case: Single prompt embeds input
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds},
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
assert completion.choices[0].prompt_logprobs is None
|
||||
|
||||
# Test case: batch completion with prompt_embeds
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
|
||||
)
|
||||
assert len(completion.choices) == 2
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
assert len(completion.choices[1].text) >= 1
|
||||
|
||||
# Test case: streaming with prompt_embeds
|
||||
single_completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds},
|
||||
)
|
||||
single_output = single_completion.choices[0].text
|
||||
|
||||
stream = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
extra_body={"prompt_embeds": encoded_embeds},
|
||||
)
|
||||
chunks = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk.choices[0].text)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == "length"
|
||||
assert chunk.choices[0].text
|
||||
assert "".join(chunks) == single_output
|
||||
|
||||
# Test case: batch streaming with prompt_embeds
|
||||
stream = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
|
||||
)
|
||||
chunks_stream_embeds: list[list[str]] = [[], []]
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
chunks_stream_embeds[chunk.choices[0].index].append(chunk.choices[0].text)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
assert finish_reason_count == 2
|
||||
assert chunk.choices[0].finish_reason == "length"
|
||||
assert chunk.choices[0].text
|
||||
assert len(chunks_stream_embeds[0]) > 0
|
||||
assert len(chunks_stream_embeds[1]) > 0
|
||||
|
||||
# Test case: mixed text and prompt_embeds
|
||||
completion_mixed = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="This is a prompt",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds},
|
||||
)
|
||||
assert len(completion.choices) == 2
|
||||
completion_text_only = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="This is a prompt",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
completion_embeds_only = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds},
|
||||
)
|
||||
# Embeddings responses should be handled first
|
||||
assert completion_mixed.choices[0].text == completion_embeds_only.choices[0].text
|
||||
assert completion_mixed.choices[1].text == completion_text_only.choices[0].text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
|
||||
async def test_completions_errors_with_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
# Test error case: invalid prompt_embeds
|
||||
with pytest.raises(BadRequestError):
|
||||
await client_with_prompt_embeds.completions.create(
|
||||
prompt="",
|
||||
model=model_name,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": "invalid_base64"},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("logprobs_arg", [1, 0])
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
|
||||
async def test_completions_with_logprobs_and_prompt_embeds(
|
||||
example_prompt_embeds,
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
logprobs_arg: int,
|
||||
model_name: str,
|
||||
):
|
||||
encoded_embeds, encoded_embeds2 = example_prompt_embeds
|
||||
|
||||
# Test case: Logprobs using prompt_embeds
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
echo=False,
|
||||
logprobs=logprobs_arg,
|
||||
extra_body={"prompt_embeds": encoded_embeds},
|
||||
)
|
||||
|
||||
logprobs = completion.choices[0].logprobs
|
||||
assert logprobs is not None
|
||||
assert len(logprobs.text_offset) == 5
|
||||
assert len(logprobs.token_logprobs) == 5
|
||||
assert len(logprobs.top_logprobs) == 5
|
||||
for top_logprobs in logprobs.top_logprobs[1:]:
|
||||
assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
|
||||
assert len(logprobs.tokens) == 5
|
||||
|
||||
# Test case: Log probs with batch completion and prompt_embeds
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
echo=False,
|
||||
logprobs=logprobs_arg,
|
||||
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
|
||||
)
|
||||
|
||||
assert len(completion.choices) == 2
|
||||
for choice in completion.choices:
|
||||
logprobs = choice.logprobs
|
||||
assert logprobs is not None
|
||||
assert len(logprobs.text_offset) == 5
|
||||
assert len(logprobs.token_logprobs) == 5
|
||||
assert len(logprobs.top_logprobs) == 5
|
||||
for top_logprobs in logprobs.top_logprobs[1:]:
|
||||
assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
|
||||
assert len(logprobs.tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prompt_logprobs_raises_error(
|
||||
example_prompt_embeds,
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
):
|
||||
encoded_embeds, _ = example_prompt_embeds
|
||||
|
||||
with pytest.raises(BadRequestError, match="not compatible"):
|
||||
await client_with_prompt_embeds.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
) -> None:
|
||||
await client_with_prompt_embeds.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Hello",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": []},
|
||||
)
|
||||
96
tests/entrypoints/openai/test_default_mm_loras.py
Normal file
96
tests/entrypoints/openai/test_default_mm_loras.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from ...conftest import AudioTestAssets
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# NOTE - the tests in this module are currently analogous to test_chat, but are
|
||||
# separated to avoid OOM killing due to module-scoped servers, since we
|
||||
# need a multimodal model for these tests.
|
||||
|
||||
# Contains a modality specific lora alongside the base model
|
||||
MULTIMODAL_MODEL_NAME = snapshot_download("microsoft/Phi-4-multimodal-instruct")
|
||||
AUDIO_LORA_PATH = os.path.join(MULTIMODAL_MODEL_NAME, "speech-lora")
|
||||
|
||||
ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def multimodal_server(): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"half",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"speech={AUDIO_LORA_PATH}",
|
||||
"--max-lora-rank",
|
||||
"320",
|
||||
"--max-num-seqs",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--gpu-memory-utilization",
|
||||
"0.8",
|
||||
"--default-mm-loras",
|
||||
f'{{"audio": "{AUDIO_LORA_PATH}"}}',
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MULTIMODAL_MODEL_NAME, args, max_wait_seconds=480
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def multi_modal_client(multimodal_server):
|
||||
async with multimodal_server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# base model with default lora should give the same response as lora model
|
||||
"model_name",
|
||||
[MULTIMODAL_MODEL_NAME, "speech"],
|
||||
)
|
||||
async def test_default_mm_lora_chat_completions(
|
||||
model_name: str,
|
||||
multi_modal_client: openai.AsyncOpenAI,
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Can you transcribe this audio?",
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {"url": audio_assets[0].url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await multi_modal_client.chat.completions.create(
|
||||
model=model_name, messages=messages, max_completion_tokens=128, temperature=0.0
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) > 0
|
||||
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
assert message.content == ACTIVE_MM_LORA_RESPONSE
|
||||
126
tests/entrypoints/openai/test_enable_force_include_usage.py
Normal file
126
tests/entrypoints/openai/test_enable_force_include_usage.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def chat_server_with_force_include_usage(request): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"4",
|
||||
"--enable-force-include-usage",
|
||||
"--port",
|
||||
"55857",
|
||||
"--gpu-memory-utilization",
|
||||
"0.2",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
|
||||
async with chat_server_with_force_include_usage.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_with_enable_force_include_usage(
|
||||
chat_client_with_force_include_usage: openai.AsyncOpenAI,
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is the capital of France?"},
|
||||
]
|
||||
|
||||
stream = await chat_client_with_force_include_usage.chat.completions.create(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
extra_body=dict(min_tokens=10),
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
last_completion_tokens = 0
|
||||
async for chunk in stream:
|
||||
if not len(chunk.choices):
|
||||
assert chunk.usage.prompt_tokens >= 0
|
||||
assert (
|
||||
last_completion_tokens == 0
|
||||
or chunk.usage.completion_tokens > last_completion_tokens
|
||||
or (
|
||||
not chunk.choices
|
||||
and chunk.usage.completion_tokens == last_completion_tokens
|
||||
)
|
||||
)
|
||||
assert chunk.usage.total_tokens == (
|
||||
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
|
||||
)
|
||||
else:
|
||||
assert chunk.usage is None
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def transcription_server_with_force_include_usage():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-num-seqs",
|
||||
"4",
|
||||
"--enforce-eager",
|
||||
"--enable-force-include-usage",
|
||||
"--gpu-memory-utilization",
|
||||
"0.2",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def transcription_client_with_force_include_usage(
|
||||
transcription_server_with_force_include_usage,
|
||||
):
|
||||
async with (
|
||||
transcription_server_with_force_include_usage.get_async_client() as async_client
|
||||
):
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_transcription_with_enable_force_include_usage(
|
||||
transcription_client_with_force_include_usage, winning_call
|
||||
):
|
||||
res = (
|
||||
await transcription_client_with_force_include_usage.audio.transcriptions.create(
|
||||
model="openai/whisper-large-v3-turbo",
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
timeout=30,
|
||||
)
|
||||
)
|
||||
|
||||
async for chunk in res:
|
||||
if not len(chunk.choices):
|
||||
# final usage sent
|
||||
usage = chunk.usage
|
||||
assert isinstance(usage, dict)
|
||||
assert usage["prompt_tokens"] > 0
|
||||
assert usage["completion_tokens"] > 0
|
||||
assert usage["total_tokens"] > 0
|
||||
else:
|
||||
assert not hasattr(chunk, "usage")
|
||||
@@ -0,0 +1,280 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""Integration tests for GPT-OSS structural tags functionality (PR #25515)."""
|
||||
|
||||
import json
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
StructuredOutputsParams,
|
||||
)
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
from vllm.reasoning.gptoss_reasoning_parser import (
|
||||
GptOssReasoningParser,
|
||||
)
|
||||
|
||||
|
||||
class TestGptOssStructuralTagsIntegration:
|
||||
"""Integration tests for structural tags in GPT-OSS tool calls."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_tokenizer(self):
|
||||
"""Create a mock tokenizer."""
|
||||
tokenizer = Mock()
|
||||
tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
|
||||
return tokenizer
|
||||
|
||||
@pytest.fixture
|
||||
def gptoss_parser(self, mock_tokenizer):
|
||||
"""Create a real GptOssReasoningParser instance."""
|
||||
return GptOssReasoningParser(mock_tokenizer)
|
||||
|
||||
@pytest.fixture
|
||||
def tool_server_with_python(self):
|
||||
"""Create a tool server with Python tool enabled."""
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
|
||||
return tool_server
|
||||
|
||||
@pytest.fixture
|
||||
def tool_server_empty(self):
|
||||
"""Create a tool server with no tools."""
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(return_value=False)
|
||||
return tool_server
|
||||
|
||||
def test_end_to_end_no_tools(self, gptoss_parser):
|
||||
"""Test end-to-end flow when no tools are available."""
|
||||
# Test the parser directly
|
||||
result = gptoss_parser.prepare_structured_tag(None, None)
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Verify basic structure
|
||||
assert parsed_result["type"] == "structural_tag"
|
||||
assert parsed_result["format"]["type"] == "triggered_tags"
|
||||
assert len(parsed_result["format"]["tags"]) == 1
|
||||
|
||||
# Verify only analysis channel is allowed
|
||||
analysis_tag = parsed_result["format"]["tags"][0]
|
||||
assert analysis_tag["begin"] == "<|channel|>analysis<|message|>"
|
||||
assert analysis_tag["content"]["type"] == "any_text"
|
||||
assert analysis_tag["end"] == "<|end|>"
|
||||
|
||||
# Verify triggers
|
||||
assert parsed_result["format"]["triggers"] == ["<|channel|>analysis"]
|
||||
assert parsed_result["format"]["stop_after_first"] is False
|
||||
|
||||
def test_end_to_end_with_python_tool(self, gptoss_parser, tool_server_with_python):
|
||||
"""Test end-to-end flow with Python tool enabled."""
|
||||
result = gptoss_parser.prepare_structured_tag(None, tool_server_with_python)
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Should have analysis tag + 2 python tags
|
||||
assert len(parsed_result["format"]["tags"]) == 3
|
||||
|
||||
# Verify all expected tags are present
|
||||
tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
|
||||
expected_begins = [
|
||||
"<|channel|>analysis<|message|>",
|
||||
"<|channel|>commentary to=python",
|
||||
"<|channel|>analysis to=python",
|
||||
]
|
||||
|
||||
for expected in expected_begins:
|
||||
assert expected in tag_begins
|
||||
|
||||
# Verify triggers include commentary
|
||||
assert "<|channel|>analysis" in parsed_result["format"]["triggers"]
|
||||
assert "<|channel|>commentary to=" in parsed_result["format"]["triggers"]
|
||||
|
||||
def test_structured_outputs_params_integration(
|
||||
self, gptoss_parser, tool_server_with_python
|
||||
):
|
||||
"""Test integration with StructuredOutputsParams."""
|
||||
# Generate structural tag
|
||||
structural_tag = gptoss_parser.prepare_structured_tag(
|
||||
None, tool_server_with_python
|
||||
)
|
||||
|
||||
# Create StructuredOutputsParams
|
||||
params = StructuredOutputsParams(structural_tag=structural_tag)
|
||||
|
||||
# Verify the tag is properly stored and accessible
|
||||
assert params.structural_tag == structural_tag
|
||||
|
||||
# Verify the tag is valid JSON
|
||||
parsed_tag = json.loads(params.structural_tag)
|
||||
assert parsed_tag["type"] == "structural_tag"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"browser, python, container, expected_tags",
|
||||
[
|
||||
# No tools
|
||||
(False, False, False, 1),
|
||||
# Single tool
|
||||
(True, False, False, 3),
|
||||
# Multiple tools
|
||||
(True, True, False, 5),
|
||||
# All tools
|
||||
(True, True, True, 7),
|
||||
],
|
||||
)
|
||||
def test_tool_server_interaction_flow(
|
||||
self, gptoss_parser, browser, python, container, expected_tags
|
||||
):
|
||||
"""Test the complete tool server interaction flow."""
|
||||
|
||||
# Create a mock ToolServer
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
|
||||
# Simulate tool availability based on parameters
|
||||
tool_server.has_tool = Mock(
|
||||
side_effect=lambda tool: {
|
||||
"browser": browser,
|
||||
"python": python,
|
||||
"container": container,
|
||||
}.get(tool, False)
|
||||
)
|
||||
|
||||
# Run the parser and verify results
|
||||
result = gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Validate number of tags
|
||||
assert len(parsed_result["format"]["tags"]) == expected_tags
|
||||
|
||||
# Verify tool-specific tags exist for enabled tools
|
||||
tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
|
||||
for tool, enabled in {
|
||||
"browser": browser,
|
||||
"python": python,
|
||||
"container": container,
|
||||
}.items():
|
||||
if enabled:
|
||||
assert f"<|channel|>commentary to={tool}" in tag_begins
|
||||
assert f"<|channel|>analysis to={tool}" in tag_begins
|
||||
|
||||
def test_original_tag_preservation(self, gptoss_parser, tool_server_with_python):
|
||||
"""Test that original tags are preserved when provided."""
|
||||
original_tag = '{"type": "custom_tag", "data": "preserved"}'
|
||||
|
||||
result = gptoss_parser.prepare_structured_tag(
|
||||
original_tag, tool_server_with_python
|
||||
)
|
||||
|
||||
# Should return original tag unchanged
|
||||
assert result == original_tag
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"tools",
|
||||
[
|
||||
[],
|
||||
["browser"],
|
||||
["python"],
|
||||
["container"],
|
||||
["browser", "python"],
|
||||
["browser", "container"],
|
||||
["python", "container"],
|
||||
["browser", "python", "container"],
|
||||
],
|
||||
)
|
||||
def test_json_validity_comprehensive(self, gptoss_parser, tools):
|
||||
"""Test JSON validity across all possible tool combinations."""
|
||||
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(side_effect=lambda tool: tool in tools)
|
||||
|
||||
result = gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
|
||||
# Should be valid JSON
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Should have correct structure
|
||||
assert parsed_result["type"] == "structural_tag"
|
||||
assert "format" in parsed_result
|
||||
assert "tags" in parsed_result["format"]
|
||||
assert "triggers" in parsed_result["format"]
|
||||
|
||||
# Tag count should be: 1 (analysis) + 2 * len(tools)
|
||||
expected_tag_count = 1 + (2 * len(tools))
|
||||
assert len(parsed_result["format"]["tags"]) == expected_tag_count
|
||||
|
||||
def test_error_handling_invalid_tool_server(self, gptoss_parser):
|
||||
"""Test error handling with invalid tool server."""
|
||||
# Tool server that raises exceptions
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(side_effect=Exception("Tool server error"))
|
||||
|
||||
# Should handle gracefully and still return a valid tag
|
||||
with pytest.raises(Exception, match="Tool server error"):
|
||||
gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
|
||||
def test_concurrent_requests_isolation(self, gptoss_parser):
|
||||
"""Test that concurrent requests don't interfere with each other."""
|
||||
# Simulate concurrent requests with different tool servers
|
||||
tool_server_1 = Mock(spec=ToolServer)
|
||||
tool_server_1.has_tool = Mock(side_effect=lambda tool: tool == "python")
|
||||
|
||||
tool_server_2 = Mock(spec=ToolServer)
|
||||
tool_server_2.has_tool = Mock(side_effect=lambda tool: tool == "browser")
|
||||
|
||||
# Generate tags concurrently
|
||||
result_1 = gptoss_parser.prepare_structured_tag(None, tool_server_1)
|
||||
result_2 = gptoss_parser.prepare_structured_tag(None, tool_server_2)
|
||||
|
||||
# Parse results
|
||||
parsed_1 = json.loads(result_1)
|
||||
parsed_2 = json.loads(result_2)
|
||||
|
||||
# Verify they have different tool configurations
|
||||
tags_1 = [tag["begin"] for tag in parsed_1["format"]["tags"]]
|
||||
tags_2 = [tag["begin"] for tag in parsed_2["format"]["tags"]]
|
||||
|
||||
# Result 1 should have python tags
|
||||
assert "<|channel|>commentary to=python" in tags_1
|
||||
assert "<|channel|>commentary to=browser" not in tags_1
|
||||
|
||||
# Result 2 should have browser tags
|
||||
assert "<|channel|>commentary to=browser" in tags_2
|
||||
assert "<|channel|>commentary to=python" not in tags_2
|
||||
|
||||
def test_tag_format_consistency(self, gptoss_parser):
|
||||
"""Test that all generated tags follow consistent format."""
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(
|
||||
side_effect=lambda tool: tool in ["python", "browser"]
|
||||
)
|
||||
|
||||
result = gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Verify all tags have required fields
|
||||
for tag in parsed_result["format"]["tags"]:
|
||||
assert "begin" in tag
|
||||
assert "content" in tag
|
||||
assert "end" in tag
|
||||
assert tag["content"]["type"] == "any_text"
|
||||
assert tag["end"] == "<|end|>"
|
||||
|
||||
# Verify begin format
|
||||
assert tag["begin"].startswith("<|channel|>")
|
||||
|
||||
def test_trigger_configuration(self, gptoss_parser):
|
||||
"""Test trigger configuration for different tool setups."""
|
||||
# Test with no tools
|
||||
result_no_tools = gptoss_parser.prepare_structured_tag(None, None)
|
||||
parsed_no_tools = json.loads(result_no_tools)
|
||||
assert parsed_no_tools["format"]["triggers"] == ["<|channel|>analysis"]
|
||||
|
||||
# Test with tools
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
|
||||
|
||||
result_with_tools = gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
parsed_with_tools = json.loads(result_with_tools)
|
||||
|
||||
expected_triggers = ["<|channel|>analysis", "<|channel|>commentary to="]
|
||||
assert set(parsed_with_tools["format"]["triggers"]) == set(expected_triggers)
|
||||
294
tests/entrypoints/openai/test_lora_adapters.py
Normal file
294
tests/entrypoints/openai/test_lora_adapters.py
Normal file
@@ -0,0 +1,294 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import shutil
|
||||
from contextlib import suppress
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
BADREQUEST_CASES = [
|
||||
(
|
||||
"test_rank",
|
||||
{"r": 1024},
|
||||
"is greater than max_lora_rank",
|
||||
),
|
||||
("test_dora", {"use_dora": True}, "does not yet support DoRA"),
|
||||
(
|
||||
"test_modules_to_save",
|
||||
{"modules_to_save": ["lm_head"]},
|
||||
"only supports modules_to_save being None",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[True])
|
||||
def server_with_lora_modules_json(request, qwen3_lora_files):
|
||||
# Define the json format LoRA module configurations
|
||||
lora_module_1 = {
|
||||
"name": "qwen3-lora",
|
||||
"path": qwen3_lora_files,
|
||||
"base_model_name": MODEL_NAME,
|
||||
}
|
||||
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
json.dumps(lora_module_1),
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
"--max-num-seqs",
|
||||
"64",
|
||||
]
|
||||
|
||||
# Enable the /v1/load_lora_adapter endpoint
|
||||
envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server_with_lora_modules_json):
|
||||
async with server_with_lora_modules_json.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
lora_models = models[1:]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert served_model.root == MODEL_NAME
|
||||
assert served_model.parent is None
|
||||
assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
|
||||
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
|
||||
assert lora_models[0].id == "qwen3-lora"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
|
||||
response = await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
|
||||
)
|
||||
# Ensure adapter loads before querying /models
|
||||
assert "success" in response
|
||||
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
dynamic_lora_model = models[-1]
|
||||
assert dynamic_lora_model.root == qwen3_lora_files
|
||||
assert dynamic_lora_model.parent == MODEL_NAME
|
||||
assert dynamic_lora_model.id == "qwen3-lora-3"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
|
||||
with pytest.raises(openai.NotFoundError):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
|
||||
invalid_files = tmp_path / "invalid_files"
|
||||
invalid_files.mkdir()
|
||||
(invalid_files / "adapter_config.json").write_text("this is not json")
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": "invalid-json", "lora_path": str(invalid_files)},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("test_name,config_change,expected_error", BADREQUEST_CASES)
|
||||
async def test_dynamic_lora_badrequests(
|
||||
client: openai.AsyncOpenAI,
|
||||
tmp_path,
|
||||
qwen3_lora_files,
|
||||
test_name: str,
|
||||
config_change: dict,
|
||||
expected_error: str,
|
||||
):
|
||||
# Create test directory
|
||||
test_dir = tmp_path / test_name
|
||||
|
||||
# Copy adapter files
|
||||
shutil.copytree(qwen3_lora_files, test_dir)
|
||||
|
||||
# Load and modify configuration
|
||||
config_path = test_dir / "adapter_config.json"
|
||||
with open(config_path) as f:
|
||||
adapter_config = json.load(f)
|
||||
# Apply configuration changes
|
||||
adapter_config.update(config_change)
|
||||
|
||||
# Save modified configuration
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(adapter_config, f)
|
||||
|
||||
# Test loading the adapter
|
||||
with pytest.raises(openai.BadRequestError, match=expected_error):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": test_name, "lora_path": str(test_dir)},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiple_lora_adapters(
|
||||
client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
|
||||
):
|
||||
"""Validate that many loras can be dynamically registered and inferenced
|
||||
with concurrently"""
|
||||
|
||||
# This test file configures the server with --max-cpu-loras=2 and this test
|
||||
# will concurrently load 10 adapters, so it should flex the LRU cache
|
||||
async def load_and_run_adapter(adapter_name: str):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
|
||||
)
|
||||
for _ in range(3):
|
||||
await client.completions.create(
|
||||
model=adapter_name,
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
)
|
||||
|
||||
lora_tasks = []
|
||||
for i in range(10):
|
||||
lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
|
||||
|
||||
results, _ = await asyncio.wait(lora_tasks)
|
||||
|
||||
for r in results:
|
||||
assert not isinstance(r, Exception), f"Got exception {r}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_loading_invalid_adapters_does_not_break_others(
|
||||
client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
|
||||
):
|
||||
invalid_files = tmp_path / "invalid_files"
|
||||
invalid_files.mkdir()
|
||||
(invalid_files / "adapter_config.json").write_text("this is not json")
|
||||
|
||||
stop_good_requests_event = asyncio.Event()
|
||||
|
||||
async def run_good_requests(client):
|
||||
# Run chat completions requests until event set
|
||||
|
||||
results = []
|
||||
|
||||
while not stop_good_requests_event.is_set():
|
||||
try:
|
||||
batch = await client.completions.create(
|
||||
model="qwen3-lora",
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
)
|
||||
results.append(batch)
|
||||
except Exception as e:
|
||||
results.append(e)
|
||||
|
||||
return results
|
||||
|
||||
# Create task to run good requests
|
||||
good_task = asyncio.create_task(run_good_requests(client))
|
||||
|
||||
# Run a bunch of bad adapter loads
|
||||
for _ in range(25):
|
||||
with suppress(openai.NotFoundError):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
|
||||
)
|
||||
for _ in range(25):
|
||||
with suppress(openai.BadRequestError):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": "invalid", "lora_path": str(invalid_files)},
|
||||
)
|
||||
|
||||
# Ensure all the running requests with lora adapters succeeded
|
||||
stop_good_requests_event.set()
|
||||
results = await good_task
|
||||
for r in results:
|
||||
assert not isinstance(r, Exception), f"Got exception {r}"
|
||||
|
||||
# Ensure we can load another adapter and run it
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": "valid", "lora_path": qwen3_lora_files},
|
||||
)
|
||||
await client.completions.create(
|
||||
model="valid",
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_beam_search_with_lora_adapters(
|
||||
client: openai.AsyncOpenAI,
|
||||
tmp_path,
|
||||
qwen3_lora_files,
|
||||
):
|
||||
"""Validate that async beam search can be used with lora."""
|
||||
|
||||
async def load_and_run_adapter(adapter_name: str):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
|
||||
)
|
||||
for _ in range(3):
|
||||
await client.completions.create(
|
||||
model=adapter_name,
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
|
||||
lora_tasks = []
|
||||
for i in range(3):
|
||||
lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
|
||||
|
||||
results, _ = await asyncio.wait(lora_tasks)
|
||||
|
||||
for r in results:
|
||||
assert not isinstance(r, Exception), f"Got exception {r}"
|
||||
230
tests/entrypoints/openai/test_lora_resolvers.py
Normal file
230
tests/entrypoints/openai/test_lora_resolvers.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass, field
|
||||
from http import HTTPStatus
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
MODEL_NAME = "openai-community/gpt2"
|
||||
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
|
||||
|
||||
MOCK_RESOLVER_NAME = "mock_test_resolver"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockHFConfig:
|
||||
model_type: str = "any"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
"""Minimal mock ModelConfig for testing."""
|
||||
|
||||
model: str = MODEL_NAME
|
||||
tokenizer: str = MODEL_NAME
|
||||
trust_remote_code: bool = False
|
||||
tokenizer_mode: str = "auto"
|
||||
max_model_len: int = 100
|
||||
tokenizer_revision: str | None = None
|
||||
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
|
||||
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
|
||||
logits_processors: list[str] | None = None
|
||||
logits_processor_pattern: str | None = None
|
||||
diff_sampling_param: dict | None = None
|
||||
allowed_local_media_path: str = ""
|
||||
allowed_media_domains: list[str] | None = None
|
||||
encoder_config = None
|
||||
generation_config: str = "auto"
|
||||
skip_tokenizer_init: bool = False
|
||||
|
||||
def get_diff_sampling_param(self):
|
||||
return self.diff_sampling_param or {}
|
||||
|
||||
|
||||
class MockLoRAResolver(LoRAResolver):
|
||||
async def resolve_lora(
|
||||
self, base_model_name: str, lora_name: str
|
||||
) -> LoRARequest | None:
|
||||
if lora_name == "test-lora":
|
||||
return LoRARequest(
|
||||
lora_name="test-lora",
|
||||
lora_int_id=1,
|
||||
lora_local_path="/fake/path/test-lora",
|
||||
)
|
||||
elif lora_name == "invalid-lora":
|
||||
return LoRARequest(
|
||||
lora_name="invalid-lora",
|
||||
lora_int_id=2,
|
||||
lora_local_path="/fake/path/invalid-lora",
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def register_mock_resolver():
|
||||
"""Fixture to register and unregister the mock LoRA resolver."""
|
||||
resolver = MockLoRAResolver()
|
||||
LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
|
||||
yield
|
||||
# Cleanup: remove the resolver after the test runs
|
||||
if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
|
||||
del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_serving_setup():
|
||||
"""Provides a mocked engine and serving completion instance."""
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.errored = False
|
||||
|
||||
tokenizer = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.get_tokenizer = AsyncMock(return_value=tokenizer)
|
||||
|
||||
async def mock_add_lora_side_effect(lora_request: LoRARequest):
|
||||
"""Simulate engine behavior when adding LoRAs."""
|
||||
if lora_request.lora_name == "test-lora":
|
||||
# Simulate successful addition
|
||||
return True
|
||||
if lora_request.lora_name == "invalid-lora":
|
||||
# Simulate failure during addition (e.g. invalid format)
|
||||
raise ValueError(f"Simulated failure adding LoRA: {lora_request.lora_name}")
|
||||
return True
|
||||
|
||||
mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect)
|
||||
|
||||
async def mock_generate(*args, **kwargs):
|
||||
for _ in []:
|
||||
yield _
|
||||
|
||||
mock_engine.generate = MagicMock(spec=AsyncLLM.generate, side_effect=mock_generate)
|
||||
|
||||
mock_engine.generate.reset_mock()
|
||||
mock_engine.add_lora.reset_mock()
|
||||
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
models = OpenAIServingModels(
|
||||
engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
|
||||
serving_completion = OpenAIServingCompletion(
|
||||
mock_engine, models, request_logger=None
|
||||
)
|
||||
|
||||
serving_completion._process_inputs = AsyncMock(
|
||||
return_value=(MagicMock(name="engine_request"), {})
|
||||
)
|
||||
|
||||
return mock_engine, serving_completion
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_serving_completion_with_lora_resolver(mock_serving_setup, monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
|
||||
|
||||
mock_engine, serving_completion = mock_serving_setup
|
||||
|
||||
lora_model_name = "test-lora"
|
||||
req_found = CompletionRequest(
|
||||
model=lora_model_name,
|
||||
prompt="Generate with LoRA",
|
||||
)
|
||||
|
||||
# Suppress potential errors during the mocked generate call,
|
||||
# as we are primarily checking for add_lora and generate calls
|
||||
with suppress(Exception):
|
||||
await serving_completion.create_completion(req_found)
|
||||
|
||||
mock_engine.add_lora.assert_awaited_once()
|
||||
called_lora_request = mock_engine.add_lora.call_args[0][0]
|
||||
assert isinstance(called_lora_request, LoRARequest)
|
||||
assert called_lora_request.lora_name == lora_model_name
|
||||
|
||||
mock_engine.generate.assert_called_once()
|
||||
called_lora_request = mock_engine.generate.call_args[1]["lora_request"]
|
||||
assert isinstance(called_lora_request, LoRARequest)
|
||||
assert called_lora_request.lora_name == lora_model_name
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_serving_completion_resolver_not_found(mock_serving_setup, monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
|
||||
|
||||
mock_engine, serving_completion = mock_serving_setup
|
||||
|
||||
non_existent_model = "non-existent-lora-adapter"
|
||||
req = CompletionRequest(
|
||||
model=non_existent_model,
|
||||
prompt="what is 1+1?",
|
||||
)
|
||||
|
||||
response = await serving_completion.create_completion(req)
|
||||
|
||||
mock_engine.add_lora.assert_not_awaited()
|
||||
mock_engine.generate.assert_not_called()
|
||||
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.error.code == HTTPStatus.NOT_FOUND.value
|
||||
assert non_existent_model in response.error.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_serving_completion_resolver_add_lora_fails(
|
||||
mock_serving_setup, monkeypatch
|
||||
):
|
||||
monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
|
||||
|
||||
mock_engine, serving_completion = mock_serving_setup
|
||||
|
||||
invalid_model = "invalid-lora"
|
||||
req = CompletionRequest(
|
||||
model=invalid_model,
|
||||
prompt="what is 1+1?",
|
||||
)
|
||||
|
||||
response = await serving_completion.create_completion(req)
|
||||
|
||||
# Assert add_lora was called before the failure
|
||||
mock_engine.add_lora.assert_awaited_once()
|
||||
called_lora_request = mock_engine.add_lora.call_args[0][0]
|
||||
assert isinstance(called_lora_request, LoRARequest)
|
||||
assert called_lora_request.lora_name == invalid_model
|
||||
|
||||
# Assert generate was *not* called due to the failure
|
||||
mock_engine.generate.assert_not_called()
|
||||
|
||||
# Assert the correct error response
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.error.code == HTTPStatus.BAD_REQUEST.value
|
||||
assert invalid_model in response.error.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_serving_completion_flag_not_set(mock_serving_setup):
|
||||
mock_engine, serving_completion = mock_serving_setup
|
||||
|
||||
lora_model_name = "test-lora"
|
||||
req_found = CompletionRequest(
|
||||
model=lora_model_name,
|
||||
prompt="Generate with LoRA",
|
||||
)
|
||||
|
||||
await serving_completion.create_completion(req_found)
|
||||
|
||||
mock_engine.add_lora.assert_not_called()
|
||||
mock_engine.generate.assert_not_called()
|
||||
155
tests/entrypoints/openai/test_messages.py
Normal file
155
tests/entrypoints/openai/test_messages.py
Normal file
@@ -0,0 +1,155 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import anthropic
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--enforce-eager",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
"--served-model-name",
|
||||
"claude-3-7-sonnet-latest",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client_anthropic() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_messages(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
messages=[{"role": "user", "content": "how are you!"}],
|
||||
)
|
||||
assert resp.stop_reason == "end_turn"
|
||||
assert resp.role == "assistant"
|
||||
|
||||
print(f"Anthropic response: {resp.model_dump_json()}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_system_message(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
system="you are a helpful assistant",
|
||||
messages=[{"role": "user", "content": "how are you!"}],
|
||||
)
|
||||
assert resp.stop_reason == "end_turn"
|
||||
assert resp.role == "assistant"
|
||||
|
||||
print(f"Anthropic response: {resp.model_dump_json()}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
messages=[{"role": "user", "content": "how are you!"}],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
first_chunk = None
|
||||
chunk_count = 0
|
||||
async for chunk in resp:
|
||||
chunk_count += 1
|
||||
if first_chunk is None and chunk.type == "message_start":
|
||||
first_chunk = chunk
|
||||
print(chunk.model_dump_json())
|
||||
|
||||
assert chunk_count > 0
|
||||
assert first_chunk is not None, "message_start chunk was never observed"
|
||||
assert first_chunk.message is not None, "first chunk should include message"
|
||||
assert first_chunk.message.usage is not None, (
|
||||
"first chunk should include usage stats"
|
||||
)
|
||||
assert first_chunk.message.usage.output_tokens == 0
|
||||
assert first_chunk.message.usage.input_tokens > 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
messages=[
|
||||
{"role": "user", "content": "What's the weather like in New York today?"}
|
||||
],
|
||||
tools=[
|
||||
{
|
||||
"name": "get_current_weather",
|
||||
"description": "Useful for querying the weather in a specified city.",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "City or region, for example: "
|
||||
"New York, London, Tokyo, etc.",
|
||||
}
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
}
|
||||
],
|
||||
stream=False,
|
||||
)
|
||||
assert resp.stop_reason == "tool_use"
|
||||
assert resp.role == "assistant"
|
||||
|
||||
print(f"Anthropic response: {resp.model_dump_json()}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in New York today?",
|
||||
}
|
||||
],
|
||||
tools=[
|
||||
{
|
||||
"name": "get_current_weather",
|
||||
"description": "Useful for querying the weather in a specified city.",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "City or region, for example: "
|
||||
"New York, London, Tokyo, etc.",
|
||||
}
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
}
|
||||
],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
async for chunk in resp:
|
||||
print(chunk.model_dump_json())
|
||||
454
tests/entrypoints/openai/test_metrics.py
Normal file
454
tests/entrypoints/openai/test_metrics.py
Normal file
@@ -0,0 +1,454 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from http import HTTPStatus
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import version
|
||||
|
||||
from ...conftest import LocalAssetServer
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODELS = {
|
||||
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
|
||||
}
|
||||
PREV_MINOR_VERSION = version._prev_minor_version()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=list(MODELS.keys()))
|
||||
def model_key(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args():
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"1024",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module",
|
||||
params=[
|
||||
"",
|
||||
"--enable-chunked-prefill",
|
||||
"--disable-frontend-multiprocessing",
|
||||
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
|
||||
],
|
||||
)
|
||||
def server(model_key, default_server_args, request):
|
||||
if request.param:
|
||||
default_server_args.append(request.param)
|
||||
|
||||
model_name = MODELS[model_key]
|
||||
with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as cl:
|
||||
yield cl
|
||||
|
||||
|
||||
_PROMPT = "Hello my name is Robert and I love magic"
|
||||
|
||||
|
||||
def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
|
||||
num_prompt_tokens = len(prompt_ids)
|
||||
|
||||
# {metric_family: [(suffix, expected_value)]}
|
||||
return {
|
||||
"vllm:time_to_first_token_seconds": [("_count", num_requests)],
|
||||
"vllm:time_per_output_token_seconds": [
|
||||
("_count", num_requests * (max_tokens - 1))
|
||||
],
|
||||
"vllm:e2e_request_latency_seconds": [("_count", num_requests)],
|
||||
"vllm:request_queue_time_seconds": [("_count", num_requests)],
|
||||
"vllm:request_inference_time_seconds": [("_count", num_requests)],
|
||||
"vllm:request_prefill_time_seconds": [("_count", num_requests)],
|
||||
"vllm:request_decode_time_seconds": [("_count", num_requests)],
|
||||
"vllm:request_prompt_tokens": [
|
||||
("_sum", num_requests * num_prompt_tokens),
|
||||
("_count", num_requests),
|
||||
],
|
||||
"vllm:request_generation_tokens": [
|
||||
("_sum", num_requests * max_tokens),
|
||||
("_count", num_requests),
|
||||
],
|
||||
"vllm:request_params_n": [("_count", num_requests)],
|
||||
"vllm:request_params_max_tokens": [
|
||||
("_sum", num_requests * max_tokens),
|
||||
("_count", num_requests),
|
||||
],
|
||||
"vllm:iteration_tokens_total": [
|
||||
(
|
||||
"_sum",
|
||||
num_requests * (num_prompt_tokens + max_tokens),
|
||||
),
|
||||
("_count", num_requests * max_tokens),
|
||||
],
|
||||
"vllm:prompt_tokens": [("_total", num_requests * num_prompt_tokens)],
|
||||
"vllm:generation_tokens": [("_total", num_requests * max_tokens)],
|
||||
"vllm:request_success": [("_total", num_requests)],
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_counts(
|
||||
server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient,
|
||||
model_key: str,
|
||||
):
|
||||
if model_key == "multimodal":
|
||||
pytest.skip("Unnecessary test")
|
||||
|
||||
model_name = MODELS[model_key]
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
prompt_ids = tokenizer.encode(_PROMPT)
|
||||
num_requests = 10
|
||||
max_tokens = 10
|
||||
|
||||
for _ in range(num_requests):
|
||||
# sending a request triggers the metrics to be logged.
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt_ids,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
response = requests.get(server.url_for("metrics"))
|
||||
print(response.text)
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
# Loop over all expected metric_families
|
||||
expected_values = _get_expected_values(num_requests, prompt_ids, max_tokens)
|
||||
for metric_family, suffix_values_list in expected_values.items():
|
||||
if metric_family not in EXPECTED_METRICS_V1 or (
|
||||
not server.show_hidden_metrics
|
||||
and metric_family in HIDDEN_DEPRECATED_METRICS
|
||||
):
|
||||
continue
|
||||
|
||||
found_metric = False
|
||||
|
||||
# Check to see if the metric_family is found in the prom endpoint.
|
||||
for family in text_string_to_metric_families(response.text):
|
||||
if family.name == metric_family:
|
||||
found_metric = True
|
||||
|
||||
# Check that each suffix is found in the prom endpoint.
|
||||
for suffix, expected_value in suffix_values_list:
|
||||
metric_name_w_suffix = f"{metric_family}{suffix}"
|
||||
found_suffix = False
|
||||
|
||||
for sample in family.samples:
|
||||
if sample.name == metric_name_w_suffix:
|
||||
found_suffix = True
|
||||
|
||||
# For each suffix, value sure the value matches
|
||||
# what we expect.
|
||||
assert sample.value == expected_value, (
|
||||
f"{metric_name_w_suffix} expected value of "
|
||||
f"{expected_value} did not match found value "
|
||||
f"{sample.value}"
|
||||
)
|
||||
break
|
||||
assert found_suffix, (
|
||||
f"Did not find {metric_name_w_suffix} in prom endpoint"
|
||||
)
|
||||
break
|
||||
|
||||
assert found_metric, f"Did not find {metric_family} in prom endpoint"
|
||||
|
||||
|
||||
EXPECTED_METRICS_V1 = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:kv_cache_usage_perc",
|
||||
"vllm:prefix_cache_queries",
|
||||
"vllm:prefix_cache_hits",
|
||||
"vllm:num_preemptions_total",
|
||||
"vllm:prompt_tokens_total",
|
||||
"vllm:generation_tokens_total",
|
||||
"vllm:iteration_tokens_total",
|
||||
"vllm:cache_config_info",
|
||||
"vllm:request_success_total",
|
||||
"vllm:request_prompt_tokens_sum",
|
||||
"vllm:request_prompt_tokens_bucket",
|
||||
"vllm:request_prompt_tokens_count",
|
||||
"vllm:request_generation_tokens_sum",
|
||||
"vllm:request_generation_tokens_bucket",
|
||||
"vllm:request_generation_tokens_count",
|
||||
"vllm:request_params_n_sum",
|
||||
"vllm:request_params_n_bucket",
|
||||
"vllm:request_params_n_count",
|
||||
"vllm:request_params_max_tokens_sum",
|
||||
"vllm:request_params_max_tokens_bucket",
|
||||
"vllm:request_params_max_tokens_count",
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
"vllm:inter_token_latency_seconds_sum",
|
||||
"vllm:inter_token_latency_seconds_bucket",
|
||||
"vllm:inter_token_latency_seconds_count",
|
||||
"vllm:e2e_request_latency_seconds_sum",
|
||||
"vllm:e2e_request_latency_seconds_bucket",
|
||||
"vllm:e2e_request_latency_seconds_count",
|
||||
"vllm:request_queue_time_seconds_sum",
|
||||
"vllm:request_queue_time_seconds_bucket",
|
||||
"vllm:request_queue_time_seconds_count",
|
||||
"vllm:request_inference_time_seconds_sum",
|
||||
"vllm:request_inference_time_seconds_bucket",
|
||||
"vllm:request_inference_time_seconds_count",
|
||||
"vllm:request_prefill_time_seconds_sum",
|
||||
"vllm:request_prefill_time_seconds_bucket",
|
||||
"vllm:request_prefill_time_seconds_count",
|
||||
"vllm:request_decode_time_seconds_sum",
|
||||
"vllm:request_decode_time_seconds_bucket",
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
]
|
||||
|
||||
EXPECTED_METRICS_MM = [
|
||||
"vllm:mm_cache_queries",
|
||||
"vllm:mm_cache_hits",
|
||||
]
|
||||
|
||||
HIDDEN_DEPRECATED_METRICS: list[str] = [
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:gpu_prefix_cache_queries",
|
||||
"vllm:gpu_prefix_cache_hits",
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_exist(
|
||||
local_asset_server: LocalAssetServer,
|
||||
server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient,
|
||||
model_key: str,
|
||||
):
|
||||
model_name = MODELS[model_key]
|
||||
|
||||
# sending a request triggers the metrics to be logged.
|
||||
if model_key == "text":
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
else:
|
||||
# https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": local_asset_server.url_for(
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
),
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
response = requests.get(server.url_for("metrics"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
expected_metrics = EXPECTED_METRICS_V1
|
||||
if model_key == "multimodal":
|
||||
# NOTE: Don't use in-place assignment
|
||||
expected_metrics = expected_metrics + EXPECTED_METRICS_MM
|
||||
|
||||
for metric in expected_metrics:
|
||||
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
|
||||
continue
|
||||
assert metric in response.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_abort_metrics_reset(
|
||||
server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient,
|
||||
model_key: str,
|
||||
):
|
||||
model_name = MODELS[model_key]
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
prompt_ids = tokenizer.encode(_PROMPT)
|
||||
|
||||
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
|
||||
server,
|
||||
)
|
||||
|
||||
# Expect no running requests or kvcache usage
|
||||
assert running_requests == 0
|
||||
assert waiting_requests == 0
|
||||
assert kv_cache_usage == 0.0
|
||||
|
||||
# Start some long-running requests that we can abort
|
||||
tasks = []
|
||||
for _ in range(3):
|
||||
task = asyncio.create_task(
|
||||
client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt_ids,
|
||||
max_tokens=100, # Long generation to give time to abort
|
||||
temperature=0.0,
|
||||
)
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait a bit for requests to start processing
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Check that we have running requests
|
||||
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
|
||||
server,
|
||||
)
|
||||
|
||||
# Expect running requests and kvcache usage
|
||||
assert running_requests > 0
|
||||
assert kv_cache_usage > 0
|
||||
|
||||
# Cancel all tasks to abort the requests
|
||||
for task in tasks:
|
||||
task.cancel()
|
||||
|
||||
# Wait for cancellations to be processed
|
||||
await asyncio.sleep(1.0)
|
||||
|
||||
# Check that metrics have reset to zero
|
||||
response = requests.get(server.url_for("metrics"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
# Verify running and waiting requests counts and KV cache usage are zero
|
||||
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
|
||||
_get_running_metrics_from_api(server)
|
||||
)
|
||||
|
||||
assert running_requests_after == 0, (
|
||||
f"Expected 0 running requests after abort, got {running_requests_after}"
|
||||
)
|
||||
assert waiting_requests_after == 0, (
|
||||
f"Expected 0 waiting requests after abort, got {waiting_requests_after}"
|
||||
)
|
||||
assert kv_cache_usage_after == 0, (
|
||||
f"Expected 0% KV cache usage after abort, got {kv_cache_usage_after}"
|
||||
)
|
||||
|
||||
|
||||
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
|
||||
"""Return (running_count, waiting_count, kv_cache_usage)"""
|
||||
|
||||
response = requests.get(server.url_for("metrics"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
# Verify running and waiting requests counts and KV cache usage are zero
|
||||
running_requests, waiting_requests, kv_cache_usage = None, None, None
|
||||
|
||||
kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
|
||||
|
||||
for family in text_string_to_metric_families(response.text):
|
||||
if family.name == "vllm:num_requests_running":
|
||||
for sample in family.samples:
|
||||
if sample.name == "vllm:num_requests_running":
|
||||
running_requests = sample.value
|
||||
break
|
||||
elif family.name == "vllm:num_requests_waiting":
|
||||
for sample in family.samples:
|
||||
if sample.name == "vllm:num_requests_waiting":
|
||||
waiting_requests = sample.value
|
||||
break
|
||||
elif family.name == kv_cache_usage_metric:
|
||||
for sample in family.samples:
|
||||
if sample.name == kv_cache_usage_metric:
|
||||
kv_cache_usage = sample.value
|
||||
break
|
||||
|
||||
assert running_requests is not None
|
||||
assert waiting_requests is not None
|
||||
assert kv_cache_usage is not None
|
||||
|
||||
return running_requests, waiting_requests, kv_cache_usage
|
||||
|
||||
|
||||
def test_metrics_exist_run_batch():
|
||||
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
|
||||
|
||||
base_url = "0.0.0.0"
|
||||
port = "8001"
|
||||
server_url = f"http://{base_url}:{port}"
|
||||
|
||||
with (
|
||||
tempfile.NamedTemporaryFile("w") as input_file,
|
||||
tempfile.NamedTemporaryFile("r") as output_file,
|
||||
):
|
||||
input_file.write(input_batch)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"vllm.entrypoints.openai.run_batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"intfloat/multilingual-e5-small",
|
||||
"--enable-metrics",
|
||||
"--url",
|
||||
base_url,
|
||||
"--port",
|
||||
port,
|
||||
],
|
||||
)
|
||||
|
||||
def is_server_up(url):
|
||||
try:
|
||||
response = requests.get(url)
|
||||
return response.status_code == 200
|
||||
except requests.ConnectionError:
|
||||
return False
|
||||
|
||||
while not is_server_up(server_url):
|
||||
time.sleep(1)
|
||||
|
||||
response = requests.get(server_url + "/metrics")
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
proc.wait()
|
||||
56
tests/entrypoints/openai/test_models.py
Normal file
56
tests/entrypoints/openai/test_models.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
||||
# generation quality here
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(qwen3_lora_files):
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"qwen3-lora={qwen3_lora_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
lora_models = models[1:]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert served_model.root == MODEL_NAME
|
||||
assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
|
||||
assert lora_models[0].id == "qwen3-lora"
|
||||
42
tests/entrypoints/openai/test_oot_registration.py
Normal file
42
tests/entrypoints/openai/test_oot_registration.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from ...utils import VLLM_PATH, RemoteOpenAIServer
|
||||
|
||||
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert chatml_jinja_path.exists()
|
||||
|
||||
|
||||
def run_and_test_dummy_opt_api_server(model, tp=1):
|
||||
# the model is registered through the plugin
|
||||
server_args = [
|
||||
"--gpu-memory-utilization",
|
||||
"0.10",
|
||||
"--dtype",
|
||||
"float32",
|
||||
"--chat-template",
|
||||
str(chatml_jinja_path),
|
||||
"--load-format",
|
||||
"dummy",
|
||||
"-tp",
|
||||
f"{tp}",
|
||||
]
|
||||
with RemoteOpenAIServer(model, server_args) as server:
|
||||
client = server.get_client()
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Hello!"},
|
||||
],
|
||||
temperature=0,
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
assert generated_text is not None
|
||||
# make sure only the first token is generated
|
||||
rest = generated_text.replace("<s>", "")
|
||||
assert rest == ""
|
||||
|
||||
|
||||
def test_oot_registration_for_api_server(dummy_opt_path: str):
|
||||
run_and_test_dummy_opt_api_server(dummy_opt_path)
|
||||
145
tests/entrypoints/openai/test_openai_schema.py
Normal file
145
tests/entrypoints/openai/test_openai_schema.py
Normal file
@@ -0,0 +1,145 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from typing import Final
|
||||
|
||||
import pytest
|
||||
import schemathesis
|
||||
from hypothesis import settings
|
||||
from schemathesis import GenerationConfig
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
schemathesis.experimental.OPEN_API_3_1.enable()
|
||||
|
||||
MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
|
||||
MAXIMUM_IMAGES = 2
|
||||
DEFAULT_TIMEOUT_SECONDS: Final[int] = 10
|
||||
LONG_TIMEOUT_SECONDS: Final[int] = 60
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--runner",
|
||||
"generate",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"5",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def get_schema(server):
|
||||
# avoid generating null (\x00) bytes in strings during test case generation
|
||||
return schemathesis.openapi.from_uri(
|
||||
f"{server.url_root}/openapi.json",
|
||||
generation_config=GenerationConfig(allow_x00=False),
|
||||
)
|
||||
|
||||
|
||||
schema = schemathesis.from_pytest_fixture("get_schema")
|
||||
|
||||
|
||||
@schemathesis.hook
|
||||
def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
|
||||
op = context.operation
|
||||
assert op is not None
|
||||
|
||||
def no_invalid_types(case: schemathesis.models.Case):
|
||||
"""
|
||||
This filter skips test cases with invalid data that schemathesis
|
||||
incorrectly generates due to permissive schema configurations.
|
||||
|
||||
1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in
|
||||
message content, which isn't implemented.
|
||||
|
||||
2. Skips tool_calls with `"type": "custom"` which schemathesis
|
||||
incorrectly generates instead of the valid `"type": "function"`.
|
||||
|
||||
Example test cases that are skipped:
|
||||
curl -X POST -H 'Content-Type: application/json' \
|
||||
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
http://localhost:8000/tokenize
|
||||
|
||||
curl -X POST -H 'Content-Type: application/json' \
|
||||
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
|
||||
http://localhost:8000/v1/chat/completions
|
||||
""" # noqa: E501
|
||||
if hasattr(case, "body") and isinstance(case.body, dict):
|
||||
if (
|
||||
"messages" in case.body
|
||||
and isinstance(case.body["messages"], list)
|
||||
and len(case.body["messages"]) > 0
|
||||
):
|
||||
for message in case.body["messages"]:
|
||||
if not isinstance(message, dict):
|
||||
continue
|
||||
|
||||
# Check for invalid file type in tokenize endpoint
|
||||
if op.method.lower() == "post" and op.path == "/tokenize":
|
||||
content = message.get("content", [])
|
||||
if (
|
||||
isinstance(content, list)
|
||||
and len(content) > 0
|
||||
and any(item.get("type") == "file" for item in content)
|
||||
):
|
||||
return False
|
||||
|
||||
# Check for invalid tool_calls with non-function types
|
||||
tool_calls = message.get("tool_calls", [])
|
||||
if isinstance(tool_calls, list):
|
||||
for tool_call in tool_calls:
|
||||
if isinstance(tool_call, dict):
|
||||
if tool_call.get("type") != "function":
|
||||
return False
|
||||
if "custom" in tool_call:
|
||||
return False
|
||||
|
||||
# Sometimes structured_outputs.grammar is generated to be empty
|
||||
# Causing a server error in EBNF grammar parsing
|
||||
# https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
|
||||
structured_outputs = case.body.get("structured_outputs", {})
|
||||
grammar = (
|
||||
structured_outputs.get("grammar")
|
||||
if isinstance(structured_outputs, dict)
|
||||
else None
|
||||
)
|
||||
|
||||
if grammar == "":
|
||||
# Allow None (will be handled as no grammar)
|
||||
# But skip empty strings
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
return strategy.filter(no_invalid_types)
|
||||
|
||||
|
||||
@schema.parametrize()
|
||||
@schema.override(headers={"Content-Type": "application/json"})
|
||||
@settings(deadline=LONG_TIMEOUT_SECONDS * 1000)
|
||||
def test_openapi_stateless(case: schemathesis.Case):
|
||||
key = (
|
||||
case.operation.method.upper(),
|
||||
case.operation.path,
|
||||
)
|
||||
if case.operation.path.startswith("/v1/responses"):
|
||||
# Skip responses API as it is meant to be stateful.
|
||||
return
|
||||
|
||||
timeout = {
|
||||
# requires a longer timeout
|
||||
("POST", "/v1/chat/completions"): LONG_TIMEOUT_SECONDS,
|
||||
}.get(key, DEFAULT_TIMEOUT_SECONDS)
|
||||
|
||||
# No need to verify SSL certificate for localhost
|
||||
case.call_and_validate(verify=False, timeout=timeout)
|
||||
118
tests/entrypoints/openai/test_optional_middleware.py
Normal file
118
tests/entrypoints/openai/test_optional_middleware.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for middleware that's off by default and can be toggled through
|
||||
server arguments, mainly --api-key and --enable-request-id-headers.
|
||||
"""
|
||||
|
||||
from http import HTTPStatus
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# Use a small embeddings model for faster startup and smaller memory footprint.
|
||||
# Since we are not testing any chat functionality,
|
||||
# using a chat capable model is overkill.
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(request: pytest.FixtureRequest):
|
||||
passed_params = []
|
||||
if hasattr(request, "param"):
|
||||
passed_params = request.param
|
||||
if isinstance(passed_params, str):
|
||||
passed_params = [passed_params]
|
||||
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"512",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"2",
|
||||
*passed_params,
|
||||
]
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_api_token(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("v1/models"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_request_id_header(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("health"))
|
||||
assert "X-Request-Id" not in response.headers
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server",
|
||||
[["--api-key", "test"]],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_api_token(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("v1/models"))
|
||||
assert response.status_code == HTTPStatus.UNAUTHORIZED
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server",
|
||||
[["--api-key", "test"]],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_passed_api_token(server: RemoteOpenAIServer):
|
||||
response = requests.get(
|
||||
server.url_for("v1/models"), headers={"Authorization": "Bearer test"}
|
||||
)
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server",
|
||||
[["--api-key", "test"]],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_not_v1_api_token(server: RemoteOpenAIServer):
|
||||
# Authorization check is skipped for any paths that
|
||||
# don't start with /v1 (e.g. /v1/chat/completions).
|
||||
response = requests.get(server.url_for("health"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server",
|
||||
["--enable-request-id-headers"],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_enable_request_id_header(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("health"))
|
||||
assert "X-Request-Id" in response.headers
|
||||
assert len(response.headers.get("X-Request-Id", "")) == 32
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server",
|
||||
["--enable-request-id-headers"],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_custom_request_id_header(server: RemoteOpenAIServer):
|
||||
response = requests.get(
|
||||
server.url_for("health"), headers={"X-Request-Id": "Custom"}
|
||||
)
|
||||
assert "X-Request-Id" in response.headers
|
||||
assert response.headers.get("X-Request-Id") == "Custom"
|
||||
126
tests/entrypoints/openai/test_orca_metrics.py
Normal file
126
tests/entrypoints/openai/test_orca_metrics.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def monkeypatch_module():
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
|
||||
mpatch = MonkeyPatch()
|
||||
yield mpatch
|
||||
mpatch.undo()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[True])
|
||||
def server(request, monkeypatch_module):
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_completion_with_orca_header(server: RemoteOpenAIServer):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
client = openai.OpenAI(
|
||||
api_key="EMPTY",
|
||||
base_url=f"http://localhost:{server.port}/v1",
|
||||
default_headers={"endpoint-load-metrics-format": "TEXT"},
|
||||
)
|
||||
|
||||
# 1. Use raw client to get response headers.
|
||||
raw_client = client.with_raw_response
|
||||
|
||||
# 2. Make the API call using the raw_client
|
||||
response_with_raw = raw_client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
extra_headers={"endpoint-load-metrics-format": "TEXT"},
|
||||
)
|
||||
|
||||
# 3. Access the raw httpx.Response object
|
||||
raw_http_response = response_with_raw.http_response
|
||||
|
||||
# 4. Get the headers from the httpx.Response object
|
||||
response_headers = raw_http_response.headers
|
||||
|
||||
assert "endpoint-load-metrics" in response_headers
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_with_orca_header(client: openai.AsyncOpenAI):
|
||||
# 1. Use raw client to get response headers.
|
||||
raw_client = client.with_raw_response
|
||||
|
||||
# 2. Make the API call using the raw_client
|
||||
completion = await raw_client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
extra_headers={"endpoint-load-metrics-format": "JSON"},
|
||||
)
|
||||
|
||||
# 3. Access the raw httpx.Response object
|
||||
raw_http_response = completion.http_response
|
||||
|
||||
# 4. Get the headers from the httpx.Response object
|
||||
response_headers = raw_http_response.headers
|
||||
|
||||
assert "endpoint-load-metrics" in response_headers
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_completion(client: openai.AsyncOpenAI):
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
extra_headers={"endpoint-load-metrics-format": "JSON"},
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
|
||||
choice = completion.choices[0]
|
||||
assert len(choice.text) >= 5
|
||||
assert choice.finish_reason == "length"
|
||||
# When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=5, total_tokens=10
|
||||
)
|
||||
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
assert completion.choices[0].prompt_logprobs is None
|
||||
118
tests/entrypoints/openai/test_prompt_validation.py
Normal file
118
tests/entrypoints/openai/test_prompt_validation.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import io
|
||||
from unittest.mock import Mock
|
||||
|
||||
# imports for structured outputs tests
|
||||
import openai
|
||||
import pybase64
|
||||
import pytest
|
||||
import regex as re
|
||||
import torch
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.renderer import CompletionRenderer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_prompt():
|
||||
model_name = "gpt2"
|
||||
server_args = ["--enforce-eager"]
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
|
||||
with pytest.raises(
|
||||
openai.BadRequestError,
|
||||
match="Either prompt or prompt_embeds must be provided and non-empty.",
|
||||
):
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt="",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": []},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_out_of_vocab_token_ids():
|
||||
model_name = "gpt2"
|
||||
server_args = ["--enforce-eager"]
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
|
||||
with pytest.raises(
|
||||
openai.BadRequestError, match=re.compile(".*out of vocabulary.*").pattern
|
||||
):
|
||||
await client.completions.create(
|
||||
model=model_name, prompt=[999999], max_tokens=5, temperature=0.0
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize(
|
||||
"layout", [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr]
|
||||
)
|
||||
@pytest.mark.parametrize("seq_len", [2, 10])
|
||||
@pytest.mark.parametrize("hidden_size", [2, 10])
|
||||
def test_load_prompt_embeds(
|
||||
dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
|
||||
):
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.enable_prompt_embeds = True
|
||||
renderer = CompletionRenderer(model_config, tokenizer=None)
|
||||
|
||||
# construct arbitrary tensors of various dtypes, layouts, and sizes.
|
||||
# We need to check against different layouts to make sure that if a user
|
||||
# uses sparse tensors to reduce the transmission size of prompt embeddings,
|
||||
# we must cast them to dense/strided before passing them into the engine.
|
||||
# We don't use non-CPU tensors in this test to avoid preemptively
|
||||
# initializing cuda and break other tests in the suite that fork processes.
|
||||
# We also need to make sure that we only use devices that are actually
|
||||
# available in the environment the test is running on. For simplicity,
|
||||
# we just test against CPU.
|
||||
tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
|
||||
if layout == torch.strided:
|
||||
tensor = tensor.contiguous()
|
||||
elif layout == torch.sparse_coo:
|
||||
tensor = tensor.to_sparse_coo()
|
||||
elif layout == torch.sparse_csc:
|
||||
tensor = tensor.to_sparse_csc()
|
||||
elif layout == torch.sparse_csr:
|
||||
tensor = tensor.to_sparse_csr()
|
||||
|
||||
buffer = io.BytesIO()
|
||||
torch.save(tensor, buffer)
|
||||
buffer.seek(0)
|
||||
encoded_tensor = pybase64.b64encode(buffer.getvalue())
|
||||
|
||||
loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
|
||||
assert len(loaded_prompt_embeds) == 1
|
||||
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
|
||||
assert loaded_tensor.device.type == "cpu"
|
||||
assert loaded_tensor.layout == torch.strided
|
||||
torch.testing.assert_close(
|
||||
loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.float32])
|
||||
@pytest.mark.parametrize("seq_len", [2])
|
||||
@pytest.mark.parametrize("hidden_size", [2])
|
||||
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.enable_prompt_embeds = False
|
||||
renderer = CompletionRenderer(model_config, tokenizer=None)
|
||||
|
||||
tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
torch.save(tensor, buffer)
|
||||
buffer.seek(0)
|
||||
encoded_tensor = pybase64.b64encode(buffer.getvalue())
|
||||
|
||||
with pytest.raises(ValueError, match="--enable-prompt-embeds"):
|
||||
renderer.load_prompt_embeds(encoded_tensor)
|
||||
36
tests/entrypoints/openai/test_protocol.py
Normal file
36
tests/entrypoints/openai/test_protocol.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from openai_harmony import (
|
||||
Message,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages
|
||||
|
||||
|
||||
def test_serialize_message() -> None:
|
||||
dict_value = {"a": 1, "b": "2"}
|
||||
assert serialize_message(dict_value) == dict_value
|
||||
|
||||
msg_value = {
|
||||
"role": "assistant",
|
||||
"name": None,
|
||||
"content": [{"type": "text", "text": "Test 1"}],
|
||||
"channel": "analysis",
|
||||
}
|
||||
msg = Message.from_dict(msg_value)
|
||||
assert serialize_message(msg) == msg_value
|
||||
|
||||
|
||||
def test_serialize_messages() -> None:
|
||||
assert serialize_messages(None) is None
|
||||
assert serialize_messages([]) is None
|
||||
|
||||
dict_value = {"a": 3, "b": "4"}
|
||||
msg_value = {
|
||||
"role": "assistant",
|
||||
"name": None,
|
||||
"content": [{"type": "text", "text": "Test 2"}],
|
||||
"channel": "analysis",
|
||||
}
|
||||
msg = Message.from_dict(msg_value)
|
||||
assert serialize_messages([msg, dict_value]) == [msg_value, dict_value]
|
||||
261
tests/entrypoints/openai/test_response_api_mcp_tools.py
Normal file
261
tests/entrypoints/openai/test_response_api_mcp_tools.py
Normal file
@@ -0,0 +1,261 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from openai import OpenAI
|
||||
from openai_harmony import ToolDescription, ToolNamespaceConfig
|
||||
|
||||
from vllm.entrypoints.tool_server import MCPToolServer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "openai/gpt-oss-20b"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def monkeypatch_module():
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
|
||||
mpatch = MonkeyPatch()
|
||||
yield mpatch
|
||||
mpatch.undo()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mcp_disabled_server(monkeypatch_module: pytest.MonkeyPatch):
|
||||
args = ["--enforce-eager", "--tool-server", "demo"]
|
||||
|
||||
with monkeypatch_module.context() as m:
|
||||
m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
|
||||
m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
|
||||
# Helps the model follow instructions better
|
||||
m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def mcp_enabled_server(monkeypatch_module: pytest.MonkeyPatch):
|
||||
args = ["--enforce-eager", "--tool-server", "demo"]
|
||||
|
||||
with monkeypatch_module.context() as m:
|
||||
m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
|
||||
m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
|
||||
m.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,container")
|
||||
# Helps the model follow instructions better
|
||||
m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def mcp_disabled_client(mcp_disabled_server):
|
||||
async with mcp_disabled_server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def mcp_enabled_client(mcp_enabled_server):
|
||||
async with mcp_enabled_server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name: str):
|
||||
response = await mcp_enabled_client.responses.create(
|
||||
model=model_name,
|
||||
input=(
|
||||
"Execute the following code: "
|
||||
"import random; print(random.randint(1, 1000000))"
|
||||
),
|
||||
instructions=(
|
||||
"You must use the Python tool to execute code. Never simulate execution."
|
||||
),
|
||||
tools=[
|
||||
{
|
||||
"type": "mcp",
|
||||
"server_label": "code_interpreter",
|
||||
# URL unused for DemoToolServer
|
||||
"server_url": "http://localhost:8888",
|
||||
}
|
||||
],
|
||||
extra_body={"enable_response_messages": True},
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
# Verify output messages: Tool calls and responses on analysis channel
|
||||
tool_call_found = False
|
||||
tool_response_found = False
|
||||
for message in response.output_messages:
|
||||
recipient = message.get("recipient")
|
||||
if recipient and recipient.startswith("python"):
|
||||
tool_call_found = True
|
||||
assert message.get("channel") == "analysis", (
|
||||
"Tool call should be on analysis channel"
|
||||
)
|
||||
author = message.get("author", {})
|
||||
if (
|
||||
author.get("role") == "tool"
|
||||
and author.get("name")
|
||||
and author.get("name").startswith("python")
|
||||
):
|
||||
tool_response_found = True
|
||||
assert message.get("channel") == "analysis", (
|
||||
"Tool response should be on analysis channel"
|
||||
)
|
||||
|
||||
assert tool_call_found, "Should have found at least one Python tool call"
|
||||
assert tool_response_found, "Should have found at least one Python tool response"
|
||||
for message in response.input_messages:
|
||||
assert message.get("author").get("role") != "developer", (
|
||||
"No developer messages should be present with valid mcp tool"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_mcp_tool_with_allowed_tools_star(
|
||||
mcp_enabled_client: OpenAI, model_name: str
|
||||
):
|
||||
"""Test MCP tool with allowed_tools=['*'] to select all available tools.
|
||||
|
||||
This E2E test verifies that the "*" wildcard works end-to-end.
|
||||
See test_serving_responses.py for detailed unit tests of "*" normalization.
|
||||
"""
|
||||
response = await mcp_enabled_client.responses.create(
|
||||
model=model_name,
|
||||
input=(
|
||||
"Execute the following code: "
|
||||
"import random; print(random.randint(1, 1000000))"
|
||||
),
|
||||
instructions=(
|
||||
"You must use the Python tool to execute code. Never simulate execution."
|
||||
),
|
||||
tools=[
|
||||
{
|
||||
"type": "mcp",
|
||||
"server_label": "code_interpreter",
|
||||
"server_url": "http://localhost:8888",
|
||||
# Using "*" to allow all tools from this MCP server
|
||||
"allowed_tools": ["*"],
|
||||
}
|
||||
],
|
||||
extra_body={"enable_response_messages": True},
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
# Verify tool calls work with allowed_tools=["*"]
|
||||
tool_call_found = False
|
||||
for message in response.output_messages:
|
||||
recipient = message.get("recipient")
|
||||
if recipient and recipient.startswith("python"):
|
||||
tool_call_found = True
|
||||
break
|
||||
assert tool_call_found, "Should have found at least one Python tool call with '*'"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_name: str):
|
||||
response = await mcp_disabled_client.responses.create(
|
||||
model=model_name,
|
||||
input=(
|
||||
"Execute the following code if the tool is present: "
|
||||
"import random; print(random.randint(1, 1000000))"
|
||||
),
|
||||
tools=[
|
||||
{
|
||||
"type": "mcp",
|
||||
"server_label": "code_interpreter",
|
||||
# URL unused for DemoToolServer
|
||||
"server_url": "http://localhost:8888",
|
||||
}
|
||||
],
|
||||
extra_body={"enable_response_messages": True},
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
# Verify output messages: No tool calls and responses
|
||||
tool_call_found = False
|
||||
tool_response_found = False
|
||||
for message in response.output_messages:
|
||||
recipient = message.get("recipient")
|
||||
if recipient and recipient.startswith("python"):
|
||||
tool_call_found = True
|
||||
assert message.get("channel") == "analysis", (
|
||||
"Tool call should be on analysis channel"
|
||||
)
|
||||
author = message.get("author", {})
|
||||
if (
|
||||
author.get("role") == "tool"
|
||||
and author.get("name")
|
||||
and author.get("name").startswith("python")
|
||||
):
|
||||
tool_response_found = True
|
||||
assert message.get("channel") == "analysis", (
|
||||
"Tool response should be on analysis channel"
|
||||
)
|
||||
|
||||
assert not tool_call_found, "Should not have a python call"
|
||||
assert not tool_response_found, "Should not have a tool response"
|
||||
for message in response.input_messages:
|
||||
assert message.get("author").get("role") != "developer", (
|
||||
"No developer messages should be present without a valid tool"
|
||||
)
|
||||
|
||||
|
||||
def test_get_tool_description():
|
||||
"""Test MCPToolServer.get_tool_description filtering logic.
|
||||
|
||||
Note: The wildcard "*" is normalized to None by
|
||||
_extract_allowed_tools_from_mcp_requests before reaching this layer,
|
||||
so we only test None and specific tool filtering here.
|
||||
See test_serving_responses.py for "*" normalization tests.
|
||||
"""
|
||||
pytest.importorskip("mcp")
|
||||
|
||||
server = MCPToolServer()
|
||||
tool1 = ToolDescription.new(
|
||||
name="tool1", description="First", parameters={"type": "object"}
|
||||
)
|
||||
tool2 = ToolDescription.new(
|
||||
name="tool2", description="Second", parameters={"type": "object"}
|
||||
)
|
||||
tool3 = ToolDescription.new(
|
||||
name="tool3", description="Third", parameters={"type": "object"}
|
||||
)
|
||||
|
||||
server.harmony_tool_descriptions = {
|
||||
"test_server": ToolNamespaceConfig(
|
||||
name="test_server", description="test", tools=[tool1, tool2, tool3]
|
||||
)
|
||||
}
|
||||
|
||||
# Nonexistent server
|
||||
assert server.get_tool_description("nonexistent") is None
|
||||
|
||||
# None (no filter) - returns all tools
|
||||
result = server.get_tool_description("test_server", allowed_tools=None)
|
||||
assert len(result.tools) == 3
|
||||
|
||||
# Filter to specific tools
|
||||
result = server.get_tool_description(
|
||||
"test_server", allowed_tools=["tool1", "tool3"]
|
||||
)
|
||||
assert len(result.tools) == 2
|
||||
assert result.tools[0].name == "tool1"
|
||||
assert result.tools[1].name == "tool3"
|
||||
|
||||
# Single tool
|
||||
result = server.get_tool_description("test_server", allowed_tools=["tool2"])
|
||||
assert len(result.tools) == 1
|
||||
assert result.tools[0].name == "tool2"
|
||||
|
||||
# No matching tools - returns None
|
||||
result = server.get_tool_description("test_server", allowed_tools=["nonexistent"])
|
||||
assert result is None
|
||||
|
||||
# Empty list - returns None
|
||||
assert server.get_tool_description("test_server", allowed_tools=[]) is None
|
||||
180
tests/entrypoints/openai/test_response_api_parsable_context.py
Normal file
180
tests/entrypoints/openai/test_response_api_parsable_context.py
Normal file
@@ -0,0 +1,180 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib
|
||||
import json
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from openai import OpenAI
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-8B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
assert importlib.util.find_spec("gpt_oss") is not None, (
|
||||
"Harmony tests require gpt_oss package to be installed"
|
||||
)
|
||||
|
||||
args = [
|
||||
"--reasoning-parser",
|
||||
"qwen3",
|
||||
"--max_model_len",
|
||||
"5000",
|
||||
"--structured-outputs-config.backend",
|
||||
"xgrammar",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
"--tool-server",
|
||||
"demo",
|
||||
]
|
||||
env_dict = dict(
|
||||
VLLM_ENABLE_RESPONSES_API_STORE="1",
|
||||
VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
|
||||
PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
|
||||
)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
)
|
||||
assert response is not None
|
||||
print("response: ", response)
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{"type": "message", "content": "Hello.", "role": "user"},
|
||||
{
|
||||
"type": "reasoning",
|
||||
"id": "lol",
|
||||
"content": [
|
||||
{
|
||||
"type": "reasoning_text",
|
||||
"text": "We need to respond: greeting.",
|
||||
}
|
||||
],
|
||||
"summary": [],
|
||||
},
|
||||
{
|
||||
"arguments": '{"location": "Paris", "unit": "celsius"}',
|
||||
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
|
||||
"name": "get_weather",
|
||||
"type": "function_call",
|
||||
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
|
||||
"status": "completed",
|
||||
},
|
||||
{
|
||||
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
|
||||
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
|
||||
"output": "The weather in Paris is 20 Celsius",
|
||||
"status": "completed",
|
||||
"type": "function_call_output",
|
||||
},
|
||||
],
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
# make sure we get a reasoning and text output
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "message"
|
||||
assert type(response.output[1].content[0].text) is str
|
||||
|
||||
|
||||
def get_horoscope(sign):
|
||||
return f"{sign}: Next Tuesday you will befriend a baby otter."
|
||||
|
||||
|
||||
def call_function(name, args):
|
||||
if name == "get_horoscope":
|
||||
return get_horoscope(**args)
|
||||
else:
|
||||
raise ValueError(f"Unknown function: {name}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_call_first_turn(client: OpenAI, model_name: str):
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_horoscope",
|
||||
"description": "Get today's horoscope for an astrological sign.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"sign": {"type": "string"},
|
||||
},
|
||||
"required": ["sign"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}
|
||||
]
|
||||
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the horoscope for Aquarius today?",
|
||||
tools=tools,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert len(response.output) == 2
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "function_call"
|
||||
|
||||
function_call = response.output[1]
|
||||
assert function_call.name == "get_horoscope"
|
||||
assert function_call.call_id is not None
|
||||
|
||||
args = json.loads(function_call.arguments)
|
||||
assert "sign" in args
|
||||
|
||||
# the multi turn function call is tested above in
|
||||
# test_reasoning_and_function_items
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_mcp_tool_call(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24? Use python to calculate the result.",
|
||||
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "mcp_call"
|
||||
assert type(response.output[1].arguments) is str
|
||||
assert type(response.output[1].output) is str
|
||||
assert response.output[2].type == "reasoning"
|
||||
# make sure the correct math is in the final output
|
||||
assert response.output[3].type == "message"
|
||||
assert "312" in response.output[3].content[0].text
|
||||
89
tests/entrypoints/openai/test_response_api_simple.py
Normal file
89
tests/entrypoints/openai/test_response_api_simple.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from openai import OpenAI
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-8B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
|
||||
env_dict = dict(
|
||||
VLLM_ENABLE_RESPONSES_API_STORE="1",
|
||||
# uncomment for tool calling
|
||||
# PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
|
||||
)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
)
|
||||
assert response is not None
|
||||
print("response: ", response)
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_enable_response_messages(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Hello?",
|
||||
extra_body={"enable_response_messages": True},
|
||||
)
|
||||
assert response.status == "completed"
|
||||
assert response.input_messages[0]["type"] == "raw_message_tokens"
|
||||
assert type(response.input_messages[0]["message"]) is str
|
||||
assert len(response.input_messages[0]["message"]) > 10
|
||||
assert type(response.input_messages[0]["tokens"][0]) is int
|
||||
assert type(response.output_messages[0]["message"]) is str
|
||||
assert len(response.output_messages[0]["message"]) > 10
|
||||
assert type(response.output_messages[0]["tokens"][0]) is int
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_reasoning_item(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{"type": "message", "content": "Hello.", "role": "user"},
|
||||
{
|
||||
"type": "reasoning",
|
||||
"id": "lol",
|
||||
"content": [
|
||||
{
|
||||
"type": "reasoning_text",
|
||||
"text": "We need to respond: greeting.",
|
||||
}
|
||||
],
|
||||
"summary": [],
|
||||
},
|
||||
],
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
# make sure we get a reasoning and text output
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "message"
|
||||
assert type(response.output[1].content[0].text) is str
|
||||
988
tests/entrypoints/openai/test_response_api_with_harmony.py
Normal file
988
tests/entrypoints/openai/test_response_api_with_harmony.py
Normal file
@@ -0,0 +1,988 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import importlib
|
||||
import json
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
from openai import BadRequestError, NotFoundError, OpenAI
|
||||
from openai_harmony import (
|
||||
Message,
|
||||
)
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "openai/gpt-oss-20b"
|
||||
|
||||
GET_WEATHER_SCHEMA = {
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {"type": "number"},
|
||||
"longitude": {"type": "number"},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
assert importlib.util.find_spec("gpt_oss") is not None, (
|
||||
"Harmony tests require gpt_oss package to be installed"
|
||||
)
|
||||
|
||||
args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
|
||||
env_dict = dict(
|
||||
VLLM_ENABLE_RESPONSES_API_STORE="1",
|
||||
PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
|
||||
)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
)
|
||||
assert response is not None
|
||||
print("response: ", response)
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic_with_instructions(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
instructions="Respond in Korean.",
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the capital of South Korea?",
|
||||
reasoning={"effort": "low"},
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_max_tokens(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the first paragraph of Moby Dick?",
|
||||
reasoning={"effort": "low"},
|
||||
max_output_tokens=30,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "incomplete"
|
||||
assert response.incomplete_details.reason == "max_output_tokens"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chat(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{"role": "system", "content": "Respond in Korean."},
|
||||
{"role": "user", "content": "Hello!"},
|
||||
{"role": "assistant", "content": "Hello! How can I help you today?"},
|
||||
{"role": "user", "content": "What is 13 * 24? Explain your answer."},
|
||||
],
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chat_with_input_type(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": "What is 13*24?"}],
|
||||
},
|
||||
],
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_structured_output(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{"role": "system", "content": "Extract the event information."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Alice and Bob are going to a science fair on Friday.",
|
||||
},
|
||||
],
|
||||
text={
|
||||
"format": {
|
||||
"type": "json_schema",
|
||||
"name": "calendar_event",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"date": {"type": "string"},
|
||||
"participants": {"type": "array", "items": {"type": "string"}},
|
||||
},
|
||||
"required": ["name", "date", "participants"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"description": "A calendar event.",
|
||||
"strict": True,
|
||||
}
|
||||
},
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_structured_output_with_parse(client: OpenAI, model_name: str):
|
||||
from pydantic import BaseModel
|
||||
|
||||
class CalendarEvent(BaseModel):
|
||||
name: str
|
||||
date: str
|
||||
participants: list[str]
|
||||
|
||||
response = await client.responses.parse(
|
||||
model=model_name,
|
||||
input="Alice and Bob are going to a science fair on Friday",
|
||||
instructions="Extract the event information",
|
||||
text_format=CalendarEvent,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_store(client: OpenAI, model_name: str):
|
||||
for store in [True, False]:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
store=store,
|
||||
)
|
||||
assert response is not None
|
||||
|
||||
try:
|
||||
_retrieved_response = await client.responses.retrieve(response.id)
|
||||
is_not_found = False
|
||||
except NotFoundError:
|
||||
is_not_found = True
|
||||
|
||||
assert is_not_found == (not store)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_background(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
background=True,
|
||||
)
|
||||
assert response is not None
|
||||
|
||||
retries = 0
|
||||
max_retries = 30
|
||||
while retries < max_retries:
|
||||
response = await client.responses.retrieve(response.id)
|
||||
if response.status == "completed":
|
||||
break
|
||||
time.sleep(1)
|
||||
retries += 1
|
||||
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_background_cancel(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Write a long story about a cat.",
|
||||
background=True,
|
||||
)
|
||||
assert response is not None
|
||||
time.sleep(1)
|
||||
|
||||
cancelled_response = await client.responses.cancel(response.id)
|
||||
assert cancelled_response is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_stateful_multi_turn(client: OpenAI, model_name: str):
|
||||
response1 = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
)
|
||||
assert response1 is not None
|
||||
assert response1.status == "completed"
|
||||
|
||||
response2 = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What if I increase both numbers by 1?",
|
||||
previous_response_id=response1.id,
|
||||
)
|
||||
assert response2 is not None
|
||||
assert response2.status == "completed"
|
||||
|
||||
response3 = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Divide the result by 2.",
|
||||
previous_response_id=response2.id,
|
||||
)
|
||||
assert response3 is not None
|
||||
assert response3.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_streaming_types(client: OpenAI, model_name: str):
|
||||
prompts = [
|
||||
"tell me a story about a cat in 20 words",
|
||||
]
|
||||
|
||||
# this links the "done" type with the "start" type
|
||||
# so every "done" type should have a corresponding "start" type
|
||||
# and every open block should be closed by the end of the stream
|
||||
pairs_of_event_types = {
|
||||
"response.completed": "response.created",
|
||||
"response.output_item.done": "response.output_item.added",
|
||||
"response.content_part.done": "response.content_part.added",
|
||||
"response.output_text.done": "response.output_text.delta",
|
||||
"response.web_search_call.done": "response.web_search_call.added",
|
||||
"response.reasoning_text.done": "response.reasoning_text.delta",
|
||||
"response.reasoning_part.done": "response.reasoning_part.added",
|
||||
}
|
||||
|
||||
for prompt in prompts:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=prompt,
|
||||
reasoning={"effort": "low"},
|
||||
tools=[],
|
||||
stream=True,
|
||||
background=False,
|
||||
)
|
||||
|
||||
stack_of_event_types = []
|
||||
async for event in response:
|
||||
if event.type == "response.created":
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type == "response.completed":
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
|
||||
stack_of_event_types.pop()
|
||||
if event.type.endswith("added"):
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("delta"):
|
||||
if stack_of_event_types[-1] == event.type:
|
||||
continue
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("done"):
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
|
||||
stack_of_event_types.pop()
|
||||
assert len(stack_of_event_types) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_with_streaming_types(client: OpenAI, model_name: str):
|
||||
# this links the "done" type with the "start" type
|
||||
# so every "done" type should have a corresponding "start" type
|
||||
# and every open block should be closed by the end of the stream
|
||||
pairs_of_event_types = {
|
||||
"response.completed": "response.created",
|
||||
"response.output_item.done": "response.output_item.added",
|
||||
"response.output_text.done": "response.output_text.delta",
|
||||
"response.reasoning_text.done": "response.reasoning_text.delta",
|
||||
"response.reasoning_part.done": "response.reasoning_part.added",
|
||||
"response.function_call_arguments.done": "response.function_call_arguments.delta", # noqa
|
||||
}
|
||||
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
input_list = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in Paris today?",
|
||||
}
|
||||
]
|
||||
stream_response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_list,
|
||||
tools=tools,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
stack_of_event_types = []
|
||||
async for event in stream_response:
|
||||
if event.type == "response.created":
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type == "response.completed":
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
|
||||
stack_of_event_types.pop()
|
||||
if event.type.endswith("added"):
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("delta"):
|
||||
if stack_of_event_types[-1] == event.type:
|
||||
continue
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("done"):
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
|
||||
stack_of_event_types.pop()
|
||||
assert len(stack_of_event_types) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("background", [True, False])
|
||||
async def test_streaming(client: OpenAI, model_name: str, background: bool):
|
||||
# TODO: Add back when web search and code interpreter are available in CI
|
||||
prompts = [
|
||||
"tell me a story about a cat in 20 words",
|
||||
"What is 13 * 24? Use python to calculate the result.",
|
||||
# "When did Jensen found NVIDIA? Search it and answer the year only.",
|
||||
]
|
||||
|
||||
for prompt in prompts:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=prompt,
|
||||
reasoning={"effort": "low"},
|
||||
tools=[
|
||||
# {
|
||||
# "type": "web_search_preview"
|
||||
# },
|
||||
{"type": "code_interpreter", "container": {"type": "auto"}},
|
||||
],
|
||||
stream=True,
|
||||
background=background,
|
||||
extra_body={"enable_response_messages": True},
|
||||
)
|
||||
|
||||
current_item_id = ""
|
||||
current_content_index = -1
|
||||
|
||||
events = []
|
||||
current_event_mode = None
|
||||
resp_id = None
|
||||
checked_response_completed = False
|
||||
async for event in response:
|
||||
if event.type == "response.created":
|
||||
resp_id = event.response.id
|
||||
|
||||
# test vllm custom types are in the response
|
||||
if event.type in [
|
||||
"response.completed",
|
||||
"response.in_progress",
|
||||
"response.created",
|
||||
]:
|
||||
assert "input_messages" in event.response.model_extra
|
||||
assert "output_messages" in event.response.model_extra
|
||||
if event.type == "response.completed":
|
||||
# make sure the serialization of content works
|
||||
for msg in event.response.model_extra["output_messages"]:
|
||||
# make sure we can convert the messages back into harmony
|
||||
Message.from_dict(msg)
|
||||
|
||||
for msg in event.response.model_extra["input_messages"]:
|
||||
# make sure we can convert the messages back into harmony
|
||||
Message.from_dict(msg)
|
||||
checked_response_completed = True
|
||||
|
||||
if current_event_mode != event.type:
|
||||
current_event_mode = event.type
|
||||
print(f"\n[{event.type}] ", end="", flush=True)
|
||||
|
||||
# verify current_item_id is correct
|
||||
if event.type == "response.output_item.added":
|
||||
assert event.item.id != current_item_id
|
||||
current_item_id = event.item.id
|
||||
elif event.type in [
|
||||
"response.output_text.delta",
|
||||
"response.reasoning_text.delta",
|
||||
]:
|
||||
assert event.item_id == current_item_id
|
||||
|
||||
# verify content_index_id is correct
|
||||
if event.type in [
|
||||
"response.content_part.added",
|
||||
"response.reasoning_part.added",
|
||||
]:
|
||||
assert event.content_index != current_content_index
|
||||
current_content_index = event.content_index
|
||||
elif event.type in [
|
||||
"response.output_text.delta",
|
||||
"response.reasoning_text.delta",
|
||||
]:
|
||||
assert event.content_index == current_content_index
|
||||
|
||||
if "text.delta" in event.type:
|
||||
print(event.delta, end="", flush=True)
|
||||
elif "reasoning_text.delta" in event.type:
|
||||
print(f"{event.delta}", end="", flush=True)
|
||||
elif "response.code_interpreter_call_code.done" in event.type:
|
||||
print(f"Code: {event.code}", end="", flush=True)
|
||||
elif (
|
||||
"response.output_item.added" in event.type
|
||||
and event.item.type == "web_search_call"
|
||||
):
|
||||
print(f"Web search: {event.item.action}", end="", flush=True)
|
||||
events.append(event)
|
||||
|
||||
assert len(events) > 0
|
||||
response_completed_event = events[-1]
|
||||
assert len(response_completed_event.response.output) > 0
|
||||
assert checked_response_completed
|
||||
|
||||
if background:
|
||||
starting_after = 5
|
||||
async with await client.responses.retrieve(
|
||||
response_id=resp_id, stream=True, starting_after=starting_after
|
||||
) as stream:
|
||||
counter = starting_after
|
||||
async for event in stream:
|
||||
counter += 1
|
||||
assert event == events[counter]
|
||||
assert counter == len(events) - 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
|
||||
async def test_web_search(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Who is the president of South Korea as of now?",
|
||||
tools=[{"type": "web_search_preview"}],
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_code_interpreter(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
# TODO: Ideally should be able to set max tool calls
|
||||
# to prevent multi-turn, but it is not currently supported
|
||||
# would speed up the test
|
||||
input=(
|
||||
"What's the first 4 digits after the decimal point of "
|
||||
"cube root of `19910212 * 20250910`? "
|
||||
"Show only the digits. The python interpreter is not stateful "
|
||||
"and you must print to see the output."
|
||||
),
|
||||
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
|
||||
temperature=0.0, # More deterministic output in response
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert response.usage.output_tokens_details.tool_output_tokens > 0
|
||||
for item in response.output:
|
||||
if item.type == "message":
|
||||
output_string = item.content[0].text
|
||||
print("output_string: ", output_string, flush=True)
|
||||
assert "5846" in output_string
|
||||
|
||||
|
||||
def get_weather(latitude, longitude):
|
||||
response = requests.get(
|
||||
f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}¤t=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m" # noqa
|
||||
)
|
||||
data = response.json()
|
||||
return data["current"]["temperature_2m"]
|
||||
|
||||
|
||||
def get_place_to_travel():
|
||||
return "Paris"
|
||||
|
||||
|
||||
def get_horoscope(sign):
|
||||
return f"{sign}: Next Tuesday you will befriend a baby otter."
|
||||
|
||||
|
||||
def call_function(name, args):
|
||||
if name == "get_weather":
|
||||
return get_weather(**args)
|
||||
elif name == "get_place_to_travel":
|
||||
return get_place_to_travel()
|
||||
elif name == "get_horoscope":
|
||||
return get_horoscope(**args)
|
||||
else:
|
||||
raise ValueError(f"Unknown function: {name}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_reasoning_item(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{"type": "message", "content": "Hello.", "role": "user"},
|
||||
{
|
||||
"type": "reasoning",
|
||||
"id": "lol",
|
||||
"content": [
|
||||
{
|
||||
"type": "reasoning_text",
|
||||
"text": "We need to respond: greeting.",
|
||||
}
|
||||
],
|
||||
"summary": [],
|
||||
},
|
||||
],
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling(client: OpenAI, model_name: str):
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What's the weather like in Paris today?",
|
||||
tools=tools,
|
||||
temperature=0.0,
|
||||
extra_body={"request_id": "test_function_calling_non_resp"},
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert len(response.output) == 2
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "function_call"
|
||||
|
||||
tool_call = response.output[1]
|
||||
name = tool_call.name
|
||||
args = json.loads(tool_call.arguments)
|
||||
|
||||
result = call_function(name, args)
|
||||
|
||||
response_2 = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}
|
||||
],
|
||||
tools=tools,
|
||||
previous_response_id=response.id,
|
||||
)
|
||||
assert response_2 is not None
|
||||
assert response_2.status == "completed"
|
||||
assert response_2.output_text is not None
|
||||
|
||||
# NOTE: chain-of-thought should be removed.
|
||||
response_3 = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What's the weather like in Paris today?",
|
||||
tools=tools,
|
||||
previous_response_id=response_2.id,
|
||||
)
|
||||
assert response_3 is not None
|
||||
assert response_3.status == "completed"
|
||||
assert response_3.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.flaky(reruns=5)
|
||||
async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_place_to_travel",
|
||||
"description": "Get a random place to travel",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": [],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
},
|
||||
GET_WEATHER_SCHEMA,
|
||||
]
|
||||
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Help me plan a trip to a random place. And tell me the weather there.",
|
||||
tools=tools,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert len(response.output) == 2
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "function_call"
|
||||
|
||||
tool_call = response.output[1]
|
||||
name = tool_call.name
|
||||
args = json.loads(tool_call.arguments)
|
||||
|
||||
result = call_function(name, args)
|
||||
|
||||
response_2 = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}
|
||||
],
|
||||
tools=tools,
|
||||
previous_response_id=response.id,
|
||||
)
|
||||
assert response_2 is not None
|
||||
assert response_2.status == "completed"
|
||||
assert len(response_2.output) == 2
|
||||
assert response_2.output[0].type == "reasoning"
|
||||
assert response_2.output[1].type == "function_call"
|
||||
|
||||
tool_call = response_2.output[1]
|
||||
name = tool_call.name
|
||||
args = json.loads(tool_call.arguments)
|
||||
|
||||
result = call_function(name, args)
|
||||
|
||||
response_3 = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}
|
||||
],
|
||||
tools=tools,
|
||||
previous_response_id=response_2.id,
|
||||
)
|
||||
assert response_3 is not None
|
||||
assert response_3.status == "completed"
|
||||
assert response_3.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_required(client: OpenAI, model_name: str):
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.responses.create(
|
||||
model=model_name,
|
||||
input="What's the weather like in Paris today?",
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_system_message_with_tools(client: OpenAI, model_name: str):
|
||||
from vllm.entrypoints.openai.parser.harmony_utils import get_system_message
|
||||
|
||||
# Test with custom tools enabled - commentary channel should be available
|
||||
sys_msg = get_system_message(with_custom_tools=True)
|
||||
valid_channels = sys_msg.content[0].channel_config.valid_channels
|
||||
assert "commentary" in valid_channels
|
||||
|
||||
# Test with custom tools disabled - commentary channel should be removed
|
||||
sys_msg = get_system_message(with_custom_tools=False)
|
||||
valid_channels = sys_msg.content[0].channel_config.valid_channels
|
||||
assert "commentary" not in valid_channels
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
|
||||
input_messages = [
|
||||
{"role": "user", "content": "What's the weather like in Paris today?"}
|
||||
]
|
||||
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_messages,
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
tool_call = response.output[-1]
|
||||
name = tool_call.name
|
||||
args = json.loads(tool_call.arguments)
|
||||
|
||||
result = call_function(name, args)
|
||||
|
||||
input_messages.extend(response.output) # append model's function call message
|
||||
input_messages.append(
|
||||
{ # append result message
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}
|
||||
)
|
||||
|
||||
response_2 = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_messages,
|
||||
tools=tools,
|
||||
)
|
||||
assert response_2 is not None
|
||||
assert response_2.status == "completed"
|
||||
assert response_2.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_with_stream(client: OpenAI, model_name: str):
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
input_list = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in Paris today?",
|
||||
}
|
||||
]
|
||||
stream_response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_list,
|
||||
tools=tools,
|
||||
stream=True,
|
||||
)
|
||||
assert stream_response is not None
|
||||
final_tool_calls = {}
|
||||
final_tool_calls_named = {}
|
||||
async for event in stream_response:
|
||||
if event.type == "response.output_item.added":
|
||||
if event.item.type != "function_call":
|
||||
continue
|
||||
final_tool_calls[event.output_index] = event.item
|
||||
final_tool_calls_named[event.item.name] = event.item
|
||||
elif event.type == "response.function_call_arguments.delta":
|
||||
index = event.output_index
|
||||
tool_call = final_tool_calls[index]
|
||||
if tool_call:
|
||||
tool_call.arguments += event.delta
|
||||
final_tool_calls_named[tool_call.name] = tool_call
|
||||
elif event.type == "response.function_call_arguments.done":
|
||||
assert event.arguments == final_tool_calls_named[event.name].arguments
|
||||
for tool_call in final_tool_calls.values():
|
||||
if (
|
||||
tool_call
|
||||
and tool_call.type == "function_call"
|
||||
and tool_call.name == "get_weather"
|
||||
):
|
||||
args = json.loads(tool_call.arguments)
|
||||
result = call_function(tool_call.name, args)
|
||||
input_list += [tool_call]
|
||||
break
|
||||
assert result is not None
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_list
|
||||
+ [
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}
|
||||
],
|
||||
tools=tools,
|
||||
stream=True,
|
||||
)
|
||||
assert response is not None
|
||||
async for event in response:
|
||||
# check that no function call events in the stream
|
||||
assert event.type != "response.function_call_arguments.delta"
|
||||
assert event.type != "response.function_call_arguments.done"
|
||||
# check that the response contains output text
|
||||
if event.type == "response.completed":
|
||||
assert len(event.response.output) > 0
|
||||
assert event.response.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the capital of South Korea?",
|
||||
extra_body={"enable_response_messages": True},
|
||||
)
|
||||
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert len(response.input_messages) > 0
|
||||
assert len(response.output_messages) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_call_with_previous_input_messages(
|
||||
client: OpenAI, model_name: str
|
||||
):
|
||||
"""Test function calling using previous_input_messages
|
||||
for multi-turn conversation with a function call"""
|
||||
|
||||
# Define the get_horoscope tool
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_horoscope",
|
||||
"description": "Get today's horoscope for an astrological sign.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"sign": {"type": "string"},
|
||||
},
|
||||
"required": ["sign"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}
|
||||
]
|
||||
|
||||
# Step 1: First call with the function tool
|
||||
stream_response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the horoscope for Aquarius today?",
|
||||
tools=tools,
|
||||
extra_body={"enable_response_messages": True},
|
||||
stream=True,
|
||||
)
|
||||
|
||||
response = None
|
||||
async for event in stream_response:
|
||||
if event.type == "response.completed":
|
||||
response = event.response
|
||||
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
# Step 2: Parse the first output to find the function_call type
|
||||
function_call = None
|
||||
for item in response.output:
|
||||
if item.type == "function_call":
|
||||
function_call = item
|
||||
break
|
||||
|
||||
assert function_call is not None, "Expected a function_call in the output"
|
||||
assert function_call.name == "get_horoscope"
|
||||
assert function_call.call_id is not None
|
||||
|
||||
# Verify the format matches expectations
|
||||
args = json.loads(function_call.arguments)
|
||||
assert "sign" in args
|
||||
|
||||
# Step 3: Call the get_horoscope function
|
||||
result = call_function(function_call.name, args)
|
||||
assert "Aquarius" in result
|
||||
assert "baby otter" in result
|
||||
|
||||
# Get the input_messages and output_messages from the first response
|
||||
first_input_messages = response.input_messages
|
||||
first_output_messages = response.output_messages
|
||||
|
||||
# Construct the full conversation history using previous_input_messages
|
||||
previous_messages = (
|
||||
first_input_messages
|
||||
+ first_output_messages
|
||||
+ [
|
||||
{
|
||||
"role": "tool",
|
||||
"name": "functions.get_horoscope",
|
||||
"content": [{"type": "text", "text": str(result)}],
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Step 4: Make another responses.create() call with previous_input_messages
|
||||
stream_response_2 = await client.responses.create(
|
||||
model=model_name,
|
||||
tools=tools,
|
||||
input="",
|
||||
extra_body={
|
||||
"previous_input_messages": previous_messages,
|
||||
"enable_response_messages": True,
|
||||
},
|
||||
stream=True,
|
||||
)
|
||||
|
||||
async for event in stream_response_2:
|
||||
if event.type == "response.completed":
|
||||
response_2 = event.response
|
||||
|
||||
assert response_2 is not None
|
||||
assert response_2.status == "completed"
|
||||
assert response_2.output_text is not None
|
||||
|
||||
# verify only one system message / developer message
|
||||
num_system_messages_input = 0
|
||||
num_developer_messages_input = 0
|
||||
num_function_call_input = 0
|
||||
for message_dict in response_2.input_messages:
|
||||
message = Message.from_dict(message_dict)
|
||||
if message.author.role == "system":
|
||||
num_system_messages_input += 1
|
||||
elif message.author.role == "developer":
|
||||
num_developer_messages_input += 1
|
||||
elif message.author.role == "tool":
|
||||
num_function_call_input += 1
|
||||
assert num_system_messages_input == 1
|
||||
assert num_developer_messages_input == 1
|
||||
assert num_function_call_input == 1
|
||||
|
||||
# Verify the output makes sense - should contain information about the horoscope
|
||||
output_text = response_2.output_text.lower()
|
||||
assert (
|
||||
"aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
|
||||
)
|
||||
89
tests/entrypoints/openai/test_responses_error.py
Normal file
89
tests/entrypoints/openai/test_responses_error.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from http import HTTPStatus
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raise_if_error_raises_generation_error():
|
||||
"""test _raise_if_error raises GenerationError"""
|
||||
# create a minimal OpenAIServing instance
|
||||
mock_engine = MagicMock()
|
||||
mock_engine.model_config = MagicMock()
|
||||
mock_engine.model_config.max_model_len = 100
|
||||
mock_models = MagicMock()
|
||||
|
||||
serving = OpenAIServing(
|
||||
engine_client=mock_engine,
|
||||
models=mock_models,
|
||||
request_logger=None,
|
||||
)
|
||||
|
||||
# test that error finish_reason raises GenerationError
|
||||
with pytest.raises(GenerationError) as exc_info:
|
||||
serving._raise_if_error("error", "test-request-id")
|
||||
|
||||
assert str(exc_info.value) == "Internal server error"
|
||||
assert exc_info.value.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
|
||||
|
||||
# test that other finish_reasons don't raise
|
||||
serving._raise_if_error("stop", "test-request-id") # should not raise
|
||||
serving._raise_if_error("length", "test-request-id") # should not raise
|
||||
serving._raise_if_error(None, "test-request-id") # should not raise
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_generation_error_to_response():
|
||||
"""test _convert_generation_error_to_response creates proper ErrorResponse"""
|
||||
mock_engine = MagicMock()
|
||||
mock_engine.model_config = MagicMock()
|
||||
mock_engine.model_config.max_model_len = 100
|
||||
mock_models = MagicMock()
|
||||
|
||||
serving = OpenAIServing(
|
||||
engine_client=mock_engine,
|
||||
models=mock_models,
|
||||
request_logger=None,
|
||||
)
|
||||
|
||||
# create a GenerationError
|
||||
gen_error = GenerationError("Internal server error")
|
||||
|
||||
# convert to ErrorResponse
|
||||
error_response = serving._convert_generation_error_to_response(gen_error)
|
||||
|
||||
assert isinstance(error_response, ErrorResponse)
|
||||
assert error_response.error.type == "InternalServerError"
|
||||
assert error_response.error.message == "Internal server error"
|
||||
assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_generation_error_to_streaming_response():
|
||||
"""test _convert_generation_error_to_streaming_response output"""
|
||||
mock_engine = MagicMock()
|
||||
mock_engine.model_config = MagicMock()
|
||||
mock_engine.model_config.max_model_len = 100
|
||||
mock_models = MagicMock()
|
||||
|
||||
serving = OpenAIServing(
|
||||
engine_client=mock_engine,
|
||||
models=mock_models,
|
||||
request_logger=None,
|
||||
)
|
||||
|
||||
# create a GenerationError
|
||||
gen_error = GenerationError("Internal server error")
|
||||
|
||||
# convert to streaming error response
|
||||
error_json = serving._convert_generation_error_to_streaming_response(gen_error)
|
||||
|
||||
assert isinstance(error_json, str)
|
||||
assert "Internal server error" in error_json
|
||||
assert "InternalServerError" in error_json
|
||||
330
tests/entrypoints/openai/test_responses_function_call_parsing.py
Normal file
330
tests/entrypoints/openai/test_responses_function_call_parsing.py
Normal file
@@ -0,0 +1,330 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test function call parsing in ResponsesRequest."""
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from openai.types.responses import ResponseFunctionToolCall
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ResponsesRequest
|
||||
|
||||
|
||||
def test_function_call_dict_converted_to_object():
|
||||
"""Test that function_call dictionaries are correctly parsed into
|
||||
ResponseFunctionToolCall objects."""
|
||||
# Create a request with function_call as dict
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_123",
|
||||
"name": "get_weather",
|
||||
"arguments": '{"location": "Boston", "unit": "celsius"}',
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify the input item is now a ResponseFunctionToolCall object
|
||||
assert len(request.input) == 1
|
||||
assert isinstance(request.input[0], ResponseFunctionToolCall)
|
||||
assert request.input[0].call_id == "fc_123"
|
||||
assert request.input[0].name == "get_weather"
|
||||
assert request.input[0].arguments == '{"location": "Boston", "unit": "celsius"}'
|
||||
|
||||
|
||||
def test_direct_function_call_object_preservation():
|
||||
"""Test that ResponseFunctionToolCall objects passed directly are preserved."""
|
||||
# Create a request with ResponseFunctionToolCall object
|
||||
function_call = ResponseFunctionToolCall(
|
||||
type="function_call",
|
||||
call_id="fc_456",
|
||||
name="get_stock_price",
|
||||
arguments='{"symbol": "AAPL"}',
|
||||
)
|
||||
|
||||
request_data = {"model": "gpt-oss", "input": [function_call]}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify the object is preserved
|
||||
assert len(request.input) == 1
|
||||
assert request.input[0] is function_call
|
||||
|
||||
|
||||
def test_mixed_input_types_with_function_calls():
|
||||
"""Test parsing with mixed input types including function calls."""
|
||||
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
# Valid Message type
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": "What's the weather?"}],
|
||||
},
|
||||
# Function call that should be parsed
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_789",
|
||||
"name": "check_weather",
|
||||
"arguments": '{"location": "NYC"}',
|
||||
},
|
||||
# Another function call
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_790",
|
||||
"name": "get_time",
|
||||
"arguments": "{}",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify mixed types are handled correctly
|
||||
assert len(request.input) == 3
|
||||
# First item should be validated as Message
|
||||
assert request.input[0]["type"] == "message"
|
||||
# Second item should be parsed to ResponseFunctionToolCall
|
||||
assert isinstance(request.input[1], ResponseFunctionToolCall)
|
||||
assert request.input[1].call_id == "fc_789"
|
||||
assert request.input[1].name == "check_weather"
|
||||
# Third item should also be parsed to ResponseFunctionToolCall
|
||||
assert isinstance(request.input[2], ResponseFunctionToolCall)
|
||||
assert request.input[2].call_id == "fc_790"
|
||||
assert request.input[2].name == "get_time"
|
||||
|
||||
|
||||
def test_function_call_with_complex_arguments():
|
||||
"""Test parsing function calls with complex nested arguments."""
|
||||
complex_args = {
|
||||
"query": "weather forecast",
|
||||
"filters": {
|
||||
"location": {"city": "San Francisco", "state": "CA"},
|
||||
"timeRange": {"start": "2024-01-01", "end": "2024-01-07"},
|
||||
"metrics": ["temperature", "humidity", "precipitation"],
|
||||
},
|
||||
"options": {"format": "detailed", "includeAlerts": True},
|
||||
}
|
||||
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_complex",
|
||||
"name": "advanced_weather_query",
|
||||
"arguments": json.dumps(complex_args),
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify complex arguments are preserved correctly
|
||||
assert len(request.input) == 1
|
||||
assert isinstance(request.input[0], ResponseFunctionToolCall)
|
||||
assert request.input[0].call_id == "fc_complex"
|
||||
assert request.input[0].name == "advanced_weather_query"
|
||||
|
||||
# Parse the arguments back to verify they're intact
|
||||
parsed_args = json.loads(request.input[0].arguments)
|
||||
assert parsed_args == complex_args
|
||||
|
||||
|
||||
def test_invalid_function_call_fallback():
|
||||
"""Test that invalid function call dictionaries fall back gracefully."""
|
||||
# Missing required field 'call_id'
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{"type": "function_call", "name": "incomplete_function", "arguments": "{}"}
|
||||
],
|
||||
}
|
||||
|
||||
# This should not raise an error during model creation
|
||||
# The validator should keep the original dict and let Pydantic
|
||||
# handle validation
|
||||
with pytest.raises(ValueError):
|
||||
# Pydantic should raise a validation error for the invalid structure
|
||||
ResponsesRequest(**request_data)
|
||||
|
||||
|
||||
def test_string_input_not_affected():
|
||||
"""Test that string input is not affected by the validator."""
|
||||
request_data = {"model": "gpt-oss", "input": "This is a simple string input"}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify string input remains unchanged
|
||||
assert request.input == "This is a simple string input"
|
||||
|
||||
|
||||
def test_empty_list_input():
|
||||
"""Test that empty list input is handled correctly."""
|
||||
request_data = {"model": "gpt-oss", "input": []}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify empty list is preserved
|
||||
assert request.input == []
|
||||
|
||||
|
||||
def test_function_call_output_not_affected():
|
||||
"""Test that FunctionCallOutput is not affected by the function_call parsing."""
|
||||
|
||||
# Test with FunctionCallOutput as dict (should not be parsed)
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": "fc_output_123",
|
||||
"output": "The weather in Boston is 72°F and sunny.",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# FunctionCallOutput should remain as dict (not converted to an object)
|
||||
assert len(request.input) == 1
|
||||
assert isinstance(request.input[0], dict)
|
||||
assert request.input[0]["type"] == "function_call_output"
|
||||
assert request.input[0]["call_id"] == "fc_output_123"
|
||||
assert request.input[0]["output"] == "The weather in Boston is 72°F and sunny."
|
||||
|
||||
|
||||
def test_mixed_function_call_and_output():
|
||||
"""Test that function_call is parsed while function_call_output is preserved."""
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
# This should be parsed to ResponseFunctionToolCall
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_call_456",
|
||||
"name": "get_weather",
|
||||
"arguments": '{"location": "NYC"}',
|
||||
},
|
||||
# This should remain as dict
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": "fc_call_456",
|
||||
"output": "NYC weather is 68°F with light rain",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
assert len(request.input) == 2
|
||||
|
||||
# First item should be parsed to ResponseFunctionToolCall
|
||||
assert isinstance(request.input[0], ResponseFunctionToolCall)
|
||||
assert request.input[0].call_id == "fc_call_456"
|
||||
assert request.input[0].name == "get_weather"
|
||||
|
||||
# Second item should remain as dict (FunctionCallOutput)
|
||||
assert isinstance(request.input[1], dict)
|
||||
assert request.input[1]["type"] == "function_call_output"
|
||||
assert request.input[1]["call_id"] == "fc_call_456"
|
||||
assert request.input[1]["output"] == "NYC weather is 68°F with light rain"
|
||||
|
||||
|
||||
def test_function_call_validation_failure_logs_debug(caplog):
|
||||
"""Test that validation failures are logged at debug level."""
|
||||
from unittest.mock import patch
|
||||
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{
|
||||
"type": "function_call",
|
||||
"name": "incomplete_function",
|
||||
"arguments": "{}", # Missing call_id
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Mock the logger to verify debug was called
|
||||
with patch("vllm.entrypoints.openai.protocol.logger") as mock_logger:
|
||||
with pytest.raises(ValueError):
|
||||
ResponsesRequest(**request_data)
|
||||
|
||||
# Verify debug was called with expected message
|
||||
mock_logger.debug.assert_called_once()
|
||||
call_args = mock_logger.debug.call_args[0][0]
|
||||
assert "Failed to parse function_call" in call_args
|
||||
|
||||
|
||||
def test_validator_handles_iterator_input():
|
||||
"""Test that validator can handle ValidatorIterator input (Pydantic internal)."""
|
||||
|
||||
# This test simulates when Pydantic passes a ValidatorIterator instead of a list
|
||||
# This happened with complex nested structures containing reasoning + function_call
|
||||
|
||||
# Create test data that would normally be a list
|
||||
test_input_items = [
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": "Test"}],
|
||||
},
|
||||
{
|
||||
"type": "reasoning",
|
||||
"id": "rs_1",
|
||||
"summary": [{"type": "summary_text", "text": "Test reasoning"}],
|
||||
"content": [{"type": "reasoning_text", "text": "Test content"}],
|
||||
},
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "call_1",
|
||||
"name": "test_function",
|
||||
"arguments": '{"test": "value"}',
|
||||
"id": "fc_1",
|
||||
},
|
||||
]
|
||||
|
||||
# Mock data where input is an iterator (simulates Pydantic ValidatorIterator)
|
||||
mock_data = {
|
||||
"model": "test-model",
|
||||
"input": iter(test_input_items), # Iterator instead of list
|
||||
}
|
||||
|
||||
# This should NOT raise an error with the fixed validator
|
||||
try:
|
||||
request = ResponsesRequest(**mock_data)
|
||||
|
||||
# Verify the validator processed the data correctly
|
||||
assert len(request.input) == 3
|
||||
|
||||
# Verify function_call was converted to ResponseFunctionToolCall object
|
||||
function_call_item = None
|
||||
for item in request.input:
|
||||
if isinstance(item, ResponseFunctionToolCall):
|
||||
function_call_item = item
|
||||
break
|
||||
|
||||
assert function_call_item is not None
|
||||
assert function_call_item.call_id == "call_1"
|
||||
assert function_call_item.name == "test_function"
|
||||
|
||||
except Exception as e:
|
||||
pytest.fail(f"Validator should handle iterator input, but failed with: {e}")
|
||||
|
||||
|
||||
def test_validator_handles_empty_iterator():
|
||||
"""Test validator handles empty iterator gracefully."""
|
||||
mock_data = {
|
||||
"model": "test-model",
|
||||
"input": iter([]), # Empty iterator
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**mock_data)
|
||||
assert request.input == []
|
||||
369
tests/entrypoints/openai/test_return_token_ids.py
Normal file
369
tests/entrypoints/openai/test_return_token_ids.py
Normal file
@@ -0,0 +1,369 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
"--enforce-eager",
|
||||
]
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("return_token_ids", [True, False, None])
|
||||
async def test_basic_completion_with_emoji(server, return_token_ids: bool | None):
|
||||
"""Test basic completion with emoji to verify token_ids field."""
|
||||
extra_body = None
|
||||
if return_token_ids is not None:
|
||||
extra_body = {"return_token_ids": return_token_ids}
|
||||
async with server.get_async_client() as client:
|
||||
# Test with return_token_ids enabled
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Complete this sentence with emojis: I love coding 🚀",
|
||||
max_tokens=10,
|
||||
temperature=0,
|
||||
logprobs=1,
|
||||
extra_body=extra_body,
|
||||
)
|
||||
|
||||
# Check the raw response to see the structure
|
||||
completion_dict = completion.model_dump()
|
||||
|
||||
# Verify prompt_token_ids field is present in the completion response
|
||||
assert "prompt_token_ids" in completion_dict["choices"][0]
|
||||
if not return_token_ids:
|
||||
# If return_token_ids is False, token_ids should not be present
|
||||
assert completion_dict["choices"][0].get("token_ids") is None
|
||||
assert completion_dict["choices"][0].get("prompt_token_ids") is None
|
||||
# Skip further checks
|
||||
return
|
||||
assert isinstance(completion.choices[0].prompt_token_ids, list)
|
||||
|
||||
# Check against the expected prompt token IDs
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
encoded_tokens = tokenizer.encode(
|
||||
"Complete this sentence with emojis: I love coding 🚀"
|
||||
)
|
||||
# Check that encoded_tokens is a subsequence of prompt_token_ids
|
||||
assert any(
|
||||
completion.choices[0].prompt_token_ids[i : i + len(encoded_tokens)]
|
||||
== encoded_tokens
|
||||
for i in range(
|
||||
len(completion.choices[0].prompt_token_ids) - len(encoded_tokens) + 1
|
||||
)
|
||||
)
|
||||
|
||||
# Verify token_ids field is present in the choice
|
||||
assert completion.choices[0].token_ids is not None
|
||||
assert isinstance(completion.choices[0].token_ids, list)
|
||||
assert len(completion.choices[0].token_ids) > 0
|
||||
|
||||
# Verify decoding works correctly
|
||||
decoded_text = tokenizer.decode(completion.choices[0].token_ids)
|
||||
# The decoded text should contain a <|im_end|> at the end
|
||||
assert decoded_text.startswith(completion.choices[0].text)
|
||||
|
||||
# Test without return_token_ids (should be None)
|
||||
completion_without = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Complete this sentence with emojis: I love coding 🚀",
|
||||
max_tokens=10,
|
||||
temperature=0,
|
||||
logprobs=1,
|
||||
extra_body={"return_token_ids": False},
|
||||
)
|
||||
|
||||
completion_without_dict = completion_without.model_dump()
|
||||
assert completion_without_dict["choices"][0].get("token_ids") is None
|
||||
assert completion_without_dict.get("prompt_token_ids") is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_completion_with_tool_use(server):
|
||||
"""Test chat completion with tool use (get_weather function)."""
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The unit of temperature",
|
||||
},
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
async with server.get_async_client() as client:
|
||||
# Test with return_token_ids enabled
|
||||
response = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What's the weather like in Paris?"},
|
||||
],
|
||||
tools=tools,
|
||||
tool_choice="auto",
|
||||
max_tokens=100,
|
||||
temperature=0,
|
||||
logprobs=True,
|
||||
extra_body={"return_token_ids": True},
|
||||
)
|
||||
|
||||
# Verify token_ids field is present in choices
|
||||
assert response.choices[0].token_ids is not None
|
||||
assert isinstance(response.choices[0].token_ids, list)
|
||||
|
||||
# Verify prompt_token_ids field is present
|
||||
assert response.prompt_token_ids is not None
|
||||
assert isinstance(response.prompt_token_ids, list)
|
||||
|
||||
# Verify the prompt texts and response texts
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
prompt_text = tokenizer.decode(response.prompt_token_ids)
|
||||
assert prompt_text.startswith(
|
||||
"<|im_start|>system\nYou are a helpful assistant."
|
||||
)
|
||||
assert prompt_text.endswith(
|
||||
"What's the weather like in Paris?<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
response_text = tokenizer.decode(response.choices[0].token_ids)
|
||||
assert response_text.startswith('<tool_call>\n{"name": "get_weather"')
|
||||
assert response_text.endswith("</tool_call><|im_end|>")
|
||||
|
||||
# If tool call was made, verify the response structure
|
||||
if response.choices[0].message.tool_calls:
|
||||
assert len(response.choices[0].message.tool_calls) > 0
|
||||
tool_call = response.choices[0].message.tool_calls[0]
|
||||
assert tool_call.function.name == "get_weather"
|
||||
|
||||
# Test without return_token_ids
|
||||
response_without = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What's the weather like in Paris?"},
|
||||
],
|
||||
tools=tools,
|
||||
tool_choice="auto",
|
||||
max_tokens=100,
|
||||
temperature=0,
|
||||
logprobs=True,
|
||||
extra_body={"return_token_ids": False},
|
||||
)
|
||||
|
||||
assert response_without.choices[0].token_ids is None
|
||||
assert response_without.prompt_token_ids is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_comparison_with_prompt_logprobs_and_logprobs(server):
|
||||
"""
|
||||
Test that token_ids align with prompt_logprobs and
|
||||
logprobs when return_tokens_as_token_ids is enabled.
|
||||
"""
|
||||
async with server.get_async_client() as client:
|
||||
# Test with both return_token_ids and return_tokens_as_token_ids enabled
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Hello, world! How are you today?",
|
||||
max_tokens=20,
|
||||
temperature=0,
|
||||
echo=True,
|
||||
logprobs=1,
|
||||
extra_body={
|
||||
"return_token_ids": True,
|
||||
"return_tokens_as_token_ids": True,
|
||||
"prompt_logprobs": 1,
|
||||
},
|
||||
)
|
||||
|
||||
# Verify all fields are present
|
||||
assert completion.choices[0].token_ids is not None
|
||||
assert completion.choices[0].prompt_token_ids is not None
|
||||
assert completion.choices[0].prompt_logprobs is not None
|
||||
assert completion.choices[0].logprobs is not None
|
||||
|
||||
# Extract token IDs from logprobs
|
||||
# (when return_tokens_as_token_ids is True)
|
||||
logprobs_token_ids = []
|
||||
for token_str in completion.choices[0].logprobs.tokens:
|
||||
# Token format is "token_id:12345" when
|
||||
# return_tokens_as_token_ids is True
|
||||
if token_str.startswith("token_id:"):
|
||||
token_id = int(token_str.removeprefix("token_id:"))
|
||||
logprobs_token_ids.append(token_id)
|
||||
|
||||
# When echo=True, the logprobs include both prompt and response tokens
|
||||
# The token_ids field should match the suffix of response portion
|
||||
# The prompt_token_ids should match the prompt portion
|
||||
assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
|
||||
response_token_ids_length = len(completion.choices[0].token_ids)
|
||||
assert (
|
||||
logprobs_token_ids[-response_token_ids_length:]
|
||||
== completion.choices[0].token_ids
|
||||
)
|
||||
|
||||
# Verify tokenizer consistency
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
|
||||
# Decode prompt tokens
|
||||
if completion.choices[0].prompt_token_ids:
|
||||
prompt_text = tokenizer.decode(completion.choices[0].prompt_token_ids)
|
||||
# The decoded prompt should match or close to original prompt
|
||||
assert "Hello, world" in prompt_text
|
||||
|
||||
# Decode response tokens
|
||||
if completion.choices[0].token_ids:
|
||||
response_text = tokenizer.decode(completion.choices[0].token_ids)
|
||||
assert completion.choices[0].text.endswith(response_text)
|
||||
|
||||
# Test streaming mode
|
||||
stream = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Tell me a short fact about Python:",
|
||||
max_tokens=30,
|
||||
temperature=0,
|
||||
stream=True,
|
||||
echo=False,
|
||||
logprobs=1,
|
||||
extra_body={"return_token_ids": True, "return_tokens_as_token_ids": True},
|
||||
)
|
||||
|
||||
# Collect streamed tokens
|
||||
streamed_prompt_token_ids = []
|
||||
streamed_token_ids = []
|
||||
streamed_logprob_token_ids = []
|
||||
first_chunk = True
|
||||
async for chunk in stream:
|
||||
for token_str in chunk.choices[0].logprobs.tokens:
|
||||
# Token format is "token_id:12345" when
|
||||
# return_tokens_as_token_ids is True
|
||||
if token_str.startswith("token_id:"):
|
||||
token_id = int(token_str.removeprefix("token_id:"))
|
||||
streamed_logprob_token_ids.append(token_id)
|
||||
if first_chunk:
|
||||
streamed_prompt_token_ids = chunk.choices[0].prompt_token_ids
|
||||
first_chunk = False
|
||||
streamed_token_ids += chunk.choices[0].token_ids
|
||||
|
||||
# Verify we collected some tokens and first chunk had prompt_token_ids
|
||||
assert len(streamed_prompt_token_ids) > 0
|
||||
assert streamed_token_ids == streamed_logprob_token_ids
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_completion_with_emoji_and_token_ids(server):
|
||||
"""Test chat completion with emojis to verify token_ids handling."""
|
||||
chat_messages = [
|
||||
{"role": "system", "content": "You like to use emojis in your responses."},
|
||||
{"role": "user", "content": "Repeat after me: I love cats 🐱"},
|
||||
]
|
||||
async with server.get_async_client() as client:
|
||||
response = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=chat_messages,
|
||||
max_tokens=50,
|
||||
temperature=0,
|
||||
logprobs=True,
|
||||
extra_body={"return_token_ids": True},
|
||||
)
|
||||
|
||||
# Verify token_ids are present
|
||||
response_dict = response.model_dump()
|
||||
assert response.choices[0].token_ids is not None
|
||||
assert "prompt_token_ids" in response_dict
|
||||
|
||||
# Verify the response contains the expected fields
|
||||
assert response.choices[0].message.content is not None
|
||||
|
||||
# Decode token_ids and verify consistency
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
|
||||
decoded_prompt = tokenizer.decode(response.prompt_token_ids)
|
||||
assert decoded_prompt.startswith(
|
||||
"<|im_start|>system\nYou like to use emojis in your responses."
|
||||
)
|
||||
assert decoded_prompt.endswith(
|
||||
"I love cats 🐱<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
decoded_response = tokenizer.decode(response.choices[0].token_ids)
|
||||
# The content should match the response text
|
||||
# except the ending <|im_end|>
|
||||
assert decoded_response == response.choices[0].message.content + "<|im_end|>"
|
||||
|
||||
# Test with streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=chat_messages,
|
||||
max_tokens=50,
|
||||
temperature=0,
|
||||
stream=True,
|
||||
extra_body={"return_token_ids": True},
|
||||
)
|
||||
|
||||
collected_content = ""
|
||||
collected_token_ids = []
|
||||
first_chunk = True
|
||||
|
||||
async for chunk in stream:
|
||||
if first_chunk:
|
||||
assert chunk.prompt_token_ids is not None
|
||||
assert isinstance(chunk.prompt_token_ids, list)
|
||||
# Check the prompt_token_ids match the initial prompt
|
||||
decoded_prompt_stream = tokenizer.decode(chunk.prompt_token_ids)
|
||||
assert decoded_prompt_stream == decoded_prompt
|
||||
first_chunk = False
|
||||
else:
|
||||
chunk_dump = chunk.model_dump()
|
||||
assert "prompt_token_ids" not in chunk_dump, (
|
||||
"Subsequent chunks should not have prompt_token_ids"
|
||||
)
|
||||
|
||||
if chunk.choices:
|
||||
if chunk.choices[0].delta.content:
|
||||
collected_content += chunk.choices[0].delta.content
|
||||
# token_ids may not present in all chunks
|
||||
choice_dump = chunk.choices[0].model_dump()
|
||||
if "token_ids" in choice_dump:
|
||||
collected_token_ids.extend(chunk.choices[0].token_ids)
|
||||
|
||||
# Verify we got response and token_ids
|
||||
assert len(collected_content) > 0
|
||||
assert len(collected_token_ids) > 0
|
||||
|
||||
# Verify token_ids decode properly
|
||||
decoded_response = tokenizer.decode(collected_token_ids)
|
||||
assert decoded_response == collected_content + "<|im_end|>"
|
||||
123
tests/entrypoints/openai/test_return_tokens_as_ids.py
Normal file
123
tests/entrypoints/openai/test_return_tokens_as_ids.py
Normal file
@@ -0,0 +1,123 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Separate these tests out from test_completion and test_chat, because they
|
||||
# require launching a second server with a different flag. Running both servers
|
||||
# at the same time on a single node will OOM.
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(qwen3_lora_files):
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
# lora config
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"qwen3-lora={qwen3_lora_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_fixture(request, default_server_args): # noqa: F811
|
||||
use_server_flag = request.param
|
||||
if use_server_flag:
|
||||
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
|
||||
with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
|
||||
yield (remote_server, True)
|
||||
else:
|
||||
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
||||
yield (remote_server, False)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
|
||||
async def test_completion_return_tokens_as_token_ids_completion(server_fixture):
|
||||
server, use_server_flag = server_fixture
|
||||
request_args = {}
|
||||
if not use_server_flag:
|
||||
request_args["return_tokens_as_token_ids"] = True
|
||||
|
||||
async with server.get_async_client() as client:
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
# Include Unicode characters to test for dividing a single
|
||||
# character across multiple tokens: 🎉 is [28705, 31862] for the
|
||||
# Zephyr tokenizer
|
||||
prompt="Say 'Hello, world! 🎉'",
|
||||
echo=True,
|
||||
temperature=0,
|
||||
max_tokens=10,
|
||||
logprobs=1,
|
||||
extra_body=request_args,
|
||||
)
|
||||
|
||||
text = completion.choices[0].text
|
||||
token_strs = completion.choices[0].logprobs.tokens
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
# Check that the token representations are consistent between raw
|
||||
# tokens and top_logprobs
|
||||
# Slice off the first one, because there's no scoring associated
|
||||
# with BOS
|
||||
top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
|
||||
top_logprob_keys = [
|
||||
next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
|
||||
]
|
||||
assert token_strs[1:] == top_logprob_keys
|
||||
|
||||
# Check that decoding the tokens gives the expected text
|
||||
tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
|
||||
assert text == tokenizer.decode(tokens, skip_special_tokens=True)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
|
||||
async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
|
||||
server, use_server_flag = server_fixture
|
||||
request_args = {}
|
||||
if not use_server_flag:
|
||||
request_args["return_tokens_as_token_ids"] = True
|
||||
|
||||
async with server.get_async_client() as client:
|
||||
response = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
# Include Unicode characters to test for dividing a single
|
||||
# character across multiple tokens: 🎉 is [28705, 31862] for the
|
||||
# Zephyr tokenizer
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You like to respond in only emojis, like 🎉",
|
||||
},
|
||||
{"role": "user", "content": "Please write some emojis: 🐱🐶🎉"},
|
||||
],
|
||||
temperature=0,
|
||||
max_tokens=8,
|
||||
logprobs=True,
|
||||
extra_body=request_args,
|
||||
)
|
||||
|
||||
text = response.choices[0].message.content
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
token_ids = []
|
||||
for logprob_content in response.choices[0].logprobs.content:
|
||||
token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
|
||||
assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
|
||||
104
tests/entrypoints/openai/test_root_path.py
Normal file
104
tests/entrypoints/openai/test_root_path.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# # any model with a chat template should work here
|
||||
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
|
||||
API_KEY = "abc-123"
|
||||
ERROR_API_KEY = "abc"
|
||||
ROOT_PATH = "llm"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"4080",
|
||||
"--root-path", # use --root-path=/llm for testing
|
||||
"/" + ROOT_PATH,
|
||||
]
|
||||
envs = os.environ.copy()
|
||||
|
||||
envs["VLLM_API_KEY"] = API_KEY
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
class TestCase(NamedTuple):
|
||||
model_name: str
|
||||
base_url: list[str]
|
||||
api_key: str
|
||||
expected_error: Any
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
TestCase(
|
||||
model_name=MODEL_NAME,
|
||||
base_url=["v1"], # http://localhost:8000/v1
|
||||
api_key=ERROR_API_KEY,
|
||||
expected_error=openai.AuthenticationError,
|
||||
),
|
||||
TestCase(
|
||||
model_name=MODEL_NAME,
|
||||
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
|
||||
api_key=ERROR_API_KEY,
|
||||
expected_error=openai.AuthenticationError,
|
||||
),
|
||||
TestCase(
|
||||
model_name=MODEL_NAME,
|
||||
base_url=["v1"], # http://localhost:8000/v1
|
||||
api_key=API_KEY,
|
||||
expected_error=None,
|
||||
),
|
||||
TestCase(
|
||||
model_name=MODEL_NAME,
|
||||
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
|
||||
api_key=API_KEY,
|
||||
expected_error=None,
|
||||
),
|
||||
],
|
||||
)
|
||||
async def test_chat_session_root_path_with_api_key(
|
||||
server: RemoteOpenAIServer, test_case: TestCase
|
||||
):
|
||||
saying: str = "Here is a common saying about apple. An apple a day, keeps"
|
||||
ctx = contextlib.nullcontext()
|
||||
if test_case.expected_error is not None:
|
||||
ctx = pytest.raises(test_case.expected_error)
|
||||
with ctx:
|
||||
client = openai.AsyncOpenAI(
|
||||
api_key=test_case.api_key,
|
||||
base_url=server.url_for(*test_case.base_url),
|
||||
max_retries=0,
|
||||
)
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=test_case.model_name,
|
||||
messages=[
|
||||
{"role": "user", "content": "tell me a common saying"},
|
||||
{"role": "assistant", "content": saying},
|
||||
],
|
||||
extra_body={"continue_final_message": True, "add_generation_prompt": False},
|
||||
)
|
||||
|
||||
assert chat_completion.id is not None
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "stop"
|
||||
message = choice.message
|
||||
assert len(message.content) > 0
|
||||
assert message.role == "assistant"
|
||||
240
tests/entrypoints/openai/test_run_batch.py
Normal file
240
tests/entrypoints/openai/test_run_batch.py
Normal file
@@ -0,0 +1,240 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.run_batch import BatchRequestOutput
|
||||
|
||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
|
||||
# ruff: noqa: E501
|
||||
INPUT_BATCH = (
|
||||
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "NonExistModel", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {{"stream": "True", "model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
|
||||
).format(MODEL_NAME)
|
||||
|
||||
INVALID_INPUT_BATCH = (
|
||||
'{{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
|
||||
).format(MODEL_NAME)
|
||||
|
||||
INPUT_EMBEDDING_BATCH = (
|
||||
'{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}\n'
|
||||
'{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}\n'
|
||||
'{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}\n'
|
||||
'{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}'
|
||||
)
|
||||
|
||||
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
|
||||
|
||||
INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
|
||||
|
||||
INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}"""
|
||||
|
||||
|
||||
def test_empty_file():
|
||||
with (
|
||||
tempfile.NamedTemporaryFile("w") as input_file,
|
||||
tempfile.NamedTemporaryFile("r") as output_file,
|
||||
):
|
||||
input_file.write("")
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"vllm",
|
||||
"run-batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"intfloat/multilingual-e5-small",
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
assert proc.returncode == 0, f"{proc=}"
|
||||
|
||||
contents = output_file.read()
|
||||
assert contents.strip() == ""
|
||||
|
||||
|
||||
def test_completions():
|
||||
with (
|
||||
tempfile.NamedTemporaryFile("w") as input_file,
|
||||
tempfile.NamedTemporaryFile("r") as output_file,
|
||||
):
|
||||
input_file.write(INPUT_BATCH)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"vllm",
|
||||
"run-batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
assert proc.returncode == 0, f"{proc=}"
|
||||
|
||||
contents = output_file.read()
|
||||
for line in contents.strip().split("\n"):
|
||||
# Ensure that the output format conforms to the openai api.
|
||||
# Validation should throw if the schema is wrong.
|
||||
BatchRequestOutput.model_validate_json(line)
|
||||
|
||||
|
||||
def test_completions_invalid_input():
|
||||
"""
|
||||
Ensure that we fail when the input doesn't conform to the openai api.
|
||||
"""
|
||||
with (
|
||||
tempfile.NamedTemporaryFile("w") as input_file,
|
||||
tempfile.NamedTemporaryFile("r") as output_file,
|
||||
):
|
||||
input_file.write(INVALID_INPUT_BATCH)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"vllm",
|
||||
"run-batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
assert proc.returncode != 0, f"{proc=}"
|
||||
|
||||
|
||||
def test_embeddings():
|
||||
with (
|
||||
tempfile.NamedTemporaryFile("w") as input_file,
|
||||
tempfile.NamedTemporaryFile("r") as output_file,
|
||||
):
|
||||
input_file.write(INPUT_EMBEDDING_BATCH)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"vllm",
|
||||
"run-batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"intfloat/multilingual-e5-small",
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
assert proc.returncode == 0, f"{proc=}"
|
||||
|
||||
contents = output_file.read()
|
||||
for line in contents.strip().split("\n"):
|
||||
# Ensure that the output format conforms to the openai api.
|
||||
# Validation should throw if the schema is wrong.
|
||||
BatchRequestOutput.model_validate_json(line)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_batch", [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
|
||||
def test_score(input_batch):
|
||||
with (
|
||||
tempfile.NamedTemporaryFile("w") as input_file,
|
||||
tempfile.NamedTemporaryFile("r") as output_file,
|
||||
):
|
||||
input_file.write(input_batch)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"vllm",
|
||||
"run-batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"BAAI/bge-reranker-v2-m3",
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
assert proc.returncode == 0, f"{proc=}"
|
||||
|
||||
contents = output_file.read()
|
||||
for line in contents.strip().split("\n"):
|
||||
# Ensure that the output format conforms to the openai api.
|
||||
# Validation should throw if the schema is wrong.
|
||||
BatchRequestOutput.model_validate_json(line)
|
||||
|
||||
# Ensure that there is no error in the response.
|
||||
line_dict = json.loads(line)
|
||||
assert isinstance(line_dict, dict)
|
||||
assert line_dict["error"] is None
|
||||
|
||||
|
||||
def test_reasoning_parser():
|
||||
"""
|
||||
Test that reasoning_parser parameter works correctly in run_batch.
|
||||
"""
|
||||
with (
|
||||
tempfile.NamedTemporaryFile("w") as input_file,
|
||||
tempfile.NamedTemporaryFile("r") as output_file,
|
||||
):
|
||||
input_file.write(INPUT_REASONING_BATCH)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"vllm",
|
||||
"run-batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
"--reasoning-parser",
|
||||
"qwen3",
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
assert proc.returncode == 0, f"{proc=}"
|
||||
|
||||
contents = output_file.read()
|
||||
for line in contents.strip().split("\n"):
|
||||
# Ensure that the output format conforms to the openai api.
|
||||
# Validation should throw if the schema is wrong.
|
||||
BatchRequestOutput.model_validate_json(line)
|
||||
|
||||
# Ensure that there is no error in the response.
|
||||
line_dict = json.loads(line)
|
||||
assert isinstance(line_dict, dict)
|
||||
assert line_dict["error"] is None
|
||||
|
||||
# Check that reasoning is present and not empty
|
||||
reasoning = line_dict["response"]["body"]["choices"][0]["message"][
|
||||
"reasoning"
|
||||
]
|
||||
assert reasoning is not None
|
||||
assert len(reasoning) > 0
|
||||
File diff suppressed because it is too large
Load Diff
71
tests/entrypoints/openai/test_serving_engine.py
Normal file
71
tests/entrypoints/openai/test_serving_engine.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def serving() -> OpenAIServing:
|
||||
"""Create a minimal OpenAIServing instance for testing."""
|
||||
|
||||
# Create minimal mocks
|
||||
engine_client = Mock()
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.max_model_len = 32768
|
||||
models = Mock(spec=OpenAIServingModels)
|
||||
models.model_config = model_config
|
||||
models.input_processor = Mock()
|
||||
models.io_processor = Mock()
|
||||
|
||||
serving = OpenAIServing(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=None,
|
||||
)
|
||||
return serving
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_mistral_tokenizer_does_not_block_event_loop(
|
||||
serving: OpenAIServing,
|
||||
):
|
||||
expected_tokens = [1, 2, 3]
|
||||
|
||||
# Mock the blocking version to sleep
|
||||
def mocked_apply_chat_template(*_args, **_kwargs):
|
||||
time.sleep(2)
|
||||
return expected_tokens
|
||||
|
||||
mock_tokenizer = Mock(spec=MistralTokenizer)
|
||||
mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template
|
||||
|
||||
task = serving._apply_mistral_chat_template_async(
|
||||
tokenizer=mock_tokenizer, messages=[], chat_template=None, tools=[]
|
||||
)
|
||||
|
||||
# Ensure the event loop is not blocked
|
||||
blocked_count = 0
|
||||
for _i in range(20): # Check over ~2 seconds
|
||||
start = time.perf_counter()
|
||||
await asyncio.sleep(0)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
# an overly generous elapsed time for slow machines
|
||||
if elapsed >= 0.5:
|
||||
blocked_count += 1
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Ensure task completes
|
||||
tokens = await task
|
||||
assert tokens == expected_tokens, "Mocked blocking tokenizer was not called"
|
||||
assert blocked_count == 0, "Event loop blocked during tokenization"
|
||||
129
tests/entrypoints/openai/test_serving_models.py
Normal file
129
tests/entrypoints/openai/test_serving_models.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from http import HTTPStatus
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
LoadLoRAAdapterRequest,
|
||||
UnloadLoRAAdapterRequest,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
|
||||
LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully."
|
||||
LORA_UNLOADING_SUCCESS_MESSAGE = (
|
||||
"Success: LoRA adapter '{lora_name}' removed successfully."
|
||||
)
|
||||
|
||||
|
||||
async def _async_serving_models_init() -> OpenAIServingModels:
|
||||
mock_engine_client = MagicMock(spec=EngineClient)
|
||||
# Set the max_model_len attribute to avoid missing attribute
|
||||
mock_model_config = MagicMock(spec=ModelConfig)
|
||||
mock_model_config.max_model_len = 2048
|
||||
mock_engine_client.model_config = mock_model_config
|
||||
mock_engine_client.input_processor = MagicMock()
|
||||
mock_engine_client.io_processor = MagicMock()
|
||||
|
||||
serving_models = OpenAIServingModels(
|
||||
engine_client=mock_engine_client,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
lora_modules=None,
|
||||
)
|
||||
await serving_models.init_static_loras()
|
||||
|
||||
return serving_models
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_serving_model_name():
|
||||
serving_models = await _async_serving_models_init()
|
||||
assert serving_models.model_name(None) == MODEL_NAME
|
||||
request = LoRARequest(
|
||||
lora_name="adapter", lora_path="/path/to/adapter2", lora_int_id=1
|
||||
)
|
||||
assert serving_models.model_name(request) == request.lora_name
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_lora_adapter_success():
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = LoadLoRAAdapterRequest(lora_name="adapter", lora_path="/path/to/adapter2")
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter")
|
||||
assert len(serving_models.lora_requests) == 1
|
||||
assert "adapter" in serving_models.lora_requests
|
||||
assert serving_models.lora_requests["adapter"].lora_name == "adapter"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_lora_adapter_missing_fields():
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.error.type == "InvalidUserInput"
|
||||
assert response.error.code == HTTPStatus.BAD_REQUEST
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_lora_adapter_duplicate():
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = LoadLoRAAdapterRequest(
|
||||
lora_name="adapter1", lora_path="/path/to/adapter1"
|
||||
)
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
|
||||
assert len(serving_models.lora_requests) == 1
|
||||
|
||||
request = LoadLoRAAdapterRequest(
|
||||
lora_name="adapter1", lora_path="/path/to/adapter1"
|
||||
)
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.error.type == "InvalidUserInput"
|
||||
assert response.error.code == HTTPStatus.BAD_REQUEST
|
||||
assert len(serving_models.lora_requests) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unload_lora_adapter_success():
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = LoadLoRAAdapterRequest(
|
||||
lora_name="adapter1", lora_path="/path/to/adapter1"
|
||||
)
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert len(serving_models.lora_requests) == 1
|
||||
|
||||
request = UnloadLoRAAdapterRequest(lora_name="adapter1")
|
||||
response = await serving_models.unload_lora_adapter(request)
|
||||
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
|
||||
assert len(serving_models.lora_requests) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unload_lora_adapter_missing_fields():
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
|
||||
response = await serving_models.unload_lora_adapter(request)
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.error.type == "InvalidUserInput"
|
||||
assert response.error.code == HTTPStatus.BAD_REQUEST
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unload_lora_adapter_not_found():
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
|
||||
response = await serving_models.unload_lora_adapter(request)
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.error.type == "NotFoundError"
|
||||
assert response.error.code == HTTPStatus.NOT_FOUND
|
||||
352
tests/entrypoints/openai/test_serving_responses.py
Normal file
352
tests/entrypoints/openai/test_serving_responses.py
Normal file
@@ -0,0 +1,352 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from contextlib import AsyncExitStack
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from openai.types.responses.tool import (
|
||||
CodeInterpreterContainerCodeInterpreterToolAuto,
|
||||
LocalShell,
|
||||
Mcp,
|
||||
Tool,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.context import ConversationContext
|
||||
from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
|
||||
from vllm.entrypoints.openai.serving_responses import (
|
||||
OpenAIServingResponses,
|
||||
_extract_allowed_tools_from_mcp_requests,
|
||||
extract_tool_types,
|
||||
)
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
|
||||
|
||||
class MockConversationContext(ConversationContext):
|
||||
"""Mock conversation context for testing"""
|
||||
|
||||
def __init__(self):
|
||||
self.init_tool_sessions_called = False
|
||||
self.init_tool_sessions_args = None
|
||||
self.init_tool_sessions_kwargs = None
|
||||
|
||||
def append_output(self, output) -> None:
|
||||
pass
|
||||
|
||||
def append_tool_output(self, output) -> None:
|
||||
pass
|
||||
|
||||
async def call_tool(self):
|
||||
return []
|
||||
|
||||
def need_builtin_tool_call(self) -> bool:
|
||||
return False
|
||||
|
||||
def render_for_completion(self):
|
||||
return []
|
||||
|
||||
async def init_tool_sessions(self, tool_server, exit_stack, request_id, mcp_tools):
|
||||
self.init_tool_sessions_called = True
|
||||
self.init_tool_sessions_args = (tool_server, exit_stack, request_id, mcp_tools)
|
||||
|
||||
async def cleanup_session(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_serving_responses():
|
||||
"""Create a mock OpenAIServingResponses instance"""
|
||||
serving_responses = MagicMock(spec=OpenAIServingResponses)
|
||||
serving_responses.tool_server = MagicMock(spec=ToolServer)
|
||||
return serving_responses
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_context():
|
||||
"""Create a mock conversation context"""
|
||||
return MockConversationContext()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_exit_stack():
|
||||
"""Create a mock async exit stack"""
|
||||
return MagicMock(spec=AsyncExitStack)
|
||||
|
||||
|
||||
def test_extract_tool_types(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
tools: list[Tool] = []
|
||||
assert extract_tool_types(tools) == set()
|
||||
|
||||
tools.append(LocalShell(type="local_shell"))
|
||||
assert extract_tool_types(tools) == {"local_shell"}
|
||||
|
||||
tools.append(CodeInterpreterContainerCodeInterpreterToolAuto(type="auto"))
|
||||
assert extract_tool_types(tools) == {"local_shell", "auto"}
|
||||
|
||||
tools.extend(
|
||||
[
|
||||
Mcp(type="mcp", server_label="random", server_url=""),
|
||||
Mcp(type="mcp", server_label="container", server_url=""),
|
||||
Mcp(type="mcp", server_label="code_interpreter", server_url=""),
|
||||
Mcp(type="mcp", server_label="web_search_preview", server_url=""),
|
||||
]
|
||||
)
|
||||
# When envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS is not set,
|
||||
# mcp tool types are all ignored.
|
||||
assert extract_tool_types(tools) == {"local_shell", "auto"}
|
||||
|
||||
# container is allowed, it would be extracted
|
||||
monkeypatch.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "container")
|
||||
assert extract_tool_types(tools) == {"local_shell", "auto", "container"}
|
||||
|
||||
# code_interpreter and web_search_preview are allowed,
|
||||
# they would be extracted
|
||||
monkeypatch.setenv(
|
||||
"VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,web_search_preview"
|
||||
)
|
||||
assert extract_tool_types(tools) == {
|
||||
"local_shell",
|
||||
"auto",
|
||||
"code_interpreter",
|
||||
"web_search_preview",
|
||||
}
|
||||
|
||||
|
||||
class TestInitializeToolSessions:
|
||||
"""Test class for _initialize_tool_sessions method"""
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def serving_responses_instance(self):
|
||||
"""Create a real OpenAIServingResponses instance for testing"""
|
||||
# Create minimal mocks for required dependencies
|
||||
engine_client = MagicMock()
|
||||
|
||||
model_config = MagicMock()
|
||||
model_config.hf_config.model_type = "test"
|
||||
model_config.get_diff_sampling_param.return_value = {}
|
||||
engine_client.model_config = model_config
|
||||
|
||||
engine_client.input_processor = MagicMock()
|
||||
engine_client.io_processor = MagicMock()
|
||||
|
||||
models = MagicMock()
|
||||
|
||||
tool_server = MagicMock(spec=ToolServer)
|
||||
|
||||
# Create the actual instance
|
||||
instance = OpenAIServingResponses(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
tool_server=tool_server,
|
||||
)
|
||||
|
||||
return instance
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_initialize_tool_sessions(
|
||||
self, serving_responses_instance, mock_context, mock_exit_stack
|
||||
):
|
||||
"""Test that method works correctly with only MCP tools"""
|
||||
|
||||
request = ResponsesRequest(input="test input", tools=[])
|
||||
|
||||
# Call the method
|
||||
await serving_responses_instance._initialize_tool_sessions(
|
||||
request, mock_context, mock_exit_stack
|
||||
)
|
||||
assert mock_context.init_tool_sessions_called is False
|
||||
|
||||
# Create only MCP tools
|
||||
tools = [
|
||||
{"type": "web_search_preview"},
|
||||
{"type": "code_interpreter", "container": {"type": "auto"}},
|
||||
]
|
||||
|
||||
request = ResponsesRequest(input="test input", tools=tools)
|
||||
|
||||
# Call the method
|
||||
await serving_responses_instance._initialize_tool_sessions(
|
||||
request, mock_context, mock_exit_stack
|
||||
)
|
||||
|
||||
# Verify that init_tool_sessions was called
|
||||
assert mock_context.init_tool_sessions_called
|
||||
|
||||
def test_validate_create_responses_input(
|
||||
self, serving_responses_instance, mock_context, mock_exit_stack
|
||||
):
|
||||
request = ResponsesRequest(
|
||||
input="test input",
|
||||
previous_input_messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What is my horoscope? I am an Aquarius.",
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
previous_response_id="lol",
|
||||
)
|
||||
error = serving_responses_instance._validate_create_responses_input(request)
|
||||
assert error is not None
|
||||
assert error.error.type == "invalid_request_error"
|
||||
|
||||
|
||||
class TestValidateGeneratorInput:
|
||||
"""Test class for _validate_generator_input method"""
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def serving_responses_instance(self):
|
||||
"""Create a real OpenAIServingResponses instance for testing"""
|
||||
# Create minimal mocks for required dependencies
|
||||
engine_client = MagicMock()
|
||||
|
||||
model_config = MagicMock()
|
||||
model_config.hf_config.model_type = "test"
|
||||
model_config.get_diff_sampling_param.return_value = {}
|
||||
engine_client.model_config = model_config
|
||||
|
||||
engine_client.input_processor = MagicMock()
|
||||
engine_client.io_processor = MagicMock()
|
||||
|
||||
models = MagicMock()
|
||||
|
||||
# Create the actual instance
|
||||
instance = OpenAIServingResponses(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
)
|
||||
|
||||
# Set max_model_len for testing
|
||||
instance.max_model_len = 100
|
||||
|
||||
return instance
|
||||
|
||||
def test_validate_generator_input(self, serving_responses_instance):
|
||||
"""Test _validate_generator_input with valid prompt length"""
|
||||
# Create an engine prompt with valid length (less than max_model_len)
|
||||
valid_prompt_token_ids = list(range(5)) # 5 tokens < 100 max_model_len
|
||||
engine_prompt = TokensPrompt(prompt_token_ids=valid_prompt_token_ids)
|
||||
|
||||
# Call the method
|
||||
result = serving_responses_instance._validate_generator_input(engine_prompt)
|
||||
|
||||
# Should return None for valid input
|
||||
assert result is None
|
||||
|
||||
# create an invalid engine prompt
|
||||
invalid_prompt_token_ids = list(range(200)) # 100 tokens >= 100 max_model_len
|
||||
engine_prompt = TokensPrompt(prompt_token_ids=invalid_prompt_token_ids)
|
||||
|
||||
# Call the method
|
||||
result = serving_responses_instance._validate_generator_input(engine_prompt)
|
||||
|
||||
# Should return an ErrorResponse
|
||||
assert result is not None
|
||||
assert isinstance(result, ErrorResponse)
|
||||
|
||||
|
||||
class TestExtractAllowedToolsFromMcpRequests:
|
||||
"""Test class for _extract_allowed_tools_from_mcp_requests function"""
|
||||
|
||||
def test_extract_allowed_tools_basic_formats(self):
|
||||
"""Test extraction with list format, object format, and None."""
|
||||
from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
|
||||
|
||||
tools = [
|
||||
# List format
|
||||
Mcp(
|
||||
type="mcp",
|
||||
server_label="server1",
|
||||
allowed_tools=["tool1", "tool2"],
|
||||
),
|
||||
# Object format
|
||||
Mcp(
|
||||
type="mcp",
|
||||
server_label="server2",
|
||||
allowed_tools=McpAllowedToolsMcpToolFilter(
|
||||
tool_names=["tool3", "tool4"]
|
||||
),
|
||||
),
|
||||
# None (no filter)
|
||||
Mcp(
|
||||
type="mcp",
|
||||
server_label="server3",
|
||||
allowed_tools=None,
|
||||
),
|
||||
]
|
||||
result = _extract_allowed_tools_from_mcp_requests(tools)
|
||||
assert result == {
|
||||
"server1": ["tool1", "tool2"],
|
||||
"server2": ["tool3", "tool4"],
|
||||
"server3": None,
|
||||
}
|
||||
|
||||
def test_extract_allowed_tools_star_normalization(self):
|
||||
"""Test that '*' wildcard is normalized to None (select all tools).
|
||||
|
||||
This is the key test requested by reviewers to explicitly demonstrate
|
||||
that the "*" select-all scenario is handled correctly.
|
||||
"""
|
||||
from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
|
||||
|
||||
tools = [
|
||||
# Star in list format
|
||||
Mcp(
|
||||
type="mcp",
|
||||
server_label="server1",
|
||||
allowed_tools=["*"],
|
||||
),
|
||||
# Star mixed with other tools in list
|
||||
Mcp(
|
||||
type="mcp",
|
||||
server_label="server2",
|
||||
allowed_tools=["tool1", "*"],
|
||||
),
|
||||
# Star in object format
|
||||
Mcp(
|
||||
type="mcp",
|
||||
server_label="server3",
|
||||
allowed_tools=McpAllowedToolsMcpToolFilter(tool_names=["*"]),
|
||||
),
|
||||
]
|
||||
result = _extract_allowed_tools_from_mcp_requests(tools)
|
||||
# All should be normalized to None (allows all tools)
|
||||
assert result == {
|
||||
"server1": None,
|
||||
"server2": None,
|
||||
"server3": None,
|
||||
}
|
||||
|
||||
def test_extract_allowed_tools_filters_non_mcp(self):
|
||||
"""Test that non-MCP tools are ignored during extraction."""
|
||||
tools = [
|
||||
Mcp(
|
||||
type="mcp",
|
||||
server_label="server1",
|
||||
allowed_tools=["tool1"],
|
||||
),
|
||||
LocalShell(type="local_shell"), # Non-MCP tool should be ignored
|
||||
Mcp(
|
||||
type="mcp",
|
||||
server_label="server2",
|
||||
allowed_tools=["tool2"],
|
||||
),
|
||||
]
|
||||
result = _extract_allowed_tools_from_mcp_requests(tools)
|
||||
# Non-MCP tools should be ignored
|
||||
assert result == {
|
||||
"server1": ["tool1"],
|
||||
"server2": ["tool2"],
|
||||
}
|
||||
262
tests/entrypoints/openai/test_serving_tokens.py
Normal file
262
tests/entrypoints/openai/test_serving_tokens.py
Normal file
@@ -0,0 +1,262 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.v1.engine.detokenizer import check_stop_strings
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
GEN_ENDPOINT = "/inference/v1/generate"
|
||||
|
||||
|
||||
def get_vocab_size(model_name):
|
||||
config = ModelConfig(
|
||||
model=model_name,
|
||||
seed=0,
|
||||
dtype="bfloat16",
|
||||
)
|
||||
return config.get_vocab_size()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer():
|
||||
return AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def messages():
|
||||
return [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "How many countries are in the EU?"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(request):
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"1024",
|
||||
"--enforce-eager",
|
||||
]
|
||||
|
||||
extra_args = getattr(request, "param", None)
|
||||
if extra_args is not None:
|
||||
args = args + (
|
||||
list(extra_args)
|
||||
if isinstance(extra_args, (list, tuple))
|
||||
else [str(extra_args)]
|
||||
)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server: RemoteOpenAIServer):
|
||||
transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
|
||||
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
|
||||
async with httpx.AsyncClient(
|
||||
transport=transport,
|
||||
base_url=server.url_root,
|
||||
timeout=600,
|
||||
headers=headers,
|
||||
) as c:
|
||||
yield c
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_endpoint(client):
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": [1, 2, 3],
|
||||
"sampling_params": {"max_tokens": 5},
|
||||
"stream": False,
|
||||
}
|
||||
resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
assert "choices" in data
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
for ignore_eos in [True, False]:
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
# NOTE coordinator will set this to skip detokenization
|
||||
"detokenize": False,
|
||||
"ignore_eos": ignore_eos,
|
||||
},
|
||||
"stream": False,
|
||||
}
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": messages,
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"ignore_eos": ignore_eos,
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
|
||||
assert generate_res == completions_res
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stop_string_workflow(client, tokenizer, messages):
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"detokenize": False,
|
||||
# stop strings are only supported when detokenize is True.
|
||||
"stop": ["27 member"],
|
||||
},
|
||||
# TODO stream test is much more interesting
|
||||
"stream": False,
|
||||
}
|
||||
with pytest.raises(httpx.HTTPStatusError):
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_resp.raise_for_status()
|
||||
|
||||
payload["sampling_params"]["stop"] = None
|
||||
generate_resp = await client.post(
|
||||
GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
|
||||
)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
|
||||
# NOTE This is under the responsibility of the coordinator
|
||||
# stop_checker = StopChecker(
|
||||
# max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
|
||||
# )
|
||||
stop_str, truncate_to = check_stop_strings(
|
||||
generate_res, len(generate_res), ["27 member"], False
|
||||
)
|
||||
assert stop_str == "27 member"
|
||||
# abort request that hit stop string (requires tokens-only mode)
|
||||
# res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
|
||||
# res.raise_for_status()
|
||||
generate_res = generate_res[:truncate_to]
|
||||
|
||||
# Get stop_str response from chat completions
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": messages,
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"stop": ["27 member"],
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
assert generate_res == completions_res
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"server",
|
||||
[
|
||||
[
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
"Alice=charent/self_cognition_Alice",
|
||||
"Bob=charent/self_cognition_Bob",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
]
|
||||
],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_generate_with_lora_adapter(client, tokenizer, messages):
|
||||
# Verify adapters are listed
|
||||
models_resp = await client.get("/v1/models")
|
||||
models_resp.raise_for_status()
|
||||
models = {m["id"] for m in models_resp.json().get("data", [])}
|
||||
assert {"Alice", "Bob"}.issubset(models)
|
||||
|
||||
# Generate using a LoRA adapter by specifying its name as the model
|
||||
payload = {
|
||||
"model": "Alice",
|
||||
"token_ids": [1, 2, 3],
|
||||
"sampling_params": {"max_tokens": 5},
|
||||
"stream": False,
|
||||
}
|
||||
resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
assert "choices" in data
|
||||
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
payload = {
|
||||
"model": "Alice",
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"detokenize": False,
|
||||
},
|
||||
"stream": False,
|
||||
}
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": "Alice",
|
||||
"messages": messages,
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
|
||||
assert generate_res == completions_res
|
||||
93
tests/entrypoints/openai/test_shutdown.py
Normal file
93
tests/entrypoints/openai/test_shutdown.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_shutdown_on_engine_failure():
|
||||
"""Verify that API returns connection error when server process is killed.
|
||||
|
||||
Starts a vLLM server, kills it to simulate a crash, then verifies that
|
||||
subsequent API calls fail appropriately.
|
||||
"""
|
||||
|
||||
port = get_open_port()
|
||||
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
# dtype, max-len etc set so that this can run in CI
|
||||
sys.executable,
|
||||
"-m",
|
||||
"vllm.entrypoints.openai.api_server",
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
"--port",
|
||||
str(port),
|
||||
"--gpu-memory-utilization",
|
||||
"0.05",
|
||||
"--max-num-seqs",
|
||||
"2",
|
||||
"--disable-frontend-multiprocessing",
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
|
||||
)
|
||||
|
||||
# Wait for server startup
|
||||
start_time = time.time()
|
||||
client = openai.AsyncOpenAI(
|
||||
base_url=f"http://localhost:{port}/v1",
|
||||
api_key="dummy",
|
||||
max_retries=0,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
# Poll until server is ready
|
||||
while time.time() - start_time < 30:
|
||||
try:
|
||||
await client.completions.create(
|
||||
model=MODEL_NAME, prompt="Hello", max_tokens=1
|
||||
)
|
||||
break
|
||||
except Exception:
|
||||
time.sleep(0.5)
|
||||
if proc.poll() is not None:
|
||||
stdout, stderr = proc.communicate(timeout=1)
|
||||
pytest.fail(
|
||||
f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
|
||||
)
|
||||
else:
|
||||
proc.terminate()
|
||||
proc.wait(timeout=5)
|
||||
pytest.fail("Server failed to start in 30 seconds")
|
||||
|
||||
# Kill server to simulate crash
|
||||
proc.terminate()
|
||||
time.sleep(1)
|
||||
|
||||
# Verify API calls now fail
|
||||
with pytest.raises((openai.APIConnectionError, openai.APIStatusError)):
|
||||
await client.completions.create(
|
||||
model=MODEL_NAME, prompt="This should fail", max_tokens=1
|
||||
)
|
||||
|
||||
return_code = proc.wait(timeout=5)
|
||||
assert return_code is not None
|
||||
110
tests/entrypoints/openai/test_sleep.py
Normal file
110
tests/entrypoints/openai/test_sleep.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import requests
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B"
|
||||
|
||||
|
||||
def test_sleep_mode():
|
||||
# dtype, max-len etc set so that this can run in CI
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enable-sleep-mode",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME,
|
||||
args,
|
||||
env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
|
||||
) as remote_server:
|
||||
response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
|
||||
assert response.status_code == 200
|
||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||
assert response.status_code == 200
|
||||
assert response.json().get("is_sleeping") is True
|
||||
|
||||
# check sleep metrics
|
||||
response = requests.get(remote_server.url_for("metrics"))
|
||||
assert response.status_code == 200
|
||||
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
|
||||
assert awake == 0
|
||||
assert weights_offloaded == 1
|
||||
assert discard_all == 0
|
||||
|
||||
response = requests.post(remote_server.url_for("wake_up"))
|
||||
assert response.status_code == 200
|
||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||
assert response.status_code == 200
|
||||
assert response.json().get("is_sleeping") is False
|
||||
|
||||
# check sleep metrics
|
||||
response = requests.get(remote_server.url_for("metrics"))
|
||||
assert response.status_code == 200
|
||||
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
|
||||
assert awake == 1
|
||||
assert weights_offloaded == 0
|
||||
assert discard_all == 0
|
||||
|
||||
# test wake up with tags
|
||||
response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
|
||||
assert response.status_code == 200
|
||||
|
||||
response = requests.post(
|
||||
remote_server.url_for("wake_up"), params={"tags": ["weights"]}
|
||||
)
|
||||
assert response.status_code == 200
|
||||
|
||||
# is sleeping should be false after waking up any part of the engine
|
||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||
assert response.status_code == 200
|
||||
assert response.json().get("is_sleeping") is True
|
||||
|
||||
response = requests.post(
|
||||
remote_server.url_for("wake_up"), params={"tags": ["kv_cache"]}
|
||||
)
|
||||
assert response.status_code == 200
|
||||
|
||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||
assert response.status_code == 200
|
||||
assert response.json().get("is_sleeping") is False
|
||||
|
||||
# check sleep metrics
|
||||
response = requests.get(remote_server.url_for("metrics"))
|
||||
assert response.status_code == 200
|
||||
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
|
||||
assert awake == 1
|
||||
assert weights_offloaded == 0
|
||||
assert discard_all == 0
|
||||
|
||||
|
||||
def _get_sleep_metrics_from_api(response: requests.Response):
|
||||
"""Return (awake, weights_offloaded, discard_all)"""
|
||||
|
||||
awake, weights_offloaded, discard_all = None, None, None
|
||||
|
||||
for family in text_string_to_metric_families(response.text):
|
||||
if family.name == "vllm:engine_sleep_state":
|
||||
for sample in family.samples:
|
||||
if sample.name == "vllm:engine_sleep_state":
|
||||
for label_name, label_value in sample.labels.items():
|
||||
if label_value == "awake":
|
||||
awake = sample.value
|
||||
elif label_value == "weights_offloaded":
|
||||
weights_offloaded = sample.value
|
||||
elif label_value == "discard_all":
|
||||
discard_all = sample.value
|
||||
|
||||
assert awake is not None
|
||||
assert weights_offloaded is not None
|
||||
assert discard_all is not None
|
||||
|
||||
return awake, weights_offloaded, discard_all
|
||||
342
tests/entrypoints/openai/test_sparse_tensor_validation.py
Normal file
342
tests/entrypoints/openai/test_sparse_tensor_validation.py
Normal file
@@ -0,0 +1,342 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Sparse tensor validation in embedding APIs.
|
||||
|
||||
Tests verify that malicious sparse tensors are rejected before they can trigger
|
||||
out-of-bounds memory writes during to_dense() operations.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import io
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.entrypoints.renderer import CompletionRenderer
|
||||
from vllm.multimodal.audio import AudioEmbeddingMediaIO
|
||||
from vllm.multimodal.image import ImageEmbeddingMediaIO
|
||||
|
||||
|
||||
def _encode_tensor(tensor: torch.Tensor) -> bytes:
|
||||
"""Helper to encode a tensor as base64 bytes."""
|
||||
buffer = io.BytesIO()
|
||||
torch.save(tensor, buffer)
|
||||
buffer.seek(0)
|
||||
return base64.b64encode(buffer.read())
|
||||
|
||||
|
||||
def _create_malicious_sparse_tensor() -> torch.Tensor:
|
||||
"""
|
||||
Create a malicious sparse COO tensor with out-of-bounds indices.
|
||||
|
||||
This tensor has indices that point beyond the declared shape, which would
|
||||
cause an out-of-bounds write when converted to dense format without
|
||||
validation.
|
||||
"""
|
||||
# Create a 3x3 sparse tensor but with indices pointing to (10, 10)
|
||||
indices = torch.tensor([[10], [10]]) # Out of bounds for 3x3 shape
|
||||
values = torch.tensor([1.0])
|
||||
shape = (3, 3)
|
||||
|
||||
# Create sparse tensor (this will be invalid)
|
||||
sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
|
||||
return sparse_tensor
|
||||
|
||||
|
||||
def _create_valid_sparse_tensor() -> torch.Tensor:
|
||||
"""Create a valid sparse COO tensor for baseline testing."""
|
||||
indices = torch.tensor([[0, 1, 2], [0, 1, 2]])
|
||||
values = torch.tensor([1.0, 2.0, 3.0])
|
||||
shape = (3, 3)
|
||||
|
||||
sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
|
||||
return sparse_tensor
|
||||
|
||||
|
||||
def _create_valid_dense_tensor() -> torch.Tensor:
|
||||
"""Create a valid dense tensor for baseline testing."""
|
||||
return torch.randn(10, 768, dtype=torch.float32) # (seq_len, hidden_size)
|
||||
|
||||
|
||||
class TestPromptEmbedsValidation:
|
||||
"""Test sparse tensor validation in prompt embeddings (Completions API)."""
|
||||
|
||||
def test_valid_dense_tensor_accepted(self, model_config):
|
||||
"""Baseline: Valid dense tensors should work normally."""
|
||||
renderer = CompletionRenderer(model_config)
|
||||
|
||||
valid_tensor = _create_valid_dense_tensor()
|
||||
encoded = _encode_tensor(valid_tensor)
|
||||
|
||||
# Should not raise any exception
|
||||
result = renderer.load_prompt_embeds(encoded)
|
||||
assert len(result) == 1
|
||||
assert result[0]["prompt_embeds"].shape == valid_tensor.shape
|
||||
|
||||
def test_valid_sparse_tensor_accepted(self):
|
||||
"""Baseline: Valid sparse tensors should load successfully."""
|
||||
io_handler = ImageEmbeddingMediaIO()
|
||||
|
||||
valid_sparse = _create_valid_sparse_tensor()
|
||||
encoded = _encode_tensor(valid_sparse)
|
||||
|
||||
# Should not raise any exception (sparse tensors remain sparse)
|
||||
result = io_handler.load_base64("", encoded.decode("utf-8"))
|
||||
assert result.shape == valid_sparse.shape
|
||||
|
||||
def test_malicious_sparse_tensor_rejected(self, model_config):
|
||||
"""Security: Malicious sparse tensors should be rejected."""
|
||||
renderer = CompletionRenderer(model_config)
|
||||
|
||||
malicious_tensor = _create_malicious_sparse_tensor()
|
||||
encoded = _encode_tensor(malicious_tensor)
|
||||
|
||||
# Should raise RuntimeError due to invalid sparse tensor
|
||||
with pytest.raises((RuntimeError, ValueError)) as exc_info:
|
||||
renderer.load_prompt_embeds(encoded)
|
||||
|
||||
# Error should indicate sparse tensor validation failure
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
|
||||
|
||||
def test_extremely_large_indices_rejected(self, model_config):
|
||||
"""Security: Sparse tensors with extremely large indices should be rejected."""
|
||||
renderer = CompletionRenderer(model_config)
|
||||
|
||||
# Create tensor with indices far beyond reasonable bounds
|
||||
indices = torch.tensor([[999999], [999999]])
|
||||
values = torch.tensor([1.0])
|
||||
shape = (10, 10)
|
||||
|
||||
malicious_tensor = torch.sparse_coo_tensor(
|
||||
indices, values, shape, dtype=torch.float32
|
||||
)
|
||||
encoded = _encode_tensor(malicious_tensor)
|
||||
|
||||
with pytest.raises((RuntimeError, ValueError)):
|
||||
renderer.load_prompt_embeds(encoded)
|
||||
|
||||
def test_negative_indices_rejected(self, model_config):
|
||||
"""Security: Sparse tensors with negative indices should be rejected."""
|
||||
renderer = CompletionRenderer(model_config)
|
||||
|
||||
# Create tensor with negative indices
|
||||
indices = torch.tensor([[-1], [-1]])
|
||||
values = torch.tensor([1.0])
|
||||
shape = (10, 10)
|
||||
|
||||
malicious_tensor = torch.sparse_coo_tensor(
|
||||
indices, values, shape, dtype=torch.float32
|
||||
)
|
||||
encoded = _encode_tensor(malicious_tensor)
|
||||
|
||||
with pytest.raises((RuntimeError, ValueError)):
|
||||
renderer.load_prompt_embeds(encoded)
|
||||
|
||||
|
||||
class TestImageEmbedsValidation:
|
||||
"""Test sparse tensor validation in image embeddings (Chat API)."""
|
||||
|
||||
def test_valid_dense_tensor_accepted(self):
|
||||
"""Baseline: Valid dense tensors should work normally."""
|
||||
io_handler = ImageEmbeddingMediaIO()
|
||||
|
||||
valid_tensor = _create_valid_dense_tensor()
|
||||
encoded = _encode_tensor(valid_tensor)
|
||||
|
||||
# Should not raise any exception
|
||||
result = io_handler.load_base64("", encoded.decode("utf-8"))
|
||||
assert result.shape == valid_tensor.shape
|
||||
|
||||
def test_valid_sparse_tensor_accepted(self):
|
||||
"""Baseline: Valid sparse tensors should load successfully."""
|
||||
io_handler = AudioEmbeddingMediaIO()
|
||||
|
||||
valid_sparse = _create_valid_sparse_tensor()
|
||||
encoded = _encode_tensor(valid_sparse)
|
||||
|
||||
# Should not raise any exception (sparse tensors remain sparse)
|
||||
result = io_handler.load_base64("", encoded.decode("utf-8"))
|
||||
assert result.shape == valid_sparse.shape
|
||||
|
||||
def test_malicious_sparse_tensor_rejected(self):
|
||||
"""Security: Malicious sparse tensors should be rejected."""
|
||||
io_handler = ImageEmbeddingMediaIO()
|
||||
|
||||
malicious_tensor = _create_malicious_sparse_tensor()
|
||||
encoded = _encode_tensor(malicious_tensor)
|
||||
|
||||
# Should raise RuntimeError due to invalid sparse tensor
|
||||
with pytest.raises((RuntimeError, ValueError)) as exc_info:
|
||||
io_handler.load_base64("", encoded.decode("utf-8"))
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
|
||||
|
||||
def test_load_bytes_validates(self):
|
||||
"""Security: Validation should also work for load_bytes method."""
|
||||
io_handler = ImageEmbeddingMediaIO()
|
||||
|
||||
malicious_tensor = _create_malicious_sparse_tensor()
|
||||
buffer = io.BytesIO()
|
||||
torch.save(malicious_tensor, buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
with pytest.raises((RuntimeError, ValueError)):
|
||||
io_handler.load_bytes(buffer.read())
|
||||
|
||||
|
||||
class TestAudioEmbedsValidation:
|
||||
"""Test sparse tensor validation in audio embeddings (Chat API)."""
|
||||
|
||||
def test_valid_dense_tensor_accepted(self):
|
||||
"""Baseline: Valid dense tensors should work normally."""
|
||||
io_handler = AudioEmbeddingMediaIO()
|
||||
|
||||
valid_tensor = _create_valid_dense_tensor()
|
||||
encoded = _encode_tensor(valid_tensor)
|
||||
|
||||
# Should not raise any exception
|
||||
result = io_handler.load_base64("", encoded.decode("utf-8"))
|
||||
assert result.shape == valid_tensor.shape
|
||||
|
||||
def test_valid_sparse_tensor_accepted(self):
|
||||
"""Baseline: Valid sparse tensors should be converted successfully."""
|
||||
io_handler = AudioEmbeddingMediaIO()
|
||||
|
||||
valid_sparse = _create_valid_sparse_tensor()
|
||||
encoded = _encode_tensor(valid_sparse)
|
||||
|
||||
# Should not raise any exception
|
||||
result = io_handler.load_base64("", encoded.decode("utf-8"))
|
||||
assert result.is_sparse is False
|
||||
|
||||
def test_malicious_sparse_tensor_rejected(self):
|
||||
"""Security: Malicious sparse tensors should be rejected."""
|
||||
io_handler = AudioEmbeddingMediaIO()
|
||||
|
||||
malicious_tensor = _create_malicious_sparse_tensor()
|
||||
encoded = _encode_tensor(malicious_tensor)
|
||||
|
||||
# Should raise RuntimeError due to invalid sparse tensor
|
||||
with pytest.raises((RuntimeError, ValueError)) as exc_info:
|
||||
io_handler.load_base64("", encoded.decode("utf-8"))
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
|
||||
|
||||
def test_load_bytes_validates(self):
|
||||
"""Security: Validation should also work for load_bytes method."""
|
||||
io_handler = AudioEmbeddingMediaIO()
|
||||
|
||||
malicious_tensor = _create_malicious_sparse_tensor()
|
||||
buffer = io.BytesIO()
|
||||
torch.save(malicious_tensor, buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
with pytest.raises((RuntimeError, ValueError)):
|
||||
io_handler.load_bytes(buffer.read())
|
||||
|
||||
|
||||
class TestSparseTensorValidationIntegration:
|
||||
"""
|
||||
These tests verify the complete attack chain is blocked at all entry points.
|
||||
"""
|
||||
|
||||
def test_attack_scenario_completions_api(self, model_config):
|
||||
"""
|
||||
Simulate a complete attack through the Completions API.
|
||||
|
||||
Attack scenario:
|
||||
1. Attacker crafts malicious sparse tensor
|
||||
2. Encodes it as base64
|
||||
3. Sends to /v1/completions with prompt_embeds parameter
|
||||
4. Server should reject before memory corruption occurs
|
||||
"""
|
||||
renderer = CompletionRenderer(model_config)
|
||||
|
||||
# Step 1-2: Attacker creates malicious payload
|
||||
attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
|
||||
|
||||
# Step 3-4: Server processes and should reject
|
||||
with pytest.raises((RuntimeError, ValueError)):
|
||||
renderer.load_prompt_embeds(attack_payload)
|
||||
|
||||
def test_attack_scenario_chat_api_image(self):
|
||||
"""
|
||||
Simulate attack through Chat API with image_embeds.
|
||||
|
||||
Verifies the image embeddings path is protected.
|
||||
"""
|
||||
io_handler = ImageEmbeddingMediaIO()
|
||||
attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
|
||||
|
||||
with pytest.raises((RuntimeError, ValueError)):
|
||||
io_handler.load_base64("", attack_payload.decode("utf-8"))
|
||||
|
||||
def test_attack_scenario_chat_api_audio(self):
|
||||
"""
|
||||
Simulate attack through Chat API with audio_embeds.
|
||||
|
||||
Verifies the audio embeddings path is protected.
|
||||
"""
|
||||
io_handler = AudioEmbeddingMediaIO()
|
||||
attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
|
||||
|
||||
with pytest.raises((RuntimeError, ValueError)):
|
||||
io_handler.load_base64("", attack_payload.decode("utf-8"))
|
||||
|
||||
def test_multiple_valid_embeddings_in_batch(self, model_config):
|
||||
"""
|
||||
Regression test: Multiple valid embeddings should still work.
|
||||
|
||||
Ensures the fix doesn't break legitimate batch processing.
|
||||
"""
|
||||
renderer = CompletionRenderer(model_config)
|
||||
|
||||
valid_tensors = [
|
||||
_encode_tensor(_create_valid_dense_tensor()),
|
||||
_encode_tensor(_create_valid_dense_tensor()),
|
||||
_encode_tensor(_create_valid_dense_tensor()),
|
||||
]
|
||||
|
||||
# Should process all without error
|
||||
result = renderer.load_prompt_embeds(valid_tensors)
|
||||
assert len(result) == 3
|
||||
|
||||
def test_mixed_valid_and_malicious_rejected(self, model_config):
|
||||
"""
|
||||
Security: Batch with one malicious tensor should be rejected.
|
||||
|
||||
Even if most tensors are valid, a single malicious one should
|
||||
cause rejection of the entire batch.
|
||||
"""
|
||||
renderer = CompletionRenderer(model_config)
|
||||
|
||||
mixed_batch = [
|
||||
_encode_tensor(_create_valid_dense_tensor()),
|
||||
_encode_tensor(_create_malicious_sparse_tensor()), # Malicious
|
||||
_encode_tensor(_create_valid_dense_tensor()),
|
||||
]
|
||||
|
||||
# Should fail on the malicious tensor
|
||||
with pytest.raises((RuntimeError, ValueError)):
|
||||
renderer.load_prompt_embeds(mixed_batch)
|
||||
|
||||
|
||||
# Pytest fixtures
|
||||
@pytest.fixture
|
||||
def model_config():
|
||||
"""Mock ModelConfig for testing."""
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
return ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float32",
|
||||
seed=0,
|
||||
enable_prompt_embeds=True, # Required for prompt embeds tests
|
||||
)
|
||||
105
tests/entrypoints/openai/test_tensorizer_entrypoint.py
Normal file
105
tests/entrypoints/openai/test_tensorizer_entrypoint.py
Normal file
@@ -0,0 +1,105 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import torch.cuda
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.model_executor.model_loader.tensorizer import (
|
||||
TensorizerConfig,
|
||||
tensorize_lora_adapter,
|
||||
tensorize_vllm_model,
|
||||
)
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
|
||||
LORA_PATH = "davzoku/finqa_adapter_1b"
|
||||
|
||||
|
||||
def _cleanup():
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup():
|
||||
_cleanup()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tmp_dir():
|
||||
with tempfile.TemporaryDirectory() as path:
|
||||
yield path
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model_uri(tmp_dir):
|
||||
yield f"{tmp_dir}/model.tensors"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tensorize_model_and_lora(tmp_dir, model_uri):
|
||||
tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri, lora_dir=tmp_dir)
|
||||
args = EngineArgs(model=MODEL_NAME)
|
||||
|
||||
tensorize_lora_adapter(LORA_PATH, tensorizer_config)
|
||||
tensorize_vllm_model(args, tensorizer_config)
|
||||
|
||||
# Manually invoke a _cleanup() here, as the cleanup()
|
||||
# fixture won't be guaranteed to be called after this
|
||||
# when this fixture is used for a test
|
||||
_cleanup()
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(model_uri, tensorize_model_and_lora):
|
||||
# In this case, model_uri is a directory with a model.tensors
|
||||
# file and all necessary model artifacts, particularly a
|
||||
# HF `config.json` file. In this case, Tensorizer can infer the
|
||||
# `TensorizerConfig` so --model-loader-extra-config can be completely
|
||||
# omitted.
|
||||
|
||||
## Start OpenAI API server
|
||||
args = [
|
||||
"--load-format",
|
||||
"tensorizer",
|
||||
"--served-model-name",
|
||||
MODEL_NAME,
|
||||
"--enable-lora",
|
||||
]
|
||||
|
||||
model_dir = os.path.dirname(model_uri)
|
||||
with RemoteOpenAIServer(model_dir, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
||||
_cleanup()
|
||||
completion = await client.completions.create(
|
||||
model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
|
||||
)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
assert completion.model == MODEL_NAME
|
||||
assert len(completion.choices) == 1
|
||||
assert len(completion.choices[0].text) >= 5
|
||||
assert completion.choices[0].finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=6, total_tokens=11
|
||||
)
|
||||
74
tests/entrypoints/openai/test_token_in_token_out.py
Normal file
74
tests/entrypoints/openai/test_token_in_token_out.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
global MODEL_PATH
|
||||
MODEL_PATH = download_weights_from_hf(
|
||||
MODEL_NAME,
|
||||
allow_patterns=["*"],
|
||||
cache_dir=MODEL_PATH,
|
||||
ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"],
|
||||
)
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
"--skip-tokenizer-init",
|
||||
"--load-format",
|
||||
"dummy",
|
||||
]
|
||||
with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_token_in_token_out_and_logprobs(server):
|
||||
"""
|
||||
Test token-in-token-out and token_ids align with prompt_logprobs
|
||||
& logprobs when return_tokens_as_token_ids is enabled.
|
||||
"""
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
text = "Hello, world! How are you today?"
|
||||
token_ids = tokenizer.encode(text)
|
||||
async with server.get_async_client() as client:
|
||||
# Test with both return_token_ids and return_tokens_as_token_ids enabled
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_PATH,
|
||||
prompt=token_ids,
|
||||
max_tokens=20,
|
||||
temperature=0,
|
||||
echo=True,
|
||||
extra_body={
|
||||
"return_token_ids": True,
|
||||
},
|
||||
)
|
||||
|
||||
# Verify all fields are present
|
||||
assert (
|
||||
completion.choices[0].token_ids is not None
|
||||
and 0 < len(completion.choices[0].token_ids) <= 20
|
||||
)
|
||||
assert completion.choices[0].prompt_token_ids is not None
|
||||
|
||||
# Decode prompt tokens
|
||||
if completion.choices[0].prompt_token_ids:
|
||||
prompt_text = tokenizer.decode(completion.choices[0].prompt_token_ids)
|
||||
# The decoded prompt should match or close to original prompt
|
||||
assert prompt_text == text
|
||||
355
tests/entrypoints/openai/test_tokenization.py
Normal file
355
tests/entrypoints/openai/test_tokenization.py
Normal file
@@ -0,0 +1,355 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enable-tokenizer-info-endpoint",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer_name(model_name: str):
|
||||
return model_name
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_completions(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
for add_special in [False, True]:
|
||||
prompt = "vllm1 This is a test prompt."
|
||||
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("tokenize"),
|
||||
json={
|
||||
"add_special_tokens": add_special,
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
assert result["tokens"] == tokens
|
||||
assert result["count"] == len(tokens)
|
||||
assert result["max_model_len"] == 8192
|
||||
assert result["token_strs"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_chat(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
for add_generation in [False, True]:
|
||||
for add_special in [False, True]:
|
||||
conversation = [
|
||||
{"role": "user", "content": "Hi there!"},
|
||||
{"role": "assistant", "content": "Nice to meet you!"},
|
||||
{"role": "user", "content": "Can I ask a question? vllm1"},
|
||||
]
|
||||
for continue_final in [False, True]:
|
||||
if add_generation and continue_final:
|
||||
continue
|
||||
if continue_final:
|
||||
conversation.append({"role": "assistant", "content": "Sure,"})
|
||||
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
add_generation_prompt=add_generation,
|
||||
continue_final_message=continue_final,
|
||||
conversation=conversation,
|
||||
tokenize=False,
|
||||
)
|
||||
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("tokenize"),
|
||||
json={
|
||||
"add_generation_prompt": add_generation,
|
||||
"continue_final_message": continue_final,
|
||||
"add_special_tokens": add_special,
|
||||
"messages": conversation,
|
||||
"model": model_name,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
assert result["tokens"] == tokens
|
||||
assert result["count"] == len(tokens)
|
||||
assert result["max_model_len"] == 8192
|
||||
assert result["token_strs"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_chat_with_tools(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
for add_generation in [False, True]:
|
||||
for add_special in [False, True]:
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in Paris today?",
|
||||
}
|
||||
]
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"location": {"type": "string"}},
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
for continue_final in [False, True]:
|
||||
if add_generation and continue_final:
|
||||
continue
|
||||
if continue_final:
|
||||
conversation.append({"role": "assistant", "content": "Sure,"})
|
||||
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
add_generation_prompt=add_generation,
|
||||
continue_final_message=continue_final,
|
||||
conversation=conversation,
|
||||
tools=tools,
|
||||
tokenize=False,
|
||||
)
|
||||
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("tokenize"),
|
||||
json={
|
||||
"add_generation_prompt": add_generation,
|
||||
"continue_final_message": continue_final,
|
||||
"add_special_tokens": add_special,
|
||||
"messages": conversation,
|
||||
"model": model_name,
|
||||
"tools": tools,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
assert result["tokens"] == tokens
|
||||
assert result["count"] == len(tokens)
|
||||
assert result["max_model_len"] == 8192
|
||||
assert result["token_strs"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_with_return_token_strs(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
prompt = "This is a token_strs test prompt! vllm1"
|
||||
response = requests.post(
|
||||
server.url_for("tokenize"),
|
||||
json={"prompt": prompt, "model": model_name, "return_token_strs": True},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
tokens = tokenizer.encode(prompt, add_special_tokens=True)
|
||||
tokens_str = tokenizer.convert_ids_to_tokens(tokens)
|
||||
|
||||
result = response.json()
|
||||
assert result["tokens"] == tokens
|
||||
assert result["count"] == len(tokens)
|
||||
assert result["max_model_len"] == 8192
|
||||
assert result["token_strs"] == tokens_str
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_detokenize(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
prompt = "This is a test prompt. vllm1"
|
||||
tokens = tokenizer.encode(prompt, add_special_tokens=False)
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("detokenize"), json={"model": model_name, "tokens": tokens}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
assert response.json() == {"prompt": prompt}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenizer_info_basic(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
"""Test basic tokenizer info endpoint functionality."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
assert "tokenizer_class" in result
|
||||
assert isinstance(result["tokenizer_class"], str)
|
||||
assert result["tokenizer_class"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
|
||||
"""Test that the response matches expected schema types."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
field_types = {
|
||||
"add_bos_token": bool,
|
||||
"add_prefix_space": bool,
|
||||
"clean_up_tokenization_spaces": bool,
|
||||
"split_special_tokens": bool,
|
||||
"bos_token": str,
|
||||
"eos_token": str,
|
||||
"pad_token": str,
|
||||
"unk_token": str,
|
||||
"chat_template": str,
|
||||
"errors": str,
|
||||
"model_max_length": int,
|
||||
"additional_special_tokens": list,
|
||||
"added_tokens_decoder": dict,
|
||||
}
|
||||
for field, expected_type in field_types.items():
|
||||
if field in result and result[field] is not None:
|
||||
assert isinstance(result[field], expected_type), (
|
||||
f"{field} should be {expected_type.__name__}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_added_tokens_structure(
|
||||
server: RemoteOpenAIServer,
|
||||
):
|
||||
"""Test added_tokens_decoder structure if present."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
added_tokens = result.get("added_tokens_decoder")
|
||||
if added_tokens:
|
||||
for token_id, token_info in added_tokens.items():
|
||||
assert isinstance(token_id, str), "Token IDs should be strings"
|
||||
assert isinstance(token_info, dict), "Token info should be a dict"
|
||||
assert "content" in token_info, "Token info should have content"
|
||||
assert "special" in token_info, "Token info should have special flag"
|
||||
assert isinstance(token_info["special"], bool), (
|
||||
"Special flag should be boolean"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_consistency_with_tokenize(
|
||||
server: RemoteOpenAIServer,
|
||||
):
|
||||
"""Test that tokenizer info is consistent with tokenization endpoint."""
|
||||
info_response = requests.get(server.url_for("tokenizer_info"))
|
||||
info_response.raise_for_status()
|
||||
info = info_response.json()
|
||||
tokenize_response = requests.post(
|
||||
server.url_for("tokenize"),
|
||||
json={"model": MODEL_NAME, "prompt": "Hello world!"},
|
||||
)
|
||||
tokenize_response.raise_for_status()
|
||||
tokenize_result = tokenize_response.json()
|
||||
info_max_len = info.get("model_max_length")
|
||||
tokenize_max_len = tokenize_result.get("max_model_len")
|
||||
if info_max_len and tokenize_max_len:
|
||||
assert info_max_len >= tokenize_max_len, (
|
||||
"Info max length should be >= tokenize max length"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
|
||||
"""Test chat template is properly included."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
chat_template = result.get("chat_template")
|
||||
if chat_template:
|
||||
assert isinstance(chat_template, str), "Chat template should be a string"
|
||||
assert chat_template.strip(), "Chat template should not be empty"
|
||||
100
tests/entrypoints/openai/test_transcription_validation.py
Normal file
100
tests/entrypoints/openai/test_transcription_validation.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# imports for structured outputs tests
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode",
|
||||
"mistral",
|
||||
"--config_format",
|
||||
"mistral",
|
||||
"--load_format",
|
||||
"mistral",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
|
||||
async def test_basic_audio(mary_had_lamb, model_name):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("mistralai"):
|
||||
server_args += MISTRAL_FORMAT_ARGS
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
assert "Mary had a little lamb," in out_text
|
||||
assert out_usage["seconds"] == 16, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
"""Ensure STT (transcribe) requests can pass LoRA through to generate."""
|
||||
model_name = "ibm-granite/granite-speech-3.3-2b"
|
||||
lora_model_name = "speech"
|
||||
server_args = [
|
||||
"--enforce-eager",
|
||||
"--enable-lora",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--lora-modules",
|
||||
f"{lora_model_name}={model_name}",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
]
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=lora_model_name,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
assert "mary had a little lamb" in out_text
|
||||
assert out_usage["seconds"] == 16, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_gemma(foscolo):
|
||||
# Gemma accuracy on some of the audio samples we use is particularly bad,
|
||||
# hence we use a different one here. WER is evaluated separately.
|
||||
model_name = "google/gemma-3n-E2B-it"
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
model_name, server_args, max_wait_seconds=480
|
||||
) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=foscolo,
|
||||
language="it",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)["text"]
|
||||
assert "da cui vergine nacque Venere" in out
|
||||
@@ -0,0 +1,246 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# imports for structured outputs tests
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import soundfile as sf
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
||||
SERVER_ARGS = ["--enforce-eager"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def whisper_client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio(whisper_client, mary_had_lamb):
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
assert "Mary had a little lamb," in out_text
|
||||
assert out_usage["seconds"] == 16, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
|
||||
transcription = whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
transcription2 = whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
# Await both transcriptions by scheduling coroutines together
|
||||
transcription, transcription2 = await asyncio.gather(transcription, transcription2)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
assert "Mary had a little lamb," in out_text
|
||||
out2 = json.loads(transcription2)
|
||||
out_text2 = out2["text"]
|
||||
assert "Edgar Martinez" in out_text2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bad_requests(mary_had_lamb, whisper_client):
|
||||
# invalid language
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_long_audio_request(mary_had_lamb, whisper_client):
|
||||
mary_had_lamb.seek(0)
|
||||
audio, sr = librosa.load(mary_had_lamb)
|
||||
# Add small silence after each audio for repeatability in the split process
|
||||
audio = np.pad(audio, (0, 1600))
|
||||
repeated_audio = np.tile(audio, 10)
|
||||
# Repeated audio to buffer
|
||||
buffer = io.BytesIO()
|
||||
sf.write(buffer, repeated_audio, sr, format="WAV")
|
||||
buffer.seek(0)
|
||||
transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=buffer,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
counts = out_text.count("Mary had a little lamb")
|
||||
assert counts == 10, counts
|
||||
assert out_usage["seconds"] == 161, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_endpoints(whisper_client):
|
||||
# text to text model
|
||||
res = await whisper_client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "system", "content": "You are a helpful assistant."}],
|
||||
)
|
||||
err = res.error
|
||||
assert err["code"] == 400
|
||||
assert err["message"] == "The model does not support Chat Completions API"
|
||||
|
||||
res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
|
||||
err = res.error
|
||||
assert err["code"] == 400
|
||||
assert err["message"] == "The model does not support Completions API"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_response(winning_call, whisper_client):
|
||||
transcription = ""
|
||||
res_no_stream = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
response_format="json",
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
)
|
||||
res = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
timeout=30,
|
||||
)
|
||||
# Reconstruct from chunks and validate
|
||||
async for chunk in res:
|
||||
text = chunk.choices[0]["delta"]["content"]
|
||||
transcription += text
|
||||
|
||||
assert transcription == res_no_stream.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_options(winning_call, whisper_client):
|
||||
res = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
|
||||
timeout=30,
|
||||
)
|
||||
final = False
|
||||
continuous = True
|
||||
async for chunk in res:
|
||||
if not len(chunk.choices):
|
||||
# final usage sent
|
||||
final = True
|
||||
else:
|
||||
continuous = continuous and hasattr(chunk, "usage")
|
||||
assert final and continuous
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sampling_params(mary_had_lamb, whisper_client):
|
||||
"""
|
||||
Compare sampling with params and greedy sampling to assert results
|
||||
are different when extreme sampling parameters values are picked.
|
||||
"""
|
||||
transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
temperature=0.8,
|
||||
extra_body=dict(
|
||||
seed=42,
|
||||
repetition_penalty=1.9,
|
||||
top_k=12,
|
||||
top_p=0.4,
|
||||
min_p=0.5,
|
||||
frequency_penalty=1.8,
|
||||
presence_penalty=2.0,
|
||||
),
|
||||
)
|
||||
|
||||
greedy_transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
extra_body=dict(seed=42),
|
||||
)
|
||||
|
||||
assert greedy_transcription.text != transcription.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_audio_prompt(mary_had_lamb, whisper_client):
|
||||
prompt = "This is a speech, recorded in a phonograph."
|
||||
# Prompts should not omit the part of original prompt while transcribing.
|
||||
prefix = "The first words I spoke in the original phonograph"
|
||||
transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)["text"]
|
||||
assert prefix in out
|
||||
transcription_wprompt = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
prompt=prompt,
|
||||
temperature=0.0,
|
||||
)
|
||||
out_prompt = json.loads(transcription_wprompt)["text"]
|
||||
assert prefix in out_prompt
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
|
||||
transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="verbose_json",
|
||||
temperature=0.0,
|
||||
)
|
||||
assert transcription.segments is not None
|
||||
assert len(transcription.segments) > 0
|
||||
229
tests/entrypoints/openai/test_translation_validation.py
Normal file
229
tests/entrypoints/openai/test_translation_validation.py
Normal file
@@ -0,0 +1,229 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import io
|
||||
|
||||
# imports for structured outputs tests
|
||||
import json
|
||||
|
||||
import httpx
|
||||
import librosa
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import soundfile as sf
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
SERVER_ARGS = ["--enforce-eager"]
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
|
||||
)
|
||||
def server(request):
|
||||
# Parametrize over model name
|
||||
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
|
||||
yield remote_server, request.param
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client_and_model(server):
|
||||
server, model_name = server
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client, model_name
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_asr_model(foscolo):
|
||||
# text to text model
|
||||
model_name = "JackFram/llama-68m"
|
||||
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
res = await client.audio.translations.create(
|
||||
model=model_name, file=foscolo, temperature=0.0
|
||||
)
|
||||
err = res.error
|
||||
assert err["code"] == 400 and not res.text
|
||||
assert err["message"] == "The model does not support Translations API"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
"""Ensure STT (translate) requests can pass LoRA through to generate."""
|
||||
# NOTE - careful to call this test before the module scoped server
|
||||
# fixture, otherwise it'll OOMkill the CI
|
||||
model_name = "ibm-granite/granite-speech-3.3-2b"
|
||||
lora_model_name = "speech"
|
||||
server_args = [
|
||||
"--enforce-eager",
|
||||
"--enable-lora",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--lora-modules",
|
||||
f"{lora_model_name}={model_name}",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
]
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
translation = await client.audio.translations.create(
|
||||
model=lora_model_name,
|
||||
file=mary_had_lamb,
|
||||
extra_body=dict(language="en", to_language="es"),
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(translation)["text"].strip().lower()
|
||||
assert "pequeño" in out.split(" ")
|
||||
|
||||
|
||||
# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio(foscolo, client_and_model):
|
||||
client, model_name = client_and_model
|
||||
translation = await client.audio.translations.create(
|
||||
model=model_name,
|
||||
file=foscolo,
|
||||
response_format="text",
|
||||
# TODO remove `language="it"` once language detection is implemented
|
||||
extra_body=dict(language="it", to_language="en"),
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(translation)["text"].strip().lower()
|
||||
assert "greek sea" in out
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_audio_prompt(foscolo, client_and_model):
|
||||
client, model_name = client_and_model
|
||||
# Condition whisper on starting text
|
||||
prompt = "Nor have I ever"
|
||||
transcription = await client.audio.translations.create(
|
||||
model=model_name,
|
||||
file=foscolo,
|
||||
prompt=prompt,
|
||||
extra_body=dict(language="it", to_language="en"),
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)["text"]
|
||||
assert "Nor will I ever touch the sacred" not in out
|
||||
assert prompt not in out
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_response(foscolo, client_and_model, server):
|
||||
client, model_name = client_and_model
|
||||
translation = ""
|
||||
res_no_stream = await client.audio.translations.create(
|
||||
model=model_name,
|
||||
file=foscolo,
|
||||
response_format="json",
|
||||
extra_body=dict(language="it", to_language="en", seed=42),
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# Stream via HTTPX since OpenAI translation client doesn't expose streaming
|
||||
server, model_name = server
|
||||
url = server.url_for("v1/audio/translations")
|
||||
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
|
||||
data = {
|
||||
"model": model_name,
|
||||
"language": "it",
|
||||
"to_language": "en",
|
||||
"stream": True,
|
||||
"temperature": 0.0,
|
||||
"seed": 42,
|
||||
}
|
||||
foscolo.seek(0)
|
||||
async with httpx.AsyncClient() as http_client:
|
||||
files = {"file": foscolo}
|
||||
async with http_client.stream(
|
||||
"POST", url, headers=headers, data=data, files=files
|
||||
) as response:
|
||||
async for line in response.aiter_lines():
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
line = line[len("data: ") :]
|
||||
if line.strip() == "[DONE]":
|
||||
break
|
||||
chunk = json.loads(line)
|
||||
text = chunk["choices"][0].get("delta", {}).get("content")
|
||||
translation += text or ""
|
||||
|
||||
res_stream = translation.split()
|
||||
# NOTE There's a small non-deterministic issue here, likely in the attn
|
||||
# computation, which will cause a few tokens to be different, while still
|
||||
# being very close semantically.
|
||||
assert (
|
||||
sum([x == y for x, y in zip(res_stream, res_no_stream.text.split())])
|
||||
>= len(res_stream) * 0.9
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_options(foscolo, server):
|
||||
server, model_name = server
|
||||
url = server.url_for("v1/audio/translations")
|
||||
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
|
||||
data = {
|
||||
"model": model_name,
|
||||
"language": "it",
|
||||
"to_language": "en",
|
||||
"stream": True,
|
||||
"stream_include_usage": True,
|
||||
"stream_continuous_usage_stats": True,
|
||||
"temperature": 0.0,
|
||||
}
|
||||
foscolo.seek(0)
|
||||
final = False
|
||||
continuous = True
|
||||
async with httpx.AsyncClient() as http_client:
|
||||
files = {"file": foscolo}
|
||||
async with http_client.stream(
|
||||
"POST", url, headers=headers, data=data, files=files
|
||||
) as response:
|
||||
async for line in response.aiter_lines():
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
line = line[len("data: ") :]
|
||||
if line.strip() == "[DONE]":
|
||||
break
|
||||
chunk = json.loads(line)
|
||||
choices = chunk.get("choices", [])
|
||||
if not choices:
|
||||
# final usage sent
|
||||
final = True
|
||||
else:
|
||||
continuous = continuous and ("usage" in chunk)
|
||||
assert final and continuous
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_long_audio_request(foscolo, client_and_model):
|
||||
client, model_name = client_and_model
|
||||
if model_name == "google/gemma-3n-E2B-it":
|
||||
pytest.skip("Gemma3n does not support long audio requests")
|
||||
foscolo.seek(0)
|
||||
audio, sr = librosa.load(foscolo)
|
||||
repeated_audio = np.tile(audio, 2)
|
||||
# Repeated audio to buffer
|
||||
buffer = io.BytesIO()
|
||||
sf.write(buffer, repeated_audio, sr, format="WAV")
|
||||
buffer.seek(0)
|
||||
translation = await client.audio.translations.create(
|
||||
model=model_name,
|
||||
file=buffer,
|
||||
extra_body=dict(language="it", to_language="en"),
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(translation)["text"].strip().lower()
|
||||
assert out.count("greek sea") == 2
|
||||
43
tests/entrypoints/openai/test_uds.py
Normal file
43
tests/entrypoints/openai/test_uds.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--uds",
|
||||
f"{tmpdir}/vllm.sock",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_show_version(server: RemoteOpenAIServer):
|
||||
transport = httpx.HTTPTransport(uds=server.uds)
|
||||
client = httpx.Client(transport=transport)
|
||||
response = client.get(server.url_for("version"))
|
||||
response.raise_for_status()
|
||||
|
||||
assert response.json() == {"version": VLLM_VERSION}
|
||||
325
tests/entrypoints/openai/test_video.py
Normal file
325
tests/entrypoints/openai/test_video.py
Normal file
@@ -0,0 +1,325 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.multimodal.utils import encode_video_base64, fetch_video
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||
MAXIMUM_VIDEOS = 4
|
||||
|
||||
TEST_VIDEO_URLS = [
|
||||
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
|
||||
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
|
||||
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
|
||||
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--runner",
|
||||
"generate",
|
||||
"--max-model-len",
|
||||
"32768",
|
||||
"--max-num-seqs",
|
||||
"2",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"video": MAXIMUM_VIDEOS}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_video() -> dict[str, str]:
|
||||
return {
|
||||
video_url: encode_video_base64(fetch_video(video_url)[0])
|
||||
for video_url in TEST_VIDEO_URLS
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_video_url(
|
||||
video_urls: str | list[str],
|
||||
content_text: str = "What's in this video?",
|
||||
):
|
||||
if isinstance(video_urls, str):
|
||||
video_urls = [video_urls]
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "video_url", "video_url": {"url": video_url}}
|
||||
for video_url in video_urls
|
||||
),
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_single_chat_session_video(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=6287, total_tokens=6297
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_error_on_invalid_video_url_type(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video_url", "video_url": video_url},
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# video_url should be a dict {"url": "some url"}, not directly a string
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_single_chat_session_video_beamsearch(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
n=2,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2
|
||||
assert (
|
||||
chat_completion.choices[0].message.content
|
||||
!= chat_completion.choices[1].message.content
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_single_chat_session_video_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
video_url: str,
|
||||
base64_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_video_url(
|
||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=6287, total_tokens=6297
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_single_chat_session_video_base64encoded_beamsearch(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
video_url: str,
|
||||
base64_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_video_url(
|
||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
n=2,
|
||||
max_completion_tokens=10,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2
|
||||
assert (
|
||||
chat_completion.choices[0].message.content
|
||||
!= chat_completion.choices[1].message.content
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_chat_streaming_video(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
|
||||
)
|
||||
async def test_multi_video_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_urls)
|
||||
|
||||
if len(video_urls) > MAXIMUM_VIDEOS:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-video input
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# the server should still work afterwards
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
completion = completion.choices[0].text
|
||||
assert completion is not None and len(completion) >= 0
|
||||
else:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
567
tests/entrypoints/openai/test_vision.py
Normal file
567
tests/entrypoints/openai/test_vision.py
Normal file
@@ -0,0 +1,567 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
|
||||
MAXIMUM_IMAGES = 2
|
||||
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
TEST_IMAGE_ASSETS = [
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
"Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
|
||||
"1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
|
||||
"RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
|
||||
]
|
||||
|
||||
EXPECTED_MM_BEAM_SEARCH_RES = [
|
||||
[
|
||||
"The image shows a wooden boardwalk leading through a",
|
||||
"The image shows a wooden boardwalk extending into a",
|
||||
],
|
||||
[
|
||||
"The image shows two parrots perched on",
|
||||
"The image shows two birds perched on a cur",
|
||||
],
|
||||
[
|
||||
"The image shows a Venn diagram with three over",
|
||||
"The image shows a colorful Venn diagram with",
|
||||
],
|
||||
[
|
||||
"This image displays a gradient of colors ranging from",
|
||||
"This image displays a gradient of colors forming a spectrum",
|
||||
],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--runner",
|
||||
"generate",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"5",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_asset: encode_image_base64(
|
||||
local_asset_server.get_image_asset(image_asset)
|
||||
)
|
||||
for image_asset in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_image_url(
|
||||
image_urls: str | list[str],
|
||||
content_text: str = "What's in this image?",
|
||||
):
|
||||
if isinstance(image_urls, str):
|
||||
image_urls = [image_urls]
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "image_url", "image_url": {"url": image_url}}
|
||||
for image_url in image_urls
|
||||
),
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_name, trust_remote_code=True, num_crops=4
|
||||
)
|
||||
|
||||
placeholder = "<|image_1|>\n"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}{content}",
|
||||
}
|
||||
]
|
||||
image = fetch_image(image_url)
|
||||
# Unwrap MediaWithBytes if present
|
||||
if isinstance(image, MediaWithBytes):
|
||||
image = image.media
|
||||
images = [image]
|
||||
|
||||
prompt = processor.tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
inputs = processor(prompt, images, return_tensors="pt")
|
||||
|
||||
return inputs.input_ids.shape[1]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = dummy_messages_from_image_url(image_url, content_text)
|
||||
|
||||
max_completion_tokens = 10
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=max_completion_tokens,
|
||||
prompt_tokens=hf_prompt_tokens,
|
||||
total_tokens=hf_prompt_tokens + max_completion_tokens,
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_error_on_invalid_image_url_type(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": image_url},
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# image_url should be a dict {"url": "some url"}, not directly a string
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image_beamsearch(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = dummy_messages_from_image_url(image_url, content_text)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
n=2,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2
|
||||
assert (
|
||||
chat_completion.choices[0].message.content
|
||||
!= chat_completion.choices[1].message.content
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
raw_image_url: str,
|
||||
image_url: str,
|
||||
base64_encoded_image: dict[str, str],
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = dummy_messages_from_image_url(
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
|
||||
content_text,
|
||||
)
|
||||
|
||||
max_completion_tokens = 10
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=max_completion_tokens,
|
||||
prompt_tokens=hf_prompt_tokens,
|
||||
total_tokens=hf_prompt_tokens + max_completion_tokens,
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
|
||||
async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_idx: int,
|
||||
base64_encoded_image: dict[str, str],
|
||||
):
|
||||
# NOTE: This test also validates that we pass MM data through beam search
|
||||
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
||||
|
||||
messages = dummy_messages_from_image_url(
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
||||
)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
n=2,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2
|
||||
for actual, expected_str in zip(chat_completion.choices, expected_res):
|
||||
assert actual.message.content == expected_str
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_chat_streaming_image(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
messages = dummy_messages_from_image_url(image_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_multi_image_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
|
||||
):
|
||||
messages = dummy_messages_from_image_url(image_urls)
|
||||
|
||||
if len(image_urls) > MAXIMUM_IMAGES:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-image input
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# the server should still work afterwards
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
completion = completion.choices[0].text
|
||||
assert completion is not None and len(completion) >= 0
|
||||
else:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_completions_with_image(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_completions_with_image_with_uuid(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_url,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
# Second request, with empty image but the same uuid.
|
||||
chat_completion_with_empty_image = await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{"type": "image_url", "image_url": {}, "uuid": image_url},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion_with_empty_image.choices[0].message.content is not None
|
||||
assert isinstance(
|
||||
chat_completion_with_empty_image.choices[0].message.content, str
|
||||
)
|
||||
assert len(chat_completion_with_empty_image.choices[0].message.content) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_completions_with_empty_image_with_uuid_without_cache_hit(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
):
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {},
|
||||
"uuid": "uuid_not_previously_seen",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_completions_with_image_with_incorrect_uuid_format(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
"incorrect_uuid_key": image_url,
|
||||
},
|
||||
"also_incorrect_uuid_key": image_url,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
70
tests/entrypoints/openai/test_vision_embeds.py
Normal file
70
tests/entrypoints/openai/test_vision_embeds.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
def _terratorch_dummy_messages():
|
||||
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
|
||||
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"pixel_values": tensor2base64(pixel_values),
|
||||
"location_coords": tensor2base64(location_coords),
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
||||
)
|
||||
def test_single_request(model_name: str):
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--max-num-seqs",
|
||||
"32",
|
||||
"--model-impl",
|
||||
"terratorch",
|
||||
"--skip-tokenizer-init",
|
||||
"--enable-mm-embeds",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(model_name, args) as server:
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"messages": _terratorch_dummy_messages(),
|
||||
"encoding_format": "base64",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = response.json()["data"][0]["data"]
|
||||
|
||||
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
|
||||
assert len(np_response) == 524288
|
||||
0
tests/entrypoints/openai/tool_parsers/__init__.py
Normal file
0
tests/entrypoints/openai/tool_parsers/__init__.py
Normal file
12
tests/entrypoints/openai/tool_parsers/conftest.py
Normal file
12
tests/entrypoints/openai/tool_parsers/conftest.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def default_tokenizer() -> TokenizerLike:
|
||||
return AutoTokenizer.from_pretrained("gpt2")
|
||||
@@ -0,0 +1,176 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
SIMPLE_ARGS_DICT = {
|
||||
"action": "create",
|
||||
"id": "preferences",
|
||||
}
|
||||
SIMPLE_FUNCTION_JSON = json.dumps(
|
||||
{
|
||||
"name": "manage_user_memory",
|
||||
"arguments": SIMPLE_ARGS_DICT,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
SIMPLE_FUNCTION_OUTPUT = "function call" + SIMPLE_FUNCTION_JSON
|
||||
SIMPLE_FUNCTION_CALL = FunctionCall(
|
||||
name="manage_user_memory",
|
||||
arguments=json.dumps(SIMPLE_ARGS_DICT, ensure_ascii=False),
|
||||
)
|
||||
|
||||
|
||||
PARAMETERLESS_FUNCTION_JSON = json.dumps(
|
||||
{
|
||||
"name": "manage_user_memory",
|
||||
"arguments": {},
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
PARAMETERLESS_FUNCTION_OUTPUT = "function call" + PARAMETERLESS_FUNCTION_JSON
|
||||
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
|
||||
name="manage_user_memory",
|
||||
arguments=json.dumps({}, ensure_ascii=False),
|
||||
)
|
||||
|
||||
|
||||
COMPLEX_ARGS_DICT = {
|
||||
"action": "create",
|
||||
"id": "preferences",
|
||||
"content": {
|
||||
"short_answers": True,
|
||||
"hate_emojis": True,
|
||||
"english_ui": False,
|
||||
"russian_math_explanations": True,
|
||||
},
|
||||
}
|
||||
COMPLEX_FUNCTION_JSON = json.dumps(
|
||||
{
|
||||
"name": "manage_user_memory",
|
||||
"arguments": COMPLEX_ARGS_DICT,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
COMPLEX_FUNCTION_OUTPUT = "function call" + COMPLEX_FUNCTION_JSON
|
||||
COMPLEX_FUNCTION_CALL = FunctionCall(
|
||||
name="manage_user_memory",
|
||||
arguments=json.dumps(COMPLEX_ARGS_DICT, ensure_ascii=False),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [True, False])
|
||||
def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
|
||||
default_tokenizer
|
||||
)
|
||||
model_output = "How can I help you today?"
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
assert content == model_output
|
||||
assert len(tool_calls) == 0
|
||||
|
||||
|
||||
TEST_CASES = [
|
||||
pytest.param(
|
||||
True,
|
||||
SIMPLE_FUNCTION_OUTPUT,
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
None,
|
||||
id="simple_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
SIMPLE_FUNCTION_OUTPUT,
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
None,
|
||||
id="simple_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
PARAMETERLESS_FUNCTION_OUTPUT,
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
None,
|
||||
id="parameterless_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
PARAMETERLESS_FUNCTION_OUTPUT,
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
None,
|
||||
id="parameterless_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
COMPLEX_FUNCTION_OUTPUT,
|
||||
[COMPLEX_FUNCTION_CALL],
|
||||
None,
|
||||
id="complex_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
COMPLEX_FUNCTION_OUTPUT,
|
||||
[COMPLEX_FUNCTION_CALL],
|
||||
None,
|
||||
id="complex_nonstreaming",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"streaming, model_output, expected_tool_calls, expected_content", TEST_CASES
|
||||
)
|
||||
def test_tool_call(
|
||||
streaming: bool,
|
||||
model_output: str,
|
||||
expected_tool_calls: list[FunctionCall],
|
||||
expected_content: str | None,
|
||||
default_tokenizer: TokenizerLike,
|
||||
):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
|
||||
default_tokenizer
|
||||
)
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
assert content == expected_content
|
||||
assert len(tool_calls) == len(expected_tool_calls)
|
||||
for actual, expected in zip(tool_calls, expected_tool_calls):
|
||||
assert actual.type == "function"
|
||||
assert actual.function.name == expected.name
|
||||
actual_args = json.loads(actual.function.arguments)
|
||||
expected_args = json.loads(expected.arguments)
|
||||
assert actual_args == expected_args
|
||||
|
||||
|
||||
def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
|
||||
default_tokenizer
|
||||
)
|
||||
model_output_deltas = [
|
||||
"function call",
|
||||
COMPLEX_FUNCTION_JSON[:40],
|
||||
COMPLEX_FUNCTION_JSON[40:],
|
||||
]
|
||||
reconstructor = run_tool_extraction_streaming(
|
||||
tool_parser,
|
||||
model_output_deltas,
|
||||
assert_one_tool_per_delta=False,
|
||||
)
|
||||
assert len(reconstructor.tool_calls) == 1
|
||||
call = reconstructor.tool_calls[0]
|
||||
assert call.type == "function"
|
||||
assert call.function.name == "manage_user_memory"
|
||||
args_dict = json.loads(call.function.arguments)
|
||||
assert args_dict == COMPLEX_ARGS_DICT
|
||||
460
tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
Normal file
460
tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
Normal file
@@ -0,0 +1,460 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
|
||||
|
||||
from ....utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci"
|
||||
|
||||
SERVER_ARGS = [
|
||||
"--enforce-eager",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"{LORA_MODEL}={LORA_MODEL}",
|
||||
"--tokenizer",
|
||||
f"{LORA_MODEL}",
|
||||
]
|
||||
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
PRODUCT_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_product_info",
|
||||
"description": "Get detailed information of a product based on its "
|
||||
"product ID.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"inserted": {
|
||||
"type": "boolean",
|
||||
"description": "inserted.",
|
||||
},
|
||||
"product_id": {
|
||||
"type": "integer",
|
||||
"description": "The product ID of the product.",
|
||||
},
|
||||
},
|
||||
"required": ["product_id", "inserted"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]
|
||||
|
||||
PRODUCT_MESSAGES = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hi! Do you have any detailed information about the product id "
|
||||
"7355608 and inserted true?",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_streaming_tool_call():
|
||||
"""Test tool call in non-streaming mode."""
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
|
||||
client = server.get_async_client()
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model=LORA_MODEL,
|
||||
messages=MESSAGES,
|
||||
tools=TOOLS,
|
||||
tool_choice="auto",
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
assert response.choices
|
||||
choice = response.choices[0]
|
||||
message = choice.message
|
||||
|
||||
assert choice.finish_reason == "tool_calls"
|
||||
assert message.tool_calls is not None
|
||||
|
||||
tool_call = message.tool_calls[0]
|
||||
assert tool_call.type == "function"
|
||||
assert tool_call.function.name == "get_current_weather"
|
||||
|
||||
arguments = json.loads(tool_call.function.arguments)
|
||||
assert "location" in arguments
|
||||
assert "Boston" in arguments["location"]
|
||||
print("\n[Non-Streaming Test Passed]")
|
||||
print(f"Tool Call: {tool_call.function.name}")
|
||||
print(f"Arguments: {arguments}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_tool_call():
|
||||
"""Test tool call in streaming mode."""
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
|
||||
client = server.get_async_client()
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=LORA_MODEL,
|
||||
messages=MESSAGES,
|
||||
tools=TOOLS,
|
||||
tool_choice="auto",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
tool_call_chunks = {}
|
||||
async for chunk in stream:
|
||||
if not chunk.choices:
|
||||
continue
|
||||
|
||||
delta = chunk.choices[0].delta
|
||||
if not delta or not delta.tool_calls:
|
||||
continue
|
||||
|
||||
for tool_chunk in delta.tool_calls:
|
||||
index = tool_chunk.index
|
||||
if index not in tool_call_chunks:
|
||||
tool_call_chunks[index] = {"name": "", "arguments": ""}
|
||||
|
||||
if tool_chunk.function.name:
|
||||
tool_call_chunks[index]["name"] += tool_chunk.function.name
|
||||
if tool_chunk.function.arguments:
|
||||
tool_call_chunks[index]["arguments"] += (
|
||||
tool_chunk.function.arguments
|
||||
)
|
||||
|
||||
assert len(tool_call_chunks) == 1
|
||||
reconstructed_tool_call = tool_call_chunks[0]
|
||||
|
||||
assert reconstructed_tool_call["name"] == "get_current_weather"
|
||||
|
||||
arguments = json.loads(reconstructed_tool_call["arguments"])
|
||||
assert "location" in arguments
|
||||
assert "Boston" in arguments["location"]
|
||||
print("\n[Streaming Test Passed]")
|
||||
print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
|
||||
print(f"Reconstructed Arguments: {arguments}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_streaming_product_tool_call():
|
||||
"""Test tool call integer and boolean parameters in non-streaming mode."""
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
|
||||
client = server.get_async_client()
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model=LORA_MODEL,
|
||||
messages=PRODUCT_MESSAGES,
|
||||
tools=PRODUCT_TOOLS,
|
||||
tool_choice="auto",
|
||||
temperature=0.66,
|
||||
)
|
||||
|
||||
assert response.choices
|
||||
choice = response.choices[0]
|
||||
message = choice.message
|
||||
|
||||
assert choice.finish_reason == "tool_calls"
|
||||
assert message.tool_calls is not None
|
||||
|
||||
tool_call = message.tool_calls[0]
|
||||
assert tool_call.type == "function"
|
||||
assert tool_call.function.name == "get_product_info"
|
||||
|
||||
arguments = json.loads(tool_call.function.arguments)
|
||||
assert "product_id" in arguments
|
||||
assert "inserted" in arguments
|
||||
|
||||
product_id = arguments.get("product_id")
|
||||
inserted = arguments.get("inserted")
|
||||
|
||||
assert isinstance(product_id, int)
|
||||
assert product_id == 7355608
|
||||
assert isinstance(inserted, bool)
|
||||
assert inserted is True
|
||||
|
||||
print("\n[Non-Streaming Product Test Passed]")
|
||||
print(f"Tool Call: {tool_call.function.name}")
|
||||
print(f"Arguments: {arguments}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_product_tool_call():
|
||||
"""Test tool call integer and boolean parameters in streaming mode."""
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
|
||||
client = server.get_async_client()
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=LORA_MODEL,
|
||||
messages=PRODUCT_MESSAGES,
|
||||
tools=PRODUCT_TOOLS,
|
||||
tool_choice="auto",
|
||||
temperature=0.66,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
tool_call_chunks = {}
|
||||
async for chunk in stream:
|
||||
if not chunk.choices:
|
||||
continue
|
||||
|
||||
delta = chunk.choices[0].delta
|
||||
if not delta or not delta.tool_calls:
|
||||
continue
|
||||
|
||||
for tool_chunk in delta.tool_calls:
|
||||
index = tool_chunk.index
|
||||
if index not in tool_call_chunks:
|
||||
tool_call_chunks[index] = {"name": "", "arguments": ""}
|
||||
|
||||
if tool_chunk.function.name:
|
||||
tool_call_chunks[index]["name"] += tool_chunk.function.name
|
||||
if tool_chunk.function.arguments:
|
||||
tool_call_chunks[index]["arguments"] += (
|
||||
tool_chunk.function.arguments
|
||||
)
|
||||
|
||||
assert len(tool_call_chunks) == 1
|
||||
reconstructed_tool_call = tool_call_chunks[0]
|
||||
|
||||
assert reconstructed_tool_call["name"] == "get_product_info"
|
||||
|
||||
arguments = json.loads(reconstructed_tool_call["arguments"])
|
||||
assert "product_id" in arguments
|
||||
assert "inserted" in arguments
|
||||
|
||||
# Handle type coercion for streaming test as well
|
||||
product_id = arguments.get("product_id")
|
||||
inserted = arguments.get("inserted")
|
||||
|
||||
assert isinstance(product_id, int)
|
||||
assert product_id == 7355608
|
||||
assert isinstance(inserted, bool)
|
||||
assert inserted is True
|
||||
|
||||
print("\n[Streaming Product Test Passed]")
|
||||
print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
|
||||
print(f"Reconstructed Arguments: {arguments}")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def qwen_tokenizer() -> TokenizerLike:
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
return get_tokenizer("Qwen/Qwen3-32B")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hermes_parser(qwen_tokenizer: TokenizerLike) -> Hermes2ProToolParser:
|
||||
return Hermes2ProToolParser(qwen_tokenizer)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def any_chat_request() -> ChatCompletionRequest:
|
||||
return ChatCompletionRequest(
|
||||
seed=42,
|
||||
model="Qwen/Qwen3-32B",
|
||||
messages=[],
|
||||
)
|
||||
|
||||
|
||||
def test_hermes_parser_streaming_just_forward_text(
|
||||
qwen_tokenizer: TokenizerLike,
|
||||
hermes_parser: Hermes2ProToolParser,
|
||||
any_chat_request: ChatCompletionRequest,
|
||||
) -> None:
|
||||
text = """This is some prior text that has nothing to do with tool calling."""
|
||||
tokens = qwen_tokenizer.encode(text)
|
||||
previous_text = ""
|
||||
delta_messages = []
|
||||
for token in tokens:
|
||||
delta_text = qwen_tokenizer.decode([token])
|
||||
current_text = previous_text + delta_text
|
||||
delta = hermes_parser.extract_tool_calls_streaming(
|
||||
previous_text=previous_text,
|
||||
current_text=current_text,
|
||||
delta_text=delta_text,
|
||||
previous_token_ids=[],
|
||||
current_token_ids=[],
|
||||
delta_token_ids=[],
|
||||
request=any_chat_request,
|
||||
)
|
||||
previous_text = current_text
|
||||
delta_messages.append(delta)
|
||||
|
||||
for delta in delta_messages:
|
||||
assert delta is not None
|
||||
assert not delta.tool_calls
|
||||
|
||||
print(delta_messages)
|
||||
assert "".join([delta.content for delta in delta_messages]) == text
|
||||
|
||||
|
||||
def test_hermes_parser_streaming_failure_case_bug_19056(
|
||||
qwen_tokenizer: TokenizerLike,
|
||||
hermes_parser: Hermes2ProToolParser,
|
||||
any_chat_request: ChatCompletionRequest,
|
||||
) -> None:
|
||||
text = """<tool_call>
|
||||
{"name": "final_answer", "arguments": {"trigger": true}}
|
||||
</tool_call>"""
|
||||
tokens = qwen_tokenizer.encode(text)
|
||||
previous_text = ""
|
||||
delta_messages = []
|
||||
for token in tokens:
|
||||
text = qwen_tokenizer.decode([token])
|
||||
current_text = previous_text + text
|
||||
delta = hermes_parser.extract_tool_calls_streaming(
|
||||
previous_text=previous_text,
|
||||
current_text=current_text,
|
||||
delta_text=text,
|
||||
previous_token_ids=[],
|
||||
current_token_ids=[],
|
||||
delta_token_ids=[],
|
||||
request=any_chat_request,
|
||||
)
|
||||
previous_text = current_text
|
||||
if delta is not None:
|
||||
delta_messages.append(delta)
|
||||
|
||||
assert delta_messages[0].tool_calls[0].function.name == "final_answer"
|
||||
tool_call_args = "".join(
|
||||
delta.tool_calls[0].function.arguments or "" for delta in delta_messages
|
||||
)
|
||||
assert tool_call_args == '{"trigger": true}'
|
||||
|
||||
|
||||
def test_hermes_parser_streaming(
|
||||
qwen_tokenizer: TokenizerLike,
|
||||
hermes_parser: Hermes2ProToolParser,
|
||||
any_chat_request: ChatCompletionRequest,
|
||||
) -> None:
|
||||
text = '<tool_call>\
|
||||
{"name": "get_current_temperature",\
|
||||
"arguments": {"location":\
|
||||
"San Francisco, California, United States", "unit": "celsius"}}\
|
||||
</tool_call>'
|
||||
|
||||
tokens = qwen_tokenizer.encode(text)
|
||||
previous_text = ""
|
||||
delta_messages = []
|
||||
for token in tokens:
|
||||
text = qwen_tokenizer.decode([token])
|
||||
current_text = previous_text + text
|
||||
delta = hermes_parser.extract_tool_calls_streaming(
|
||||
previous_text=previous_text,
|
||||
current_text=current_text,
|
||||
delta_text=text,
|
||||
previous_token_ids=[],
|
||||
current_token_ids=[],
|
||||
delta_token_ids=[],
|
||||
request=any_chat_request,
|
||||
)
|
||||
previous_text = current_text
|
||||
if delta is not None:
|
||||
delta_messages.append(delta)
|
||||
print(delta_messages)
|
||||
assert delta_messages[0].tool_calls[0].function.name == "get_current_temperature"
|
||||
tool_call_args = "".join(
|
||||
delta.tool_calls[0].function.arguments or "" for delta in delta_messages
|
||||
)
|
||||
assert tool_call_args == (
|
||||
'{"location":"San Francisco, California, United States", "unit": "celsius"}'
|
||||
)
|
||||
|
||||
|
||||
def test_hermes_parser_non_streaming_no_tool_call(
|
||||
hermes_parser: Hermes2ProToolParser,
|
||||
any_chat_request: ChatCompletionRequest,
|
||||
) -> None:
|
||||
text = """This is not a tool call."""
|
||||
tool_call = hermes_parser.extract_tool_calls(
|
||||
model_output=text,
|
||||
request=any_chat_request,
|
||||
)
|
||||
|
||||
assert tool_call is not None
|
||||
assert not tool_call.tools_called
|
||||
|
||||
|
||||
def test_hermes_parser_non_streaming_tool_call_between_tags(
|
||||
hermes_parser: Hermes2ProToolParser,
|
||||
any_chat_request: ChatCompletionRequest,
|
||||
) -> None:
|
||||
text = """<tool_call>
|
||||
{"name": "final_answer", "arguments": {"trigger": true}}
|
||||
</tool_call>"""
|
||||
tool_call = hermes_parser.extract_tool_calls(
|
||||
model_output=text,
|
||||
request=any_chat_request,
|
||||
)
|
||||
|
||||
assert tool_call is not None
|
||||
assert tool_call.tools_called
|
||||
assert tool_call.tool_calls[0].function.name == "final_answer"
|
||||
assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
|
||||
|
||||
|
||||
def test_hermes_parser_non_streaming_tool_call_until_eos(
|
||||
hermes_parser: Hermes2ProToolParser,
|
||||
any_chat_request: ChatCompletionRequest,
|
||||
) -> None:
|
||||
text = """<tool_call>
|
||||
{"name": "final_answer", "arguments": {"trigger": true}}"""
|
||||
tool_call = hermes_parser.extract_tool_calls(
|
||||
model_output=text,
|
||||
request=any_chat_request,
|
||||
)
|
||||
|
||||
assert tool_call is not None
|
||||
assert tool_call.tools_called
|
||||
assert tool_call.tool_calls[0].function.name == "final_answer"
|
||||
assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
|
||||
|
||||
|
||||
def test_hermes_parser_non_streaming_tool_call_invalid_json(
|
||||
hermes_parser: Hermes2ProToolParser,
|
||||
any_chat_request: ChatCompletionRequest,
|
||||
) -> None:
|
||||
# Missing closing brace to trigger exception
|
||||
text = """<tool_call>
|
||||
{"name": "final_answer", "arguments": {"trigger": true}"""
|
||||
tool_call = hermes_parser.extract_tool_calls(
|
||||
model_output=text,
|
||||
request=any_chat_request,
|
||||
)
|
||||
|
||||
assert tool_call is not None
|
||||
assert not tool_call.tools_called
|
||||
@@ -0,0 +1,179 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
|
||||
def make_tool_call(name, arguments):
|
||||
return ToolCall(
|
||||
type="function",
|
||||
function=FunctionCall(name=name, arguments=json.dumps(arguments)),
|
||||
)
|
||||
|
||||
|
||||
# TODO: add reason prefix and suffix.
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_output,expected_tool_calls,expected_content",
|
||||
[
|
||||
# No tool call
|
||||
("How can I help you today?", [], "How can I help you today?"),
|
||||
# Single tool call, no content
|
||||
(
|
||||
'<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}]</tool_calls>', # noqa: E501
|
||||
[
|
||||
make_tool_call(
|
||||
"get_weather", {"city": "San Francisco", "metric": "celsius"}
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
# Multiple tool calls
|
||||
(
|
||||
'<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}, {"name": "register_user", "arguments": {"name": "John Doe", "age": 37, "address": {"city": "San Francisco", "state": "CA"}, "role": null, "passed_test": true, "aliases": ["John", "Johnny"]}}]</tool_calls>', # noqa: E501
|
||||
[
|
||||
make_tool_call(
|
||||
"get_weather", {"city": "San Francisco", "metric": "celsius"}
|
||||
),
|
||||
make_tool_call(
|
||||
"register_user",
|
||||
{
|
||||
"name": "John Doe",
|
||||
"age": 37,
|
||||
"address": {"city": "San Francisco", "state": "CA"},
|
||||
"role": None,
|
||||
"passed_test": True,
|
||||
"aliases": ["John", "Johnny"],
|
||||
},
|
||||
),
|
||||
],
|
||||
None,
|
||||
),
|
||||
# Content before tool call
|
||||
(
|
||||
'I will call the tool now. <tool_calls>[{"name": "get_weather", "arguments": {"city": "Boston"}}]</tool_calls>', # noqa: E501
|
||||
[make_tool_call("get_weather", {"city": "Boston"})],
|
||||
"I will call the tool now. ",
|
||||
),
|
||||
# Content after tool call (should be stripped)
|
||||
(
|
||||
'<tool_calls>[{"name": "get_weather", "arguments": {"city": "Seattle"}}]</tool_calls>\nThank you!', # noqa: E501
|
||||
[make_tool_call("get_weather", {"city": "Seattle"})],
|
||||
None,
|
||||
),
|
||||
(
|
||||
'<tool_calls>[{"name": "complex_tool", "arguments": {"level1": {"level2": {"level3": {"value": 123}}}}}]</tool_calls>',
|
||||
[
|
||||
make_tool_call(
|
||||
"complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_hunyuan_a13b_tool_parser_extract(
|
||||
model_output, expected_tool_calls, expected_content
|
||||
):
|
||||
mock_tokenizer = MagicMock()
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
|
||||
mock_tokenizer
|
||||
)
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=False
|
||||
)
|
||||
|
||||
# align the random id.
|
||||
for idx in range(len(tool_calls)):
|
||||
tool_calls[idx].id = expected_tool_calls[idx].id
|
||||
assert tool_calls == expected_tool_calls
|
||||
assert content == expected_content
|
||||
|
||||
|
||||
# Streaming test: simulate incremental output
|
||||
@pytest.mark.parametrize(
|
||||
"model_deltas,expected_tool_calls",
|
||||
[
|
||||
(
|
||||
[
|
||||
'<tool_calls>[{"name": "get_weather", ',
|
||||
'"arguments": {"city": "San Francisco", ',
|
||||
'"metric": "celsius"}}]',
|
||||
"</tool_calls>",
|
||||
],
|
||||
[
|
||||
make_tool_call(
|
||||
"get_weather", {"city": "San Francisco", "metric": "celsius"}
|
||||
)
|
||||
],
|
||||
),
|
||||
(
|
||||
[
|
||||
'<tool_calls>[{"name":',
|
||||
' "get_weather",',
|
||||
' "arguments":',
|
||||
' {"city": "Boston"}',
|
||||
"}]",
|
||||
"</tool_calls>",
|
||||
],
|
||||
[make_tool_call("get_weather", {"city": "Boston"})],
|
||||
),
|
||||
(
|
||||
[
|
||||
"",
|
||||
'<tool_calls>[{"name":',
|
||||
' "get_weather",',
|
||||
' "arguments":',
|
||||
' {"city": "Boston"}',
|
||||
"}]",
|
||||
"</tool_calls>",
|
||||
"\n</answer>",
|
||||
],
|
||||
[make_tool_call("get_weather", {"city": "Boston"})],
|
||||
),
|
||||
pytest.param(
|
||||
[
|
||||
'<tool_calls>[{"name": "complex_tool",',
|
||||
' "arguments": ',
|
||||
' {"level1": {"level2": ',
|
||||
'{"level3": {"value": 123}}}}}',
|
||||
"]</tool_calls>",
|
||||
],
|
||||
[
|
||||
make_tool_call(
|
||||
"complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
|
||||
)
|
||||
],
|
||||
marks=pytest.mark.xfail(
|
||||
reason="stream parsing not support nested json yet."
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_hunyuan_a13b_tool_parser_streaming(model_deltas, expected_tool_calls):
|
||||
mock_tokenizer = MagicMock()
|
||||
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
|
||||
mock_tokenizer
|
||||
)
|
||||
reconstructor = run_tool_extraction_streaming(
|
||||
tool_parser, model_deltas, assert_one_tool_per_delta=False
|
||||
)
|
||||
|
||||
# align the random id.
|
||||
for idx in range(len(reconstructor.tool_calls)):
|
||||
reconstructor.tool_calls[idx].id = expected_tool_calls[idx].id
|
||||
|
||||
assert reconstructor.tool_calls == expected_tool_calls
|
||||
@@ -0,0 +1,262 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser(default_tokenizer: TokenizerLike):
|
||||
return Llama3JsonToolParser(default_tokenizer)
|
||||
|
||||
|
||||
def test_extract_tool_calls_simple(parser):
|
||||
# Test with a simple tool call
|
||||
model_output = (
|
||||
'Here is the result: {"name": "getOpenIncidentsTool", '
|
||||
'"parameters": {}} Would you like to know more?'
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert isinstance(result, ExtractedToolCallInformation)
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].type == "function"
|
||||
assert result.tool_calls[0].function.name == "getOpenIncidentsTool"
|
||||
assert result.tool_calls[0].function.arguments == "{}"
|
||||
assert result.content is None
|
||||
|
||||
|
||||
def test_extract_tool_calls_with_arguments(parser):
|
||||
# Test with a tool call that has arguments
|
||||
model_output = (
|
||||
'{"name": "searchTool", "parameters": {"query": "test query", "limit": 10}}'
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].function.name == "searchTool"
|
||||
assert '"query": "test query"' in result.tool_calls[0].function.arguments
|
||||
assert '"limit": 10' in result.tool_calls[0].function.arguments
|
||||
|
||||
|
||||
def test_extract_tool_calls_no_json(parser):
|
||||
# Test with text that doesn't contain a JSON object
|
||||
model_output = "This is just some text without any tool calls"
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is False
|
||||
assert len(result.tool_calls) == 0
|
||||
assert result.content == model_output
|
||||
|
||||
|
||||
def test_extract_tool_calls_invalid_json(parser):
|
||||
# Test with invalid JSON
|
||||
model_output = '{"name": "invalidTool", "parameters": {invalid json}'
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is False
|
||||
assert len(result.tool_calls) == 0
|
||||
assert result.content == model_output
|
||||
|
||||
|
||||
def test_extract_tool_calls_with_arguments_key(parser):
|
||||
# Test with a tool call that uses "arguments" instead of "parameters"
|
||||
model_output = '{"name": "searchTool", "arguments": {"query": "test"}}'
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].function.name == "searchTool"
|
||||
assert '"query": "test"' in result.tool_calls[0].function.arguments
|
||||
|
||||
|
||||
def test_extract_tool_calls_multiple_json(parser):
|
||||
# Test with multiple JSONs separated by semicolons
|
||||
model_output = (
|
||||
'{"name": "searchTool", "parameters": {"query": "test1"}}; '
|
||||
'{"name": "getOpenIncidentsTool", "parameters": {}}; '
|
||||
'{"name": "searchTool", "parameters": {"query": "test2"}}'
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 3
|
||||
|
||||
# Check first tool call
|
||||
assert result.tool_calls[0].function.name == "searchTool"
|
||||
assert '"query": "test1"' in result.tool_calls[0].function.arguments
|
||||
|
||||
# Check second tool call
|
||||
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
|
||||
assert result.tool_calls[1].function.arguments == "{}"
|
||||
|
||||
# Check third tool call
|
||||
assert result.tool_calls[2].function.name == "searchTool"
|
||||
assert '"query": "test2"' in result.tool_calls[2].function.arguments
|
||||
|
||||
|
||||
def test_extract_tool_calls_multiple_json_with_whitespace(parser):
|
||||
# Test with multiple JSONs separated by semicolons and extra whitespace
|
||||
model_output = (
|
||||
'{"name": "searchTool", "parameters": {"query": "test1"}} ; '
|
||||
'{"name": "getOpenIncidentsTool", "parameters": {}} ; '
|
||||
'{"name": "searchTool", "parameters": {"query": "test2"}}'
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 3
|
||||
assert result.tool_calls[0].function.name == "searchTool"
|
||||
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
|
||||
assert result.tool_calls[2].function.name == "searchTool"
|
||||
|
||||
|
||||
def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
|
||||
# Test with multiple JSONs and surrounding text
|
||||
model_output = (
|
||||
"Here are the results: "
|
||||
'{"name": "searchTool", "parameters": {"query": "test1"}}; '
|
||||
'{"name": "getOpenIncidentsTool", "parameters": {}}; '
|
||||
'{"name": "searchTool", "parameters": {"query": "test2"}} '
|
||||
"Would you like to know more?"
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 3
|
||||
assert result.tool_calls[0].function.name == "searchTool"
|
||||
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
|
||||
assert result.tool_calls[2].function.name == "searchTool"
|
||||
|
||||
|
||||
def test_extract_tool_calls_deeply_nested_json(parser):
|
||||
# Test with deeply nested JSON parameters (5 levels)
|
||||
model_output = (
|
||||
'{"name": "complexTool", '
|
||||
'"parameters": {'
|
||||
'"level1": {'
|
||||
'"level2": {'
|
||||
'"level3": {'
|
||||
'"level4": {'
|
||||
'"value": "deep"'
|
||||
"}}}}}}"
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].function.name == "complexTool"
|
||||
# Verify the nested structure is preserved in the arguments
|
||||
import json
|
||||
|
||||
args = json.loads(result.tool_calls[0].function.arguments)
|
||||
assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
|
||||
|
||||
|
||||
def test_extract_tool_calls_multiple_with_deep_nesting(parser):
|
||||
# Test with multiple tool calls where some have deeply nested parameters
|
||||
model_output = (
|
||||
'{"name": "simpleTool", "parameters": {"value": "test"}}; '
|
||||
'{"name": "complexTool", "parameters": '
|
||||
'{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 2
|
||||
|
||||
# Check first tool call
|
||||
assert result.tool_calls[0].function.name == "simpleTool"
|
||||
import json
|
||||
|
||||
args0 = json.loads(result.tool_calls[0].function.arguments)
|
||||
assert args0["value"] == "test"
|
||||
|
||||
# Check second tool call with deep nesting
|
||||
assert result.tool_calls[1].function.name == "complexTool"
|
||||
args1 = json.loads(result.tool_calls[1].function.arguments)
|
||||
assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
|
||||
|
||||
|
||||
def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
|
||||
# Test with quotes and brackets inside quoted string values
|
||||
model_output = (
|
||||
'{"name": "searchTool", '
|
||||
'"parameters": {'
|
||||
'"query": "test {value} [complex]",'
|
||||
'"nested": {"inner": "more {brackets}"}'
|
||||
"}}"
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].function.name == "searchTool"
|
||||
# Verify the string values are preserved including brackets and quotes
|
||||
import json
|
||||
|
||||
args = json.loads(result.tool_calls[0].function.arguments)
|
||||
assert args["query"] == "test {value} [complex]"
|
||||
assert args["nested"]["inner"] == "more {brackets}"
|
||||
|
||||
|
||||
def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
|
||||
# Test with escaped quotes in deeply nested JSON
|
||||
model_output = (
|
||||
'{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].function.name == "parserTool"
|
||||
# Verify escaped quotes are preserved
|
||||
import json
|
||||
|
||||
args = json.loads(result.tool_calls[0].function.arguments)
|
||||
assert args["text"] == 'He said "Hello {world}"'
|
||||
|
||||
|
||||
def test_extract_tool_calls_missing_name_key(parser):
|
||||
# Test that missing "name" key returns content
|
||||
model_output = '{"parameters": {}}'
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is False
|
||||
assert len(result.tool_calls) == 0
|
||||
assert result.content == model_output
|
||||
|
||||
|
||||
def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
|
||||
# Test that missing both "parameters" and "arguments" keys returns content
|
||||
model_output = '{"name": "toolWithoutParams"}'
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is False
|
||||
assert len(result.tool_calls) == 0
|
||||
assert result.content == model_output
|
||||
|
||||
|
||||
def test_regex_timeout_handling(parser):
|
||||
"""Test regex timeout is handled gracefully"""
|
||||
fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
|
||||
|
||||
# create a mock regex that raises TimeoutError
|
||||
mock_regex = MagicMock()
|
||||
mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
|
||||
|
||||
with patch.object(parser, "tool_call_start_regex", mock_regex):
|
||||
result = parser.extract_tool_calls(fake_problematic_input, None)
|
||||
|
||||
# should treat as regular text when regex times out
|
||||
assert result.content == fake_problematic_input
|
||||
assert result.tools_called is False
|
||||
assert len(result.tool_calls) == 0
|
||||
mock_regex.finditer.assert_called_once()
|
||||
@@ -0,0 +1,269 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
# Test cases similar to pythonic parser but with Llama4 specific format
|
||||
SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
|
||||
SIMPLE_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments='{"city": "LA", "metric": "C"}',
|
||||
)
|
||||
MORE_TYPES_FUNCTION_OUTPUT = (
|
||||
"[register_user(name='Doe', "
|
||||
"age=9, "
|
||||
"address={'city': 'LA', 'state': 'CA'}, "
|
||||
"role=None, "
|
||||
"passed_test=True, "
|
||||
"aliases=['John', 'Johnny'])]"
|
||||
)
|
||||
MORE_TYPES_FUNCTION_CALL = FunctionCall(
|
||||
name="register_user",
|
||||
arguments='{"name": "Doe", '
|
||||
'"age": 9, '
|
||||
'"address": {"city": "LA", "state": "CA"}, '
|
||||
'"role": null, '
|
||||
'"passed_test": true, '
|
||||
'"aliases": ["John", "Johnny"]}',
|
||||
)
|
||||
PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]"
|
||||
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments="{}",
|
||||
)
|
||||
EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]"
|
||||
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
|
||||
name="do_something_cool",
|
||||
arguments='{"additional_data": {}}',
|
||||
)
|
||||
EMPTY_LIST_FUNCTION_OUTPUT = "[do_something_cool(steps=[])]"
|
||||
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
|
||||
name="do_something_cool",
|
||||
arguments='{"steps": []}',
|
||||
)
|
||||
ESCAPED_STRING_FUNCTION_OUTPUT = (
|
||||
r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]"
|
||||
)
|
||||
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
|
||||
)
|
||||
PYTHON_TAG_FUNCTION_OUTPUT = (
|
||||
"<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [True, False])
|
||||
def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
|
||||
default_tokenizer
|
||||
)
|
||||
model_output = "How can I help you today?"
|
||||
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
|
||||
assert content == model_output
|
||||
assert len(tool_calls) == 0
|
||||
|
||||
|
||||
test_str = "<|python_start|>"
|
||||
test_str += "[get_weather(city='LA', metric='C'),"
|
||||
test_str += "register_user(name='Doe', age=9)]"
|
||||
TEST_CASES = [
|
||||
pytest.param(
|
||||
True,
|
||||
ESCAPED_STRING_FUNCTION_OUTPUT,
|
||||
[ESCAPED_STRING_FUNCTION_CALL],
|
||||
id="simple_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False, SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], id="simple_nonstreaming"
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
MORE_TYPES_FUNCTION_OUTPUT,
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
MORE_TYPES_FUNCTION_OUTPUT,
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
PARAMETERLESS_FUNCTION_OUTPUT,
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
id="parameterless_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
PARAMETERLESS_FUNCTION_OUTPUT,
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
id="parameterless_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
EMPTY_DICT_FUNCTION_OUTPUT,
|
||||
[EMPTY_DICT_FUNCTION_CALL],
|
||||
id="empty_dict_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
EMPTY_DICT_FUNCTION_OUTPUT,
|
||||
[EMPTY_DICT_FUNCTION_CALL],
|
||||
id="empty_dict_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
EMPTY_LIST_FUNCTION_OUTPUT,
|
||||
[EMPTY_LIST_FUNCTION_CALL],
|
||||
id="empty_list_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
EMPTY_LIST_FUNCTION_OUTPUT,
|
||||
[EMPTY_LIST_FUNCTION_CALL],
|
||||
id="empty_list_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
ESCAPED_STRING_FUNCTION_OUTPUT,
|
||||
[ESCAPED_STRING_FUNCTION_CALL],
|
||||
id="escaped_string_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
ESCAPED_STRING_FUNCTION_OUTPUT,
|
||||
[ESCAPED_STRING_FUNCTION_CALL],
|
||||
id="escaped_string_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
"[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
|
||||
[
|
||||
SIMPLE_FUNCTION_CALL,
|
||||
FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
|
||||
],
|
||||
id="parallel_calls_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
"[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
|
||||
[
|
||||
SIMPLE_FUNCTION_CALL,
|
||||
FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
|
||||
],
|
||||
id="parallel_calls_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
PYTHON_TAG_FUNCTION_OUTPUT,
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
id="python_tag_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
PYTHON_TAG_FUNCTION_OUTPUT,
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
id="python_tag_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
test_str,
|
||||
[
|
||||
SIMPLE_FUNCTION_CALL,
|
||||
FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
|
||||
],
|
||||
id="parallel_calls_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
"<|python_start|>[get_weather(city='LA', metric='C'), "
|
||||
+ "register_user(name='Doe', age=9)]",
|
||||
[
|
||||
SIMPLE_FUNCTION_CALL,
|
||||
FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
|
||||
],
|
||||
id="parallel_calls_nonstreaming",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
|
||||
def test_tool_call(
|
||||
streaming: bool,
|
||||
model_output: str,
|
||||
expected_tool_calls: list[FunctionCall],
|
||||
default_tokenizer: TokenizerLike,
|
||||
):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
|
||||
default_tokenizer
|
||||
)
|
||||
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
|
||||
assert len(tool_calls) == len(expected_tool_calls)
|
||||
for actual, expected in zip(tool_calls, expected_tool_calls):
|
||||
assert actual.type == "function"
|
||||
assert actual.function == expected
|
||||
|
||||
|
||||
def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
|
||||
default_tokenizer
|
||||
)
|
||||
model_output_deltas = [
|
||||
"<|python_start|>[get_weather(city='LA', metric='C'), "
|
||||
"get_weather(), "
|
||||
"do_something_cool(steps=[])]<|python_end|>",
|
||||
]
|
||||
|
||||
reconstructor = run_tool_extraction_streaming(
|
||||
tool_parser, model_output_deltas, assert_one_tool_per_delta=False
|
||||
)
|
||||
|
||||
assert reconstructor.other_content == ""
|
||||
assert len(reconstructor.tool_calls) == 3
|
||||
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [False])
|
||||
def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
|
||||
"""test regex timeout is handled gracefully"""
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
|
||||
default_tokenizer
|
||||
)
|
||||
|
||||
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
|
||||
|
||||
# create a mock regex that raises TimeoutError
|
||||
mock_regex = MagicMock()
|
||||
mock_regex.match.side_effect = TimeoutError("Regex timeout")
|
||||
|
||||
with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, fake_problematic_input, streaming=streaming
|
||||
)
|
||||
|
||||
# should treat as regular text when regex times out
|
||||
assert content == fake_problematic_input
|
||||
assert len(tool_calls) == 0
|
||||
mock_regex.match.assert_called_once()
|
||||
251
tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
Normal file
251
tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
Normal file
@@ -0,0 +1,251 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
|
||||
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
|
||||
SIMPLE_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments='{"city": "San Francisco", "metric": "celsius"}',
|
||||
)
|
||||
MORE_TYPES_FUNCTION_OUTPUT = (
|
||||
"register_user(name='John Doe', "
|
||||
"age=37, "
|
||||
"address={'city': 'San Francisco', 'state': 'CA'}, "
|
||||
"role=None, "
|
||||
"passed_test=True, "
|
||||
"aliases=['John', 'Johnny'])"
|
||||
)
|
||||
MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS = (
|
||||
"register_user(name='John Doe', "
|
||||
"age=37, "
|
||||
"address={'city': 'San Francisco', 'state': 'CA'}, "
|
||||
"role=null, "
|
||||
"passed_test=true, "
|
||||
"aliases=['John', 'Johnny'])"
|
||||
)
|
||||
MORE_TYPES_FUNCTION_CALL = FunctionCall(
|
||||
name="register_user",
|
||||
arguments='{"name": "John Doe", '
|
||||
'"age": 37, '
|
||||
'"address": {"city": "San Francisco", "state": "CA"}, '
|
||||
'"role": null, '
|
||||
'"passed_test": true, '
|
||||
'"aliases": ["John", "Johnny"]}',
|
||||
)
|
||||
PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
|
||||
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments="{}",
|
||||
)
|
||||
EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
|
||||
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
|
||||
name="do_something_cool",
|
||||
arguments='{"additional_data": {}}',
|
||||
)
|
||||
EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
|
||||
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
|
||||
name="do_something_cool",
|
||||
arguments='{"steps": []}',
|
||||
)
|
||||
ESCAPED_STRING_FUNCTION_OUTPUT = (
|
||||
r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
|
||||
)
|
||||
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [True, False])
|
||||
def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
|
||||
default_tokenizer
|
||||
)
|
||||
model_output = "How can I help you today?"
|
||||
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
|
||||
assert content == model_output
|
||||
assert len(tool_calls) == 0
|
||||
|
||||
|
||||
TEST_CASES = [
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
id="simple_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
id="simple_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_streaming_json_literals",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_nonstreaming_json_literals",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
id="parameterless_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
id="parameterless_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
|
||||
[EMPTY_DICT_FUNCTION_CALL],
|
||||
id="empty_dict_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
|
||||
[EMPTY_DICT_FUNCTION_CALL],
|
||||
id="empty_dict_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
|
||||
[EMPTY_LIST_FUNCTION_CALL],
|
||||
id="empty_list_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
|
||||
[EMPTY_LIST_FUNCTION_CALL],
|
||||
id="empty_list_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
|
||||
[ESCAPED_STRING_FUNCTION_CALL],
|
||||
id="escaped_string_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
|
||||
[ESCAPED_STRING_FUNCTION_CALL],
|
||||
id="escaped_string_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
|
||||
id="parallel_calls_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
|
||||
id="parallel_calls_nonstreaming",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
|
||||
def test_tool_call(
|
||||
streaming: bool,
|
||||
model_output: str,
|
||||
expected_tool_calls: list[FunctionCall],
|
||||
default_tokenizer: TokenizerLike,
|
||||
):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
|
||||
default_tokenizer
|
||||
)
|
||||
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
|
||||
assert content is None
|
||||
assert len(tool_calls) == len(expected_tool_calls)
|
||||
for actual, expected in zip(tool_calls, expected_tool_calls):
|
||||
assert actual.type == "function"
|
||||
assert actual.function == expected
|
||||
|
||||
|
||||
def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
|
||||
default_tokenizer
|
||||
)
|
||||
model_output_deltas = [
|
||||
"<function_calls>get_weather(city='San",
|
||||
" Francisco', metric='celsius')\n"
|
||||
f"{PARAMETERLESS_FUNCTION_OUTPUT}\n"
|
||||
f"{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
|
||||
]
|
||||
|
||||
reconstructor = run_tool_extraction_streaming(
|
||||
tool_parser, model_output_deltas, assert_one_tool_per_delta=False
|
||||
)
|
||||
|
||||
assert reconstructor.other_content == ""
|
||||
assert len(reconstructor.tool_calls) == 3
|
||||
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [False])
|
||||
def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
|
||||
"""test regex timeout is handled gracefully"""
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
|
||||
default_tokenizer
|
||||
)
|
||||
|
||||
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
|
||||
|
||||
# create a mock regex that raises TimeoutError
|
||||
mock_regex = MagicMock()
|
||||
mock_regex.match.side_effect = TimeoutError("Regex timeout")
|
||||
|
||||
with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, fake_problematic_input, streaming=streaming
|
||||
)
|
||||
|
||||
# should treat as regular text when regex times out
|
||||
assert content == fake_problematic_input
|
||||
assert len(tool_calls) == 0
|
||||
mock_regex.match.assert_called_once()
|
||||
359
tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py
Normal file
359
tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py
Normal file
@@ -0,0 +1,359 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import jsonschema
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
from ....utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "openai/gpt-oss-20b"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"openai",
|
||||
]
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
"""Async fixture providing an OpenAI-compatible vLLM client."""
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Tool Definitions
|
||||
# ==========================================================
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "calculator",
|
||||
"description": "Performs basic arithmetic calculations.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"expression": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Arithmetic expression to evaluate, e.g. '123 + 456'."
|
||||
),
|
||||
}
|
||||
},
|
||||
"required": ["expression"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_time",
|
||||
"description": "Retrieves the current local time for a given city.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "City name, e.g. 'New York'.",
|
||||
}
|
||||
},
|
||||
"required": ["city"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Message Examples
|
||||
# ==========================================================
|
||||
MESSAGES_CALC = [
|
||||
{"role": "user", "content": "Calculate 123 + 456 using the calculator."}
|
||||
]
|
||||
|
||||
MESSAGES_GET_TIME = [
|
||||
{"role": "user", "content": "What is the current time in New York?"}
|
||||
]
|
||||
|
||||
MESSAGES_MULTIPLE_CALLS = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You can call multiple tools. "
|
||||
"When using more than one, return single JSON object with tool_calls array"
|
||||
"containing each tool call with its function name and arguments. "
|
||||
"Do not output multiple JSON objects separately."
|
||||
),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "First, calculate 7 * 8 using the calculator. "
|
||||
"Then, use get_time to tell me the current time in New York.",
|
||||
},
|
||||
]
|
||||
|
||||
MESSAGES_INVALID_CALL = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Can you help with something, "
|
||||
"but don’t actually perform any calculation?",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
# Expected outputs
|
||||
FUNC_CALC = "calculator"
|
||||
FUNC_ARGS_CALC = '{"expression":"123 + 456"}'
|
||||
|
||||
FUNC_TIME = "get_time"
|
||||
FUNC_ARGS_TIME = '{"city": "New York"}'
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Utility to extract reasoning and tool calls
|
||||
# ==========================================================
|
||||
def extract_reasoning_and_calls(chunks: list) -> tuple[str, list[str], list[str]]:
|
||||
"""
|
||||
Extract accumulated reasoning text and tool call arguments
|
||||
from streaming chunks.
|
||||
"""
|
||||
reasoning_content: str = ""
|
||||
tool_calls: dict[int, dict[str, str]] = {}
|
||||
|
||||
for chunk in chunks:
|
||||
choice = getattr(chunk.choices[0], "delta", None)
|
||||
if not choice:
|
||||
continue
|
||||
|
||||
if hasattr(choice, "reasoning_content") and choice.reasoning_content:
|
||||
reasoning_content += choice.reasoning_content
|
||||
|
||||
for tc in getattr(choice, "tool_calls", []) or []:
|
||||
idx = getattr(tc, "index", 0)
|
||||
tool_entry = tool_calls.setdefault(idx, {"name": "", "arguments": ""})
|
||||
|
||||
if getattr(tc, "function", None):
|
||||
func = tc.function
|
||||
if getattr(func, "name", None):
|
||||
tool_entry["name"] = func.name
|
||||
if getattr(func, "arguments", None):
|
||||
tool_entry["arguments"] += func.arguments
|
||||
|
||||
function_names: list[str] = [v["name"] for _, v in sorted(tool_calls.items())]
|
||||
arguments: list[str] = [v["arguments"] for _, v in sorted(tool_calls.items())]
|
||||
|
||||
return reasoning_content, arguments, function_names
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Test Scenarios
|
||||
# ==========================================================
|
||||
@pytest.mark.asyncio
|
||||
async def test_calculator_tool_call_and_argument_accuracy(client: openai.AsyncOpenAI):
|
||||
"""Verify calculator tool call is made and arguments are accurate."""
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES_CALC,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
message = response.choices[0].message
|
||||
tool_calls = getattr(message, "tool_calls", [])
|
||||
assert tool_calls, "No tool calls detected"
|
||||
|
||||
calc_call = next((c for c in tool_calls if c.function.name == FUNC_CALC), None)
|
||||
assert calc_call, "Calculator function not called"
|
||||
|
||||
raw_args = calc_call.function.arguments
|
||||
assert raw_args, "Calculator arguments missing"
|
||||
assert "123" in raw_args and "456" in raw_args, (
|
||||
f"Expected values not in raw arguments: {raw_args}"
|
||||
)
|
||||
|
||||
try:
|
||||
parsed_args = json.loads(raw_args)
|
||||
except json.JSONDecodeError:
|
||||
pytest.fail(f"Invalid JSON in calculator arguments: {raw_args}")
|
||||
|
||||
expected_expr = "123 + 456"
|
||||
actual_expr = parsed_args.get("expression", "")
|
||||
similarity = fuzz.ratio(actual_expr, expected_expr)
|
||||
|
||||
assert similarity > 90, (
|
||||
f"Expression mismatch: expected '{expected_expr}' "
|
||||
f"got '{actual_expr}' (similarity={similarity}%)"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_tool_call_get_time_with_reasoning(client: openai.AsyncOpenAI):
|
||||
"""Verify streamed reasoning and tool call behavior for get_time."""
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES_GET_TIME,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
chunks = [chunk async for chunk in stream]
|
||||
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
|
||||
assert FUNC_TIME in function_names, "get_time function not called"
|
||||
|
||||
assert any("New York" in arg for arg in arguments), (
|
||||
f"Expected get_time arguments for New York not found in {arguments}"
|
||||
)
|
||||
|
||||
assert len(reasoning) > 0, "Expected reasoning content missing"
|
||||
|
||||
assert any(keyword in reasoning for keyword in ["New York", "time", "current"]), (
|
||||
f"Reasoning is not relevant to the request: {reasoning}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_multiple_tools(client: openai.AsyncOpenAI):
|
||||
"""Test streamed multi-tool response with reasoning."""
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES_MULTIPLE_CALLS,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
chunks = [chunk async for chunk in stream]
|
||||
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
|
||||
try:
|
||||
assert FUNC_CALC in function_names, (
|
||||
f"Calculator tool missing — found {function_names}"
|
||||
)
|
||||
assert FUNC_TIME in function_names, (
|
||||
f"Time tool missing — found {function_names}"
|
||||
)
|
||||
assert len(reasoning) > 0, "Expected reasoning content in streamed response"
|
||||
except AssertionError as e:
|
||||
print(f"ERROR: {e}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invalid_tool_call(client: openai.AsyncOpenAI):
|
||||
"""
|
||||
Verify that ambiguous instructions that should not trigger a tool
|
||||
do not produce any tool calls.
|
||||
"""
|
||||
response = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES_INVALID_CALL,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
message = response.choices[0].message
|
||||
|
||||
assert message is not None, "Expected message in response"
|
||||
assert hasattr(message, "content"), "Expected 'content' field in message"
|
||||
|
||||
tool_calls = getattr(message, "tool_calls", [])
|
||||
assert not tool_calls, (
|
||||
f"Model unexpectedly attempted a tool call on invalid input: {tool_calls}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tool_call_with_temperature(client: openai.AsyncOpenAI):
|
||||
"""
|
||||
Verify model produces valid tool or text output
|
||||
under non-deterministic sampling.
|
||||
"""
|
||||
response = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES_CALC,
|
||||
tools=TOOLS,
|
||||
temperature=0.7,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
message = response.choices[0].message
|
||||
assert message is not None, "Expected non-empty message in response"
|
||||
assert message.tool_calls or message.content, (
|
||||
"Response missing both text and tool calls"
|
||||
)
|
||||
|
||||
print(f"\nTool calls: {message.tool_calls}")
|
||||
print(f"Text: {message.content}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tool_response_schema_accuracy(client: openai.AsyncOpenAI):
|
||||
"""Validate that tool call arguments adhere to their declared JSON schema."""
|
||||
response = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES_MULTIPLE_CALLS,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
calls = response.choices[0].message.tool_calls
|
||||
assert calls, "No tool calls produced"
|
||||
|
||||
for call in calls:
|
||||
func_name = call.function.name
|
||||
args = json.loads(call.function.arguments)
|
||||
|
||||
schema: dict[str, object] | None = None
|
||||
for tool_entry in TOOLS:
|
||||
function_def = tool_entry.get("function")
|
||||
if (
|
||||
function_def
|
||||
and isinstance(function_def, dict)
|
||||
and function_def.get("name") == func_name
|
||||
):
|
||||
schema = function_def.get("parameters")
|
||||
break
|
||||
|
||||
assert schema is not None, f"No matching tool schema found for {func_name}"
|
||||
|
||||
jsonschema.validate(instance=args, schema=schema)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_semantic_consistency_with_temperature(client: openai.AsyncOpenAI):
|
||||
"""Test that temperature variation doesn't cause contradictory reasoning."""
|
||||
responses = []
|
||||
for temp in [0.0, 0.5, 1.0]:
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES_CALC,
|
||||
tools=TOOLS,
|
||||
temperature=temp,
|
||||
)
|
||||
text = (resp.choices[0].message.content or "").strip()
|
||||
responses.append(text)
|
||||
|
||||
# Compare fuzzy similarity between low- and mid-temperature outputs
|
||||
low_mid_sim = fuzz.ratio(responses[0], responses[1])
|
||||
assert low_mid_sim > 60, (
|
||||
f"Semantic drift too large between T=0.0 and T=0.5 ({low_mid_sim}%)"
|
||||
)
|
||||
@@ -0,0 +1,231 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
|
||||
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
|
||||
SIMPLE_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments='{"city": "San Francisco", "metric": "celsius"}',
|
||||
)
|
||||
MORE_TYPES_FUNCTION_OUTPUT = (
|
||||
"register_user(name='John Doe', "
|
||||
"age=37, "
|
||||
"address={'city': 'San Francisco', 'state': 'CA'}, "
|
||||
"role=None, "
|
||||
"passed_test=True, "
|
||||
"aliases=['John', 'Johnny'])"
|
||||
)
|
||||
MORE_TYPES_FUNCTION_CALL = FunctionCall(
|
||||
name="register_user",
|
||||
arguments='{"name": "John Doe", '
|
||||
'"age": 37, '
|
||||
'"address": {"city": "San Francisco", "state": "CA"}, '
|
||||
'"role": null, '
|
||||
'"passed_test": true, '
|
||||
'"aliases": ["John", "Johnny"]}',
|
||||
)
|
||||
PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
|
||||
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments="{}",
|
||||
)
|
||||
EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
|
||||
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
|
||||
name="do_something_cool",
|
||||
arguments='{"additional_data": {}}',
|
||||
)
|
||||
EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
|
||||
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
|
||||
name="do_something_cool",
|
||||
arguments='{"steps": []}',
|
||||
)
|
||||
ESCAPED_STRING_FUNCTION_OUTPUT = (
|
||||
r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
|
||||
)
|
||||
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [True, False])
|
||||
def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
|
||||
default_tokenizer
|
||||
)
|
||||
model_output = "How can I help you today?"
|
||||
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
|
||||
assert content == model_output
|
||||
assert len(tool_calls) == 0
|
||||
|
||||
|
||||
TEST_CASES = [
|
||||
pytest.param(
|
||||
True,
|
||||
f"[{SIMPLE_FUNCTION_OUTPUT}]",
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
id="simple_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"[{SIMPLE_FUNCTION_OUTPUT}]",
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
id="simple_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
id="parameterless_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
id="parameterless_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
|
||||
[EMPTY_DICT_FUNCTION_CALL],
|
||||
id="empty_dict_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
|
||||
[EMPTY_DICT_FUNCTION_CALL],
|
||||
id="empty_dict_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
|
||||
[EMPTY_LIST_FUNCTION_CALL],
|
||||
id="empty_list_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
|
||||
[EMPTY_LIST_FUNCTION_CALL],
|
||||
id="empty_list_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
|
||||
[ESCAPED_STRING_FUNCTION_CALL],
|
||||
id="escaped_string_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
|
||||
[ESCAPED_STRING_FUNCTION_CALL],
|
||||
id="escaped_string_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
|
||||
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
|
||||
id="parallel_calls_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
|
||||
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
|
||||
id="parallel_calls_nonstreaming",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
|
||||
def test_tool_call(
|
||||
streaming: bool,
|
||||
model_output: str,
|
||||
expected_tool_calls: list[FunctionCall],
|
||||
default_tokenizer: TokenizerLike,
|
||||
):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
|
||||
default_tokenizer
|
||||
)
|
||||
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
|
||||
assert content is None
|
||||
assert len(tool_calls) == len(expected_tool_calls)
|
||||
for actual, expected in zip(tool_calls, expected_tool_calls):
|
||||
assert actual.type == "function"
|
||||
assert actual.function == expected
|
||||
|
||||
|
||||
def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
|
||||
default_tokenizer
|
||||
)
|
||||
model_output_deltas = [
|
||||
"[get_weather(city='San",
|
||||
" Francisco', metric='celsius'), "
|
||||
f"{PARAMETERLESS_FUNCTION_OUTPUT}, "
|
||||
f"{EMPTY_LIST_FUNCTION_OUTPUT}]",
|
||||
]
|
||||
|
||||
reconstructor = run_tool_extraction_streaming(
|
||||
tool_parser, model_output_deltas, assert_one_tool_per_delta=False
|
||||
)
|
||||
|
||||
assert reconstructor.other_content == ""
|
||||
assert len(reconstructor.tool_calls) == 3
|
||||
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [False])
|
||||
def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
|
||||
"""test regex timeout is handled gracefully"""
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
|
||||
default_tokenizer
|
||||
)
|
||||
|
||||
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
|
||||
|
||||
# create a mock regex that raises TimeoutError
|
||||
mock_regex = MagicMock()
|
||||
mock_regex.match.side_effect = TimeoutError("Regex timeout")
|
||||
|
||||
with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, fake_problematic_input, streaming=streaming
|
||||
)
|
||||
|
||||
# should treat as regular text when regex times out
|
||||
assert content == fake_problematic_input
|
||||
assert len(tool_calls) == 0
|
||||
mock_regex.match.assert_called_once()
|
||||
167
tests/entrypoints/openai/tool_parsers/utils.py
Normal file
167
tests/entrypoints/openai/tool_parsers/utils.py
Normal file
@@ -0,0 +1,167 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Iterable
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
DeltaMessage,
|
||||
ExtractedToolCallInformation,
|
||||
FunctionCall,
|
||||
ToolCall,
|
||||
)
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers import ToolParser
|
||||
|
||||
|
||||
class StreamingToolReconstructor:
|
||||
def __init__(self, assert_one_tool_per_delta: bool = True):
|
||||
self.tool_calls: list[ToolCall] = []
|
||||
self.other_content: str = ""
|
||||
self._assert_one_tool_per_delta = assert_one_tool_per_delta
|
||||
|
||||
def append_delta(self, delta: DeltaMessage):
|
||||
if delta.content is not None:
|
||||
self.other_content += delta.content
|
||||
else:
|
||||
assert delta.tool_calls, (
|
||||
"Streaming results should have either content or tool calls (or both)"
|
||||
)
|
||||
if self._assert_one_tool_per_delta:
|
||||
# Note: This isn't strictly required by the API and may not be
|
||||
# possible to adhere to depending on the token space and number of
|
||||
# tokens per streamed response from the model, but it is required
|
||||
# by tool_use tests, so we enforce it here by default also.
|
||||
assert len(delta.tool_calls) < 2, (
|
||||
"Streaming should include only one tool call per update."
|
||||
)
|
||||
for call_delta in delta.tool_calls:
|
||||
assert call_delta.type is None or call_delta.type == "function", (
|
||||
"Streaming tool calls should only emit function calls. Got "
|
||||
f"{call_delta.type}"
|
||||
)
|
||||
current_tool_call = (
|
||||
self.tool_calls[call_delta.index]
|
||||
if call_delta.index < len(self.tool_calls)
|
||||
else None
|
||||
)
|
||||
if current_tool_call:
|
||||
assert not call_delta.function.name, (
|
||||
"Streaming tool calls should emit the full function name "
|
||||
f"exactly once. Got {call_delta.function.name}"
|
||||
)
|
||||
assert not call_delta.id, (
|
||||
"Streaming tool calls must emit function id only once. Got "
|
||||
f"{call_delta.id}"
|
||||
)
|
||||
assert call_delta.index == len(self.tool_calls) - 1, (
|
||||
f"Incorrect index for tool delta. Got {call_delta.index}, "
|
||||
f"expected {len(self.tool_calls) - 1}"
|
||||
)
|
||||
current_tool_call.function.arguments += call_delta.function.arguments
|
||||
else:
|
||||
assert call_delta.id is not None, (
|
||||
"Streaming tool calls must have an id on first appearance"
|
||||
)
|
||||
assert call_delta.function.name is not None, (
|
||||
"Streaming tool calls must have a function name on first appearance"
|
||||
)
|
||||
assert call_delta.index == len(self.tool_calls), (
|
||||
f"Incorrect index for tool delta. Got {call_delta.index}, "
|
||||
f"expected {len(self.tool_calls)}"
|
||||
)
|
||||
self.tool_calls.append(
|
||||
ToolCall(
|
||||
id=call_delta.id,
|
||||
function=FunctionCall(
|
||||
name=call_delta.function.name,
|
||||
arguments=call_delta.function.arguments or "",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def run_tool_extraction(
|
||||
tool_parser: ToolParser,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | None = None,
|
||||
streaming: bool = False,
|
||||
assert_one_tool_per_delta: bool = True,
|
||||
) -> tuple[str | None, list[ToolCall]]:
|
||||
if streaming:
|
||||
reconstructor = run_tool_extraction_streaming(
|
||||
tool_parser,
|
||||
model_output,
|
||||
request,
|
||||
assert_one_tool_per_delta=assert_one_tool_per_delta,
|
||||
)
|
||||
return reconstructor.other_content or None, reconstructor.tool_calls
|
||||
else:
|
||||
extracted = run_tool_extraction_nonstreaming(tool_parser, model_output, request)
|
||||
assert extracted.tools_called == bool(extracted.tool_calls)
|
||||
return extracted.content, extracted.tool_calls
|
||||
|
||||
|
||||
def run_tool_extraction_nonstreaming(
|
||||
tool_parser: ToolParser,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | None = None,
|
||||
) -> ExtractedToolCallInformation:
|
||||
request = request or ChatCompletionRequest(messages=[], model="test-model")
|
||||
return tool_parser.extract_tool_calls(model_output, request)
|
||||
|
||||
|
||||
def split_string_into_token_deltas(tokenizer: TokenizerLike, text: str) -> list[str]:
|
||||
# Split a string into a series of deltas using the provided tokenizer. Each
|
||||
# delta will be the string equivalent of a single token.
|
||||
token_ids = tokenizer.encode(text, add_special_tokens=False)
|
||||
previously_decoded_text = ""
|
||||
deltas = []
|
||||
for i in range(1, len(token_ids) + 1):
|
||||
current_tokens = token_ids[:i]
|
||||
current_text = tokenizer.decode(current_tokens)
|
||||
new_text = current_text[len(previously_decoded_text) :]
|
||||
previously_decoded_text = current_text
|
||||
deltas.append(new_text)
|
||||
return deltas
|
||||
|
||||
|
||||
def run_tool_extraction_streaming(
|
||||
tool_parser: ToolParser,
|
||||
model_deltas: Iterable[str],
|
||||
request: ChatCompletionRequest | None = None,
|
||||
assert_one_tool_per_delta: bool = True,
|
||||
) -> StreamingToolReconstructor:
|
||||
if isinstance(model_deltas, str):
|
||||
model_deltas = split_string_into_token_deltas(
|
||||
tool_parser.model_tokenizer, model_deltas
|
||||
)
|
||||
|
||||
request = request or ChatCompletionRequest(messages=[], model="test-model")
|
||||
reconstructor = StreamingToolReconstructor(
|
||||
assert_one_tool_per_delta=assert_one_tool_per_delta
|
||||
)
|
||||
previous_text = ""
|
||||
previous_tokens: list[int] = []
|
||||
for delta in model_deltas:
|
||||
token_delta = [
|
||||
tool_parser.vocab.get(token)
|
||||
for token in tool_parser.model_tokenizer.tokenize(delta)
|
||||
if token in tool_parser.vocab
|
||||
]
|
||||
current_text = previous_text + delta
|
||||
current_tokens = previous_tokens + token_delta
|
||||
delta_message = tool_parser.extract_tool_calls_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta,
|
||||
previous_tokens,
|
||||
current_tokens,
|
||||
token_delta,
|
||||
request,
|
||||
)
|
||||
if delta_message is not None:
|
||||
reconstructor.append_delta(delta_message)
|
||||
previous_text = current_text
|
||||
previous_tokens = current_tokens
|
||||
return reconstructor
|
||||
190
tests/entrypoints/openai/utils.py
Normal file
190
tests/entrypoints/openai/utils.py
Normal file
@@ -0,0 +1,190 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Any
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionResponse,
|
||||
ChatCompletionResponseChoice,
|
||||
ChatCompletionStreamResponse,
|
||||
ChatMessage,
|
||||
UsageInfo,
|
||||
)
|
||||
|
||||
|
||||
async def accumulate_streaming_response(
|
||||
stream_generator: AsyncGenerator[str, None],
|
||||
) -> ChatCompletionResponse:
|
||||
"""
|
||||
Accumulate streaming SSE chunks into a complete ChatCompletionResponse.
|
||||
|
||||
This helper parses the SSE format and builds up the complete response
|
||||
by combining all the delta chunks.
|
||||
"""
|
||||
accumulated_content = ""
|
||||
accumulated_reasoning = None
|
||||
accumulated_tool_calls: list[dict[str, Any]] = []
|
||||
role = None
|
||||
finish_reason = None
|
||||
response_id = None
|
||||
created = None
|
||||
model = None
|
||||
index = 0
|
||||
|
||||
async for chunk_str in stream_generator:
|
||||
# Skip empty lines and [DONE] marker
|
||||
if not chunk_str.strip() or chunk_str.strip() == "data: [DONE]":
|
||||
continue
|
||||
|
||||
# Parse SSE format: "data: {json}\n\n"
|
||||
if chunk_str.startswith("data: "):
|
||||
json_str = chunk_str[6:].strip()
|
||||
try:
|
||||
chunk_data = json.loads(json_str)
|
||||
# print(f"DEBUG: Parsed chunk_data: {chunk_data}")
|
||||
chunk = ChatCompletionStreamResponse(**chunk_data)
|
||||
|
||||
# Store metadata from first chunk
|
||||
if response_id is None:
|
||||
response_id = chunk.id
|
||||
created = chunk.created
|
||||
model = chunk.model
|
||||
|
||||
# Process each choice in the chunk
|
||||
for choice in chunk.choices:
|
||||
if choice.delta.role:
|
||||
role = choice.delta.role
|
||||
if choice.delta.content:
|
||||
accumulated_content += choice.delta.content
|
||||
if choice.delta.reasoning:
|
||||
if accumulated_reasoning is None:
|
||||
accumulated_reasoning = ""
|
||||
accumulated_reasoning += choice.delta.reasoning
|
||||
if choice.delta.tool_calls:
|
||||
# Accumulate tool calls
|
||||
for tool_call_delta in choice.delta.tool_calls:
|
||||
# Find or create the tool call at this index
|
||||
while len(accumulated_tool_calls) <= tool_call_delta.index:
|
||||
accumulated_tool_calls.append(
|
||||
{
|
||||
"id": None,
|
||||
"type": "function",
|
||||
"function": {"name": "", "arguments": ""},
|
||||
}
|
||||
)
|
||||
|
||||
if tool_call_delta.id:
|
||||
accumulated_tool_calls[tool_call_delta.index]["id"] = (
|
||||
tool_call_delta.id
|
||||
)
|
||||
if tool_call_delta.function:
|
||||
if tool_call_delta.function.name:
|
||||
accumulated_tool_calls[tool_call_delta.index][
|
||||
"function"
|
||||
]["name"] += tool_call_delta.function.name
|
||||
if tool_call_delta.function.arguments:
|
||||
accumulated_tool_calls[tool_call_delta.index][
|
||||
"function"
|
||||
]["arguments"] += tool_call_delta.function.arguments
|
||||
|
||||
if choice.finish_reason:
|
||||
finish_reason = choice.finish_reason
|
||||
if choice.index is not None:
|
||||
index = choice.index
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Build the final message
|
||||
message_kwargs = {
|
||||
"role": role or "assistant",
|
||||
"content": accumulated_content if accumulated_content else None,
|
||||
"reasoning": accumulated_reasoning,
|
||||
}
|
||||
|
||||
# Only include tool_calls if there are any
|
||||
if accumulated_tool_calls:
|
||||
message_kwargs["tool_calls"] = [
|
||||
{"id": tc["id"], "type": tc["type"], "function": tc["function"]}
|
||||
for tc in accumulated_tool_calls
|
||||
]
|
||||
|
||||
message = ChatMessage(**message_kwargs)
|
||||
|
||||
# Build the final response
|
||||
choice = ChatCompletionResponseChoice(
|
||||
index=index,
|
||||
message=message,
|
||||
finish_reason=finish_reason or "stop",
|
||||
)
|
||||
|
||||
# Create usage info (with dummy values for tests)
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=0,
|
||||
completion_tokens=0,
|
||||
total_tokens=0,
|
||||
)
|
||||
|
||||
response = ChatCompletionResponse(
|
||||
id=response_id or "chatcmpl-test",
|
||||
object="chat.completion",
|
||||
created=created or 0,
|
||||
model=model or "test-model",
|
||||
choices=[choice],
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def verify_harmony_messages(
|
||||
messages: list[Any], expected_messages: list[dict[str, Any]]
|
||||
):
|
||||
assert len(messages) == len(expected_messages)
|
||||
for msg, expected in zip(messages, expected_messages):
|
||||
if "role" in expected:
|
||||
assert msg.author.role == expected["role"]
|
||||
if "author_name" in expected:
|
||||
assert msg.author.name == expected["author_name"]
|
||||
if "channel" in expected:
|
||||
assert msg.channel == expected["channel"]
|
||||
if "recipient" in expected:
|
||||
assert msg.recipient == expected["recipient"]
|
||||
if "content" in expected:
|
||||
assert msg.content[0].text == expected["content"]
|
||||
if "content_type" in expected:
|
||||
assert msg.content_type == expected["content_type"]
|
||||
if "tool_definitions" in expected:
|
||||
# Check that the tool definitions match the expected list of tool names
|
||||
actual_tools = [t.name for t in msg.content[0].tools["functions"].tools]
|
||||
assert actual_tools == expected["tool_definitions"]
|
||||
|
||||
|
||||
def verify_chat_response(
|
||||
response: ChatCompletionResponse,
|
||||
content: str | None = None,
|
||||
reasoning: str | None = None,
|
||||
tool_calls: list[tuple[str, str]] | None = None,
|
||||
):
|
||||
assert len(response.choices) == 1
|
||||
message = response.choices[0].message
|
||||
|
||||
if content is not None:
|
||||
assert message.content == content
|
||||
else:
|
||||
assert not message.content
|
||||
|
||||
if reasoning is not None:
|
||||
assert message.reasoning == reasoning
|
||||
else:
|
||||
assert not message.reasoning
|
||||
|
||||
if tool_calls:
|
||||
assert message.tool_calls is not None
|
||||
assert len(message.tool_calls) == len(tool_calls)
|
||||
for tc, (expected_name, expected_args) in zip(message.tool_calls, tool_calls):
|
||||
assert tc.function.name == expected_name
|
||||
assert tc.function.arguments == expected_args
|
||||
else:
|
||||
assert not message.tool_calls
|
||||
0
tests/entrypoints/pooling/__init__.py
Normal file
0
tests/entrypoints/pooling/__init__.py
Normal file
0
tests/entrypoints/pooling/basic/__init__.py
Normal file
0
tests/entrypoints/pooling/basic/__init__.py
Normal file
90
tests/entrypoints/pooling/basic/test_encode.py
Normal file
90
tests/entrypoints/pooling/basic/test_encode.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
TOKEN_IDS = [
|
||||
# Using ID={0, 1, 2, 3} results in NaN values,
|
||||
# so we add this offset of 1000
|
||||
[1000],
|
||||
[1000, 1001],
|
||||
[1000, 1002, 1001],
|
||||
[1000, 1003, 1001, 1002],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_multiple_pooling_params(llm: LLM):
|
||||
pooling_params = [
|
||||
PoolingParams(),
|
||||
PoolingParams(),
|
||||
PoolingParams(),
|
||||
PoolingParams(),
|
||||
]
|
||||
|
||||
# Multiple PoolingParams should be matched with each prompt
|
||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.encode(
|
||||
PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
|
||||
)
|
||||
|
||||
# Single PoolingParams should be applied to every prompt
|
||||
single_pooling_params = PoolingParams()
|
||||
outputs = llm.encode(
|
||||
PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
|
||||
)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# pooling_params is None, default params should be applied
|
||||
outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
def test_right_side_truncation(llm: LLM):
|
||||
# Embeddings models should truncate the end of the prompt
|
||||
tokenizer = llm.get_tokenizer()
|
||||
assert tokenizer.truncation_side == "right"
|
||||
119
tests/entrypoints/pooling/basic/test_truncation.py
Normal file
119
tests/entrypoints/pooling/basic/test_truncation.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
max_model_len = 128
|
||||
|
||||
input = """Immerse yourself in the enchanting chronicle of calculus, a
|
||||
mathematical domain that has radically transformed our comprehension of
|
||||
change and motion. Despite its roots in ancient civilizations, the
|
||||
formal birth of calculus predominantly occurred in the 17th century,
|
||||
primarily under the influential guidance of Sir Isaac Newton and Gottfried
|
||||
Wilhelm Leibniz. The earliest traces of calculus concepts are found in
|
||||
ancient Greek mathematics,most notably in the works of Eudoxus and
|
||||
Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a
|
||||
technique for computing areas and volumes through the use of finite sums.
|
||||
This methodology laid crucial foundational work for integral calculus.
|
||||
In the 17th century, both Newton and Leibniz independently pioneered
|
||||
calculus, each contributing unique perspectives that would shape this new
|
||||
field."""
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
str(max_model_len),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
|
||||
truncation_size = 10
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": MODEL_NAME,
|
||||
"input": input,
|
||||
"truncate_prompt_tokens": truncation_size,
|
||||
}
|
||||
|
||||
response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
|
||||
|
||||
assert response["usage"]["prompt_tokens"] == truncation_size
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_zero_truncation_size(client: openai.AsyncOpenAI):
|
||||
truncation_size = 0
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": MODEL_NAME,
|
||||
"input": input,
|
||||
"truncate_prompt_tokens": truncation_size,
|
||||
}
|
||||
|
||||
response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
|
||||
|
||||
assert response["usage"]["prompt_tokens"] == truncation_size
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
|
||||
truncation_size = max_model_len + 1
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": MODEL_NAME,
|
||||
"input": input,
|
||||
"truncate_prompt_tokens": truncation_size,
|
||||
}
|
||||
|
||||
with pytest.raises(openai.BadRequestError) as err:
|
||||
await client.post(path="embeddings", cast_to=object, body={**kwargs})
|
||||
|
||||
assert err.value.status_code == 400
|
||||
error_details = err.value.response.json()["error"]
|
||||
assert error_details["type"] == "BadRequestError"
|
||||
expected_message = (
|
||||
"truncate_prompt_tokens value is "
|
||||
"greater than max_model_len."
|
||||
" Please, select a smaller truncation size."
|
||||
)
|
||||
assert error_details["message"] == expected_message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_max_truncation_size(client: openai.AsyncOpenAI):
|
||||
truncation_size = -1
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": MODEL_NAME,
|
||||
"input": input,
|
||||
"truncate_prompt_tokens": truncation_size,
|
||||
}
|
||||
|
||||
response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
|
||||
|
||||
assert response["usage"]["prompt_tokens"] == max_model_len
|
||||
0
tests/entrypoints/pooling/classify/__init__.py
Normal file
0
tests/entrypoints/pooling/classify/__init__.py
Normal file
71
tests/entrypoints/pooling/classify/test_offline.py
Normal file
71
tests/entrypoints/pooling/classify/test_offline.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.models.utils import softmax
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
def get_outputs(use_activation):
|
||||
outputs = llm.classify(
|
||||
prompts,
|
||||
pooling_params=PoolingParams(use_activation=use_activation),
|
||||
use_tqdm=False,
|
||||
)
|
||||
return torch.tensor([x.outputs.probs for x in outputs])
|
||||
|
||||
default = get_outputs(use_activation=None)
|
||||
w_activation = get_outputs(use_activation=True)
|
||||
wo_activation = get_outputs(use_activation=False)
|
||||
|
||||
assert torch.allclose(default, w_activation, atol=1e-2), (
|
||||
"Default should use activation."
|
||||
)
|
||||
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
|
||||
"wo_activation should not use activation."
|
||||
)
|
||||
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
|
||||
"w_activation should be close to activation(wo_activation)."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_token_classify(llm: LLM):
|
||||
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
|
||||
|
||||
|
||||
def test_score_api(llm: LLM):
|
||||
err_msg = "Score API is only enabled for num_labels == 1."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.score("ping", "pong", use_tqdm=False)
|
||||
293
tests/entrypoints/pooling/classify/test_online.py
Normal file
293
tests/entrypoints/pooling/classify/test_online.py
Normal file
@@ -0,0 +1,293 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
|
||||
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
|
||||
|
||||
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
|
||||
DTYPE = "float32" # Use float32 to avoid NaN issue
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"512",
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_single_input_classification(server: RemoteOpenAIServer, model_name: str):
|
||||
input_text = "This product was excellent and exceeded my expectations"
|
||||
|
||||
classification_response = requests.post(
|
||||
server.url_for("classify"),
|
||||
json={"model": model_name, "input": input_text},
|
||||
)
|
||||
|
||||
classification_response.raise_for_status()
|
||||
output = ClassificationResponse.model_validate(classification_response.json())
|
||||
|
||||
assert output.object == "list"
|
||||
assert output.model == MODEL_NAME
|
||||
assert len(output.data) == 1
|
||||
assert hasattr(output.data[0], "label")
|
||||
assert hasattr(output.data[0], "probs")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
|
||||
response = requests.post(
|
||||
server.url_for("classify"),
|
||||
json={"model": model_name, "input": "hello", "add_special_tokens": False},
|
||||
)
|
||||
response.raise_for_status()
|
||||
ClassificationResponse.model_validate(response.json())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
|
||||
input_texts = [
|
||||
"The product arrived on time and works perfectly",
|
||||
"I'm very satisfied with my purchase, would buy again",
|
||||
"The customer service was helpful and resolved my issue quickly",
|
||||
"This product broke after one week, terrible quality",
|
||||
"I'm very disappointed with this purchase, complete waste of money",
|
||||
"The customer service was rude and unhelpful",
|
||||
]
|
||||
|
||||
classification_response = requests.post(
|
||||
server.url_for("classify"),
|
||||
json={"model": model_name, "input": input_texts},
|
||||
)
|
||||
output = ClassificationResponse.model_validate(classification_response.json())
|
||||
|
||||
assert len(output.data) == len(input_texts)
|
||||
for i, item in enumerate(output.data):
|
||||
assert item.index == i
|
||||
assert hasattr(item, "label")
|
||||
assert hasattr(item, "probs")
|
||||
assert len(item.probs) == item.num_classes
|
||||
assert item.label in ["Default", "Spoiled"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
|
||||
long_text = "hello " * 600
|
||||
|
||||
classification_response = requests.post(
|
||||
server.url_for("classify"),
|
||||
json={"model": model_name, "input": long_text, "truncate_prompt_tokens": 5},
|
||||
)
|
||||
|
||||
classification_response.raise_for_status()
|
||||
output = ClassificationResponse.model_validate(classification_response.json())
|
||||
|
||||
assert len(output.data) == 1
|
||||
assert output.data[0].index == 0
|
||||
assert hasattr(output.data[0], "probs")
|
||||
assert output.usage.prompt_tokens == 5
|
||||
assert output.usage.total_tokens == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_invalid_truncate_prompt_tokens_error(
|
||||
server: RemoteOpenAIServer, model_name: str
|
||||
):
|
||||
classification_response = requests.post(
|
||||
server.url_for("classify"),
|
||||
json={"model": model_name, "input": "test", "truncate_prompt_tokens": 513},
|
||||
)
|
||||
|
||||
error = classification_response.json()
|
||||
assert classification_response.status_code == 400
|
||||
assert "truncate_prompt_tokens" in error["error"]["message"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
|
||||
classification_response = requests.post(
|
||||
server.url_for("classify"),
|
||||
json={"model": model_name, "input": ""},
|
||||
)
|
||||
|
||||
error = classification_response.json()
|
||||
assert classification_response.status_code == 400
|
||||
assert "error" in error
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_batch_classification_empty_list(server: RemoteOpenAIServer, model_name: str):
|
||||
classification_response = requests.post(
|
||||
server.url_for("classify"),
|
||||
json={"model": model_name, "input": []},
|
||||
)
|
||||
classification_response.raise_for_status()
|
||||
output = ClassificationResponse.model_validate(classification_response.json())
|
||||
|
||||
assert output.object == "list"
|
||||
assert isinstance(output.data, list)
|
||||
assert len(output.data) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invocations(server: RemoteOpenAIServer):
|
||||
request_args = {
|
||||
"model": MODEL_NAME,
|
||||
"input": "This product was excellent and exceeded my expectations",
|
||||
}
|
||||
|
||||
classification_response = requests.post(
|
||||
server.url_for("classify"), json=request_args
|
||||
)
|
||||
classification_response.raise_for_status()
|
||||
|
||||
invocation_response = requests.post(
|
||||
server.url_for("invocations"), json=request_args
|
||||
)
|
||||
invocation_response.raise_for_status()
|
||||
|
||||
classification_output = classification_response.json()
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert classification_output.keys() == invocation_output.keys()
|
||||
for classification_data, invocation_data in zip(
|
||||
classification_output["data"], invocation_output["data"]
|
||||
):
|
||||
assert classification_data.keys() == invocation_data.keys()
|
||||
assert classification_data["probs"] == pytest.approx(
|
||||
invocation_data["probs"], rel=0.01
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
|
||||
input_text = ["This product was excellent and exceeded my expectations"]
|
||||
|
||||
async def get_outputs(use_activation):
|
||||
response = requests.post(
|
||||
server.url_for("classify"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_text,
|
||||
"use_activation": use_activation,
|
||||
},
|
||||
)
|
||||
outputs = response.json()
|
||||
return torch.tensor([x["probs"] for x in outputs["data"]])
|
||||
|
||||
default = await get_outputs(use_activation=None)
|
||||
w_activation = await get_outputs(use_activation=True)
|
||||
wo_activation = await get_outputs(use_activation=False)
|
||||
|
||||
assert torch.allclose(default, w_activation, atol=1e-2), (
|
||||
"Default should use activation."
|
||||
)
|
||||
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
|
||||
"wo_activation should not use activation."
|
||||
)
|
||||
assert torch.allclose(F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2), (
|
||||
"w_activation should be close to activation(wo_activation)."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_score(server: RemoteOpenAIServer, model_name: str):
|
||||
# score api is only enabled for num_labels == 1.
|
||||
response = requests.post(
|
||||
server.url_for("score"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"text_1": "ping",
|
||||
"text_2": "pong",
|
||||
},
|
||||
)
|
||||
assert response.json()["error"]["type"] == "BadRequestError"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_rerank(server: RemoteOpenAIServer, model_name: str):
|
||||
# rerank api is only enabled for num_labels == 1.
|
||||
response = requests.post(
|
||||
server.url_for("rerank"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"query": "ping",
|
||||
"documents": ["pong"],
|
||||
},
|
||||
)
|
||||
assert response.json()["error"]["type"] == "BadRequestError"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
|
||||
input_text = "This product was excellent and exceeded my expectations"
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_text,
|
||||
"encoding_format": "float",
|
||||
"task": "classify",
|
||||
},
|
||||
)
|
||||
poolings = PoolingResponse.model_validate(response.json())
|
||||
assert len(poolings.data) == 1
|
||||
assert len(poolings.data[0].data) == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
|
||||
task = "token_classify"
|
||||
input_text = ["This product was excellent and exceeded my expectations"]
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_text,
|
||||
"encoding_format": "float",
|
||||
"task": task,
|
||||
},
|
||||
)
|
||||
poolings = PoolingResponse.model_validate(response.json())
|
||||
assert len(poolings.data) == 1
|
||||
assert len(poolings.data[0].data) == 8
|
||||
assert len(poolings.data[0].data[0]) == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
|
||||
async def test_pooling_not_supported(
|
||||
server: RemoteOpenAIServer, model_name: str, task: str
|
||||
):
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": "test",
|
||||
"encoding_format": "float",
|
||||
"task": task,
|
||||
},
|
||||
)
|
||||
assert response.json()["error"]["type"] == "BadRequestError"
|
||||
assert response.json()["error"]["message"].startswith(
|
||||
f"Task {task} is not supported"
|
||||
)
|
||||
95
tests/entrypoints/pooling/classify/test_online_vision.py
Normal file
95
tests/entrypoints/pooling/classify/test_online_vision.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
|
||||
|
||||
VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
|
||||
MAXIMUM_VIDEOS = 1
|
||||
TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
|
||||
|
||||
HF_OVERRIDES = {
|
||||
"text_config": {
|
||||
"architectures": ["Qwen2_5_VLForSequenceClassification"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_vlm_classify():
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
"--max-model-len",
|
||||
"5000",
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"video": MAXIMUM_VIDEOS}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
|
||||
def test_classify_accepts_chat_text_only(
|
||||
server_vlm_classify: RemoteOpenAIServer, model_name: str
|
||||
) -> None:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Please classify this text request."},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = requests.post(
|
||||
server_vlm_classify.url_for("classify"),
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = ClassificationResponse.model_validate(response.json())
|
||||
|
||||
assert output.object == "list"
|
||||
assert output.model == model_name
|
||||
assert len(output.data) == 1
|
||||
assert len(output.data[0].probs) == 2
|
||||
assert output.usage.prompt_tokens == 22
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
|
||||
def test_classify_accepts_chat_video_url(
|
||||
server_vlm_classify: RemoteOpenAIServer, model_name: str
|
||||
) -> None:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Please classify this video."},
|
||||
{"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = requests.post(
|
||||
server_vlm_classify.url_for("classify"),
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = ClassificationResponse.model_validate(response.json())
|
||||
|
||||
assert output.object == "list"
|
||||
assert output.model == model_name
|
||||
assert len(output.data) == 1
|
||||
assert len(output.data[0].probs) == 2
|
||||
assert output.usage.prompt_tokens == 4807
|
||||
0
tests/entrypoints/pooling/embed/__init__.py
Normal file
0
tests/entrypoints/pooling/embed/__init__.py
Normal file
47
tests/entrypoints/pooling/embed/test_correctness_mteb.py
Normal file
47
tests/entrypoints/pooling/embed/test_correctness_mteb.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
MTEB_EMBED_TASKS,
|
||||
MTEB_EMBED_TOL,
|
||||
OpenAIClientMtebEncoder,
|
||||
run_mteb_embed_task,
|
||||
)
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||
|
||||
MODEL_NAME = "intfloat/e5-small"
|
||||
MAIN_SCORE = 0.7422994752439667
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
def test_mteb_embed(server):
|
||||
client = server.get_client()
|
||||
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
|
||||
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
|
||||
st_main_score = MAIN_SCORE
|
||||
|
||||
print("VLLM main score: ", vllm_main_score)
|
||||
print("SentenceTransformer main score: ", st_main_score)
|
||||
print("Difference: ", st_main_score - vllm_main_score)
|
||||
|
||||
# We are not concerned that the vllm mteb results are better
|
||||
# than SentenceTransformers, so we only perform one-sided testing.
|
||||
assert st_main_score - vllm_main_score < MTEB_EMBED_TOL
|
||||
68
tests/entrypoints/pooling/embed/test_offline.py
Normal file
68
tests/entrypoints/pooling/embed/test_offline.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_token_embed(llm: LLM):
|
||||
outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
|
||||
multi_vector = outputs[0].outputs.data
|
||||
assert multi_vector.shape == (11, 384)
|
||||
|
||||
|
||||
def test_pooling_params(llm: LLM):
|
||||
def get_outputs(normalize):
|
||||
outputs = llm.embed(
|
||||
prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
|
||||
)
|
||||
return torch.tensor([x.outputs.embedding for x in outputs])
|
||||
|
||||
default = get_outputs(normalize=None)
|
||||
w_normal = get_outputs(normalize=True)
|
||||
wo_normal = get_outputs(normalize=False)
|
||||
|
||||
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
|
||||
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
|
||||
"wo_normal should not use normal."
|
||||
)
|
||||
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
|
||||
"w_normal should be close to normal(wo_normal)."
|
||||
)
|
||||
680
tests/entrypoints/pooling/embed/test_online.py
Normal file
680
tests/entrypoints/pooling/embed/test_online.py
Normal file
@@ -0,0 +1,680 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
|
||||
from tests.models.utils import check_embeddings_close
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.utils.serial_utils import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
ENDIANNESS,
|
||||
MetadataItem,
|
||||
binary2tensor,
|
||||
build_metadata_items,
|
||||
decode_pooling_output,
|
||||
)
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"512",
|
||||
"--chat-template",
|
||||
DUMMY_CHAT_TEMPLATE,
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def hf_model(hf_runner):
|
||||
with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
|
||||
yield hf_model
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
|
||||
input_texts = [
|
||||
"The chef prepared a delicious meal.",
|
||||
]
|
||||
|
||||
# test single embedding
|
||||
embedding_response = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_texts,
|
||||
encoding_format="float",
|
||||
)
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json")
|
||||
)
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 384
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 11
|
||||
assert embeddings.usage.total_tokens == 11
|
||||
|
||||
vllm_outputs = [d.embedding for d in embeddings.data]
|
||||
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
|
||||
|
||||
# test using token IDs
|
||||
input_tokens = [1, 1, 1, 1, 1]
|
||||
embedding_response = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_tokens,
|
||||
encoding_format="float",
|
||||
)
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json")
|
||||
)
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 384
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 5
|
||||
assert embeddings.usage.total_tokens == 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
|
||||
# test list[str]
|
||||
input_texts = [
|
||||
"The cat sat on the mat.",
|
||||
"A feline was resting on a rug.",
|
||||
"Stars twinkle brightly in the night sky.",
|
||||
]
|
||||
embedding_response = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_texts,
|
||||
encoding_format="float",
|
||||
)
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json")
|
||||
)
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 3
|
||||
assert len(embeddings.data[0].embedding) == 384
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 33
|
||||
assert embeddings.usage.total_tokens == 33
|
||||
|
||||
vllm_outputs = [d.embedding for d in embeddings.data]
|
||||
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
|
||||
|
||||
# test list[list[int]]
|
||||
input_tokens = [
|
||||
[4, 5, 7, 9, 20],
|
||||
[15, 29, 499],
|
||||
[24, 24, 24, 24, 24],
|
||||
[25, 32, 64, 77],
|
||||
]
|
||||
embedding_response = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_tokens,
|
||||
encoding_format="float",
|
||||
)
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json")
|
||||
)
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 4
|
||||
assert len(embeddings.data[0].embedding) == 384
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 17
|
||||
assert embeddings.usage.total_tokens == 17
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_conversation_embedding(
|
||||
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "The cat sat on the mat.",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "A feline was resting on a rug.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Stars twinkle brightly in the night sky.",
|
||||
},
|
||||
]
|
||||
|
||||
chat_response = requests.post(
|
||||
server.url_for("v1/embeddings"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"messages": messages,
|
||||
"encoding_format": "float",
|
||||
},
|
||||
)
|
||||
chat_response.raise_for_status()
|
||||
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
|
||||
|
||||
tokenizer = get_tokenizer(tokenizer_name=model_name)
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
chat_template=DUMMY_CHAT_TEMPLATE,
|
||||
add_generation_prompt=True,
|
||||
continue_final_message=False,
|
||||
tokenize=False,
|
||||
)
|
||||
completion_response = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=prompt,
|
||||
encoding_format="float",
|
||||
# To be consistent with chat
|
||||
extra_body={"add_special_tokens": False},
|
||||
)
|
||||
completion_embeddings = EmbeddingResponse.model_validate(
|
||||
completion_response.model_dump(mode="json")
|
||||
)
|
||||
|
||||
assert chat_embeddings.id is not None
|
||||
assert completion_embeddings.id is not None
|
||||
assert chat_embeddings.created <= completion_embeddings.created
|
||||
assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
|
||||
completion_embeddings.model_dump(exclude={"id", "created"})
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_batch_base64_embedding(
|
||||
hf_model, client: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
input_texts = [
|
||||
"Hello my name is",
|
||||
"The best thing about vLLM is that it supports many different models",
|
||||
]
|
||||
|
||||
responses_float = await client.embeddings.create(
|
||||
input=input_texts, model=model_name, encoding_format="float"
|
||||
)
|
||||
float_data = [d.embedding for d in responses_float.data]
|
||||
run_embedding_correctness_test(hf_model, input_texts, float_data)
|
||||
|
||||
responses_base64 = await client.embeddings.create(
|
||||
input=input_texts, model=model_name, encoding_format="base64"
|
||||
)
|
||||
base64_data = []
|
||||
for data in responses_base64.data:
|
||||
base64_data.append(
|
||||
np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
|
||||
)
|
||||
|
||||
run_embedding_correctness_test(hf_model, input_texts, base64_data)
|
||||
|
||||
# Default response is float32 decoded from base64 by OpenAI Client
|
||||
responses_default = await client.embeddings.create(
|
||||
input=input_texts, model=model_name
|
||||
)
|
||||
default_data = [d.embedding for d in responses_default.data]
|
||||
run_embedding_correctness_test(hf_model, input_texts, default_data)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_base64_embed_dtype_and_endianness(
|
||||
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
input_texts = [
|
||||
"The best thing about vLLM is that it supports many different models",
|
||||
]
|
||||
|
||||
responses_float = await client.embeddings.create(
|
||||
input=input_texts, model=model_name, encoding_format="float"
|
||||
)
|
||||
float_data = [d.embedding for d in responses_float.data]
|
||||
|
||||
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
|
||||
for endianness in ENDIANNESS:
|
||||
responses_base64 = requests.post(
|
||||
server.url_for("/v1/embeddings"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_texts,
|
||||
"encoding_format": "base64",
|
||||
"embed_dtype": embed_dtype,
|
||||
"endianness": endianness,
|
||||
},
|
||||
)
|
||||
|
||||
base64_data = []
|
||||
for data in responses_base64.json()["data"]:
|
||||
binary = base64.b64decode(data["embedding"])
|
||||
tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
|
||||
base64_data.append(tensor.to(torch.float32).tolist())
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=float_data,
|
||||
embeddings_1_lst=base64_data,
|
||||
name_0="float_data",
|
||||
name_1="base64_data",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_bytes_embed_dtype_and_endianness(
|
||||
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
input_texts = [
|
||||
"The best thing about vLLM is that it supports many different models",
|
||||
]
|
||||
|
||||
responses_float = await client.embeddings.create(
|
||||
input=input_texts, model=model_name, encoding_format="float"
|
||||
)
|
||||
float_data = [d.embedding for d in responses_float.data]
|
||||
|
||||
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
|
||||
for endianness in ENDIANNESS:
|
||||
responses_bytes = requests.post(
|
||||
server.url_for("/v1/embeddings"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_texts,
|
||||
"encoding_format": "bytes",
|
||||
"embed_dtype": embed_dtype,
|
||||
"endianness": endianness,
|
||||
},
|
||||
)
|
||||
|
||||
metadata = json.loads(responses_bytes.headers["metadata"])
|
||||
body = responses_bytes.content
|
||||
items = [MetadataItem(**x) for x in metadata["data"]]
|
||||
|
||||
bytes_data = decode_pooling_output(items=items, body=body)
|
||||
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=float_data,
|
||||
embeddings_1_lst=bytes_data,
|
||||
name_0="float_data",
|
||||
name_1="bytes_data",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_bytes_only_embed_dtype_and_endianness(
|
||||
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
input_texts = [
|
||||
"The best thing about vLLM is that it supports many different models",
|
||||
] * 2
|
||||
|
||||
responses_float = await client.embeddings.create(
|
||||
input=input_texts, model=model_name, encoding_format="float"
|
||||
)
|
||||
float_data = [d.embedding for d in responses_float.data]
|
||||
embedding_size = len(float_data[0])
|
||||
|
||||
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
|
||||
for endianness in ENDIANNESS:
|
||||
responses_bytes = requests.post(
|
||||
server.url_for("/v1/embeddings"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_texts,
|
||||
"encoding_format": "bytes_only",
|
||||
"embed_dtype": embed_dtype,
|
||||
"endianness": endianness,
|
||||
},
|
||||
)
|
||||
|
||||
assert "metadata" not in responses_bytes.headers
|
||||
body = responses_bytes.content
|
||||
items = build_metadata_items(
|
||||
embed_dtype=embed_dtype,
|
||||
endianness=endianness,
|
||||
shape=(embedding_size,),
|
||||
n_request=len(input_texts),
|
||||
)
|
||||
|
||||
bytes_data = decode_pooling_output(items=items, body=body)
|
||||
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=float_data,
|
||||
embeddings_1_lst=bytes_data,
|
||||
name_0="float_data",
|
||||
name_1="bytes_data",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
|
||||
async def test_params_not_supported(
|
||||
server: RemoteOpenAIServer, model_name: str, param_name: str
|
||||
):
|
||||
input_texts = [
|
||||
"The best thing about vLLM is that it supports many different models",
|
||||
]
|
||||
|
||||
responses_base64 = requests.post(
|
||||
server.url_for("/v1/embeddings"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_texts,
|
||||
"encoding_format": "base64",
|
||||
param_name: f"bad_{param_name}",
|
||||
},
|
||||
)
|
||||
|
||||
assert responses_base64.status_code == 400
|
||||
assert "literal_error" in responses_base64.json()["error"]["message"]
|
||||
assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
|
||||
input_texts = [
|
||||
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
|
||||
]
|
||||
|
||||
# test single embedding
|
||||
embedding_response = await client.embeddings.create(
|
||||
model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
|
||||
)
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json")
|
||||
)
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 384
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 10
|
||||
assert embeddings.usage.total_tokens == 10
|
||||
|
||||
input_tokens = [
|
||||
1,
|
||||
24428,
|
||||
289,
|
||||
18341,
|
||||
26165,
|
||||
285,
|
||||
19323,
|
||||
283,
|
||||
289,
|
||||
26789,
|
||||
3871,
|
||||
28728,
|
||||
9901,
|
||||
340,
|
||||
2229,
|
||||
385,
|
||||
340,
|
||||
315,
|
||||
28741,
|
||||
28804,
|
||||
2,
|
||||
]
|
||||
embedding_response = await client.embeddings.create(
|
||||
model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
|
||||
)
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json")
|
||||
)
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 384
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 10
|
||||
assert embeddings.usage.total_tokens == 10
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_embedding_truncation_invalid(
|
||||
client: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
input_texts = [
|
||||
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
|
||||
]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
response = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_texts,
|
||||
extra_body={"truncate_prompt_tokens": 8193},
|
||||
)
|
||||
assert "error" in response.object
|
||||
assert (
|
||||
"truncate_prompt_tokens value is greater than max_model_len. "
|
||||
"Please, select a smaller truncation size." in response.message
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
|
||||
input_texts = [
|
||||
"The chef prepared a delicious meal.",
|
||||
]
|
||||
|
||||
request_args = {
|
||||
"model": MODEL_NAME,
|
||||
"input": input_texts,
|
||||
"encoding_format": "float",
|
||||
}
|
||||
|
||||
completion_response = await client.embeddings.create(**request_args)
|
||||
|
||||
invocation_response = requests.post(
|
||||
server.url_for("invocations"), json=request_args
|
||||
)
|
||||
invocation_response.raise_for_status()
|
||||
|
||||
completion_output = completion_response.model_dump()
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert completion_output.keys() == invocation_output.keys()
|
||||
for completion_data, invocation_data in zip(
|
||||
completion_output["data"], invocation_output["data"]
|
||||
):
|
||||
assert completion_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=[completion_data["embedding"]],
|
||||
embeddings_1_lst=[invocation_data["embedding"]],
|
||||
name_0="completion",
|
||||
name_1="invocation",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invocations_conversation(server: RemoteOpenAIServer):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "The cat sat on the mat.",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "A feline was resting on a rug.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Stars twinkle brightly in the night sky.",
|
||||
},
|
||||
]
|
||||
|
||||
request_args = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": messages,
|
||||
"encoding_format": "float",
|
||||
}
|
||||
|
||||
chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
|
||||
chat_response.raise_for_status()
|
||||
|
||||
invocation_response = requests.post(
|
||||
server.url_for("invocations"), json=request_args
|
||||
)
|
||||
invocation_response.raise_for_status()
|
||||
|
||||
chat_output = chat_response.json()
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert chat_output.keys() == invocation_output.keys()
|
||||
for chat_data, invocation_data in zip(
|
||||
chat_output["data"], invocation_output["data"]
|
||||
):
|
||||
assert chat_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=[chat_data["embedding"]],
|
||||
embeddings_1_lst=[invocation_data["embedding"]],
|
||||
name_0="chat",
|
||||
name_1="invocation",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
|
||||
input_text = ["The chef prepared a delicious meal."]
|
||||
|
||||
async def get_outputs(normalize):
|
||||
request_args = {
|
||||
"model": MODEL_NAME,
|
||||
"input": input_text,
|
||||
"encoding_format": "float",
|
||||
"normalize": normalize,
|
||||
}
|
||||
|
||||
response = requests.post(server.url_for("v1/embeddings"), json=request_args)
|
||||
outputs = response.json()
|
||||
|
||||
return torch.tensor([x["embedding"] for x in outputs["data"]])
|
||||
|
||||
default = await get_outputs(normalize=None)
|
||||
w_normal = await get_outputs(normalize=True)
|
||||
wo_normal = await get_outputs(normalize=False)
|
||||
|
||||
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
|
||||
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
|
||||
"wo_normal should not use normal."
|
||||
)
|
||||
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
|
||||
"w_normal should be close to normal(wo_normal)."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
|
||||
task = "embed"
|
||||
input_text = ["The chef prepared a delicious meal."]
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_text,
|
||||
"encoding_format": "float",
|
||||
"task": task,
|
||||
},
|
||||
)
|
||||
|
||||
poolings = PoolingResponse.model_validate(response.json())
|
||||
|
||||
assert len(poolings.data) == 1
|
||||
assert len(poolings.data[0].data) == 384
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
|
||||
task = "token_embed"
|
||||
input_text = ["The chef prepared a delicious meal."]
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_text,
|
||||
"encoding_format": "float",
|
||||
"task": task,
|
||||
},
|
||||
)
|
||||
|
||||
poolings = PoolingResponse.model_validate(response.json())
|
||||
|
||||
assert len(poolings.data) == 1
|
||||
assert len(poolings.data[0].data) == 11
|
||||
assert len(poolings.data[0].data[0]) == 384
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
|
||||
async def test_pooling_not_supported(
|
||||
server: RemoteOpenAIServer, model_name: str, task: str
|
||||
):
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": "test",
|
||||
"encoding_format": "float",
|
||||
"task": task,
|
||||
},
|
||||
)
|
||||
assert response.json()["error"]["type"] == "BadRequestError"
|
||||
assert response.json()["error"]["message"].startswith(
|
||||
f"Task {task} is not supported"
|
||||
)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user