Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,173 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
@pytest.fixture
def sample_prompts():
return [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
@pytest.fixture
def sample_token_ids():
return [
[0],
[0, 1],
[0, 2, 1],
[0, 3, 1, 2],
]
@pytest.fixture
def sample_regex():
return (
r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
# Note: Ensure this only uses attributes compatible with xgrammar
@pytest.fixture
def sample_json_schema():
return {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {
"type": "string",
},
},
"grade": {
"type": "string",
"pattern": "^[A-D]$", # Regex pattern
},
"email": {
"type": "string",
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {"type": "string"},
"duration": {
"type": "number",
"minimum": 0.0,
"maximum": 100.0, # Numeric range
},
"position": {"type": "string"},
},
"required": ["company", "duration", "position"],
"additionalProperties": False,
},
"minItems": 0,
"maxItems": 3,
},
},
"required": ["name", "age", "skills", "grade", "email", "work_history"],
"additionalProperties": False,
"minProperties": 1,
"maxProperties": 10,
}
# A schema unsupported by xgrammar
@pytest.fixture
def unsupported_json_schema():
return {
"type": "object",
"properties": {
"score": {
"type": "integer",
"multipleOf": 5, # Numeric multiple
},
"tags": {
"type": "array",
"items": {"type": "string", "minLength": 10, "maxLength": 20},
},
},
"required": ["score", "tags"],
"additionalProperties": False,
"patternProperties": {
"^score$": {"type": "integer"},
},
}
@pytest.fixture
def sample_definition_json_schema():
return {
"$defs": {
"Step": {
"properties": {
"explanation": {"title": "Explanation", "type": "string"},
"output": {"title": "Output", "type": "string"},
},
"required": ["explanation", "output"],
"title": "Step",
"type": "object",
}
},
"properties": {
"steps": {
"items": {"$ref": "#/$defs/Step"},
"title": "Steps",
"type": "array",
},
"final_answer": {"title": "Final Answer", "type": "string"},
},
"required": ["steps", "final_answer"],
"title": "MathReasoning",
"type": "object",
"additionalProperties": False,
}
@pytest.fixture
def sample_structured_outputs_choices():
return [
"Python",
"Java",
"JavaScript",
"C++",
"C#",
"PHP",
"TypeScript",
"Ruby",
"Swift",
"Kotlin",
]
@pytest.fixture
def sample_sql_ebnf():
return """
root ::= select_statement
select_statement ::= "SELECT" column "from" table "where" condition
column ::= "col_1" | "col_2"
table ::= "table_1" | "table_2"
condition ::= column "=" number
number ::= "1" | "2"
"""
@pytest.fixture
def sample_sql_lark():
return """
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
"""

View File

View File

@@ -0,0 +1,916 @@
# ruff: noqa: E501
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from enum import Enum
from typing import Any
import jsonschema
import pytest
import regex as re
import torch
from pydantic import BaseModel
from tests.reasoning.utils import run_reasoning_extraction
from vllm.config import StructuredOutputsConfig
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
from vllm.sampling_params import (
SamplingParams,
StructuredOutputsParams,
)
NGRAM_SPEC_CONFIG = {
"model": "[ngram]",
"num_speculative_tokens": 5,
"prompt_lookup_max": 5,
"prompt_lookup_min": 1,
}
EAGLE_SPEC_CONFIG = {
"method": "eagle",
"model": "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
"num_speculative_tokens": 5,
}
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
# FIXME: Since "auto" will use Mistral tokenizer and these backends do not support
# it, we skip these tests for now.
# ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
# ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", None),
pytest.param(
"mistralai/Ministral-8B-Instruct-2410",
"lm-format-enforcer",
"hf",
None,
marks=pytest.mark.skip(
reason=(
"Flaky: lm-format-enforcer intermittently returns"
"incomplete JSON."
"See https://github.com/noamgat/lm-format-enforcer/issues/169"
)
),
),
("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
pytest.param(
"Qwen/Qwen2.5-1.5B-Instruct",
"lm-format-enforcer",
"auto",
None,
marks=pytest.mark.skip(
reason=(
"Flaky: lm-format-enforcer intermittently returns"
"incomplete JSON."
"See https://github.com/noamgat/lm-format-enforcer/issues/169"
)
),
),
# FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
# ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
# ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
# ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG),
("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", NGRAM_SPEC_CONFIG),
("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", EAGLE_SPEC_CONFIG),
]
PARAMS_MODELS_TOKENIZER_MODE = [
("mistralai/Ministral-8B-Instruct-2410", "auto"),
("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
]
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
truck = "Truck"
coupe = "Coupe"
class CarDescription(BaseModel):
brand: str
model: str
car_type: CarType
@pytest.mark.parametrize(
"model_name, backend, tokenizer_mode, speculative_config",
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
)
def test_structured_output(
sample_json_schema: dict[str, Any],
unsupported_json_schema: dict[str, Any],
sample_sql_ebnf: str,
sample_sql_lark: str,
sample_regex: str,
sample_structured_outputs_choices: str,
backend: str,
tokenizer_mode: str,
model_name: str,
speculative_config: dict[str, Any],
):
if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding")
# Use a single LLM instance for several scenarios to
# speed up the test suite.
llm = LLM(
model=model_name,
enforce_eager=True,
max_model_len=1024,
structured_outputs_config=dict(
backend=backend, disable_any_whitespace=backend in {"xgrammar", "guidance"}
),
seed=120,
tokenizer_mode=tokenizer_mode,
load_format="auto" if not model_name.startswith("mistralai/") else "hf",
config_format="auto" if not model_name.startswith("mistralai/") else "hf",
speculative_config=speculative_config,
)
#
# Test 1: Generate JSON output based on a provided schema
#
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=4096,
structured_outputs=StructuredOutputsParams(json=sample_json_schema),
)
prompt = (
"Give an example JSON for an employee profile that fits this "
"schema. Make the response as short as possible. Schema: "
f"{sample_json_schema}"
)
outputs = llm.generate(
[prompt] * 2,
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
if backend != "lm-format-enforcer":
assert "\n" not in generated_text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
try:
output_json = json.loads(generated_text)
except json.JSONDecodeError as e:
pytest.fail(
f"Invalid JSON from backend={backend}: {generated_text!r}\n"
f"Schema: {sample_json_schema}\nError: {e}"
)
jsonschema.validate(instance=output_json, schema=sample_json_schema)
#
# Test 2: Generate JSON object without a schema
#
if backend != "outlines":
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=4096,
n=2,
structured_outputs=StructuredOutputsParams(json_object=True),
)
outputs = llm.generate(
prompts=(
"Generate a JSON object with curly braces for a person with "
"name and age fields for John Smith who is 31 years old. "
"Make the response as short as possible."
),
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
for i in range(2):
generated_text = output.outputs[i].text
print(generated_text)
assert generated_text is not None
# Parse to verify it is a valid JSON object
parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict)
#
# Test 3: test a jsonschema incompatible with xgrammar
#
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=4096,
structured_outputs=StructuredOutputsParams(json=unsupported_json_schema),
)
if backend.startswith("xgrammar"):
with pytest.raises(
ValueError,
match="The provided JSON schema contains features "
"not supported by xgrammar.",
):
prompt = (
f"Give an example JSON for an employee profile that "
f"fits this schema: {unsupported_json_schema}. "
f"Make the response as short as possible."
)
llm.generate(
[prompt] * 2,
sampling_params=sampling_params,
use_tqdm=True,
)
else:
prompt = (
f"Give an example JSON object for a grade that "
f"fits this schema: {unsupported_json_schema}. "
f"Make the response as short as possible."
)
outputs = llm.generate(
prompt,
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
generated_text = output.outputs[0].text
assert generated_text is not None
print(generated_text)
# Parse to verify it is valid JSON
parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict)
if backend not in ["outlines", "lm-format-enforcer"]:
#
# Test 4: Generate SQL statement using EBNF grammar
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=1000,
structured_outputs=StructuredOutputsParams(grammar=sample_sql_ebnf),
)
outputs = llm.generate(
(
"Generate a sql statement that selects col_1 from "
"table_1 where it is equal to 1. Make the response as short as "
"possible."
),
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
# remove spaces for comparison b/c we removed them in the grammar
ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
assert generated_text.strip() == ground_truth
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
#
# Test 5: Generate SQL statement using Lark grammar
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=1000,
structured_outputs=StructuredOutputsParams(grammar=sample_sql_lark),
)
outputs = llm.generate(
(
"Generate a sql statement that selects col_1 from "
"table_1 where it is equal to 1. Make the response as short as "
"possible."
),
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
# use Lark to parse the output, and make sure it's a valid parse tree
from lark import Lark
parser = Lark(sample_sql_lark)
parser.parse(generated_text)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
assert generated_text.strip() == ground_truth
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
#
# Test 6: Test invalid grammar input
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=1000,
structured_outputs=StructuredOutputsParams(grammar="not a grammar"),
)
with pytest.raises(ValueError, match="Failed to convert the grammar "):
llm.generate(
(
"Generate a sql statement that selects col_1 from "
"table_1 where it is equal to 1. Make the response as short "
"as possible."
),
sampling_params=sampling_params,
use_tqdm=True,
)
#
# Test 7: Generate text based on a regex pattern
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
structured_outputs=StructuredOutputsParams(regex=sample_regex),
)
prompt = (
f"Give an example IPv4 address with this regex: {sample_regex}. "
f"Make the response as short as possible."
)
outputs = llm.generate(
[prompt] * 2,
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
print(generated_text)
assert generated_text is not None
assert re.fullmatch(sample_regex, generated_text) is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
#
# Test 8: Generate text based on a choices
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
structured_outputs=StructuredOutputsParams(
choice=sample_structured_outputs_choices
),
)
outputs = llm.generate(
(
"The best language for type-safe systems programming is "
"(Make the response as short as possible.) "
),
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
print(generated_text)
assert generated_text is not None
assert generated_text in sample_structured_outputs_choices
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
#
# Test 9: Generate structured output using a Pydantic model with an enum
#
json_schema = CarDescription.model_json_schema()
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
structured_outputs=StructuredOutputsParams(json=json_schema),
)
outputs = llm.generate(
(
"Generate a JSON with the brand, model and car_type of the most "
"iconic car from the 90's. Make the response as short as "
"possible."
),
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
try:
output_json = json.loads(generated_text)
except json.JSONDecodeError as e:
pytest.fail(
f"Invalid JSON from backend={backend}: {generated_text!r}\n"
f"Schema: {json_schema}\nError: {e}"
)
jsonschema.validate(instance=output_json, schema=json_schema)
#
# Test 10: Generate structured with minLength and maxLength
#
min_length = 50
max_length = 50
json_schema = {
"type": "object",
"properties": {
"description": {
"type": "string",
"maxLength": max_length,
"minLength": min_length,
}
},
"required": ["description"],
"additionalProperties": False,
}
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=4096,
structured_outputs=StructuredOutputsParams(json=json_schema),
)
outputs = llm.generate(
(
"Generate a description of a frog using 50 characters. "
"Make the response as short as possible."
),
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
try:
output_json = json.loads(generated_text)
except json.JSONDecodeError as e:
pytest.fail(
f"Invalid JSON from backend={backend}: {generated_text!r}\n"
f"Schema: {json_schema}\nError: {e}"
)
jsonschema.validate(instance=output_json, schema=json_schema)
if backend not in ["outlines", "lm-format-enforcer"]:
#
# Test 11: Generate structured output using structural_tag format
#
structural_tag_config = {
"type": "structural_tag",
"structures": [
{
"begin": "<function=get_weather>",
"schema": {
"type": "object",
"properties": {"city": {"type": "string"}},
"additionalProperties": False,
},
"end": "</function>",
}
],
"triggers": ["<function="],
}
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=4096,
structured_outputs=StructuredOutputsParams(
structural_tag=json.dumps(structural_tag_config)
),
)
prompt = """
You have access to the following function to retrieve the weather in a city:
{
"name": "get_weather",
"parameters": {
"city": {
"param_type": "string",
"description": "The city to get the weather for",
"required": True
}
}
}
If a you choose to call a function ONLY reply in the following format:
<{start_tag}={function_name}>{parameters}{end_tag}
where
start_tag => `<function`
parameters => a JSON dict with the function argument name
as key and function argument value as value.
end_tag => `</function>`
Here is an example,
<function=example_function_name>{"example_name": "example_value"}</function>
Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.
Given the previous instructions, what is the weather in New York City? \
Make the response as short as possible.
"""
# Change this once other backends support structural_tag
outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
generated_text = output.outputs[0].text
assert generated_text is not None
# Search for function call pattern in the response
function_call_pattern = r"<function=get_weather>(.*?)</function>"
matches = re.findall(function_call_pattern, generated_text)
if not matches:
print(
f"Warning: No function calls found in response: {generated_text!r}"
)
continue
# Take the first function call if multiple are found
json_str = matches[0]
try:
json_content = json.loads(json_str)
assert "city" in json_content
assert isinstance(json_content["city"], str)
print(f"Found valid function call: {generated_text!r}")
except (json.JSONDecodeError, AssertionError) as e:
pytest.fail(
f"Invalid function call format: {generated_text!r}\nError: {str(e)}"
)
@pytest.mark.parametrize(
"model_name, backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501
[
(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"xgrammar",
"auto",
"deepseek_r1",
NGRAM_SPEC_CONFIG,
),
("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None),
],
)
def test_structured_output_with_reasoning_matrices(
backend: str,
tokenizer_mode: str,
reasoning_parser: str,
model_name: str,
speculative_config: dict[str, Any] | None,
):
if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding")
# Use a single LLM instance for several scenarios to
# speed up the test suite.
llm = LLM(
model=model_name,
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager=bool(not current_platform.is_tpu()),
max_model_len=1024,
max_num_seqs=16,
structured_outputs_config=dict(
backend=backend,
disable_any_whitespace=backend in {"xgrammar", "guidance"},
reasoning_parser=reasoning_parser,
),
tokenizer_mode=tokenizer_mode,
speculative_config=speculative_config,
)
tokenizer = llm.get_tokenizer()
reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)(
tokenizer=tokenizer
)
reasoning_prompt = "Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key 'result'. Make sure to correct your reasoning if there are any issue should it arise.\nProblem: What is 5 * 8 + 2?" # noqa: E501
reasoning_schema = {
"type": "object",
"properties": {"result": {"type": "integer"}},
"required": ["result"],
"additionalProperties": False,
}
if "Qwen3" in model_name:
reasoning_prompt += "<think>\n"
sampling_params = SamplingParams(
temperature=0.1,
max_tokens=8192,
structured_outputs=StructuredOutputsParams(json=reasoning_schema),
)
outputs = llm.generate(
[reasoning_prompt],
sampling_params=sampling_params,
use_tqdm=True,
)
assert outputs is not None
output = outputs[0]
assert output is not None and isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
reasoning, content = run_reasoning_extraction(reasoner, [generated_text])
print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}")
if "Qwen3" in model_name:
assert content is not None
assert reasoning is not None
if content is not None:
output_json = json.loads(content)
jsonschema.validate(instance=output_json, schema=reasoning_schema)
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
def test_structured_output_auto_mode(
unsupported_json_schema: dict[str, Any],
model_name: str,
tokenizer_mode: str,
):
llm = LLM(
model=model_name,
max_model_len=1024,
structured_outputs_config=dict(backend="auto"),
tokenizer_mode=tokenizer_mode,
load_format="auto",
config_format="auto",
)
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
structured_outputs=StructuredOutputsParams(json=unsupported_json_schema),
)
prompts = (
"Give an example JSON object for a grade "
"that fits this schema: "
f"{unsupported_json_schema}. Make the response as short as possible."
)
# This would fail with the default of "xgrammar", but in "auto"
# we will handle fallback automatically.
outputs = llm.generate(prompts, sampling_params=sampling_params, use_tqdm=True)
# Make sure `auto` backend handling doesn't mess up sampling_params
# and that we can reuse it without error.
outputs.extend(
llm.generate(prompts, sampling_params=sampling_params, use_tqdm=True)
)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
generated_text = output.outputs[0].text
assert generated_text is not None
print(generated_text)
# Parse to verify it is valid JSON
parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict)
def test_guidance_no_additional_properties():
llm = LLM(
model="Qwen/Qwen2.5-1.5B-Instruct",
max_model_len=1024,
structured_outputs_config=dict(
backend="guidance",
disable_any_whitespace=True,
disable_additional_properties=True,
),
)
schema = {
"type": "object",
"properties": {
"a1": {"type": "string"},
"a2": {"type": "string"},
"a3": {"type": "string"},
},
"required": ["a1", "a2", "a3"],
}
prompt = (
"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
"helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
"large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20. "
"Make the response as short as possible."
"<|im_end|>\n<|im_start|>assistant\n"
)
def generate_with_backend(backend):
structured_outputs_params = StructuredOutputsParams(
json=schema,
backend=backend,
disable_any_whitespace=True,
disable_additional_properties=True,
)
sampling_params = SamplingParams(
temperature=0, max_tokens=256, structured_outputs=structured_outputs_params
)
outputs = llm.generate(prompt, sampling_params=sampling_params)
assert outputs is not None
generated_text = outputs[0].outputs[0].text
assert generated_text is not None
parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict)
jsonschema.validate(instance=parsed_json, schema=schema)
return parsed_json
generated = generate_with_backend("guidance")
assert "a1" in generated
assert "a2" in generated
assert "a3" in generated
assert "a4" not in generated
assert "a5" not in generated
assert "a6" not in generated
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
def test_structured_output_batched_with_non_structured_outputs_requests(
sample_json_schema: dict[str, Any],
backend: str,
):
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager = bool(not current_platform.is_tpu())
llm = LLM(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
enforce_eager=enforce_eager,
max_model_len=1024,
structured_outputs_config=StructuredOutputsConfig(
backend=backend,
disable_any_whitespace=backend in {"xgrammar", "guidance"},
),
)
structured_outputs_prompt = (
"Give an example JSON for an employee profile that fits this "
"schema. Make the response as short as possible. Schema: "
f"{sample_json_schema}"
)
non_structured_outputs_prompt = "The diameter of the Earth in kilometers is "
prompts = [structured_outputs_prompt, non_structured_outputs_prompt]
sampling_params = [
SamplingParams(
temperature=1.0,
max_tokens=400,
structured_outputs=StructuredOutputsParams(json=sample_json_schema),
),
# No max tokens, temp=0 to assert on contents
SamplingParams(
seed=42,
temperature=0,
top_p=1.0,
),
]
outputs = llm.generate(
prompts=prompts, sampling_params=sampling_params, use_tqdm=True
)
assert outputs is not None
# Free memory as soon as possible as failed assertions
# will short circuit and not free up memory
del llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
for index, output in enumerate(outputs):
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}")
if index == 0:
# First prompt is structured outputs, expect valid JSON
assert "\n" not in generated_text
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, schema=sample_json_schema)
else:
# Second prompt is not structured outputs, expect valid output
# Cannot assert on exact output, but we can expect it to be factual
assert "12,742" in generated_text
# non-structured outputs requests should not return a valid JSON here
with pytest.raises(ValueError):
output_json = json.loads(generated_text)
@pytest.mark.parametrize("backend", ["xgrammar"])
def test_structured_output_with_structural_tag(backend: str):
llm = LLM(
model="Qwen/Qwen2.5-1.5B-Instruct",
structured_outputs_config=StructuredOutputsConfig(backend=backend),
)
structural_tag_config = {
"type": "structural_tag",
"format": {
"type": "triggered_tags",
"tags": [
{"begin": "hello_flag", "content": {"type": "any_text"}, "end": "hello"}
],
"triggers": ["hello"],
"stop_after_first": False,
},
}
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=500,
structured_outputs=StructuredOutputsParams(
structural_tag=json.dumps(structural_tag_config)
),
)
prompt = "Hello and repete hello 10 times, do not say anything else. Only say hello hello hello, now start"
outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
assert "hello_flag" in generated_text, (
f"Expected 'hello_flag' to be in generated text, but got: {generated_text}"
)

View File

@@ -0,0 +1,44 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
# Use a small reasoning model to test the responses API.
MODEL_NAME = "Qwen/Qwen3-1.7B"
@pytest.fixture(scope="module")
def default_server_args():
return [
"--max-model-len",
"8192",
"--enforce-eager", # For faster startup.
"--enable-auto-tool-choice",
"--structured-outputs-config.backend",
"xgrammar",
"--tool-call-parser",
"hermes",
"--reasoning-parser",
"qwen3",
]
@pytest.fixture(scope="module")
def server_with_store(default_server_args):
with RemoteOpenAIServer(
MODEL_NAME,
default_server_args,
env_dict={
"VLLM_ENABLE_RESPONSES_API_STORE": "1",
"VLLM_SERVER_DEV_MODE": "1",
},
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server_with_store):
async with server_with_store.get_async_client() as async_client:
yield async_client

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai # use the official client for correctness check
import openai.types.responses as openai_responses_types
import pytest
@pytest.mark.asyncio
async def test_simple_input(client: openai.AsyncOpenAI):
response = await client.responses.create(input="What is 13 * 24?")
print(response)
outputs = response.output
# Whether the output contains the answer.
assert outputs[-1].type == "message"
assert "312" in outputs[-1].content[0].text
# Whether the output contains the reasoning.
assert outputs[0].type == "reasoning"
assert outputs[0].content[0].text != ""
@pytest.mark.asyncio
async def test_instructions(client: openai.AsyncOpenAI):
response = await client.responses.create(
instructions="Finish the answer with QED.",
input="What is 13 * 24?",
)
print(response)
output_text = response.output[-1].content[0].text
assert "312" in output_text
assert "QED" in output_text
@pytest.mark.asyncio
async def test_chat(client: openai.AsyncOpenAI):
response = await client.responses.create(
input=[
{"role": "system", "content": "Finish the answer with QED."},
{"role": "user", "content": "What is 5 * 3?"},
{"role": "assistant", "content": "15. QED."},
{"role": "user", "content": "Multiply the result by 2."},
],
)
print(response)
output_text = response.output[-1].content[0].text
assert "30" in output_text
assert "QED" in output_text
@pytest.mark.asyncio
async def test_chat_with_input_type(client: openai.AsyncOpenAI):
response = await client.responses.create(
input=[
{
"role": "user",
"content": [{"type": "input_text", "text": "Hello!"}],
},
],
)
print(response)
assert response.status == "completed"
@pytest.mark.asyncio
async def test_logprobs(client: openai.AsyncOpenAI):
response = await client.responses.create(
include=["message.output_text.logprobs"],
input="What is 13 * 24?",
top_logprobs=5,
)
print(response)
outputs = response.output
assert outputs[-1].content[-1].logprobs
assert len(outputs[-1].content[-1].logprobs[0].top_logprobs) == 5
@pytest.mark.asyncio
async def test_streaming(client: openai.AsyncOpenAI):
stream = await client.responses.create(
input="What is 13 * 24?",
stream=True,
)
events = [event async for event in stream]
assert isinstance(events[0], openai_responses_types.ResponseCreatedEvent)
assert any(
isinstance(event, openai_responses_types.ResponseTextDeltaEvent)
for event in events
)
assert isinstance(events[-1], openai_responses_types.ResponseCompletedEvent)

View File

@@ -0,0 +1,199 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai # use the official client for correctness check
import pytest
MODEL_NAME = "Qwen/Qwen3-1.7B"
tools = [
{
"type": "function",
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. 'Austria'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
"options": {
"$ref": "#/$defs/WeatherOptions",
"description": "Optional parameters for weather query",
},
},
"required": ["country", "unit"],
"$defs": {
"WeatherOptions": {
"title": "WeatherOptions",
"type": "object",
"additionalProperties": False,
"properties": {
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"default": "celsius",
"description": "Temperature unit",
"title": "Temperature Unit",
},
"include_forecast": {
"type": "boolean",
"default": False,
"description": "Whether to include a 24-hour forecast",
"title": "Include Forecast",
},
"language": {
"type": "string",
"default": "zh-CN",
"description": "Language of the response",
"title": "Language",
"enum": ["zh-CN", "en-US", "ja-JP"],
},
},
},
},
},
},
{
"type": "function",
"name": "get_forecast",
"description": "Get the weather forecast for a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to get the forecast for, e.g. 'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. 'Austria'",
},
"days": {
"type": "integer",
"description": "Number of days to get the forecast for (1-7)",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["country", "days", "unit"],
},
},
]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("tool_choice", ["auto", "required"])
async def test_function_tool_use(
client: openai.AsyncOpenAI, model_name: str, tool_choice: str
):
prompt = [
{
"role": "user",
"content": "Can you tell me what the current weather is in Berlin and the "
"forecast for the next 5 days, in fahrenheit?",
},
]
response = await client.responses.create(
model=model_name,
input=prompt,
tools=tools,
tool_choice=tool_choice,
temperature=0.0,
)
assert len(response.output) >= 1
tool_call = None
reasoning = None
for out in response.output:
if out.type == "function_call":
tool_call = out
if out.type == "reasoning":
reasoning = out
assert tool_call is not None
assert tool_call.type == "function_call"
assert json.loads(tool_call.arguments) is not None
assert reasoning is not None
assert reasoning.type == "reasoning"
@pytest.mark.asyncio
async def test_named_tool_use(client: openai.AsyncOpenAI):
def get_weather(latitude: float, longitude: float) -> str:
"""
Mock function to simulate getting weather data.
In a real application, this would call an external weather API.
"""
return f"Current temperature at ({latitude}, {longitude}) is 20°C."
tools = [
{
"type": "function",
"name": "get_weather",
"description": (
"Get current temperature for provided coordinates in celsius."
),
"parameters": {
"type": "object",
"properties": {
"latitude": {"type": "number"},
"longitude": {"type": "number"},
},
"required": ["latitude", "longitude"],
"additionalProperties": False,
},
"strict": True,
}
]
input_messages = [
{"role": "user", "content": "What's the weather like in Paris today?"}
]
response = await client.responses.create(
model=MODEL_NAME,
input=input_messages,
tools=tools,
tool_choice={"type": "function", "name": "get_weather"},
)
assert len(response.output) >= 1
for out in response.output:
if out.type == "function_call":
tool_call = out
assert tool_call is not None
assert tool_call.type == "function_call"
assert tool_call.name == "get_weather"
args = json.loads(tool_call.arguments)
assert args["latitude"] is not None
assert args["longitude"] is not None
# call the tool
result = get_weather(args["latitude"], args["longitude"])
input_messages.append(tool_call) # append model's function call message
input_messages.append(
{ # append result message
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
)
# create a new response with the tool call result
response_2 = await client.responses.create(model=MODEL_NAME, input=input_messages)
# check the output
assert len(response_2.output_text) > 0

View File

@@ -0,0 +1,171 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_image_base64
# Use a small vision model for testing
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
MAXIMUM_IMAGES = 2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_ASSETS = [
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
]
@pytest.fixture(scope="module")
def default_image_server_args():
return [
"--enforce-eager",
"--max-model-len",
"6000",
"--max-num-seqs",
"128",
"--limit-mm-per-prompt",
json.dumps({"image": MAXIMUM_IMAGES}),
]
@pytest.fixture(scope="module")
def image_server(default_image_server_args):
with RemoteOpenAIServer(
MODEL_NAME,
default_image_server_args,
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(image_server):
async with image_server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image(
client: openai.AsyncOpenAI, model_name: str, image_url: str
):
content_text = "What's in this image?"
messages = [
{
"role": "user",
"content": [
{
"type": "input_image",
"image_url": image_url,
"detail": "auto",
},
{"type": "input_text", "text": content_text},
],
}
]
# test image url
response = await client.responses.create(
model=model_name,
input=messages,
)
assert len(response.output_text) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
raw_image_url: str,
base64_encoded_image: dict[str, str],
):
content_text = "What's in this image?"
messages = [
{
"role": "user",
"content": [
{
"type": "input_image",
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501
"detail": "auto",
},
{"type": "input_text", "text": content_text},
],
}
]
# test image base64
response = await client.responses.create(
model=model_name,
input=messages,
)
assert len(response.output_text) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
indirect=True,
)
async def test_multi_image_input(
client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
):
messages = [
{
"role": "user",
"content": [
*(
{
"type": "input_image",
"image_url": image_url,
"detail": "auto",
}
for image_url in image_urls
),
{"type": "input_text", "text": "What's in this image?"},
],
}
]
if len(image_urls) > MAXIMUM_IMAGES:
with pytest.raises(openai.BadRequestError): # test multi-image input
await client.responses.create(
model=model_name,
input=messages,
)
# the server should still work afterwards
response = await client.responses.create(
model=model_name,
input=[
{
"role": "user",
"content": "What's the weather like in Paris today?",
}
],
)
assert len(response.output_text) > 0
else:
response = await client.responses.create(
model=model_name,
input=messages,
)
assert len(response.output_text) > 0

View File

@@ -0,0 +1,139 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import openai
import pytest
@pytest.mark.asyncio
async def test_store(client: openai.AsyncOpenAI):
# By default, store is True.
response = await client.responses.create(input="Hello!")
assert response.status == "completed"
# Retrieve the response.
response = await client.responses.retrieve(response.id)
assert response.status == "completed"
# Test store=False.
response = await client.responses.create(
input="Hello!",
store=False,
)
assert response.status == "completed"
# The response should not be found.
with pytest.raises(openai.NotFoundError, match="Response with id .* not found."):
await client.responses.retrieve(response.id)
@pytest.mark.asyncio
async def test_background(client: openai.AsyncOpenAI):
# NOTE: This query should be easy enough for the model to answer
# within the 10 seconds.
response = await client.responses.create(
input="Hello!",
background=True,
)
assert response.status == "queued"
max_retries = 10
for _ in range(max_retries):
await asyncio.sleep(1)
response = await client.responses.retrieve(response.id)
if response.status != "queued":
break
print(response)
assert response.status == "completed"
@pytest.mark.asyncio
async def test_background_error(client: openai.AsyncOpenAI):
with pytest.raises(
openai.BadRequestError, match="background can only be used when `store` is true"
):
_ = await client.responses.create(
input="What is 13 * 24?",
background=True,
store=False,
)
@pytest.mark.asyncio
async def test_background_cancel(client: openai.AsyncOpenAI):
response = await client.responses.create(
input="Write a long story about a cat.",
background=True,
)
assert response.status == "queued"
# Cancel the response before it is completed.
# FIXME: This test can be flaky.
await asyncio.sleep(0.5)
response = await client.responses.cancel(response.id)
assert response.status == "cancelled"
# Make sure the response status remains unchanged.
await asyncio.sleep(5)
response = await client.responses.retrieve(response.id)
assert response.status == "cancelled"
@pytest.mark.asyncio
async def test_cancel_completed(client: openai.AsyncOpenAI):
response = await client.responses.create(input="Hello")
assert response.status == "completed"
with pytest.raises(
openai.BadRequestError, match="Cannot cancel a synchronous response."
):
await client.responses.cancel(response.id)
@pytest.mark.asyncio
async def test_previous_response_id(client: openai.AsyncOpenAI):
response1 = await client.responses.create(
instructions="You are tested on your ability to retrieve the correct "
"information from the previous response.",
input="Hello, my name is John.",
)
response2 = await client.responses.create(
input="Actually, my name is not John. My real name is Mark.",
previous_response_id=response1.id,
)
response3 = await client.responses.create(
input="What is my real name again? Answer in one word.",
previous_response_id=response2.id,
)
print(response3)
assert "Mark" in response3.output[-1].content[0].text
assert "John" not in response3.output[-1].content[0].text
@pytest.mark.asyncio
async def test_two_responses_with_same_prev_id(client: openai.AsyncOpenAI):
response1 = await client.responses.create(
instructions="You are tested on your ability to retrieve the correct "
"information from the previous response.",
input="Hello, my name is John.",
)
# Both response 2 and 3 use response 1 as the previous response.
response2 = client.responses.create(
input="Actually, my name is not John. My name is Mark.",
previous_response_id=response1.id,
)
response3 = client.responses.create(
input="What is my name again? Answer in one word.",
previous_response_id=response1.id,
)
_ = await response2
response3_result = await response3
print(response3_result)
assert "John" in response3_result.output[-1].content[0].text
assert "Mark" not in response3_result.output[-1].content[0].text

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai
import pytest
from pydantic import BaseModel
@pytest.mark.asyncio
async def test_structured_output(client: openai.AsyncOpenAI):
response = await client.responses.create(
input=[
{"role": "system", "content": "Extract the event information."},
{
"role": "user",
"content": "Alice and Bob are going to a science fair on Friday.",
},
],
text={
"format": {
"type": "json_schema",
"name": "calendar_event",
"schema": {
"type": "object",
"properties": {
"event_name": {"type": "string"},
"date": {"type": "string"},
"participants": {"type": "array", "items": {"type": "string"}},
},
"required": ["event_name", "date", "participants"],
"additionalProperties": False,
},
"description": "A calendar event.",
"strict": True,
}
},
)
print(response)
# NOTE: The JSON schema is applied to the output text, not reasoning.
output_text = response.output[-1].content[0].text
event = json.loads(output_text)
assert event["event_name"].lower() == "science fair"
assert event["date"] == "Friday"
participants = event["participants"]
assert len(participants) == 2
assert participants[0] == "Alice"
assert participants[1] == "Bob"
@pytest.mark.asyncio
async def test_structured_output_with_parse(client: openai.AsyncOpenAI):
class CalendarEvent(BaseModel):
event_name: str
date: str
participants: list[str]
response = await client.responses.parse(
model=None,
instructions="Extract the event information.",
input="Alice and Bob are going to a science fair on Friday.",
text_format=CalendarEvent,
)
print(response)
# The output is successfully parsed.
event = response.output_parsed
assert event is not None
# The output is correct.
assert event.event_name.lower() == "science fair"
assert event.date == "Friday"
participants = event.participants
assert len(participants) == 2
assert participants[0] == "Alice"
assert participants[1] == "Bob"

View File

@@ -0,0 +1,160 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
# any model with a chat template defined in tokenizer_config should work here
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
]
@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_invalid_json_schema(client: openai.AsyncOpenAI, model_name: str) -> None:
invalid_json_schema = {
"$defs": {
"CarType": {
"enum": ["sedan", "SUV", "Truck", "Coupe"],
"title": "CarType",
"type": "string",
}
},
"properties": {
"brand": {"title": "Brand", "type": "string"},
"model": {"title": "Model", "type": "string"},
"car_type": {"$ref": "#/$defs/CarType"},
"foo": "bar",
},
"required": ["brand", "model", "car_type"],
"title": "CarDescription",
"type": "object",
}
prompt = (
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
with pytest.raises((openai.BadRequestError, openai.APIError)):
await client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={"structured_outputs": {"json": invalid_json_schema}},
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
prompt = (
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n"
)
with pytest.raises((openai.BadRequestError, openai.APIError)):
await client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={"structured_outputs": {"regex": r"[.*"}, "stop": ["\n"]},
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
invalid_simplified_sql_grammar = """
root ::= select_statementinvalidsyntax
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""
prompt = (
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
with pytest.raises((openai.BadRequestError, openai.APIError)):
await client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={
"structured_outputs": {"grammar": invalid_simplified_sql_grammar}
},
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_empty_grammar(client: openai.AsyncOpenAI, model_name: str) -> None:
prompt = "Say hello"
with pytest.raises((openai.BadRequestError, openai.APIError)):
await client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={"structured_outputs": {"grammar": ""}},
)

View File

@@ -0,0 +1,687 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import regex as re
from openai import BadRequestError
from tests.utils import RemoteOpenAIServer
from vllm.tokenizers import get_tokenizer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module")
def default_server_args():
return [
"--dtype",
"float32",
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
"--enable-prompt-tokens-details",
]
@pytest.fixture(
scope="module",
params=[
["--no-enable-prefix-caching"],
["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"],
],
)
def server(default_server_args, request):
if request.param:
default_server_args = default_server_args + request.param
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str) -> None:
completion = await client.completions.create(
model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
assert len(choice.text) >= 5
assert choice.finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11
)
# test using token IDs
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 1
assert completion.choices[0].prompt_logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=None,
)
choice = completion.choices[0]
assert choice.logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=0,
)
choice = completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) == 1
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=5,
)
choice = completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_too_many_completion_logprobs(
client: openai.AsyncOpenAI, model_name: str
) -> None:
with pytest.raises(
(openai.BadRequestError, openai.APIError)
): # test using token IDs
await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=21,
)
...
with pytest.raises(
(openai.BadRequestError, openai.APIError)
): # test using token IDs
stream = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=30,
stream=True,
)
async for chunk in stream:
...
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name, prompt_logprobs",
[(MODEL_NAME, -1), (MODEL_NAME, 0), (MODEL_NAME, 1), (MODEL_NAME, None)],
)
async def test_prompt_logprobs_completion(
client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
):
params: dict = {
"prompt": ["A robot may not injure another robot", "My name is"],
"model": model_name,
}
if prompt_logprobs is not None:
params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
if prompt_logprobs is not None and prompt_logprobs < 0:
with pytest.raises(BadRequestError):
await client.completions.create(**params)
else:
completion = await client.completions.create(**params)
if prompt_logprobs is not None:
assert completion.choices[0].prompt_logprobs is not None
assert len(completion.choices[0].prompt_logprobs) > 0
assert completion.choices[1].prompt_logprobs is not None
assert len(completion.choices[1].prompt_logprobs) > 0
else:
assert completion.choices[0].prompt_logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_completion_streaming(
client: openai.AsyncOpenAI, model_name: str
) -> None:
prompt = "What is an LLM?"
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
stream = await client.completions.create(
model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
)
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text
assert "".join(chunks) == single_output
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_parallel_no_streaming(client: openai.AsyncOpenAI, model_name: str):
"""Parallel sampling without streaming.
A single request output contains a list of completions.
"""
prompt = "What is an LLM?"
n = 3
max_tokens = 50 # we want some to finish earlier than others
# High temperature to maximize chance of unique completions.
completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=max_tokens,
n=n,
temperature=1.0,
stream=False,
logprobs=0,
seed=42,
)
# Assert `n` completions
num_completions = len(completion.choices)
assert num_completions == n, f"Num completions {num_completions} but expected {n}."
completion_repeats: dict[str, int] = {}
output_token_lengths = set()
for idx, choice in enumerate(completion.choices):
# Assert correct completion index & some finish reason.
assert choice.index == idx, f"Index {choice.index} but expected {idx}."
assert choice.finish_reason is not None, "None finish_reason is invalid."
text = choice.text
completion_repeats[text] = completion_repeats.get(text, 0) + 1
output_token_lengths.add(len(choice.logprobs.tokens))
# Assert subrequests finished at different times
assert len(output_token_lengths) > 1
# Assert `n` unique completions
num_unique = len(completion_repeats)
if num_unique != n:
repeats = {txt: num for (txt, num) in completion_repeats.items() if num > 1}
raise AssertionError(
f"Expected {n} unique completions, got {num_unique}; repeats: {repeats}."
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
"""Streaming for parallel sampling.
The tokens from multiple samples, are flattened into a single stream,
with an index to indicate which sample the token belongs to.
"""
prompt = "What is an LLM?"
n = 3
max_tokens = 50 # we want some to finish earlier than others
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=max_tokens,
n=n,
temperature=1.0,
stream=True,
seed=42,
)
chunks: list[list[str]] = [[] for _ in range(n)]
finish_reason_count = 0
async for chunk in stream:
index = chunk.choices[0].index
text = chunk.choices[0].text
chunks[index].append(text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# Assert `n` completions with correct finish reasons
assert finish_reason_count == n, (
f"Expected {n} completions with valid indices and finish_reason."
)
completion_repeats: dict[str, int] = {}
chunk_lengths = set()
for chunk in chunks:
chunk_len = len(chunk)
# Assert correct number of completion tokens
chunk_lengths.add(chunk_len)
assert chunk_len <= max_tokens, (
f"max_tokens={max_tokens} but chunk len is {chunk_len}."
)
text = "".join(chunk)
completion_repeats[text] = completion_repeats.get(text, 0) + 1
print(text)
# Assert subrequests finished at different times
assert len(chunk_lengths) > 1
# Assert `n` unique completions
num_unique = len(completion_repeats)
if num_unique != n:
repeats = {txt: num for (txt, num) in completion_repeats.items() if num > 1}
raise AssertionError(
f"{num_unique} unique completions, expected {n}; repeats: {repeats}"
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_completion_stream_options(client: openai.AsyncOpenAI, model_name: str):
prompt = "What is the capital of France?"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": False,
"continuous_usage_stats": False,
},
)
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": False,
"continuous_usage_stats": True,
},
)
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": False,
},
)
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await anext(stream)
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
)
assert final_chunk.choices == []
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
)
async for chunk in stream:
assert chunk.usage is not None
assert chunk.usage.prompt_tokens > 0
assert chunk.usage.completion_tokens > 0
assert chunk.usage.total_tokens == (
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
)
if chunk.choices[0].finish_reason is not None:
final_chunk = await anext(stream)
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
)
assert final_chunk.choices == []
# Test stream=False, stream_options=
# {"include_usage": None}
with pytest.raises(BadRequestError):
await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": None},
)
# Test stream=False, stream_options=
# {"include_usage": True}
with pytest.raises(BadRequestError):
await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": True},
)
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with pytest.raises(BadRequestError):
await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"continuous_usage_stats": None},
)
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with pytest.raises(BadRequestError):
await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"continuous_usage_stats": True},
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
# test both text and token IDs
for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
# test simple list
batch = await client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
)
assert len(batch.choices) == 2
assert batch.choices[0].text == batch.choices[1].text
# test n = 2
batch = await client.completions.create(
model=model_name,
prompt=prompts,
n=2,
max_tokens=5,
temperature=0.0,
extra_body=dict(
# NOTE: this has to be true for n > 1 in vLLM, but
# not necessary for official client.
use_beam_search=True
),
)
assert len(batch.choices) == 4
assert batch.choices[0].text != batch.choices[1].text, (
"beam search should be different"
)
assert batch.choices[0].text == batch.choices[2].text, (
"two copies of the same prompt should be the same"
)
assert batch.choices[1].text == batch.choices[3].text, (
"two copies of the same prompt should be the same"
)
# test streaming
batch = await client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
async for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
assert texts[0] == texts[1]
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
@pytest.mark.parametrize("logprobs_arg", [1, 0])
async def test_echo_logprob_completion(
client: openai.AsyncOpenAI, model_name: str, logprobs_arg: int
):
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# test using text and token IDs
for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
echo=True,
logprobs=logprobs_arg,
)
prompt_text = tokenizer.decode(prompt) if isinstance(prompt, list) else prompt
assert re.search(r"^" + prompt_text, completion.choices[0].text)
logprobs = completion.choices[0].logprobs
assert logprobs is not None
assert len(logprobs.text_offset) > 5
assert len(logprobs.token_logprobs) > 5 and logprobs.token_logprobs[0] is None
assert len(logprobs.top_logprobs) > 5 and logprobs.top_logprobs[0] is None
for top_logprobs in logprobs.top_logprobs[1:]:
assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
assert len(logprobs.tokens) > 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_invalid_json_schema(client: openai.AsyncOpenAI, model_name: str) -> None:
invalid_json_schema = {
"$defs": {
"CarType": {
"enum": ["sedan", "SUV", "Truck", "Coupe"],
"title": "CarType",
"type": "string",
}
},
"properties": {
"brand": {"title": "Brand", "type": "string"},
"model": {"title": "Model", "type": "string"},
"car_type": {"$ref": "#/$defs/CarType"},
"foo": "bar",
},
"required": ["brand", "model", "car_type"],
"title": "CarDescription",
"type": "object",
}
prompt = (
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
with pytest.raises((openai.BadRequestError, openai.APIError)):
await client.completions.create(
model=model_name,
prompt=prompt,
extra_body={"structured_outputs": {"json": invalid_json_schema}},
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
prompt = (
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n"
)
with pytest.raises((openai.BadRequestError, openai.APIError)):
await client.completions.create(
model=model_name,
prompt=prompt,
extra_body={"structured_outputs": {"regex": r"[.*"}, "stop": ["\n"]},
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
invalid_simplified_sql_grammar = """
root ::= select_statementinvalidsyntax
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""
prompt = (
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
with pytest.raises((openai.BadRequestError, openai.APIError)):
await client.completions.create(
model=model_name,
prompt=prompt,
extra_body={
"structured_outputs": {"grammar": invalid_simplified_sql_grammar}
},
)

View File

@@ -0,0 +1,86 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import torch
from transformers import AutoConfig
from tests.conftest import ImageTestAssets
from tests.utils import RemoteOpenAIServer
from vllm.utils.serial_utils import tensor2base64
# any model with a chat template should work here
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
MAXIMUM_IMAGES = 2
@pytest.fixture(scope="module")
def default_image_embeds_server_args() -> list[str]:
return [
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"4",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"image": MAXIMUM_IMAGES}),
"--enable-mm-embeds",
]
@pytest.fixture(scope="module")
def server_with_image_embeds(default_image_embeds_server_args):
with RemoteOpenAIServer(
MODEL_NAME, default_image_embeds_server_args, max_wait_seconds=600
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client_with_image_embeds(server_with_image_embeds):
async with server_with_image_embeds.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32])
async def test_completions_with_image_embeds(
client_with_image_embeds: openai.AsyncOpenAI,
model_name: str,
image_assets: ImageTestAssets,
dtype: torch.dtype,
):
# Test case: Single image embeds input
image_embeds = image_assets[0].image_embeds.to(dtype=dtype)
base64_image_embedding = tensor2base64(image_embeds)
chat_completion = await client_with_image_embeds.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe these images separately. For each image,"
"reply with a short sentence (no more than 10 words).",
},
{
"type": "image_embeds",
"image_embeds": base64_image_embedding,
},
],
},
],
model=model_name,
)
assert chat_completion.choices[0].message.content is not None
assert isinstance(chat_completion.choices[0].message.content, str)
assert len(chat_completion.choices[0].message.content) > 0

View File

@@ -0,0 +1,175 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from tests.v1.utils import check_request_balancing
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
DP_SIZE = os.getenv("DP_SIZE", "1")
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
"--api-server-count",
"4",
"--data_parallel_size",
DP_SIZE,
]
@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_single_completion(
client: openai.AsyncOpenAI, server: RemoteOpenAIServer, model_name: str
) -> None:
async def make_request():
completion = await client.completions.create(
model=model_name, prompt="Hello, my name is", max_tokens=10, temperature=1.0
)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert len(choice.text) >= 1
# Finish reason might not always be 'length' if the model finishes early
# or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert choice.finish_reason in ("length", "stop")
# Token counts can also vary, so we check they are positive.
assert completion.usage.completion_tokens > 0
assert completion.usage.prompt_tokens > 0
assert completion.usage.total_tokens > 0
return completion
# Test single request
result = await make_request()
assert result is not None
await asyncio.sleep(0.5)
# Send two bursts of requests
num_requests = 100
tasks = [make_request() for _ in range(num_requests)]
results = await asyncio.gather(*tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
await asyncio.sleep(0.5)
tasks = [make_request() for _ in range(num_requests)]
results = await asyncio.gather(*tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
# Check request balancing via Prometheus metrics if DP_SIZE > 1
check_request_balancing(server, int(DP_SIZE))
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_completion_streaming(
client: openai.AsyncOpenAI, server: RemoteOpenAIServer, model_name: str
) -> None:
prompt = "What is an LLM?"
async def make_streaming_request():
# Perform a non-streaming request to get the expected full output
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
# Perform the streaming request
stream = await client.completions.create(
model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
)
chunks: list[str] = []
finish_reason_count = 0
last_chunk = None
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
last_chunk = chunk # Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert finish_reason_count == 1, "Finish reason should appear exactly once."
assert last_chunk is not None, "Stream should have yielded at least one chunk."
assert last_chunk.choices[0].finish_reason == "length", (
"Finish reason should be 'length'."
)
# Check that the combined text matches the non-streamed version.
assert "".join(chunks) == single_output, (
"Streamed output should match non-streamed output."
)
return True # Indicate success for this request
# Test single request
result = await make_streaming_request()
assert result is not None
await asyncio.sleep(0.5)
# Send two bursts of requests
num_requests = 100
tasks = [make_streaming_request() for _ in range(num_requests)]
results = await asyncio.gather(*tasks)
assert len(results) == num_requests, (
f"Expected {num_requests} results, got {len(results)}"
)
assert all(results), "Not all streaming requests completed successfully."
await asyncio.sleep(0.5)
tasks = [make_streaming_request() for _ in range(num_requests)]
results = await asyncio.gather(*tasks)
assert len(results) == num_requests, (
f"Expected {num_requests} results, got {len(results)}"
)
assert all(results), "Not all streaming requests completed successfully."
# Check request balancing via Prometheus metrics if DP_SIZE > 1
check_request_balancing(server, int(DP_SIZE))