Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,192 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from concurrent.futures import Future
import pytest
from transformers import AutoTokenizer
from vllm.config import StructuredOutputsConfig, VllmConfig
from vllm.config.model import ModelConfig
from vllm.config.parallel import ParallelConfig
from vllm.config.speculative import SpeculativeConfig
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager
from vllm.v1.structured_output.backend_guidance import GuidanceBackend
from vllm.v1.structured_output.backend_types import StructuredOutputOptions
TOKENIZER = "gpt2"
def test_backend_guidance_rollback_terminated():
# Test that the backend guidance successfully rollbacks from a
# terminated state. This can happen with speculative decoding,
# where the draft model proposes EOS and it is verified by the
# guidance backend. In that case we are in a stopped state, but
# it should be reverted in case EOS is not accepted by the target
# model.
vllm_config = VllmConfig(
decoding_config=StructuredOutputsConfig(
backend="guidance",
)
)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
backend = GuidanceBackend(
vllm_config,
tokenizer=tokenizer,
vocab_size=50257,
)
grammar = backend.compile_grammar(
StructuredOutputOptions.JSON, '{"type": "object"}'
)
prompt = tokenizer.encode('{"a": "b"}')
assert len(prompt) > 1
dummy_wrong = tokenizer.encode('{"a"}')
for token in prompt:
assert grammar.accept_tokens("", [token])
assert not grammar.is_terminated()
assert grammar.accept_tokens("", [tokenizer.eos_token_id])
assert grammar.is_terminated()
# Giving any other token should also be accepted
assert grammar.accept_tokens("", dummy_wrong)
# Rollback is done from where state was terminated, so from '}' not EOS
grammar.rollback(len(prompt) - 1)
assert not grammar.is_terminated()
assert grammar.validate_tokens([tokenizer.eos_token_id]) == []
assert grammar.validate_tokens(dummy_wrong) != dummy_wrong
assert grammar.accept_tokens("", prompt[1:])
assert not grammar.is_terminated()
assert grammar.accept_tokens("", [tokenizer.eos_token_id])
assert grammar.is_terminated()
# Rollback of <= 0 should not change the terminated state
grammar.rollback(0)
assert grammar.is_terminated()
grammar.rollback(-1)
assert grammar.is_terminated()
def test_grammar_bitmask_with_specdec():
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
prompt = tokenizer.encode('{"a": "b"}')
vllm_config = VllmConfig(
model_config=ModelConfig(tokenizer=TOKENIZER),
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
)
structured_output_manager = StructuredOutputManager(vllm_config)
for i in range(1, 2):
sampling_params = SamplingParams(
structured_outputs=StructuredOutputsParams(
json='{"type": "object"}',
),
)
sampling_params.structured_outputs._backend = "guidance"
my_req_id = f"my_req_id_{i}"
request = Request(
my_req_id,
prompt_token_ids=prompt[:i],
sampling_params=sampling_params,
pooling_params=None,
eos_token_id=tokenizer.eos_token_id,
)
structured_output_manager.grammar_init(request)
def grammar_bitmask(req: Request, tokens: list[int]) -> None:
structured_output_manager.grammar_bitmask(
requests={req.request_id: req},
structured_output_request_ids={req.request_id: 0},
scheduled_spec_decode_tokens={req.request_id: tokens},
)
# At this point, we rolled-back, so should not be terminated
assert not req.structured_output_request.grammar.is_terminated()
# The grammar might not yet be compiled, so we wait for it
while not request.structured_output_request._check_grammar_completion():
continue
assert request.structured_output_request.grammar.accept_tokens(
request.request_id, prompt[:i]
)
grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
grammar_bitmask(
request, prompt[i:] + [tokenizer.eos_token_id] + prompt
) # EOS not the final token
grammar_bitmask(request, prompt[i:]) # EOS not present
grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
@pytest.mark.parametrize("async_grammar", [True, False])
def test_grammar_init_async_and_sync(async_grammar):
"""Test grammar initialization works correctly in both async and sync modes.
This test validates that the distributed_executor_backend config option
correctly controls whether grammar compilation happens asynchronously
(via executor.submit) or synchronously. When set to "external_launcher",
grammar compilation is synchronous to avoid deadlocks.
"""
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
prompt = tokenizer.encode('{"a": "b"}')
# Use "external_launcher" for sync mode, None for async mode
executor_backend = None if async_grammar else "external_launcher"
vllm_config = VllmConfig(
model_config=ModelConfig(tokenizer=TOKENIZER),
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
)
structured_output_manager = StructuredOutputManager(vllm_config)
sampling_params = SamplingParams(
structured_outputs=StructuredOutputsParams(
json='{"type": "object"}',
),
)
sampling_params.structured_outputs._backend = "guidance"
request = Request(
"test_request",
prompt_token_ids=prompt,
sampling_params=sampling_params,
pooling_params=None,
eos_token_id=tokenizer.eos_token_id,
)
structured_output_manager.grammar_init(request)
# Check the internal _grammar type immediately after init
# Before _check_grammar_completion is called, async mode should have a Future
raw_grammar = request.structured_output_request._grammar
if async_grammar:
assert isinstance(raw_grammar, Future), (
"Async mode should store a Future before completion"
)
else:
assert not isinstance(raw_grammar, Future), (
"Sync mode should store the grammar directly, not a Future"
)
# Wait for grammar to be ready (handles both async and sync cases)
start_time = time.time()
while not request.structured_output_request._check_grammar_completion():
if time.time() - start_time > 5: # 5-second timeout
pytest.fail("Grammar compilation timed out")
time.sleep(0.01)
# After completion, _grammar should no longer be a Future
assert not isinstance(request.structured_output_request._grammar, Future)
# Verify grammar is properly initialized and functional
grammar = request.structured_output_request.grammar
assert grammar is not None
assert not grammar.is_terminated()
# Verify the grammar can accept valid tokens
assert grammar.accept_tokens(request.request_id, prompt)

View File

@@ -0,0 +1,172 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Unit tests for GPT-OSS structural tag support in reasoning (PR #25515)."""
import json
from unittest.mock import Mock
import pytest
from vllm.entrypoints.tool_server import ToolServer
from vllm.reasoning.gptoss_reasoning_parser import (
GptOssReasoningParser,
from_builtin_tool_to_tag,
no_func_reaonsing_tag,
tag_with_builtin_funcs,
)
class TestGptOssReasoningParser:
"""Test cases for GptOssReasoningParser structural tag functionality."""
@pytest.fixture
def mock_tokenizer(self):
"""Create a mock tokenizer for testing."""
tokenizer = Mock()
tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
return tokenizer
@pytest.fixture
def reasoning_parser(self, mock_tokenizer):
"""Create a GptOssReasoningParser instance."""
return GptOssReasoningParser(mock_tokenizer)
@pytest.fixture
def mock_tool_server_empty(self):
"""Create a mock ToolServer with no tools."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(return_value=False)
return tool_server
@pytest.fixture
def mock_tool_server_with_browser(self):
"""Create a mock ToolServer with browser tool."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=lambda tool: tool == "browser")
return tool_server
@pytest.fixture
def mock_tool_server_with_all_tools(self):
"""Create a mock ToolServer with all builtin tools."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(
side_effect=lambda tool: tool in ["browser", "python", "container"]
)
return tool_server
def test_prepare_structured_tag_no_tool_server(self, reasoning_parser):
"""Test prepare_structured_tag with no tool server."""
result = reasoning_parser.prepare_structured_tag(None, None)
expected = json.dumps(no_func_reaonsing_tag)
assert result == expected
# Verify the structure is correct
parsed = json.loads(result)
assert parsed["type"] == "structural_tag"
assert parsed["format"]["type"] == "triggered_tags"
assert len(parsed["format"]["tags"]) == 1
assert parsed["format"]["tags"][0]["begin"] == "<|channel|>analysis<|message|>"
assert parsed["format"]["triggers"] == ["<|channel|>analysis"]
def test_prepare_structured_tag_with_all_tools(
self, reasoning_parser, mock_tool_server_with_all_tools
):
"""Test prepare_structured_tag with all builtin tools."""
result = reasoning_parser.prepare_structured_tag(
None, mock_tool_server_with_all_tools
)
parsed = json.loads(result)
# Should have analysis tag + tags for all 3 tools (2 tags each)
assert len(parsed["format"]["tags"]) == 7 # 1 analysis + 6 tool tags
# Check all tool tags are present
tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]]
for tool in ["browser", "python", "container"]:
assert f"<|channel|>commentary to={tool}" in tag_begins
assert f"<|channel|>analysis to={tool}" in tag_begins
def test_prepare_structured_tag_with_original_tag(self, reasoning_parser):
"""Test prepare_structured_tag when original_tag is provided."""
original_tag = '{"custom": "tag"}'
result = reasoning_parser.prepare_structured_tag(original_tag, None)
# Should return the original tag unchanged
assert result == original_tag
def test_from_builtin_tool_to_tag(self):
"""Test from_builtin_tool_to_tag function."""
tags = from_builtin_tool_to_tag("python")
assert len(tags) == 2
assert tags[0]["begin"] == "<|channel|>commentary to=python"
assert tags[0]["content"]["type"] == "any_text"
assert tags[0]["end"] == "<|end|>"
assert tags[1]["begin"] == "<|channel|>analysis to=python"
assert tags[1]["content"]["type"] == "any_text"
assert tags[1]["end"] == "<|end|>"
def test_tag_with_builtin_funcs(self):
"""Test tag_with_builtin_funcs function."""
builtin_tools = ["browser", "python"]
result = tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tools)
assert result["type"] == "structural_tag"
# Should have original analysis tag + 2 tags per tool
assert len(result["format"]["tags"]) == 5 # 1 + 2*2
# Should have added commentary trigger
assert "<|channel|>commentary to=" in result["format"]["triggers"]
assert "<|channel|>analysis" in result["format"]["triggers"]
def test_tag_structure_invariants(self):
"""Test that the basic tag structure follows expected format."""
# Test the base no_func_reaonsing_tag structure
assert no_func_reaonsing_tag["type"] == "structural_tag"
assert no_func_reaonsing_tag["format"]["type"] == "triggered_tags"
assert no_func_reaonsing_tag["format"]["stop_after_first"] is False
# Verify analysis tag structure
analysis_tag = no_func_reaonsing_tag["format"]["tags"][0]
assert analysis_tag["begin"] == "<|channel|>analysis<|message|>"
assert analysis_tag["content"]["type"] == "any_text"
assert analysis_tag["end"] == "<|end|>"
def test_json_serialization_valid(
self, reasoning_parser, mock_tool_server_with_all_tools
):
"""Test that all generated tags produce valid JSON."""
# Test with no tool server
result1 = reasoning_parser.prepare_structured_tag(None, None)
json.loads(result1) # Should not raise
# Test with empty tool server
empty_server = Mock(spec=ToolServer)
empty_server.has_tool = Mock(return_value=False)
result2 = reasoning_parser.prepare_structured_tag(None, empty_server)
json.loads(result2) # Should not raise
# Test with tools
result3 = reasoning_parser.prepare_structured_tag(
None, mock_tool_server_with_all_tools
)
json.loads(result3) # Should not raise
@pytest.mark.parametrize("tool_name", ["browser", "python", "container"])
def test_single_tool_integration(self, reasoning_parser, tool_name):
"""Test integration with individual tools."""
tool_server = Mock(spec=ToolServer)
tool_server.has_tool = Mock(side_effect=lambda tool: tool == tool_name)
result = reasoning_parser.prepare_structured_tag(None, tool_server)
parsed = json.loads(result)
# Should have 1 analysis + 2 tool-specific tags
assert len(parsed["format"]["tags"]) == 3
tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]]
assert f"<|channel|>commentary to={tool_name}" in tag_begins
assert f"<|channel|>analysis to={tool_name}" in tag_begins

View File

@@ -0,0 +1,208 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Unit tests for reasoning-aware structured output functionality (PR #25515)."""
from unittest.mock import Mock
import pytest
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.reasoning import ReasoningParser
from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager
class TestReasoningStructuredOutput:
"""Test reasoning-aware structured output functionality."""
@pytest.fixture
def mock_model_config(self):
"""Create a mock ModelConfig."""
config = Mock(spec=ModelConfig)
config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls
config.get_vocab_size = Mock(return_value=50000)
# Add missing runner_type attribute that tokenizer initialization expects
config.runner_type = "generate"
# Add other attributes that tokenizer initialization might need
config.tokenizer = "test-tokenizer"
config.tokenizer_mode = "auto"
config.trust_remote_code = False
config.tokenizer_revision = None
return config
@pytest.fixture
def mock_scheduler_config(self):
"""Create a mock SchedulerConfig."""
config = Mock(spec=SchedulerConfig)
config.max_num_seqs = 128
return config
@pytest.fixture
def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
"""Create a mock VllmConfig."""
config = Mock(spec=VllmConfig)
config.model_config = mock_model_config
config.scheduler_config = mock_scheduler_config
config.structured_outputs_config = Mock()
config.structured_outputs_config.reasoning_parser = None
config.structured_outputs_config.enable_in_reasoning = False
config.speculative_config = None
return config
@pytest.fixture
def mock_reasoning_parser(self):
"""Create a mock ReasoningParser."""
parser = Mock(spec=ReasoningParser)
parser.is_reasoning_end = Mock(return_value=False)
return parser
@pytest.fixture
def mock_request_with_structured_output(self):
"""Create a mock request with structured output."""
request = Mock(spec=Request)
request.structured_output_request = Mock()
request.structured_output_request.reasoning_ended = None
request.structured_output_request.grammar = Mock()
request.structured_output_request.grammar.is_terminated = Mock(
return_value=False
)
request.use_structured_output = True
request.prompt_token_ids = [1, 2, 3, 4, 5]
request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
request.num_computed_tokens = 5
return request
def test_should_fill_bitmask_with_enable_in_reasoning(
self, mock_vllm_config, mock_request_with_structured_output
):
"""Test should_fill_bitmask when enable_in_reasoning is True."""
# Enable enable_in_reasoning
mock_vllm_config.structured_outputs_config.enable_in_reasoning = True
manager = StructuredOutputManager(mock_vllm_config)
# Should always return True when enable_in_reasoning is enabled
result = manager.should_fill_bitmask(mock_request_with_structured_output)
assert result is True
def test_should_fill_bitmask_without_enable_in_reasoning(
self,
mock_vllm_config,
mock_request_with_structured_output,
mock_reasoning_parser,
):
"""Test should_fill_bitmask when enable_in_reasoning is False."""
# Keep enable_in_reasoning as False (default)
config = mock_vllm_config.structured_outputs_config
assert config.enable_in_reasoning is False
manager = StructuredOutputManager(mock_vllm_config)
manager.reasoner = mock_reasoning_parser
# Mock reasoning not ended
mock_reasoning_parser.is_reasoning_end.return_value = False
result = manager.should_fill_bitmask(mock_request_with_structured_output)
# Should set reasoning_ended and return its value
assert (
mock_request_with_structured_output.structured_output_request.reasoning_ended
is False
)
assert result is False
def test_should_fill_bitmask_no_reasoner(
self, mock_vllm_config, mock_request_with_structured_output
):
"""Test should_fill_bitmask when no reasoner is configured."""
manager = StructuredOutputManager(mock_vllm_config)
manager.reasoner = None
result = manager.should_fill_bitmask(mock_request_with_structured_output)
# Should default to True when no reasoner
assert result is True
def test_should_advance_with_enable_in_reasoning(
self,
mock_vllm_config,
mock_request_with_structured_output,
mock_reasoning_parser,
):
"""Test should_advance when enable_in_reasoning is True."""
# Enable enable_in_reasoning
mock_vllm_config.structured_outputs_config.enable_in_reasoning = True
manager = StructuredOutputManager(mock_vllm_config)
manager.reasoner = mock_reasoning_parser
# Should always return True when enable_in_reasoning is enabled
result = manager.should_advance(mock_request_with_structured_output)
assert result is True
def test_should_advance_reasoning_not_ended(
self,
mock_vllm_config,
mock_request_with_structured_output,
mock_reasoning_parser,
):
"""Test should_advance when reasoning has not ended."""
manager = StructuredOutputManager(mock_vllm_config)
manager.reasoner = mock_reasoning_parser
# Set reasoning as not ended
(
mock_request_with_structured_output.structured_output_request
).reasoning_ended = False
mock_reasoning_parser.is_reasoning_end.return_value = False
result = manager.should_advance(mock_request_with_structured_output)
# Should return False since reasoning hasn't ended
assert result is False
def test_should_advance_reasoning_just_ended(
self,
mock_vllm_config,
mock_request_with_structured_output,
mock_reasoning_parser,
):
"""Test should_advance when reasoning ends in current step."""
manager = StructuredOutputManager(mock_vllm_config)
manager.reasoner = mock_reasoning_parser
# Set reasoning as not ended initially, but ends in this step
(
mock_request_with_structured_output.structured_output_request
).reasoning_ended = False
mock_reasoning_parser.is_reasoning_end.return_value = True
result = manager.should_advance(mock_request_with_structured_output)
# Should set reasoning_ended to True but return False for this step
assert (
mock_request_with_structured_output.structured_output_request.reasoning_ended
is True
)
assert result is False
def test_should_advance_reasoning_already_ended(
self,
mock_vllm_config,
mock_request_with_structured_output,
mock_reasoning_parser,
):
"""Test should_advance when reasoning has already ended."""
manager = StructuredOutputManager(mock_vllm_config)
manager.reasoner = mock_reasoning_parser
# Set reasoning as already ended
(
mock_request_with_structured_output.structured_output_request
).reasoning_ended = True
result = manager.should_advance(mock_request_with_structured_output)
# Should return True since reasoning has ended
assert result is True

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.v1.structured_output.backend_xgrammar import (
has_xgrammar_unsupported_json_features,
)
pytestmark = pytest.mark.cpu_test
@pytest.fixture
def unsupported_string_schemas():
return [
{"type": "string", "format": "non_existing_format"},
]
@pytest.fixture
def unsupported_integer_schemas():
return [
{"type": "integer", "multipleOf": 120},
]
@pytest.fixture
def unsupported_number_schemas():
return [
{"type": "number", "multipleOf": 120},
]
@pytest.fixture
def unsupported_array_schemas():
return [
{"type": "array", "uniqueItems": True},
{"type": "array", "contains": {"type": "string"}},
{"type": "array", "minContains": 1},
{"type": "array", "maxContains": 5},
]
@pytest.fixture
def unsupported_object_schemas():
return [
{"type": "object", "propertyNames": {"pattern": "^[a-z]+$"}},
{"type": "object", "patternProperties": {"^S": {"type": "string"}}},
]
@pytest.fixture
def supported_schema():
return {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
"email": {"type": "string", "format": "email"},
"status": {"type": "string"},
"scores": {"type": "array", "items": {"type": "number"}},
"car_type": {"type": "string", "enum": ["sedan", "suv", "truck"]},
"car_brand": {"type": "string", "pattern": "^[a-zA-Z]+$"},
"short_description": {"type": "string", "maxLength": 50},
"mileage": {"type": "number", "minimum": 0, "maximum": 1000000},
"model_year": {
"type": "integer",
"exclusiveMinimum": 1900,
"exclusiveMaximum": 2100,
},
"long_description": {"type": "string", "minLength": 50, "maxLength": 2000},
"address": {
"type": "object",
"properties": {
"street": {"type": "string"},
"city": {"type": "string"},
},
},
},
"minProperties": 1,
"maxProperties": 100,
}
@pytest.mark.parametrize(
"schema_type",
[
"unsupported_string_schemas",
"unsupported_integer_schemas",
"unsupported_number_schemas",
"unsupported_array_schemas",
"unsupported_object_schemas",
],
)
def test_unsupported_json_features_by_type(schema_type, request):
schemas = request.getfixturevalue(schema_type)
for schema in schemas:
assert has_xgrammar_unsupported_json_features(schema), (
f"Schema should be unsupported: {schema}"
)
def test_supported_json_features(supported_schema):
assert not has_xgrammar_unsupported_json_features(supported_schema), (
"Schema should be supported"
)