forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/engine/__init__.py
Normal file
0
vllm-v0.6.2/tests/engine/__init__.py
Normal file
271
vllm-v0.6.2/tests/engine/output_processor/test_multi_step.py
Normal file
271
vllm-v0.6.2/tests/engine/output_processor/test_multi_step.py
Normal file
@@ -0,0 +1,271 @@
|
||||
import random
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
|
||||
SequenceOutput, SequenceStatus)
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.utils import Counter
|
||||
|
||||
from ...core.utils import create_seq_group
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [1, 12])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
|
||||
"""Verify multi-step decoding appends token ids correctly.
|
||||
|
||||
We append token ids and verify all the token ids were appended correctly.
|
||||
Note that ignore_eos=True.
|
||||
"""
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=1024,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(max_tokens=seq_output_len +
|
||||
num_new_tokens,
|
||||
ignore_eos=True),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_prompt_len", [1024])
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
|
||||
@pytest.mark.parametrize("max_tokens", [128 + 3])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
|
||||
seq_output_len: int, max_tokens: int):
|
||||
"""Verify tokens after max_tokens are dropped and not appended to the
|
||||
sequence.
|
||||
"""
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=seq_prompt_len,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(max_tokens=max_tokens, ),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Expect the processed sequence to not go over max tokens in len.
|
||||
assert seq.get_len() == seq_prompt_len + max_tokens
|
||||
|
||||
# Expect the correct tokens were appended.
|
||||
expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
|
||||
assert seq.get_token_ids(
|
||||
)[-len(expected_appended_tokens):] == expected_appended_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_prompt_len", [1024])
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [12])
|
||||
@pytest.mark.parametrize("seed", list(range(6)))
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
|
||||
seq_output_len: int, seed: int):
|
||||
"""Verify the eos token id is included in the sequence, but subsequent
|
||||
tokens are dropped (not appended to sequence).
|
||||
"""
|
||||
random.seed(seed)
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
eos_token_id = 100
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=seq_prompt_len,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(
|
||||
# Ensure enough space.
|
||||
max_tokens=seq_output_len + num_new_tokens, ),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
assert eos_token_id not in new_token_ids
|
||||
eos_index = random.randint(0, len(new_token_ids) - 1)
|
||||
new_token_ids[eos_index] = eos_token_id
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Expect the processed sequence to not go beyond provided eos.
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
|
||||
|
||||
# Expect the correct tokens were appended.
|
||||
expected_appended_tokens = new_token_ids[:eos_index + 1]
|
||||
assert seq.get_token_ids(
|
||||
)[-len(expected_appended_tokens):] == expected_appended_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_prompt_len", [1024])
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [12])
|
||||
@pytest.mark.parametrize("seed", list(range(6)))
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
|
||||
seq_output_len: int, seed: int):
|
||||
"""When sampling parameters dictate that we should ignore the eos token id,
|
||||
ensure all token ids are appended even if the eos token id is emitted.
|
||||
"""
|
||||
random.seed(seed)
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
eos_token_id = 100
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=seq_prompt_len,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(
|
||||
# Ensure enough space.
|
||||
max_tokens=seq_output_len + num_new_tokens,
|
||||
ignore_eos=True,
|
||||
),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
assert eos_token_id not in new_token_ids
|
||||
eos_index = random.randint(0, len(new_token_ids) - 1)
|
||||
new_token_ids[eos_index] = eos_token_id
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Expect the processed sequence to go beyond eos.
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
|
||||
|
||||
# Expect the correct tokens were appended.
|
||||
expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
|
||||
seq_output_len]
|
||||
assert seq.get_token_ids(
|
||||
)[-len(expected_appended_tokens):] == expected_appended_tokens
|
||||
|
||||
|
||||
def mock_tokenizer(eos_token_id=1000):
|
||||
tokenizer = MagicMock(spec=PreTrainedTokenizer)
|
||||
tokenizer.eos_token_id = eos_token_id
|
||||
return tokenizer
|
||||
@@ -0,0 +1,86 @@
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.inputs import token_inputs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import Logprob, Sequence, SequenceStatus
|
||||
|
||||
|
||||
def sequence_with_eos(text: str, eos_token: str,
|
||||
eos_token_id: int) -> Sequence:
|
||||
"""
|
||||
Create a Sequence that ends with an EOS token.
|
||||
"""
|
||||
seq = Sequence(
|
||||
seq_id=0,
|
||||
inputs=token_inputs([]),
|
||||
block_size=16,
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
seq.output_text = text + eos_token
|
||||
|
||||
offset = eos_token_id + 1
|
||||
for i in range(offset, len(text) + offset):
|
||||
seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
|
||||
seq.append_token_id(token_id=eos_token_id,
|
||||
logprobs={eos_token_id: Logprob(0.0)})
|
||||
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
|
||||
("This text ends with EOS token", "</s>", 2),
|
||||
])
|
||||
@pytest.mark.parametrize("ignore_eos", [True, False])
|
||||
@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
|
||||
ignore_eos: bool, include_stop_str_in_output: bool):
|
||||
"""
|
||||
Test the behavior of the StopChecker's maybe_stop_sequence method
|
||||
when an EOS token is encountered.
|
||||
|
||||
This test covers:
|
||||
- When the EOS token should stop the sequence and be removed from the output
|
||||
- When the EOS token should stop the sequence and be included in the output
|
||||
- When the EOS token should be ignored, and the sequence continues
|
||||
"""
|
||||
|
||||
tokenizer = MagicMock(spec=PreTrainedTokenizer)
|
||||
get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
|
||||
stop_checker = StopChecker(max_model_len=1024,
|
||||
get_tokenizer_for_seq=get_tokenizer_for_seq)
|
||||
|
||||
seq = sequence_with_eos(
|
||||
text=text_wo_eos,
|
||||
eos_token=eos_token,
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
new_char_count = len(eos_token)
|
||||
|
||||
# Note that `stop` and `stop_token_ids` are not specified
|
||||
sampling_params = SamplingParams(
|
||||
min_tokens=1,
|
||||
ignore_eos=ignore_eos,
|
||||
include_stop_str_in_output=include_stop_str_in_output)
|
||||
|
||||
stop_checker.maybe_stop_sequence(
|
||||
seq=seq,
|
||||
new_char_count=new_char_count,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
if ignore_eos:
|
||||
assert seq.status == SequenceStatus.RUNNING
|
||||
assert seq.output_text == text_wo_eos + eos_token
|
||||
elif include_stop_str_in_output:
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
assert seq.output_text == text_wo_eos + eos_token
|
||||
else:
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
assert seq.output_text == text_wo_eos
|
||||
95
vllm-v0.6.2/tests/engine/test_arg_utils.py
Normal file
95
vllm-v0.6.2/tests/engine/test_arg_utils.py
Normal file
@@ -0,0 +1,95 @@
|
||||
from argparse import ArgumentTypeError
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import PoolerConfig
|
||||
from vllm.engine.arg_utils import EngineArgs, nullable_kvs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("arg", "expected"), [
|
||||
(None, None),
|
||||
("image=16", {
|
||||
"image": 16
|
||||
}),
|
||||
("image=16,video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
("Image=16, Video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
])
|
||||
def test_limit_mm_per_prompt_parser(arg, expected):
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
if arg is None:
|
||||
args = parser.parse_args([])
|
||||
else:
|
||||
args = parser.parse_args(["--limit-mm-per-prompt", arg])
|
||||
|
||||
assert args.limit_mm_per_prompt == expected
|
||||
|
||||
|
||||
def test_valid_pooling_config():
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
args = parser.parse_args([
|
||||
'--override-pooler-config',
|
||||
'{"pooling_type": "MEAN"}',
|
||||
])
|
||||
engine_args = EngineArgs.from_cli_args(args=args)
|
||||
assert engine_args.override_pooler_config == PoolerConfig(
|
||||
pooling_type="MEAN", )
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg"),
|
||||
[
|
||||
"image", # Missing =
|
||||
"image=4,image=5", # Conflicting values
|
||||
"image=video=4" # Too many = in tokenized arg
|
||||
])
|
||||
def test_bad_nullable_kvs(arg):
|
||||
with pytest.raises(ArgumentTypeError):
|
||||
nullable_kvs(arg)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(("arg", "expected", "option"), [
|
||||
(None, None, "mm-processor-kwargs"),
|
||||
("{}", {}, "mm-processor-kwargs"),
|
||||
(
|
||||
'{"num_crops": 4}',
|
||||
{
|
||||
"num_crops": 4
|
||||
},
|
||||
"mm-processor-kwargs"
|
||||
),
|
||||
(
|
||||
'{"foo": {"bar": "baz"}}',
|
||||
{
|
||||
"foo":
|
||||
{
|
||||
"bar": "baz"
|
||||
}
|
||||
},
|
||||
"mm-processor-kwargs"
|
||||
),
|
||||
(
|
||||
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
|
||||
{
|
||||
"cast_logits_dtype": "bfloat16",
|
||||
"sequence_parallel_norm": True,
|
||||
"sequence_parallel_norm_threshold": 2048,
|
||||
},
|
||||
"override-neuron-config"
|
||||
),
|
||||
])
|
||||
# yapf: enable
|
||||
def test_composite_arg_parser(arg, expected, option):
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
if arg is None:
|
||||
args = parser.parse_args([])
|
||||
else:
|
||||
args = parser.parse_args([f"--{option}", arg])
|
||||
assert getattr(args, option.replace("-", "_")) == expected
|
||||
40
vllm-v0.6.2/tests/engine/test_computed_prefix_blocks.py
Normal file
40
vllm-v0.6.2/tests/engine/test_computed_prefix_blocks.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(enable_prefix_caching): Not support prefix caching yet, will fix in VLLM-342.
|
||||
'''
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
def test_computed_prefix_blocks(model: str, block_size: int):
|
||||
# This test checks if we are able to run the engine to completion
|
||||
# without triggering asserts.
|
||||
# We are in a scenario where all blocks from the second request's prompt
|
||||
# are full and already computed when the second request arrives.
|
||||
prompt = (
|
||||
"You are a helpful assistant. How do I build a car from cardboard and "
|
||||
"paper clips? Is there an easy to follow video tutorial available "
|
||||
"online for free?")
|
||||
prompt2 = (
|
||||
" Please recommend to me some resources where I can learn not only to "
|
||||
"handle technical difficulties of building a car, but also "
|
||||
"decoration.")
|
||||
|
||||
engine_args = EngineArgs(model=model,
|
||||
block_size=block_size,
|
||||
enable_prefix_caching=False)
|
||||
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
engine.add_request("0", prompt + prompt2, sampling_params)
|
||||
engine.step()
|
||||
engine.add_request("1", prompt, sampling_params)
|
||||
engine.step()
|
||||
116
vllm-v0.6.2/tests/engine/test_custom_executor.py
Normal file
116
vllm-v0.6.2/tests/engine/test_custom_executor.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.executor.mlu_executor import MLUExecutor, MLUExecutorAsync
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
class Mock:
|
||||
...
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(GPUExecutor): Use mlu executor on MLU devices.
|
||||
'''
|
||||
class CustomGPUExecutor(MLUExecutor):
|
||||
|
||||
def execute_model(self, *args, **kwargs):
|
||||
# Drop marker to show that this was ran
|
||||
with open(".marker", "w"):
|
||||
...
|
||||
return super().execute_model(*args, **kwargs)
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(GPUExecutor): Use mlu executor on MLU devices.
|
||||
'''
|
||||
class CustomGPUExecutorAsync(MLUExecutorAsync):
|
||||
|
||||
async def execute_model_async(self, *args, **kwargs):
|
||||
with open(".marker", "w"):
|
||||
...
|
||||
return await super().execute_model_async(*args, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_custom_executor_type_checking(model):
|
||||
with pytest.raises(ValueError):
|
||||
engine_args = EngineArgs(model=model,
|
||||
distributed_executor_backend=Mock)
|
||||
LLMEngine.from_engine_args(engine_args)
|
||||
with pytest.raises(ValueError):
|
||||
engine_args = AsyncEngineArgs(model=model,
|
||||
distributed_executor_backend=Mock)
|
||||
AsyncLLMEngine.from_engine_args(engine_args)
|
||||
with pytest.raises(TypeError):
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model, distributed_executor_backend=CustomGPUExecutor)
|
||||
AsyncLLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(tmpdir): All test models are soft link into tests dir, do not change dir.
|
||||
'''
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_custom_executor(model, tmp_path):
|
||||
cwd = os.path.abspath(".")
|
||||
# os.chdir(tmp_path)
|
||||
try:
|
||||
assert not os.path.exists(".marker")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model, distributed_executor_backend=CustomGPUExecutor)
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
|
||||
engine.add_request("0", "foo", sampling_params)
|
||||
engine.step()
|
||||
|
||||
assert os.path.exists(".marker")
|
||||
os.remove(".marker")
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(tmpdir): All test models are soft link into tests dir, do not change dir.
|
||||
'''
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_custom_executor_async(model, tmp_path):
|
||||
cwd = os.path.abspath(".")
|
||||
# os.chdir(tmp_path)
|
||||
try:
|
||||
assert not os.path.exists(".marker")
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model, distributed_executor_backend=CustomGPUExecutorAsync)
|
||||
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
|
||||
async def t():
|
||||
stream = await engine.add_request("0", "foo", sampling_params)
|
||||
async for x in stream:
|
||||
...
|
||||
|
||||
asyncio.run(t())
|
||||
|
||||
assert os.path.exists(".marker")
|
||||
os.remove(".marker")
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
32
vllm-v0.6.2/tests/engine/test_detokenization.py
Normal file
32
vllm-v0.6.2/tests/engine/test_detokenization.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_computed_prefix_blocks(model: str):
|
||||
# This test checks if the engine generates completions both with and
|
||||
# without optional detokenization, that detokenization includes text
|
||||
# and no-detokenization doesn't, and that both completions have the same
|
||||
# token_ids.
|
||||
prompt = (
|
||||
"You are a helpful assistant. How do I build a car from cardboard and "
|
||||
"paper clips? Is there an easy to follow video tutorial available "
|
||||
"online for free?")
|
||||
|
||||
llm = LLM(model=model)
|
||||
sampling_params = SamplingParams(max_tokens=10,
|
||||
temperature=0.0,
|
||||
detokenize=False)
|
||||
|
||||
outputs_no_detokenization = llm.generate(prompt,
|
||||
sampling_params)[0].outputs[0]
|
||||
sampling_params.detokenize = True
|
||||
outputs_with_detokenization = llm.generate(prompt,
|
||||
sampling_params)[0].outputs[0]
|
||||
|
||||
assert outputs_no_detokenization.text == ''
|
||||
assert outputs_with_detokenization.text != ''
|
||||
assert outputs_no_detokenization.token_ids == \
|
||||
outputs_with_detokenization.token_ids
|
||||
176
vllm-v0.6.2/tests/engine/test_multiproc_workers.py
Normal file
176
vllm-v0.6.2/tests/engine/test_multiproc_workers.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from time import sleep
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
|
||||
ResultHandler, WorkerMonitor)
|
||||
|
||||
|
||||
class DummyWorker:
|
||||
"""Dummy version of vllm.worker.worker.Worker"""
|
||||
|
||||
def __init__(self, rank: int):
|
||||
self.rank = rank
|
||||
|
||||
def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
|
||||
sleep(0.05)
|
||||
|
||||
if isinstance(worker_input, Exception):
|
||||
# simulate error case
|
||||
raise worker_input
|
||||
|
||||
return self.rank, input
|
||||
|
||||
|
||||
def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
|
||||
result_handler = ResultHandler()
|
||||
workers = [
|
||||
ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank))
|
||||
for rank in range(8)
|
||||
]
|
||||
|
||||
worker_monitor = WorkerMonitor(workers, result_handler)
|
||||
assert not worker_monitor.is_alive()
|
||||
|
||||
result_handler.start()
|
||||
worker_monitor.start()
|
||||
assert worker_monitor.is_alive()
|
||||
|
||||
return workers, worker_monitor
|
||||
|
||||
|
||||
def test_local_workers() -> None:
|
||||
"""Test workers with sync task submission"""
|
||||
|
||||
workers, worker_monitor = _start_workers()
|
||||
|
||||
def execute_workers(worker_input: str) -> None:
|
||||
worker_outputs = [
|
||||
worker.execute_method("worker_method", worker_input)
|
||||
for worker in workers
|
||||
]
|
||||
|
||||
for rank, output in enumerate(worker_outputs):
|
||||
assert output.get() == (rank, input)
|
||||
|
||||
executor = ThreadPoolExecutor(max_workers=4)
|
||||
|
||||
# Test concurrent submission from different threads
|
||||
futures = [
|
||||
executor.submit(partial(execute_workers, f"thread {thread_num}"))
|
||||
for thread_num in range(4)
|
||||
]
|
||||
|
||||
for future in futures:
|
||||
future.result()
|
||||
|
||||
# Test error case
|
||||
exception = ValueError("fake error")
|
||||
result = workers[0].execute_method("worker_method", exception)
|
||||
try:
|
||||
result.get()
|
||||
pytest.fail("task should have failed")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ValueError)
|
||||
assert str(e) == "fake error"
|
||||
|
||||
# Test cleanup when a worker fails
|
||||
assert worker_monitor.is_alive()
|
||||
workers[3].process.kill()
|
||||
|
||||
# Other workers should get shut down here
|
||||
worker_monitor.join(20)
|
||||
|
||||
# Ensure everything is stopped
|
||||
assert not worker_monitor.is_alive()
|
||||
assert all(not worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Further attempts to submit tasks should fail
|
||||
try:
|
||||
_result = workers[0].execute_method("worker_method", "test")
|
||||
pytest.fail("task should fail once workers have been shut down")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ChildProcessError)
|
||||
|
||||
|
||||
def test_local_workers_clean_shutdown() -> None:
|
||||
"""Test clean shutdown"""
|
||||
|
||||
workers, worker_monitor = _start_workers()
|
||||
|
||||
assert worker_monitor.is_alive()
|
||||
assert all(worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Clean shutdown
|
||||
worker_monitor.close()
|
||||
|
||||
worker_monitor.join(20)
|
||||
|
||||
# Ensure everything is stopped
|
||||
assert not worker_monitor.is_alive()
|
||||
assert all(not worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Further attempts to submit tasks should fail
|
||||
try:
|
||||
_result = workers[0].execute_method("worker_method", "test")
|
||||
pytest.fail("task should fail once workers have been shut down")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ChildProcessError)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_local_workers_async() -> None:
|
||||
"""Test local workers with async task submission"""
|
||||
|
||||
workers, worker_monitor = _start_workers()
|
||||
|
||||
async def execute_workers(worker_input: str) -> None:
|
||||
worker_coros = [
|
||||
worker.execute_method_async("worker_method", worker_input)
|
||||
for worker in workers
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*worker_coros)
|
||||
for rank, result in enumerate(results):
|
||||
assert result == (rank, input)
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(execute_workers(f"task {task_num}"))
|
||||
for task_num in range(4)
|
||||
]
|
||||
|
||||
for task in tasks:
|
||||
await task
|
||||
|
||||
# Test error case
|
||||
exception = ValueError("fake error")
|
||||
try:
|
||||
_result = await workers[0].execute_method_async(
|
||||
"worker_method", exception)
|
||||
pytest.fail("task should have failed")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ValueError)
|
||||
assert str(e) == "fake error"
|
||||
|
||||
# Test cleanup when a worker fails
|
||||
assert worker_monitor.is_alive()
|
||||
workers[3].process.kill()
|
||||
|
||||
# Other workers should get shut down here
|
||||
worker_monitor.join(20)
|
||||
|
||||
# Ensure everything is stopped
|
||||
assert not worker_monitor.is_alive()
|
||||
assert all(not worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Further attempts to submit tasks should fail
|
||||
try:
|
||||
_result = await workers[0].execute_method_async(
|
||||
"worker_method", "test")
|
||||
pytest.fail("task should fail once workers have been shut down")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ChildProcessError)
|
||||
29
vllm-v0.6.2/tests/engine/test_short_mm_context.py
Normal file
29
vllm-v0.6.2/tests/engine/test_short_mm_context.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import pytest
|
||||
|
||||
from ..conftest import IMAGE_ASSETS
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
||||
"cherry_blossom":
|
||||
"USER: <image>\nWhat is the season?\nASSISTANT:",
|
||||
})
|
||||
|
||||
models = ["llava-hf/llava-1.5-7b-hf"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
def test_context_length_too_short(vllm_runner, image_assets, model):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
with pytest.raises(ValueError, match="too long to fit into the model"):
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
max_model_len=128, # LLaVA has a feature size of 576
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
with vllm_model:
|
||||
vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
|
||||
max_tokens=1,
|
||||
images=[images[0]])
|
||||
24
vllm-v0.6.2/tests/engine/test_skip_tokenizer_init.py
Normal file
24
vllm-v0.6.2/tests/engine/test_skip_tokenizer_init.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_skip_tokenizer_initialization(model: str):
|
||||
# This test checks if the flag skip_tokenizer_init skips the initialization
|
||||
# of tokenizer and detokenizer. The generated output is expected to contain
|
||||
# token ids.
|
||||
llm = LLM(model=model, skip_tokenizer_init=True)
|
||||
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
|
||||
|
||||
with pytest.raises(ValueError, match="cannot pass text prompts when"):
|
||||
llm.generate("abc", sampling_params)
|
||||
|
||||
outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
|
||||
sampling_params=sampling_params)
|
||||
assert len(outputs) > 0
|
||||
completions = outputs[0].outputs
|
||||
assert len(completions) > 0
|
||||
assert completions[0].text == ""
|
||||
assert completions[0].token_ids
|
||||
62
vllm-v0.6.2/tests/engine/test_stop_reason.py
Normal file
62
vllm-v0.6.2/tests/engine/test_stop_reason.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""Test the different finish_reason="stop" situations during generation:
|
||||
1. One of the provided stop strings
|
||||
2. One of the provided stop tokens
|
||||
3. The EOS token
|
||||
|
||||
Run `pytest tests/engine/test_stop_reason.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import transformers
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
MODEL = "facebook/opt-350m"
|
||||
STOP_STR = "."
|
||||
SEED = 42
|
||||
MAX_TOKENS = 1024
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vllm_model(vllm_runner):
|
||||
with vllm_runner(MODEL) as vllm_model:
|
||||
yield vllm_model
|
||||
|
||||
|
||||
def test_stop_reason(vllm_model, example_prompts):
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
|
||||
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
|
||||
llm = vllm_model.model
|
||||
|
||||
# test stop token
|
||||
outputs = llm.generate(example_prompts,
|
||||
sampling_params=SamplingParams(
|
||||
ignore_eos=True,
|
||||
seed=SEED,
|
||||
max_tokens=MAX_TOKENS,
|
||||
stop_token_ids=[stop_token_id]))
|
||||
for output in outputs:
|
||||
output = output.outputs[0]
|
||||
assert output.finish_reason == "stop"
|
||||
assert output.stop_reason == stop_token_id
|
||||
|
||||
# test stop string
|
||||
outputs = llm.generate(example_prompts,
|
||||
sampling_params=SamplingParams(
|
||||
ignore_eos=True,
|
||||
seed=SEED,
|
||||
max_tokens=MAX_TOKENS,
|
||||
stop="."))
|
||||
for output in outputs:
|
||||
output = output.outputs[0]
|
||||
assert output.finish_reason == "stop"
|
||||
assert output.stop_reason == STOP_STR
|
||||
|
||||
# test EOS token
|
||||
outputs = llm.generate(example_prompts,
|
||||
sampling_params=SamplingParams(
|
||||
seed=SEED, max_tokens=MAX_TOKENS))
|
||||
for output in outputs:
|
||||
output = output.outputs[0]
|
||||
assert output.finish_reason == "length" or (
|
||||
output.finish_reason == "stop" and output.stop_reason is None)
|
||||
163
vllm-v0.6.2/tests/engine/test_stop_strings.py
Normal file
163
vllm-v0.6.2/tests/engine/test_stop_strings.py
Normal file
@@ -0,0 +1,163 @@
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import CompletionOutput, LLMEngine, SamplingParams
|
||||
|
||||
MODEL = "meta-llama/llama-2-7b-hf"
|
||||
MAX_TOKENS = 200
|
||||
|
||||
IS_ASYNC = False
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def vllm_model(vllm_runner):
|
||||
with vllm_runner(MODEL) as vllm_model:
|
||||
yield vllm_model
|
||||
|
||||
|
||||
def _test_stopping(llm_engine: LLMEngine,
|
||||
expected_output: str,
|
||||
expected_reason: Any,
|
||||
stop: Optional[List[str]] = None,
|
||||
stop_token_ids: Optional[List[int]] = None,
|
||||
include_in_output: bool = False,
|
||||
use_async_output_proc: bool = False) -> None:
|
||||
llm_engine.add_request(
|
||||
"id", "A story about vLLM:\n",
|
||||
SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
stop=stop,
|
||||
stop_token_ids=stop_token_ids,
|
||||
include_stop_str_in_output=include_in_output,
|
||||
), None)
|
||||
|
||||
output: Optional[CompletionOutput] = None
|
||||
output_text = ""
|
||||
stop_reason = None
|
||||
|
||||
if use_async_output_proc:
|
||||
llm_engine.step()
|
||||
|
||||
while llm_engine.has_unfinished_requests():
|
||||
(request_output, ) = llm_engine.step()
|
||||
(output, ) = request_output.outputs
|
||||
|
||||
# Ensure we don't backtrack
|
||||
assert output.text.startswith(output_text)
|
||||
output_text = output.text
|
||||
stop_reason = output.stop_reason
|
||||
|
||||
assert output is not None
|
||||
assert output_text == expected_output
|
||||
assert stop_reason == expected_reason
|
||||
|
||||
|
||||
def _set_async_mode(llm_engine, is_async):
|
||||
llm_engine.scheduler[0].use_async_output_proc = is_async
|
||||
|
||||
|
||||
def _stop_basic(llm_engine, is_async):
|
||||
_test_stopping(llm_engine,
|
||||
stop=["."],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer organization",
|
||||
expected_reason=".",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
_test_stopping(llm_engine,
|
||||
stop=["."],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organization.",
|
||||
expected_reason=".",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
|
||||
def _stop_multi_tokens(llm_engine, is_async):
|
||||
_test_stopping(
|
||||
llm_engine,
|
||||
stop=["group of peo", "short"],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer organization. We are a ",
|
||||
expected_reason="group of peo",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
_test_stopping(
|
||||
llm_engine,
|
||||
stop=["group of peo", "short"],
|
||||
include_in_output=True,
|
||||
expected_output=
|
||||
"VLLM is a 100% volunteer organization. We are a group of peo",
|
||||
expected_reason="group of peo",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
|
||||
def _stop_partial_token(llm_engine, is_async):
|
||||
_test_stopping(llm_engine,
|
||||
stop=["gani"],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer or",
|
||||
expected_reason="gani",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
_test_stopping(llm_engine,
|
||||
stop=["gani"],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organi",
|
||||
expected_reason="gani",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
|
||||
def _stop_token_id(llm_engine, is_async):
|
||||
# token id 13013 => " organization"
|
||||
|
||||
_test_stopping(llm_engine,
|
||||
stop_token_ids=[13013],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer",
|
||||
expected_reason=13013,
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
_test_stopping(llm_engine,
|
||||
stop_token_ids=[13013],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organization",
|
||||
expected_reason=13013,
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_basic(vllm_model):
|
||||
_set_async_mode(vllm_model.model.llm_engine, True)
|
||||
_stop_basic(vllm_model.model.llm_engine, is_async=True)
|
||||
|
||||
_set_async_mode(vllm_model.model.llm_engine, False)
|
||||
_stop_basic(vllm_model.model.llm_engine, is_async=False)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_multi_tokens(vllm_model):
|
||||
_set_async_mode(vllm_model.model.llm_engine, True)
|
||||
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
|
||||
|
||||
_set_async_mode(vllm_model.model.llm_engine, False)
|
||||
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_partial_token(vllm_model):
|
||||
_set_async_mode(vllm_model.model.llm_engine, True)
|
||||
_stop_partial_token(vllm_model.model.llm_engine, is_async=True)
|
||||
|
||||
_set_async_mode(vllm_model.model.llm_engine, False)
|
||||
_stop_partial_token(vllm_model.model.llm_engine, is_async=False)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_token_id(vllm_model):
|
||||
_set_async_mode(vllm_model.model.llm_engine, True)
|
||||
_stop_token_id(vllm_model.model.llm_engine, is_async=True)
|
||||
|
||||
_set_async_mode(vllm_model.model.llm_engine, False)
|
||||
_stop_token_id(vllm_model.model.llm_engine, is_async=False)
|
||||
Reference in New Issue
Block a user