add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/model_executor/__init__.py
Normal file
0
vllm-v0.6.2/tests/model_executor/__init__.py
Normal file
49
vllm-v0.6.2/tests/model_executor/conftest.py
Normal file
49
vllm-v0.6.2/tests/model_executor/conftest.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_regex():
|
||||
return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
|
||||
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_json_schema():
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"age": {
|
||||
"type": "integer"
|
||||
},
|
||||
"skills": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"maxLength": 10
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"work_history": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string"
|
||||
},
|
||||
"duration": {
|
||||
"type": "number"
|
||||
},
|
||||
"position": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["company", "position"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["name", "age", "skills", "work_history"]
|
||||
}
|
||||
92
vllm-v0.6.2/tests/model_executor/test_enabled_custom_ops.py
Normal file
92
vllm-v0.6.2/tests/model_executor/test_enabled_custom_ops.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.layers.activation import (GeluAndMul,
|
||||
ReLUSquaredActivation,
|
||||
SiluAndMul)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
|
||||
|
||||
# Registered subclass for test
|
||||
@CustomOp.register("relu3")
|
||||
class Relu3(ReLUSquaredActivation):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"env, torch_level, ops_enabled, default_on",
|
||||
[
|
||||
# Default values based on compile level
|
||||
("", 0, [True] * 4, True),
|
||||
("", 1, [True] * 4, True),
|
||||
("", 2, [True] * 4, True), # All by default
|
||||
("", 3, [False] * 4, False),
|
||||
("", 4, [False] * 4, False), # None by default
|
||||
# Explicitly enabling/disabling
|
||||
#
|
||||
# Default: all
|
||||
#
|
||||
# All but SiluAndMul
|
||||
("+rms_norm,-silu_and_mul", 0, [1, 0, 1, 1], True),
|
||||
# Only ReLU3
|
||||
("none,-rms_norm,+relu3", 0, [0, 0, 0, 1], False),
|
||||
# All but SiluAndMul
|
||||
("all,-silu_and_mul", 1, [1, 0, 1, 1], True),
|
||||
# All but ReLU3 (even if ReLU2 is on)
|
||||
("-relu3,relu2", 1, [1, 1, 1, 0], True),
|
||||
# GeluAndMul and SiluAndMul
|
||||
("none,-relu3,+gelu_and_mul,+silu_and_mul", 2, [0, 1, 1, 0], False),
|
||||
# All but RMSNorm
|
||||
("-rms_norm", 2, [0, 1, 1, 1], True),
|
||||
#
|
||||
# Default: none
|
||||
#
|
||||
# Only ReLU3
|
||||
("-silu_and_mul,+relu3", 3, [0, 0, 0, 1], False),
|
||||
# All but RMSNorm
|
||||
("all,-rms_norm", 4, [0, 1, 1, 1], True),
|
||||
])
|
||||
def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
|
||||
default_on: bool):
|
||||
os.environ["VLLM_CUSTOM_OPS"] = env
|
||||
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level)
|
||||
|
||||
# Reset default_on (computed once):
|
||||
CustomOp.default_on.cache_clear()
|
||||
|
||||
assert CustomOp.default_on() == default_on
|
||||
|
||||
ops_enabled = [bool(x) for x in ops_enabled]
|
||||
|
||||
assert RMSNorm(1024).enabled() == ops_enabled[0]
|
||||
assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
|
||||
|
||||
assert SiluAndMul().enabled() == ops_enabled[1]
|
||||
assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
|
||||
|
||||
assert GeluAndMul().enabled() == ops_enabled[2]
|
||||
assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
|
||||
|
||||
# If registered, subclasses should follow their own name
|
||||
assert Relu3().enabled() == ops_enabled[3]
|
||||
assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
|
||||
|
||||
# Unregistered subclass
|
||||
class SiluAndMul2(SiluAndMul):
|
||||
pass
|
||||
|
||||
# Subclasses should not require registration
|
||||
assert SiluAndMul2().enabled() == SiluAndMul().enabled()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
|
||||
def test_enabled_ops_invalid(env: str):
|
||||
os.environ["VLLM_CUSTOM_OPS"] = env
|
||||
CustomOp.default_on.cache_clear()
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
RMSNorm(1024).enabled()
|
||||
85
vllm-v0.6.2/tests/model_executor/test_guided_processors.py
Normal file
85
vllm-v0.6.2/tests/model_executor/test_guided_processors.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.model_executor.guided_decoding import (
|
||||
get_guided_decoding_logits_processor)
|
||||
from vllm.model_executor.guided_decoding.outlines_logits_processors import (
|
||||
JSONLogitsProcessor, RegexLogitsProcessor)
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
|
||||
def test_guided_logits_processors(sample_regex, sample_json_schema):
|
||||
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
|
||||
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
|
||||
regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
|
||||
json_LP = JSONLogitsProcessor(sample_json_schema,
|
||||
tokenizer,
|
||||
whitespace_pattern=None)
|
||||
|
||||
token_ids = tokenizer.encode(
|
||||
f"Give an example IPv4 address with this regex: {sample_regex}")
|
||||
tensor = torch.rand(32000)
|
||||
original_tensor = torch.clone(tensor)
|
||||
regex_LP(token_ids, tensor)
|
||||
assert tensor.shape == original_tensor.shape
|
||||
assert not torch.allclose(tensor, original_tensor)
|
||||
|
||||
token_ids = tokenizer.encode(
|
||||
f"Give an employee profile that fits this schema: {sample_json_schema}"
|
||||
)
|
||||
tensor = torch.rand(32000)
|
||||
original_tensor = torch.clone(tensor)
|
||||
json_LP(token_ids, tensor)
|
||||
assert tensor.shape == original_tensor.shape
|
||||
assert not torch.allclose(tensor, original_tensor)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
|
||||
async def test_guided_logits_processor_black_box(backend: str, sample_regex,
|
||||
sample_json_schema):
|
||||
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
|
||||
token_ids = tokenizer.encode(
|
||||
f"Give an example IPv4 address with this regex: {sample_regex}")
|
||||
regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
|
||||
regex_lp = await get_guided_decoding_logits_processor(
|
||||
regex_request, tokenizer)
|
||||
assert regex_lp is not None
|
||||
tensor = torch.rand(32000)
|
||||
original_tensor = torch.clone(tensor)
|
||||
tensor = regex_lp(token_ids, tensor)
|
||||
assert tensor.shape == original_tensor.shape
|
||||
assert not torch.allclose(tensor, original_tensor)
|
||||
|
||||
token_ids = tokenizer.encode(
|
||||
f"Give an employee profile that fits this schema: {sample_json_schema}"
|
||||
)
|
||||
json_request = GuidedDecodingParams(json=sample_json_schema,
|
||||
backend=backend)
|
||||
json_lp = await get_guided_decoding_logits_processor(
|
||||
json_request, tokenizer)
|
||||
assert json_lp is not None
|
||||
tensor = torch.rand(32000)
|
||||
original_tensor = torch.clone(tensor)
|
||||
tensor = json_lp(token_ids, tensor)
|
||||
assert tensor.shape == original_tensor.shape
|
||||
assert not torch.allclose(tensor, original_tensor)
|
||||
|
||||
|
||||
def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
|
||||
with pytest.raises(ValueError,
|
||||
match="You can only use one kind of guided"):
|
||||
GuidedDecodingParams(json=sample_json_schema, regex=sample_regex)
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="You can only use one kind of guided"):
|
||||
GuidedDecodingParams(json=sample_json_schema, json_object=True)
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="You can only use one kind of guided"):
|
||||
GuidedDecodingParams(json=sample_json_schema, choice=["a", "b"])
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="You can only use one kind of guided"):
|
||||
GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
|
||||
@@ -0,0 +1,94 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.layers.pooler import PoolingType
|
||||
from vllm.model_executor.models.bert import BertEmbeddingModel
|
||||
from vllm.model_executor.models.roberta import RobertaEmbeddingModel
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MAX_MODEL_LEN = 128
|
||||
MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
|
||||
REVISION = os.environ.get("REVISION", "main")
|
||||
|
||||
MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
|
||||
"intfloat/multilingual-e5-large")
|
||||
REVISION_ROBERTA = os.environ.get("REVISION", "main")
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_model_loading_with_params(vllm_runner):
|
||||
"""
|
||||
Test parameter weight loading with tp>1.
|
||||
"""
|
||||
with vllm_runner(model_name=MODEL_NAME,
|
||||
revision=REVISION,
|
||||
dtype="float16",
|
||||
max_model_len=MAX_MODEL_LEN) as model:
|
||||
output = model.encode("Write a short story about a robot that"
|
||||
" dreams for the first time.\n")
|
||||
|
||||
model_config = model.model.llm_engine.model_config
|
||||
|
||||
model_tokenizer = model.model.llm_engine.tokenizer
|
||||
|
||||
# asserts on the bert model config file
|
||||
assert model_config.encoder_config["max_seq_length"] == 512
|
||||
assert model_config.encoder_config["do_lower_case"]
|
||||
|
||||
# asserts on the pooling config files
|
||||
assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
|
||||
assert model_config.pooler_config.pooling_norm
|
||||
|
||||
# asserts on the tokenizer loaded
|
||||
assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5"
|
||||
assert model_tokenizer.tokenizer_config["do_lower_case"]
|
||||
assert model_tokenizer.tokenizer.model_max_length == 512
|
||||
|
||||
model = model.model.llm_engine.model_executor\
|
||||
.driver_worker.model_runner.model
|
||||
assert isinstance(model, BertEmbeddingModel)
|
||||
assert model._pooler.pooling_type == PoolingType.CLS
|
||||
assert model._pooler.normalize
|
||||
# assert output
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_roberta_model_loading_with_params(vllm_runner):
|
||||
"""
|
||||
Test parameter weight loading with tp>1.
|
||||
"""
|
||||
with vllm_runner(model_name=MODEL_NAME_ROBERTA,
|
||||
revision=REVISION_ROBERTA,
|
||||
dtype="float16",
|
||||
max_model_len=MAX_MODEL_LEN) as model:
|
||||
output = model.encode("Write a short story about a robot that"
|
||||
" dreams for the first time.\n")
|
||||
|
||||
model_config = model.model.llm_engine.model_config
|
||||
|
||||
model_tokenizer = model.model.llm_engine.tokenizer
|
||||
|
||||
# asserts on the bert model config file
|
||||
assert model_config.encoder_config["max_seq_length"] == 512
|
||||
assert not model_config.encoder_config["do_lower_case"]
|
||||
|
||||
# asserts on the pooling config files
|
||||
assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
|
||||
assert model_config.pooler_config.pooling_norm
|
||||
|
||||
# asserts on the tokenizer loaded
|
||||
assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large"
|
||||
assert not model_tokenizer.tokenizer_config["do_lower_case"]
|
||||
|
||||
model = model.model.llm_engine.model_executor\
|
||||
.driver_worker.model_runner.model
|
||||
assert isinstance(model, RobertaEmbeddingModel)
|
||||
assert model._pooler.pooling_type == PoolingType.MEAN
|
||||
assert model._pooler.normalize
|
||||
|
||||
# assert output
|
||||
assert output
|
||||
54
vllm-v0.6.2/tests/model_executor/weight_utils.py
Normal file
54
vllm-v0.6.2/tests/model_executor/weight_utils.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import huggingface_hub.constants
|
||||
import pytest
|
||||
from huggingface_hub.utils import LocalEntryNotFoundError
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
download_weights_from_hf, enable_hf_transfer)
|
||||
|
||||
|
||||
def test_hf_transfer_auto_activation():
|
||||
if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
|
||||
# in case it is already set, we can't test the auto activation
|
||||
pytest.skip(
|
||||
"HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
|
||||
enable_hf_transfer()
|
||||
try:
|
||||
# enable hf hub transfer if available
|
||||
import hf_transfer # type: ignore # noqa
|
||||
HF_TRANFER_ACTIVE = True
|
||||
except ImportError:
|
||||
HF_TRANFER_ACTIVE = False
|
||||
assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
|
||||
HF_TRANFER_ACTIVE)
|
||||
|
||||
|
||||
def test_download_weights_from_hf():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# assert LocalEntryNotFoundError error is thrown
|
||||
# if offline is set and model is not cached
|
||||
huggingface_hub.constants.HF_HUB_OFFLINE = True
|
||||
with pytest.raises(LocalEntryNotFoundError):
|
||||
download_weights_from_hf("facebook/opt-125m",
|
||||
allow_patterns=["*.safetensors", "*.bin"],
|
||||
cache_dir=tmpdir)
|
||||
|
||||
# download the model
|
||||
huggingface_hub.constants.HF_HUB_OFFLINE = False
|
||||
download_weights_from_hf("facebook/opt-125m",
|
||||
allow_patterns=["*.safetensors", "*.bin"],
|
||||
cache_dir=tmpdir)
|
||||
|
||||
# now it should work offline
|
||||
huggingface_hub.constants.HF_HUB_OFFLINE = True
|
||||
assert download_weights_from_hf(
|
||||
"facebook/opt-125m",
|
||||
allow_patterns=["*.safetensors", "*.bin"],
|
||||
cache_dir=tmpdir) is not None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_hf_transfer_auto_activation()
|
||||
test_download_weights_from_hf()
|
||||
Reference in New Issue
Block a user