Sync from v0.13
This commit is contained in:
@@ -1,136 +1,164 @@
|
||||
import contextlib
|
||||
import gc
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
import vllm
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.distributed import destroy_model_parallel, initialize_model_parallel
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.distributed import (
|
||||
cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.models.interfaces import SupportsLoRA
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def cleanup():
|
||||
destroy_model_parallel()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
ray.shutdown()
|
||||
@pytest.fixture()
|
||||
def should_do_global_cleanup_after_test(request) -> bool:
|
||||
"""Allow subdirectories to skip global cleanup by overriding this fixture.
|
||||
This can provide a ~10x speedup for non-GPU unit tests since they don't need
|
||||
to initialize torch.
|
||||
"""
|
||||
|
||||
return not request.node.get_closest_marker("skip_global_cleanup")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup_fixture():
|
||||
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
|
||||
yield
|
||||
cleanup()
|
||||
if should_do_global_cleanup_after_test:
|
||||
cleanup_dist_env_and_memory(shutdown_ray=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dist_init():
|
||||
if not torch.distributed.is_initialized():
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
world_size=1,
|
||||
rank=0,
|
||||
init_method=f"file://{temp_file}",
|
||||
)
|
||||
torch.distributed.all_reduce(torch.zeros(1).cuda())
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
|
||||
backend = "nccl"
|
||||
if current_platform.is_cpu() or current_platform.is_tpu():
|
||||
backend = "gloo"
|
||||
|
||||
init_distributed_environment(
|
||||
world_size=1,
|
||||
rank=0,
|
||||
distributed_init_method=f"file://{temp_file}",
|
||||
local_rank=0,
|
||||
backend=backend,
|
||||
)
|
||||
initialize_model_parallel(1, 1)
|
||||
yield
|
||||
cleanup()
|
||||
cleanup_dist_env_and_memory(shutdown_ray=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dist_init_torch_only():
|
||||
if torch.distributed.is_initialized():
|
||||
return
|
||||
backend = "nccl"
|
||||
if current_platform.is_cpu():
|
||||
backend = "gloo"
|
||||
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
world_size=1,
|
||||
rank=0,
|
||||
init_method=f"file://{temp_file}",
|
||||
world_size=1, rank=0, init_method=f"file://{temp_file}", backend=backend
|
||||
)
|
||||
|
||||
|
||||
class DummyLoRAModel(nn.Sequential, SupportsLoRA):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_model() -> nn.Module:
|
||||
model = nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", ColumnParallelLinear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model = DummyLoRAModel(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", ColumnParallelLinear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
]
|
||||
)
|
||||
)
|
||||
model.config = MagicMock()
|
||||
model.embedding_modules = {"lm_head": "lm_head"}
|
||||
model.unpadded_vocab_size = 32000
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_model_gate_up() -> nn.Module:
|
||||
model = nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model = DummyLoRAModel(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
]
|
||||
)
|
||||
)
|
||||
model.config = MagicMock()
|
||||
model.packed_modules_mapping = {
|
||||
"gate_up_proj": [
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
],
|
||||
}
|
||||
model.embedding_modules = {"lm_head": "lm_head"}
|
||||
model.unpadded_vocab_size = 32000
|
||||
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sql_lora_files():
|
||||
return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def mixtral_lora_files():
|
||||
return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def gemma_lora_files():
|
||||
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
|
||||
# Note: this module has incorrect adapter_config.json to test
|
||||
# https://github.com/vllm-project/vllm/pull/5909/files.
|
||||
return snapshot_download(repo_id="SangBinCho/mixtral-lora")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@@ -149,31 +177,85 @@ def baichuan_zero_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def baichuan_regex_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ilama_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def minicpmv_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen2vl_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen25vl_base_huggingface_id():
|
||||
# used as a base model for testing with qwen25vl lora adapter
|
||||
return "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen25vl_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tinyllama_lora_files():
|
||||
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llama_2_7b_engine_extra_embeddings() -> nn.Module:
|
||||
cleanup()
|
||||
get_model_old = get_model
|
||||
@pytest.fixture(scope="session")
|
||||
def deepseekv2_lora_files():
|
||||
return snapshot_download(repo_id="wuchen01/DeepSeek-V2-Lite-Chat-All-LoRA")
|
||||
|
||||
def get_model_patched(*, model_config, device_config, **kwargs):
|
||||
kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
|
||||
return get_model_old(model_config=model_config,
|
||||
device_config=device_config,
|
||||
**kwargs)
|
||||
|
||||
with patch("vllm.worker.model_runner.get_model", get_model_patched):
|
||||
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
|
||||
yield engine.llm_engine
|
||||
del engine
|
||||
cleanup()
|
||||
@pytest.fixture(scope="session")
|
||||
def gptoss20b_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/gpt-oss-20b-lora-adapter-text2sql")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3moe_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/qwen3-moe-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def olmoe_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_lora_files():
|
||||
return snapshot_download(repo_id="charent/self_cognition_Alice")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def llama32_lora_huggingface_id():
|
||||
# huggingface repo id is used to test lora runtime downloading.
|
||||
return "jeeejeee/llama32-3b-text2sql-spider"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def llama32_lora_files(llama32_lora_huggingface_id):
|
||||
return snapshot_download(repo_id=llama32_lora_huggingface_id)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llama_2_7b_model_extra_embeddings(
|
||||
llama_2_7b_engine_extra_embeddings) -> nn.Module:
|
||||
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
def reset_default_device():
|
||||
"""
|
||||
Some tests, such as `test_punica_ops.py`, explicitly set the
|
||||
default device, which can affect subsequent tests. Adding this fixture
|
||||
helps avoid this problem.
|
||||
"""
|
||||
original_device = torch.get_default_device()
|
||||
yield
|
||||
torch.set_default_device(original_device)
|
||||
|
||||
Reference in New Issue
Block a user