Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -1,69 +1,105 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import random
import tempfile
from unittest.mock import patch
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, ParallelConfig, SchedulerConfig)
from vllm.lora.models import LoRAMapping
from vllm.config import (
CacheConfig,
DeviceConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
)
from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.model_manager import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.worker.worker import Worker
from vllm.v1.worker.gpu_worker import Worker
MODEL_PATH = "Qwen/Qwen3-0.6B"
NUM_LORAS = 16
@patch.dict(os.environ, {"RANK": "0"})
def test_worker_apply_lora(sql_lora_files):
worker = Worker(
model_config=ModelConfig(
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-hf",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
),
def test_worker_apply_lora(qwen3_lora_files):
def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
lora_mapping = LoRAMapping([], [])
worker.model_runner.lora_manager.set_active_adapters(
lora_requests, lora_mapping
)
model_config = ModelConfig(
MODEL_PATH,
seed=0,
dtype="float16",
max_model_len=127,
enforce_eager=True,
)
vllm_config = VllmConfig(
model_config=model_config,
load_config=LoadConfig(
download_dir=None,
load_format="dummy",
),
parallel_config=ParallelConfig(1, 1, False),
scheduler_config=SchedulerConfig(32, 32, 32),
parallel_config=ParallelConfig(
pipeline_parallel_size=1,
tensor_parallel_size=1,
data_parallel_size=1,
),
scheduler_config=SchedulerConfig(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
runner_type="generate",
max_num_batched_tokens=32,
max_num_seqs=32,
max_num_partial_prefills=32,
),
device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(block_size=16,
gpu_memory_utilization=1.,
swap_space=0,
cache_dtype="auto"),
cache_config=CacheConfig(
block_size=16,
swap_space=0,
cache_dtype="auto",
),
lora_config=LoRAConfig(
max_lora_rank=8, max_cpu_loras=NUM_LORAS, max_loras=NUM_LORAS
),
)
worker = Worker(
vllm_config=vllm_config,
local_rank=0,
rank=0,
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
max_loras=32),
distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
)
worker.init_device()
worker.load_model()
worker.model_runner.set_active_loras([], LoRAMapping([], []))
set_active_loras(worker, [])
assert worker.list_loras() == set()
n_loras = 32
lora_requests = [
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS)
]
worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
set_active_loras(worker, lora_requests)
assert worker.list_loras() == {
lora_request.lora_int_id
for lora_request in lora_requests
lora_request.lora_int_id for lora_request in lora_requests
}
for i in range(32):
for i in range(NUM_LORAS):
random.seed(i)
iter_lora_requests = random.choices(lora_requests,
k=random.randint(1, n_loras))
iter_lora_requests = random.choices(
lora_requests, k=random.randint(1, NUM_LORAS)
)
random.shuffle(iter_lora_requests)
iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
worker.model_runner.set_active_loras(iter_lora_requests,
LoRAMapping([], []))
iter_lora_requests = iter_lora_requests[: -random.randint(0, NUM_LORAS)]
set_active_loras(worker, lora_requests)
assert worker.list_loras().issuperset(
{lora_request.lora_int_id
for lora_request in iter_lora_requests})
{lora_request.lora_int_id for lora_request in iter_lora_requests}
)