Sync from v0.13
This commit is contained in:
187
tests/lora/test_llm_with_multi_loras.py
Normal file
187
tests/lora/test_llm_with_multi_loras.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This script contains:
|
||||
1. test multi loras service with tp >= 2
|
||||
2. test multi loras request
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "Qwen/Qwen3-0.6B"
|
||||
LORA_NAME_PATH_MAP = {
|
||||
"Alice": "charent/self_cognition_Alice",
|
||||
"Bob": "charent/self_cognition_Bob",
|
||||
"Cat": "charent/self_cognition_Bob", # same as Bob
|
||||
}
|
||||
|
||||
LORA_NAME_ID_MAP = {}
|
||||
INCREASE_LORA_ID = 0
|
||||
LORA_RANK = 8
|
||||
|
||||
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
|
||||
LORA_TEST_EXPECTED = [
|
||||
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
|
||||
"I am Alice, an AI assistant developed by GitHub/Charent.",
|
||||
]
|
||||
|
||||
|
||||
def format_chatml_messages(prompt: str):
|
||||
return [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
|
||||
def make_add_lora_request(name: str, path: str):
|
||||
global INCREASE_LORA_ID, LORA_NAME_ID_MAP
|
||||
|
||||
INCREASE_LORA_ID += 1
|
||||
LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID
|
||||
|
||||
return LoRARequest(
|
||||
lora_name=name,
|
||||
lora_int_id=INCREASE_LORA_ID,
|
||||
lora_path=path,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multi_loras_with_tp_sync():
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=2, # ensure max_loras < max_cpu_loras
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2, # ensure tp >= 2
|
||||
max_cpu_loras=4, # ensure max_cpu_loras >= 2
|
||||
)
|
||||
|
||||
def run_check_lora(fn, args, expected: list):
|
||||
fn(args)
|
||||
assert set(llm.llm_engine.list_loras()) == set(expected)
|
||||
|
||||
# simulate add loras with CLI args
|
||||
# likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob`
|
||||
run_check_lora(
|
||||
llm.llm_engine.add_lora,
|
||||
make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]),
|
||||
[1],
|
||||
)
|
||||
run_check_lora(
|
||||
llm.llm_engine.add_lora,
|
||||
make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]),
|
||||
[1, 2],
|
||||
)
|
||||
run_check_lora(
|
||||
llm.llm_engine.add_lora,
|
||||
make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]),
|
||||
[1, 2, 3],
|
||||
)
|
||||
|
||||
# set temperature = 0 for greedy search
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=64)
|
||||
|
||||
def call_llm_get_outputs(prompt: str, lora_name: str):
|
||||
lora_request = LoRARequest(
|
||||
lora_name=lora_name,
|
||||
lora_int_id=LORA_NAME_ID_MAP[lora_name],
|
||||
lora_path=LORA_NAME_PATH_MAP[lora_name],
|
||||
)
|
||||
messages = format_chatml_messages(prompt)
|
||||
outputs = llm.chat(
|
||||
[messages],
|
||||
sampling_params,
|
||||
chat_template_kwargs={
|
||||
"enable_thinking": False
|
||||
}, # for those loras, ensure enable_thinking=False
|
||||
lora_request=lora_request,
|
||||
use_tqdm=False,
|
||||
)
|
||||
output_text = outputs[0].outputs[0].text
|
||||
return output_text
|
||||
|
||||
def reload_lora(name: str):
|
||||
"""
|
||||
reload a lora to simulate the case:
|
||||
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
|
||||
for dynamic lora loading and unloading
|
||||
"""
|
||||
remove_lora_response = llm.llm_engine.remove_lora(
|
||||
lora_id=LORA_NAME_ID_MAP[name]
|
||||
)
|
||||
|
||||
add_lora_response = llm.llm_engine.add_lora(
|
||||
make_add_lora_request(name, LORA_NAME_PATH_MAP[name])
|
||||
)
|
||||
|
||||
print(f"{remove_lora_response=}, {add_lora_response=}")
|
||||
|
||||
def check_outputs(outputs: str, expected: str):
|
||||
print(f"{prompt=}.\n{expected_output=}\n{output_text=}")
|
||||
print("\n----------------------------\n")
|
||||
assert outputs == expected
|
||||
|
||||
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
# call Bob, ignore what it is output
|
||||
call_llm_get_outputs(prompt, "Bob")
|
||||
print("After call Bob:")
|
||||
|
||||
# call Alice
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
# reload Bob Lora
|
||||
reload_lora("Bob")
|
||||
print("After reload Bob:")
|
||||
|
||||
# call Alice
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
# reload Alice Lora
|
||||
reload_lora("Alice")
|
||||
print("After reload Alice:")
|
||||
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
|
||||
def test_multiple_lora_requests():
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
)
|
||||
PROMPTS = ["Hello, my name is"] * 2
|
||||
LORA_NAME = "Alice"
|
||||
lora_request = [
|
||||
LoRARequest(LORA_NAME + str(idx), idx + 1, LORA_NAME_PATH_MAP[LORA_NAME])
|
||||
for idx in range(len(PROMPTS))
|
||||
]
|
||||
# Multiple SamplingParams should be matched with each prompt
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
|
||||
|
||||
# Single LoRARequest should be applied to every prompt
|
||||
single_lora_request = lora_request[0]
|
||||
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
Reference in New Issue
Block a user