Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/lora/test_llm_with_multi_loras.py
+++ b/tests/lora/test_llm_with_multi_loras.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script contains:
+1. test multi loras service with tp >= 2
+2. test multi loras request
+"""
+
+import pytest
+
+from tests.utils import multi_gpu_test
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "Qwen/Qwen3-0.6B"
+LORA_NAME_PATH_MAP = {
+    "Alice": "charent/self_cognition_Alice",
+    "Bob": "charent/self_cognition_Bob",
+    "Cat": "charent/self_cognition_Bob",  # same as Bob
+}
+
+LORA_NAME_ID_MAP = {}
+INCREASE_LORA_ID = 0
+LORA_RANK = 8
+
+LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
+LORA_TEST_EXPECTED = [
+    "GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.",  # noqa: E501
+    "I am Alice, an AI assistant developed by GitHub/Charent.",
+]
+
+
+def format_chatml_messages(prompt: str):
+    return [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
+    ]
+
+
+def make_add_lora_request(name: str, path: str):
+    global INCREASE_LORA_ID, LORA_NAME_ID_MAP
+
+    INCREASE_LORA_ID += 1
+    LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID
+
+    return LoRARequest(
+        lora_name=name,
+        lora_int_id=INCREASE_LORA_ID,
+        lora_path=path,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multi_loras_with_tp_sync():
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=2,  # ensure max_loras < max_cpu_loras
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+        tensor_parallel_size=2,  # ensure tp >= 2
+        max_cpu_loras=4,  # ensure max_cpu_loras >= 2
+    )
+
+    def run_check_lora(fn, args, expected: list):
+        fn(args)
+        assert set(llm.llm_engine.list_loras()) == set(expected)
+
+    # simulate add loras with CLI args
+    # likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob`
+    run_check_lora(
+        llm.llm_engine.add_lora,
+        make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]),
+        [1],
+    )
+    run_check_lora(
+        llm.llm_engine.add_lora,
+        make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]),
+        [1, 2],
+    )
+    run_check_lora(
+        llm.llm_engine.add_lora,
+        make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]),
+        [1, 2, 3],
+    )
+
+    # set temperature = 0 for greedy search
+    sampling_params = SamplingParams(temperature=0, max_tokens=64)
+
+    def call_llm_get_outputs(prompt: str, lora_name: str):
+        lora_request = LoRARequest(
+            lora_name=lora_name,
+            lora_int_id=LORA_NAME_ID_MAP[lora_name],
+            lora_path=LORA_NAME_PATH_MAP[lora_name],
+        )
+        messages = format_chatml_messages(prompt)
+        outputs = llm.chat(
+            [messages],
+            sampling_params,
+            chat_template_kwargs={
+                "enable_thinking": False
+            },  # for those loras, ensure enable_thinking=False
+            lora_request=lora_request,
+            use_tqdm=False,
+        )
+        output_text = outputs[0].outputs[0].text
+        return output_text
+
+    def reload_lora(name: str):
+        """
+        reload a lora to simulate the case:
+        setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
+        for dynamic lora loading and unloading
+        """
+        remove_lora_response = llm.llm_engine.remove_lora(
+            lora_id=LORA_NAME_ID_MAP[name]
+        )
+
+        add_lora_response = llm.llm_engine.add_lora(
+            make_add_lora_request(name, LORA_NAME_PATH_MAP[name])
+        )
+
+        print(f"{remove_lora_response=}, {add_lora_response=}")
+
+    def check_outputs(outputs: str, expected: str):
+        print(f"{prompt=}.\n{expected_output=}\n{output_text=}")
+        print("\n----------------------------\n")
+        assert outputs == expected
+
+    for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+        # call Bob, ignore what it is output
+        call_llm_get_outputs(prompt, "Bob")
+        print("After call Bob:")
+
+        # call Alice
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+        # reload Bob Lora
+        reload_lora("Bob")
+        print("After reload Bob:")
+
+        # call Alice
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+        # reload Alice Lora
+        reload_lora("Alice")
+        print("After reload Alice:")
+
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+
+def test_multiple_lora_requests():
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    PROMPTS = ["Hello, my name is"] * 2
+    LORA_NAME = "Alice"
+    lora_request = [
+        LoRARequest(LORA_NAME + str(idx), idx + 1, LORA_NAME_PATH_MAP[LORA_NAME])
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)