Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/tpu/lora/init.py
+++ b/tests/tpu/lora/init.py
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from torch_xla._internal import tpu
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+# This file contains tests to ensure that LoRA works correctly on the TPU
+# backend. We use a series of custom trained adapters for Qwen2.5-3B-Instruct
+# for this. The adapters are:
+# Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter, where x ranges
+# from 1 to 4.
+
+# These adapters are trained using a standard huggingface peft training script,
+# where all the inputs are "What is 1+1? \n" and all the outputs are "x". We run
+# 100 training iterations with a training batch size of 100.
+
+
+def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
+    return vllm.LLM(
+        model="Qwen/Qwen2.5-3B-Instruct",
+        max_model_len=256,
+        max_num_seqs=8,
+        tensor_parallel_size=tp,
+        enable_lora=True,
+        max_loras=num_loras,
+        max_lora_rank=8,
+    )
+
+
+TPU_TENSOR_PARALLEL_SIZES = (
+    [1, tpu.num_available_chips()] if tpu.num_available_chips() > 1 else [1]
+)
+
+
+@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
+def test_single_lora(tp: int):
+    """
+    This test ensures we can run a single LoRA adapter on the TPU backend.
+    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
+    will force Qwen2.5-3B-Instruct to claim 1+1=1.
+    """
+
+    llm = setup_vllm(1, tp)
+
+    prompt = "What is 1+1? \n"
+
+    lora_request = LoRARequest(
+        "lora_adapter_1",
+        1,
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter",
+    )
+    output = (
+        llm.generate(
+            prompt,
+            sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
+            lora_request=lora_request,
+        )[0]
+        .outputs[0]
+        .text
+    )
+
+    answer = output.strip()[0]
+
+    assert answer.isdigit()
+    assert int(answer) == 1
+
+
+@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
+def test_lora_hotswapping(tp: int):
+    """
+    This test ensures we can run multiple LoRA adapters on the TPU backend, even
+    if we only have space to store 1.
+
+    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
+    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
+    """
+
+    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_requests = [
+        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
+        for i in range(1, 5)
+    ]
+
+    llm = setup_vllm(1, tp)
+
+    prompt = "What is 1+1? \n"
+
+    for i, req in enumerate(lora_requests):
+        output = (
+            llm.generate(
+                prompt,
+                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
+                lora_request=req,
+            )[0]
+            .outputs[0]
+            .text
+        )
+        answer = output.strip()[0]
+
+        assert answer.isdigit()
+        assert int(answer) == i + 1
+
+
+@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
+def test_multi_lora(tp: int):
+    """
+    This test ensures we can run multiple LoRA adapters on the TPU backend, when
+    we have enough space to store all of them.
+
+    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
+    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
+    """
+    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_requests = [
+        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
+        for i in range(1, 5)
+    ]
+
+    llm = setup_vllm(4, tp)
+
+    prompt = "What is 1+1? \n"
+
+    for i, req in enumerate(lora_requests):
+        output = (
+            llm.generate(
+                prompt,
+                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
+                lora_request=req,
+            )[0]
+            .outputs[0]
+            .text
+        )
+
+        answer = output.strip()[0]
+
+        assert answer.isdigit()
+        assert int(output.strip()[0]) == i + 1