Files
xc-llm-ascend/tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py
yupeng 830f39dd70 [Bugfix][LoRA] Fix the issue when enable LoRA + tp + fully_sharded_loras (#6650)
### What this PR does / why we need it?
Fix the issue #6143 .

### Does this PR introduce _any_ user-facing change?
Allow to start the server with "--enable-lora && --fully-sharded-loras
&& --tensor_parallel_size 2".

### How was this patch tested?
pytest -sv tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py
- vLLM version: v0.15.0
- vLLM main:
d7e17aaacd

---------

Signed-off-by: paulyu12 <507435917@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-03-11 15:43:15 +08:00

30 lines
977 B
Python
Executable File

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
from tests.e2e.singlecard.test_llama32_lora import generate_and_test
from vllm_ascend.utils import enable_custom_op
enable_custom_op()
# For hk region, we need to use the model from hf to avoid the network issue
MODEL_PATH = "vllm-ascend/Llama-3.2-3B-Instruct"
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
@wait_until_npu_memory_free()
def test_llama_lora_tp2(llama32_lora_files, fully_sharded_loras):
with VllmRunner(
MODEL_PATH,
enable_lora=True,
# also test odd max_num_seqs
max_num_seqs=7,
max_model_len=1024,
max_loras=4,
tensor_parallel_size=2,
fully_sharded_loras=fully_sharded_loras,
) as vllm_model:
llm = vllm_model.model
generate_and_test(llm, llama32_lora_files)