### What this PR does / why we need it?
Fix the issue #6143 .
### Does this PR introduce _any_ user-facing change?
Allow to start the server with "--enable-lora && --fully-sharded-loras
&& --tensor_parallel_size 2".
### How was this patch tested?
pytest -sv tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py
- vLLM version: v0.15.0
- vLLM main:
d7e17aaacd
---------
Signed-off-by: paulyu12 <507435917@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
30 lines
977 B
Python
Executable File
30 lines
977 B
Python
Executable File
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
import pytest
|
|
|
|
from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
|
|
from tests.e2e.singlecard.test_llama32_lora import generate_and_test
|
|
from vllm_ascend.utils import enable_custom_op
|
|
|
|
enable_custom_op()
|
|
|
|
# For hk region, we need to use the model from hf to avoid the network issue
|
|
MODEL_PATH = "vllm-ascend/Llama-3.2-3B-Instruct"
|
|
|
|
|
|
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
|
|
@wait_until_npu_memory_free()
|
|
def test_llama_lora_tp2(llama32_lora_files, fully_sharded_loras):
|
|
with VllmRunner(
|
|
MODEL_PATH,
|
|
enable_lora=True,
|
|
# also test odd max_num_seqs
|
|
max_num_seqs=7,
|
|
max_model_len=1024,
|
|
max_loras=4,
|
|
tensor_parallel_size=2,
|
|
fully_sharded_loras=fully_sharded_loras,
|
|
) as vllm_model:
|
|
llm = vllm_model.model
|
|
generate_and_test(llm, llama32_lora_files)
|