xc-llm-ascend/tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
from tests.e2e.singlecard.test_llama32_lora import generate_and_test
from vllm_ascend.utils import enable_custom_op

enable_custom_op()

# For hk region, we need to use the model from hf to avoid the network issue
MODEL_PATH = "vllm-ascend/Llama-3.2-3B-Instruct"


@pytest.mark.parametrize("fully_sharded_loras", [False, True])
@wait_until_npu_memory_free()
def test_llama_lora_tp2(llama32_lora_files, fully_sharded_loras):
    with VllmRunner(
        MODEL_PATH,
        enable_lora=True,
        # also test odd max_num_seqs
        max_num_seqs=7,
        max_model_len=1024,
        max_loras=4,
        tensor_parallel_size=2,
        fully_sharded_loras=fully_sharded_loras,
    ) as vllm_model:
        llm = vllm_model.model
        generate_and_test(llm, llama32_lora_files)