# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free from tests.e2e.singlecard.test_llama32_lora import generate_and_test from vllm_ascend.utils import enable_custom_op enable_custom_op() # For hk region, we need to use the model from hf to avoid the network issue MODEL_PATH = "vllm-ascend/Llama-3.2-3B-Instruct" @pytest.mark.parametrize("fully_sharded_loras", [False, True]) @wait_until_npu_memory_free() def test_llama_lora_tp2(llama32_lora_files, fully_sharded_loras): with VllmRunner( MODEL_PATH, enable_lora=True, # also test odd max_num_seqs max_num_seqs=7, max_model_len=1024, max_loras=4, tensor_parallel_size=2, fully_sharded_loras=fully_sharded_loras, ) as vllm_model: llm = vllm_model.model generate_and_test(llm, llama32_lora_files)