Files
xc-llm-ascend/tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py

30 lines
977 B
Python
Raw Normal View History

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
from tests.e2e.singlecard.test_llama32_lora import generate_and_test
from vllm_ascend.utils import enable_custom_op
enable_custom_op()
# For hk region, we need to use the model from hf to avoid the network issue
MODEL_PATH = "vllm-ascend/Llama-3.2-3B-Instruct"
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
@wait_until_npu_memory_free()
def test_llama_lora_tp2(llama32_lora_files, fully_sharded_loras):
with VllmRunner(
MODEL_PATH,
enable_lora=True,
# also test odd max_num_seqs
max_num_seqs=7,
max_model_len=1024,
max_loras=4,
tensor_parallel_size=2,
fully_sharded_loras=fully_sharded_loras,
) as vllm_model:
llm = vllm_model.model
generate_and_test(llm, llama32_lora_files)