xc-llm-ascend/tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
from tests.e2e.singlecard.test_llama32_lora import generate_and_test
from vllm_ascend.utils import enable_custom_op

enable_custom_op()

# For hk region, we need to use the model from hf to avoid the network issue
MODEL_PATH = "vllm-ascend/Llama-3.2-3B-Instruct"


@pytest.mark.parametrize("fully_sharded_loras", [False, True])
@wait_until_npu_memory_free()
def test_llama_lora_tp2(llama32_lora_files, fully_sharded_loras):
    with VllmRunner(
        MODEL_PATH,
        enable_lora=True,
        # also test odd max_num_seqs
        max_num_seqs=7,
        max_model_len=1024,
        max_loras=4,
        tensor_parallel_size=2,
        fully_sharded_loras=fully_sharded_loras,
    ) as vllm_model:
        llm = vllm_model.model
        generate_and_test(llm, llama32_lora_files)
[Bugfix][LoRA] Fix the issue when enable LoRA + tp + fully_sharded_loras (#6650) ### What this PR does / why we need it? Fix the issue #6143 . ### Does this PR introduce _any_ user-facing change? Allow to start the server with "--enable-lora && --fully-sharded-loras && --tensor_parallel_size 2". ### How was this patch tested? pytest -sv tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a --------- Signed-off-by: paulyu12 <507435917@qq.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> 2026-03-11 15:43:15 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`import pytest`

			`from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free`
			`from tests.e2e.singlecard.test_llama32_lora import generate_and_test`
			`from vllm_ascend.utils import enable_custom_op`

			`enable_custom_op()`

			`# For hk region, we need to use the model from hf to avoid the network issue`
			`MODEL_PATH = "vllm-ascend/Llama-3.2-3B-Instruct"`


			`@pytest.mark.parametrize("fully_sharded_loras", [False, True])`
			`@wait_until_npu_memory_free()`
			`def test_llama_lora_tp2(llama32_lora_files, fully_sharded_loras):`
			`with VllmRunner(`
			`MODEL_PATH,`
			`enable_lora=True,`
			`# also test odd max_num_seqs`
			`max_num_seqs=7,`
			`max_model_len=1024,`
			`max_loras=4,`
			`tensor_parallel_size=2,`
			`fully_sharded_loras=fully_sharded_loras,`
			`) as vllm_model:`
			`llm = vllm_model.model`
			`generate_and_test(llm, llama32_lora_files)`