diff --git a/tests/e2e/singlecard/test_llama32_lora.py b/tests/e2e/singlecard/test_llama32_lora.py index ab7015b2..9ce3e684 100644 --- a/tests/e2e/singlecard/test_llama32_lora.py +++ b/tests/e2e/singlecard/test_llama32_lora.py @@ -3,7 +3,6 @@ from unittest.mock import patch -import pytest import vllm import vllm.config from vllm.lora.request import LoRARequest @@ -126,7 +125,6 @@ def generate_and_test(llm, llama32_lora_files, tensorizer_config_dict: dict | No print("removing lora") -@pytest.mark.skip(reason="fix me") @patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"}) def test_llama_lora(llama32_lora_files): vllm_model = VllmRunner( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a2e49373..1bb40291 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1848,6 +1848,7 @@ class NPUModelRunner(GPUModelRunner): # be improved in model runner v2) force_uniform_decode: bool | None = None, force_has_lora: bool | None = None, + force_num_active_loras: int | None = None, num_encoder_reqs: int = 0, ) -> tuple[CUDAGraphMode, BatchDescriptor, bool, torch.Tensor | None, CUDAGraphStat | None]: num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens) @@ -1864,7 +1865,12 @@ class NPUModelRunner(GPUModelRunner): # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output # is present). Also, chunked-prefill is disabled, so batch are uniform. has_encoder_output = self.model_config.is_encoder_decoder and num_encoder_reqs > 0 - has_lora = len(self.input_batch.lora_id_to_lora_request) > 0 if force_has_lora is None else force_has_lora + num_active_loras = ( + force_num_active_loras + if force_num_active_loras is not None + else len(self.input_batch.lora_id_to_lora_request) + ) + has_lora = num_active_loras > 0 if force_has_lora is None else force_has_lora # ruff: noqa: E731 def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None): @@ -1877,6 +1883,7 @@ class NPUModelRunner(GPUModelRunner): has_lora=has_lora, uniform_decode=uniform_decode, disable_full=disable_full, + num_active_loras=num_active_loras, ) else: return self.cudagraph_dispatcher.dispatch( @@ -1885,6 +1892,7 @@ class NPUModelRunner(GPUModelRunner): uniform_decode=uniform_decode, valid_modes=valid_modes, invalid_modes={CUDAGraphMode.FULL} if disable_full else None, + num_active_loras=num_active_loras, ) cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output) @@ -2196,7 +2204,6 @@ class NPUModelRunner(GPUModelRunner): allow_microbatching: bool = True, skip_eplb: bool = False, remove_lora: bool = True, - activate_lora: bool = False, is_graph_capturing: bool = False, num_active_loras: int = 0, ) -> tuple[torch.Tensor, torch.Tensor]: @@ -2260,7 +2267,8 @@ class NPUModelRunner(GPUModelRunner): # `force_has_lora` is used for cudagraph capture; because LoRA is # activated later in the context manager, but we need to know the # LoRA state when determining the batch descriptor for capture - force_has_lora=activate_lora, + force_has_lora=num_active_loras > 0, + force_num_active_loras=num_active_loras, ) if self.use_cp: self.pcp_manager.init_batch_info( @@ -2334,6 +2342,11 @@ class NPUModelRunner(GPUModelRunner): self.lora_config, num_scheduled_tokens, num_sampled_tokens, + remove_lora, + # TODO: The next line is a temporary workaround + # to fix the accuracy issue of test_llama32_lora.py, + # which is introduced by vllm-project/vllm#32005 + num_active_loras=(self.lora_config.max_loras if self.lora_config is not None else num_active_loras), ): # Make sure padding doesn't exceed max_num_tokens assert num_tokens_padded <= self.max_num_tokens