[bugfix][LoRA] Fix the lora accuracy issue introduced by the upstream vLLM changed. (#6958)
### What this PR does / why we need it?
Fix the LoRA e2e test accuracy issue that introduced by the upstream PR
https://github.com/vllm-project/vllm/pull/32005
### How was this patch tested?
pytest -sv tests/e2e/singlecard/test_llama32_lora.py
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
---------
Signed-off-by: paulyu12 <507435917@qq.com>
Signed-off-by: yupeng <507435917@qq.com>
This commit is contained in:
@@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
|
||||||
import vllm
|
import vllm
|
||||||
import vllm.config
|
import vllm.config
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
@@ -126,7 +125,6 @@ def generate_and_test(llm, llama32_lora_files, tensorizer_config_dict: dict | No
|
|||||||
print("removing lora")
|
print("removing lora")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="fix me")
|
|
||||||
@patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
|
@patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
|
||||||
def test_llama_lora(llama32_lora_files):
|
def test_llama_lora(llama32_lora_files):
|
||||||
vllm_model = VllmRunner(
|
vllm_model = VllmRunner(
|
||||||
|
|||||||
@@ -1848,6 +1848,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
# be improved in model runner v2)
|
# be improved in model runner v2)
|
||||||
force_uniform_decode: bool | None = None,
|
force_uniform_decode: bool | None = None,
|
||||||
force_has_lora: bool | None = None,
|
force_has_lora: bool | None = None,
|
||||||
|
force_num_active_loras: int | None = None,
|
||||||
num_encoder_reqs: int = 0,
|
num_encoder_reqs: int = 0,
|
||||||
) -> tuple[CUDAGraphMode, BatchDescriptor, bool, torch.Tensor | None, CUDAGraphStat | None]:
|
) -> tuple[CUDAGraphMode, BatchDescriptor, bool, torch.Tensor | None, CUDAGraphStat | None]:
|
||||||
num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
|
num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
|
||||||
@@ -1864,7 +1865,12 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
|
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
|
||||||
# is present). Also, chunked-prefill is disabled, so batch are uniform.
|
# is present). Also, chunked-prefill is disabled, so batch are uniform.
|
||||||
has_encoder_output = self.model_config.is_encoder_decoder and num_encoder_reqs > 0
|
has_encoder_output = self.model_config.is_encoder_decoder and num_encoder_reqs > 0
|
||||||
has_lora = len(self.input_batch.lora_id_to_lora_request) > 0 if force_has_lora is None else force_has_lora
|
num_active_loras = (
|
||||||
|
force_num_active_loras
|
||||||
|
if force_num_active_loras is not None
|
||||||
|
else len(self.input_batch.lora_id_to_lora_request)
|
||||||
|
)
|
||||||
|
has_lora = num_active_loras > 0 if force_has_lora is None else force_has_lora
|
||||||
|
|
||||||
# ruff: noqa: E731
|
# ruff: noqa: E731
|
||||||
def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None):
|
def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None):
|
||||||
@@ -1877,6 +1883,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
has_lora=has_lora,
|
has_lora=has_lora,
|
||||||
uniform_decode=uniform_decode,
|
uniform_decode=uniform_decode,
|
||||||
disable_full=disable_full,
|
disable_full=disable_full,
|
||||||
|
num_active_loras=num_active_loras,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return self.cudagraph_dispatcher.dispatch(
|
return self.cudagraph_dispatcher.dispatch(
|
||||||
@@ -1885,6 +1892,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
uniform_decode=uniform_decode,
|
uniform_decode=uniform_decode,
|
||||||
valid_modes=valid_modes,
|
valid_modes=valid_modes,
|
||||||
invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
|
invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
|
||||||
|
num_active_loras=num_active_loras,
|
||||||
)
|
)
|
||||||
|
|
||||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output)
|
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output)
|
||||||
@@ -2196,7 +2204,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
allow_microbatching: bool = True,
|
allow_microbatching: bool = True,
|
||||||
skip_eplb: bool = False,
|
skip_eplb: bool = False,
|
||||||
remove_lora: bool = True,
|
remove_lora: bool = True,
|
||||||
activate_lora: bool = False,
|
|
||||||
is_graph_capturing: bool = False,
|
is_graph_capturing: bool = False,
|
||||||
num_active_loras: int = 0,
|
num_active_loras: int = 0,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
@@ -2260,7 +2267,8 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
# `force_has_lora` is used for cudagraph capture; because LoRA is
|
# `force_has_lora` is used for cudagraph capture; because LoRA is
|
||||||
# activated later in the context manager, but we need to know the
|
# activated later in the context manager, but we need to know the
|
||||||
# LoRA state when determining the batch descriptor for capture
|
# LoRA state when determining the batch descriptor for capture
|
||||||
force_has_lora=activate_lora,
|
force_has_lora=num_active_loras > 0,
|
||||||
|
force_num_active_loras=num_active_loras,
|
||||||
)
|
)
|
||||||
if self.use_cp:
|
if self.use_cp:
|
||||||
self.pcp_manager.init_batch_info(
|
self.pcp_manager.init_batch_info(
|
||||||
@@ -2334,6 +2342,11 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
self.lora_config,
|
self.lora_config,
|
||||||
num_scheduled_tokens,
|
num_scheduled_tokens,
|
||||||
num_sampled_tokens,
|
num_sampled_tokens,
|
||||||
|
remove_lora,
|
||||||
|
# TODO: The next line is a temporary workaround
|
||||||
|
# to fix the accuracy issue of test_llama32_lora.py,
|
||||||
|
# which is introduced by vllm-project/vllm#32005
|
||||||
|
num_active_loras=(self.lora_config.max_loras if self.lora_config is not None else num_active_loras),
|
||||||
):
|
):
|
||||||
# Make sure padding doesn't exceed max_num_tokens
|
# Make sure padding doesn't exceed max_num_tokens
|
||||||
assert num_tokens_padded <= self.max_num_tokens
|
assert num_tokens_padded <= self.max_num_tokens
|
||||||
|
|||||||
Reference in New Issue
Block a user