From 29f195a91c4642207112cf9d5cb5fd9eef646584 Mon Sep 17 00:00:00 2001 From: yupeng <507435917@qq.com> Date: Sun, 15 Mar 2026 17:55:42 +0800 Subject: [PATCH] [Bugfix][LoRA] Fix the bug when runs Qwen3-Reranker-0.6B with LoRA. (#7156) ### What this PR does / why we need it? Fix the error that reports while initializing qwen3-reranker-0.6b model with `--enable-lora`. And add a testcase to verify the fix. - vLLM version: v0.17.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d --------- Signed-off-by: paulyu12 <507435917@qq.com> Co-authored-by: Mengqing Cao --- .github/workflows/scripts/config.yaml | 2 + .../pooling/template/qwen3_reranker.jinja | 11 +++ .../pooling/test_qwen3_reranker_lora.py | 73 +++++++++++++++++++ vllm_ascend/lora/utils.py | 17 +++++ vllm_ascend/ops/linear.py | 2 +- vllm_ascend/ops/linear_op.py | 7 +- 6 files changed, 108 insertions(+), 4 deletions(-) create mode 100755 tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja create mode 100755 tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py diff --git a/.github/workflows/scripts/config.yaml b/.github/workflows/scripts/config.yaml index 7cd9396d..25c49465 100644 --- a/.github/workflows/scripts/config.yaml +++ b/.github/workflows/scripts/config.yaml @@ -49,6 +49,8 @@ e2e-singlecard: estimated_time: 270 - name: tests/e2e/singlecard/pooling/test_scoring.py estimated_time: 500 + - name: tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py + estimated_time: 235 - name: tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py estimated_time: 1500 - name: tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py diff --git a/tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja b/tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja new file mode 100755 index 00000000..f33f526d --- /dev/null +++ b/tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja @@ -0,0 +1,11 @@ +<|im_start|>system +Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|> +<|im_start|>user +: {{ messages | selectattr("role", "eq", "system") | map(attribute="content") | first | default("Given a web search query, retrieve relevant passages that answer the query") }} +: {{ messages | selectattr("role", "eq", "query") | map(attribute="content") | first }} +: {{ messages | selectattr("role", "eq", "document") | map(attribute="content") | first }}<|im_end|> +<|im_start|>assistant + + + + diff --git a/tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py b/tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py new file mode 100755 index 00000000..e01d7b6b --- /dev/null +++ b/tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py @@ -0,0 +1,73 @@ +from pathlib import Path + +from vllm import LLM + +model_name = "Qwen/Qwen3-Reranker-0.6B" + + +def get_llm() -> LLM: + """ + Initializes and returns the LLM model for Qwen3-Reranker. + + Returns: + LLM: Configured vLLM instance for reranking tasks. + + Note: + This function loads the ORIGINAL Qwen3-Reranker model with specific + overrides to make it compatible with vLLM's score API. + """ + return LLM( + # Specify the original model from HuggingFace + model=model_name, + # Use pooling runner for score task + runner="pooling", + # HuggingFace model configuration overrides required for compatibility + hf_overrides={ + # Manually route to sequence classification architecture + # This tells vLLM to use Qwen3ForSequenceClassification instead of + # the default Qwen3ForCausalLM + "architectures": ["Qwen3ForSequenceClassification"], + # Specify which token logits to extract from the language model head + # The original reranker uses "no" and "yes" token logits for scoring + "classifier_from_token": ["no", "yes"], + # Enable special handling for original Qwen3-Reranker models + # This flag triggers conversion logic that transforms the two token + # vectors into a single classification vector + "is_original_qwen3_reranker": True, + }, + enable_lora=True, + ) + + +def test_reranker_models_lora(): + # Load the Jinja template for formatting query-document pairs + # The template ensures proper formatting for the reranker model + template_home = Path(__file__).parent / "template" + template_path = "qwen3_reranker.jinja" + chat_template = (template_home / template_path).read_text() + + # Sample queries for testing the reranker + queries = [ + "What is the capital of China?", + "Explain gravity", + ] + + # Corresponding documents to be scored against each query + documents = [ + "The capital of China is Beijing.", + "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.", + ] + + # Initialize the LLM model with the original Qwen3-Reranker configuration + llm = get_llm() + + # Compute relevance scores for each query-document pair + # The score() method returns a relevance score for each pair + # Higher scores indicate better relevance + outputs = llm.score(queries, documents, chat_template=chat_template) + + # Extract and print the relevance scores from the outputs + # Each output contains a score representing query-document relevance + print("-" * 30) + print("Relevance scores:", [output.outputs.score for output in outputs]) + print("-" * 30) diff --git a/vllm_ascend/lora/utils.py b/vllm_ascend/lora/utils.py index 341a3ab4..d822b362 100755 --- a/vllm_ascend/lora/utils.py +++ b/vllm_ascend/lora/utils.py @@ -15,12 +15,14 @@ from vllm.lora.layers import ( RowParallelLinearWithShardedLoRA, VocabParallelEmbeddingWithLoRA, ) +from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA from vllm.lora.layers.utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace from vllm_ascend.ops.linear import ( AscendColumnParallelLinear, AscendMergedColumnParallelLinear, AscendQKVParallelLinear, + AscendReplicatedLinear, AscendRowParallelLinear, ) from vllm_ascend.ops.vocab_parallel_embedding import AscendVocabParallelEmbedding @@ -103,6 +105,20 @@ class AscendMergedQKVParallelLinearWithLoRA(MergedQKVParallelLinearWithLoRA): return type(source_layer) is AscendQKVParallelLinear and len(packed_modules_list) == 3 +class AscendReplicatedLinearWithLoRA(ReplicatedLinearWithLoRA): + # ReplicatedLinear should always be replaced, regardless of the fully + # sharded LoRAs setting, because it is, by definition, copied per GPU. + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None = None, + ) -> bool: + return type(source_layer) is AscendReplicatedLinear + + class AscendColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithShardedLoRA): @classmethod @_fully_sharded_can_replace @@ -180,3 +196,4 @@ def refresh_all_lora_classes(): vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithShardedLoRA) vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithShardedLoRA) vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithShardedLoRA) + vllm.lora.utils._all_lora_classes.add(AscendReplicatedLinearWithLoRA) diff --git a/vllm_ascend/ops/linear.py b/vllm_ascend/ops/linear.py index b2118606..9209d331 100644 --- a/vllm_ascend/ops/linear.py +++ b/vllm_ascend/ops/linear.py @@ -433,7 +433,7 @@ class AscendReplicatedLinear(ReplicatedLinear): return_bias: bool = True, disable_tp: bool = False, ): - self.custom_op = get_replicated_op(disable_tp, prefix, self) + self.custom_op, self.tp_rank, self.tp_size = get_replicated_op(disable_tp, prefix, self) # If MergedReplicatedLinear, use output size of each partition. if hasattr(self, "output_sizes"): self.output_partition_sizes = self.output_sizes diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py index db898645..77c71b40 100644 --- a/vllm_ascend/ops/linear_op.py +++ b/vllm_ascend/ops/linear_op.py @@ -734,11 +734,12 @@ def get_parallel_op(disable_tp, prefix, layer, direct): return None, get_tp_group().rank_in_group, get_tp_group().world_size -def get_replicated_op(disable_tp, prefix, layer) -> CustomReplicatedOp | None: +def get_replicated_op(disable_tp, prefix, layer) -> tuple[CustomReplicatedOp | None, int | None, int | None]: if disable_tp: - return None + return None, None, None - return CustomReplicatedOp(layer) + custom_op = CustomReplicatedOp(layer) + return custom_op, custom_op.tp_rank, custom_op.tp_size def is_moe_layer(prefix: str) -> bool: