[Bugfix][LoRA] Fix the bug when runs Qwen3-Reranker-0.6B with LoRA. (#7156)
### What this PR does / why we need it?
Fix the error that reports while initializing qwen3-reranker-0.6b model
with `--enable-lora`.
And add a testcase to verify the fix.
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: paulyu12 <507435917@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
2
.github/workflows/scripts/config.yaml
vendored
2
.github/workflows/scripts/config.yaml
vendored
@@ -49,6 +49,8 @@ e2e-singlecard:
|
||||
estimated_time: 270
|
||||
- name: tests/e2e/singlecard/pooling/test_scoring.py
|
||||
estimated_time: 500
|
||||
- name: tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py
|
||||
estimated_time: 235
|
||||
- name: tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
|
||||
estimated_time: 1500
|
||||
- name: tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
|
||||
|
||||
11
tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja
Executable file
11
tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja
Executable file
@@ -0,0 +1,11 @@
|
||||
<|im_start|>system
|
||||
Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
|
||||
<|im_start|>user
|
||||
<Instruct>: {{ messages | selectattr("role", "eq", "system") | map(attribute="content") | first | default("Given a web search query, retrieve relevant passages that answer the query") }}
|
||||
<Query>: {{ messages | selectattr("role", "eq", "query") | map(attribute="content") | first }}
|
||||
<Document>: {{ messages | selectattr("role", "eq", "document") | map(attribute="content") | first }}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
|
||||
</think>
|
||||
|
||||
73
tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py
Executable file
73
tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py
Executable file
@@ -0,0 +1,73 @@
|
||||
from pathlib import Path
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
model_name = "Qwen/Qwen3-Reranker-0.6B"
|
||||
|
||||
|
||||
def get_llm() -> LLM:
|
||||
"""
|
||||
Initializes and returns the LLM model for Qwen3-Reranker.
|
||||
|
||||
Returns:
|
||||
LLM: Configured vLLM instance for reranking tasks.
|
||||
|
||||
Note:
|
||||
This function loads the ORIGINAL Qwen3-Reranker model with specific
|
||||
overrides to make it compatible with vLLM's score API.
|
||||
"""
|
||||
return LLM(
|
||||
# Specify the original model from HuggingFace
|
||||
model=model_name,
|
||||
# Use pooling runner for score task
|
||||
runner="pooling",
|
||||
# HuggingFace model configuration overrides required for compatibility
|
||||
hf_overrides={
|
||||
# Manually route to sequence classification architecture
|
||||
# This tells vLLM to use Qwen3ForSequenceClassification instead of
|
||||
# the default Qwen3ForCausalLM
|
||||
"architectures": ["Qwen3ForSequenceClassification"],
|
||||
# Specify which token logits to extract from the language model head
|
||||
# The original reranker uses "no" and "yes" token logits for scoring
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
# Enable special handling for original Qwen3-Reranker models
|
||||
# This flag triggers conversion logic that transforms the two token
|
||||
# vectors into a single classification vector
|
||||
"is_original_qwen3_reranker": True,
|
||||
},
|
||||
enable_lora=True,
|
||||
)
|
||||
|
||||
|
||||
def test_reranker_models_lora():
|
||||
# Load the Jinja template for formatting query-document pairs
|
||||
# The template ensures proper formatting for the reranker model
|
||||
template_home = Path(__file__).parent / "template"
|
||||
template_path = "qwen3_reranker.jinja"
|
||||
chat_template = (template_home / template_path).read_text()
|
||||
|
||||
# Sample queries for testing the reranker
|
||||
queries = [
|
||||
"What is the capital of China?",
|
||||
"Explain gravity",
|
||||
]
|
||||
|
||||
# Corresponding documents to be scored against each query
|
||||
documents = [
|
||||
"The capital of China is Beijing.",
|
||||
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
|
||||
]
|
||||
|
||||
# Initialize the LLM model with the original Qwen3-Reranker configuration
|
||||
llm = get_llm()
|
||||
|
||||
# Compute relevance scores for each query-document pair
|
||||
# The score() method returns a relevance score for each pair
|
||||
# Higher scores indicate better relevance
|
||||
outputs = llm.score(queries, documents, chat_template=chat_template)
|
||||
|
||||
# Extract and print the relevance scores from the outputs
|
||||
# Each output contains a score representing query-document relevance
|
||||
print("-" * 30)
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
print("-" * 30)
|
||||
@@ -15,12 +15,14 @@ from vllm.lora.layers import (
|
||||
RowParallelLinearWithShardedLoRA,
|
||||
VocabParallelEmbeddingWithLoRA,
|
||||
)
|
||||
from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA
|
||||
from vllm.lora.layers.utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
|
||||
|
||||
from vllm_ascend.ops.linear import (
|
||||
AscendColumnParallelLinear,
|
||||
AscendMergedColumnParallelLinear,
|
||||
AscendQKVParallelLinear,
|
||||
AscendReplicatedLinear,
|
||||
AscendRowParallelLinear,
|
||||
)
|
||||
from vllm_ascend.ops.vocab_parallel_embedding import AscendVocabParallelEmbedding
|
||||
@@ -103,6 +105,20 @@ class AscendMergedQKVParallelLinearWithLoRA(MergedQKVParallelLinearWithLoRA):
|
||||
return type(source_layer) is AscendQKVParallelLinear and len(packed_modules_list) == 3
|
||||
|
||||
|
||||
class AscendReplicatedLinearWithLoRA(ReplicatedLinearWithLoRA):
|
||||
# ReplicatedLinear should always be replaced, regardless of the fully
|
||||
# sharded LoRAs setting, because it is, by definition, copied per GPU.
|
||||
@classmethod
|
||||
def can_replace_layer(
|
||||
cls,
|
||||
source_layer: nn.Module,
|
||||
lora_config: LoRAConfig,
|
||||
packed_modules_list: list,
|
||||
model_config: PretrainedConfig | None = None,
|
||||
) -> bool:
|
||||
return type(source_layer) is AscendReplicatedLinear
|
||||
|
||||
|
||||
class AscendColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithShardedLoRA):
|
||||
@classmethod
|
||||
@_fully_sharded_can_replace
|
||||
@@ -180,3 +196,4 @@ def refresh_all_lora_classes():
|
||||
vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithShardedLoRA)
|
||||
vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithShardedLoRA)
|
||||
vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithShardedLoRA)
|
||||
vllm.lora.utils._all_lora_classes.add(AscendReplicatedLinearWithLoRA)
|
||||
|
||||
@@ -433,7 +433,7 @@ class AscendReplicatedLinear(ReplicatedLinear):
|
||||
return_bias: bool = True,
|
||||
disable_tp: bool = False,
|
||||
):
|
||||
self.custom_op = get_replicated_op(disable_tp, prefix, self)
|
||||
self.custom_op, self.tp_rank, self.tp_size = get_replicated_op(disable_tp, prefix, self)
|
||||
# If MergedReplicatedLinear, use output size of each partition.
|
||||
if hasattr(self, "output_sizes"):
|
||||
self.output_partition_sizes = self.output_sizes
|
||||
|
||||
@@ -734,11 +734,12 @@ def get_parallel_op(disable_tp, prefix, layer, direct):
|
||||
return None, get_tp_group().rank_in_group, get_tp_group().world_size
|
||||
|
||||
|
||||
def get_replicated_op(disable_tp, prefix, layer) -> CustomReplicatedOp | None:
|
||||
def get_replicated_op(disable_tp, prefix, layer) -> tuple[CustomReplicatedOp | None, int | None, int | None]:
|
||||
if disable_tp:
|
||||
return None
|
||||
return None, None, None
|
||||
|
||||
return CustomReplicatedOp(layer)
|
||||
custom_op = CustomReplicatedOp(layer)
|
||||
return custom_op, custom_op.tp_rank, custom_op.tp_size
|
||||
|
||||
|
||||
def is_moe_layer(prefix: str) -> bool:
|
||||
|
||||
Reference in New Issue
Block a user