[Bugfix][LoRA] Fix the bug when runs Qwen3-Reranker-0.6B with LoRA. (#7156)
### What this PR does / why we need it?
Fix the error that reports while initializing qwen3-reranker-0.6b model
with `--enable-lora`.
And add a testcase to verify the fix.
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: paulyu12 <507435917@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
2
.github/workflows/scripts/config.yaml
vendored
2
.github/workflows/scripts/config.yaml
vendored
@@ -49,6 +49,8 @@ e2e-singlecard:
|
|||||||
estimated_time: 270
|
estimated_time: 270
|
||||||
- name: tests/e2e/singlecard/pooling/test_scoring.py
|
- name: tests/e2e/singlecard/pooling/test_scoring.py
|
||||||
estimated_time: 500
|
estimated_time: 500
|
||||||
|
- name: tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py
|
||||||
|
estimated_time: 235
|
||||||
- name: tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
|
- name: tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
|
||||||
estimated_time: 1500
|
estimated_time: 1500
|
||||||
- name: tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
|
- name: tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
|
||||||
|
|||||||
11
tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja
Executable file
11
tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja
Executable file
@@ -0,0 +1,11 @@
|
|||||||
|
<|im_start|>system
|
||||||
|
Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
|
||||||
|
<|im_start|>user
|
||||||
|
<Instruct>: {{ messages | selectattr("role", "eq", "system") | map(attribute="content") | first | default("Given a web search query, retrieve relevant passages that answer the query") }}
|
||||||
|
<Query>: {{ messages | selectattr("role", "eq", "query") | map(attribute="content") | first }}
|
||||||
|
<Document>: {{ messages | selectattr("role", "eq", "document") | map(attribute="content") | first }}<|im_end|>
|
||||||
|
<|im_start|>assistant
|
||||||
|
<think>
|
||||||
|
|
||||||
|
</think>
|
||||||
|
|
||||||
73
tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py
Executable file
73
tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py
Executable file
@@ -0,0 +1,73 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
|
model_name = "Qwen/Qwen3-Reranker-0.6B"
|
||||||
|
|
||||||
|
|
||||||
|
def get_llm() -> LLM:
|
||||||
|
"""
|
||||||
|
Initializes and returns the LLM model for Qwen3-Reranker.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LLM: Configured vLLM instance for reranking tasks.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This function loads the ORIGINAL Qwen3-Reranker model with specific
|
||||||
|
overrides to make it compatible with vLLM's score API.
|
||||||
|
"""
|
||||||
|
return LLM(
|
||||||
|
# Specify the original model from HuggingFace
|
||||||
|
model=model_name,
|
||||||
|
# Use pooling runner for score task
|
||||||
|
runner="pooling",
|
||||||
|
# HuggingFace model configuration overrides required for compatibility
|
||||||
|
hf_overrides={
|
||||||
|
# Manually route to sequence classification architecture
|
||||||
|
# This tells vLLM to use Qwen3ForSequenceClassification instead of
|
||||||
|
# the default Qwen3ForCausalLM
|
||||||
|
"architectures": ["Qwen3ForSequenceClassification"],
|
||||||
|
# Specify which token logits to extract from the language model head
|
||||||
|
# The original reranker uses "no" and "yes" token logits for scoring
|
||||||
|
"classifier_from_token": ["no", "yes"],
|
||||||
|
# Enable special handling for original Qwen3-Reranker models
|
||||||
|
# This flag triggers conversion logic that transforms the two token
|
||||||
|
# vectors into a single classification vector
|
||||||
|
"is_original_qwen3_reranker": True,
|
||||||
|
},
|
||||||
|
enable_lora=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_reranker_models_lora():
|
||||||
|
# Load the Jinja template for formatting query-document pairs
|
||||||
|
# The template ensures proper formatting for the reranker model
|
||||||
|
template_home = Path(__file__).parent / "template"
|
||||||
|
template_path = "qwen3_reranker.jinja"
|
||||||
|
chat_template = (template_home / template_path).read_text()
|
||||||
|
|
||||||
|
# Sample queries for testing the reranker
|
||||||
|
queries = [
|
||||||
|
"What is the capital of China?",
|
||||||
|
"Explain gravity",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Corresponding documents to be scored against each query
|
||||||
|
documents = [
|
||||||
|
"The capital of China is Beijing.",
|
||||||
|
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Initialize the LLM model with the original Qwen3-Reranker configuration
|
||||||
|
llm = get_llm()
|
||||||
|
|
||||||
|
# Compute relevance scores for each query-document pair
|
||||||
|
# The score() method returns a relevance score for each pair
|
||||||
|
# Higher scores indicate better relevance
|
||||||
|
outputs = llm.score(queries, documents, chat_template=chat_template)
|
||||||
|
|
||||||
|
# Extract and print the relevance scores from the outputs
|
||||||
|
# Each output contains a score representing query-document relevance
|
||||||
|
print("-" * 30)
|
||||||
|
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||||
|
print("-" * 30)
|
||||||
@@ -15,12 +15,14 @@ from vllm.lora.layers import (
|
|||||||
RowParallelLinearWithShardedLoRA,
|
RowParallelLinearWithShardedLoRA,
|
||||||
VocabParallelEmbeddingWithLoRA,
|
VocabParallelEmbeddingWithLoRA,
|
||||||
)
|
)
|
||||||
|
from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA
|
||||||
from vllm.lora.layers.utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
|
from vllm.lora.layers.utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
|
||||||
|
|
||||||
from vllm_ascend.ops.linear import (
|
from vllm_ascend.ops.linear import (
|
||||||
AscendColumnParallelLinear,
|
AscendColumnParallelLinear,
|
||||||
AscendMergedColumnParallelLinear,
|
AscendMergedColumnParallelLinear,
|
||||||
AscendQKVParallelLinear,
|
AscendQKVParallelLinear,
|
||||||
|
AscendReplicatedLinear,
|
||||||
AscendRowParallelLinear,
|
AscendRowParallelLinear,
|
||||||
)
|
)
|
||||||
from vllm_ascend.ops.vocab_parallel_embedding import AscendVocabParallelEmbedding
|
from vllm_ascend.ops.vocab_parallel_embedding import AscendVocabParallelEmbedding
|
||||||
@@ -103,6 +105,20 @@ class AscendMergedQKVParallelLinearWithLoRA(MergedQKVParallelLinearWithLoRA):
|
|||||||
return type(source_layer) is AscendQKVParallelLinear and len(packed_modules_list) == 3
|
return type(source_layer) is AscendQKVParallelLinear and len(packed_modules_list) == 3
|
||||||
|
|
||||||
|
|
||||||
|
class AscendReplicatedLinearWithLoRA(ReplicatedLinearWithLoRA):
|
||||||
|
# ReplicatedLinear should always be replaced, regardless of the fully
|
||||||
|
# sharded LoRAs setting, because it is, by definition, copied per GPU.
|
||||||
|
@classmethod
|
||||||
|
def can_replace_layer(
|
||||||
|
cls,
|
||||||
|
source_layer: nn.Module,
|
||||||
|
lora_config: LoRAConfig,
|
||||||
|
packed_modules_list: list,
|
||||||
|
model_config: PretrainedConfig | None = None,
|
||||||
|
) -> bool:
|
||||||
|
return type(source_layer) is AscendReplicatedLinear
|
||||||
|
|
||||||
|
|
||||||
class AscendColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithShardedLoRA):
|
class AscendColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithShardedLoRA):
|
||||||
@classmethod
|
@classmethod
|
||||||
@_fully_sharded_can_replace
|
@_fully_sharded_can_replace
|
||||||
@@ -180,3 +196,4 @@ def refresh_all_lora_classes():
|
|||||||
vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithShardedLoRA)
|
vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithShardedLoRA)
|
||||||
vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithShardedLoRA)
|
vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithShardedLoRA)
|
||||||
vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithShardedLoRA)
|
vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithShardedLoRA)
|
||||||
|
vllm.lora.utils._all_lora_classes.add(AscendReplicatedLinearWithLoRA)
|
||||||
|
|||||||
@@ -433,7 +433,7 @@ class AscendReplicatedLinear(ReplicatedLinear):
|
|||||||
return_bias: bool = True,
|
return_bias: bool = True,
|
||||||
disable_tp: bool = False,
|
disable_tp: bool = False,
|
||||||
):
|
):
|
||||||
self.custom_op = get_replicated_op(disable_tp, prefix, self)
|
self.custom_op, self.tp_rank, self.tp_size = get_replicated_op(disable_tp, prefix, self)
|
||||||
# If MergedReplicatedLinear, use output size of each partition.
|
# If MergedReplicatedLinear, use output size of each partition.
|
||||||
if hasattr(self, "output_sizes"):
|
if hasattr(self, "output_sizes"):
|
||||||
self.output_partition_sizes = self.output_sizes
|
self.output_partition_sizes = self.output_sizes
|
||||||
|
|||||||
@@ -734,11 +734,12 @@ def get_parallel_op(disable_tp, prefix, layer, direct):
|
|||||||
return None, get_tp_group().rank_in_group, get_tp_group().world_size
|
return None, get_tp_group().rank_in_group, get_tp_group().world_size
|
||||||
|
|
||||||
|
|
||||||
def get_replicated_op(disable_tp, prefix, layer) -> CustomReplicatedOp | None:
|
def get_replicated_op(disable_tp, prefix, layer) -> tuple[CustomReplicatedOp | None, int | None, int | None]:
|
||||||
if disable_tp:
|
if disable_tp:
|
||||||
return None
|
return None, None, None
|
||||||
|
|
||||||
return CustomReplicatedOp(layer)
|
custom_op = CustomReplicatedOp(layer)
|
||||||
|
return custom_op, custom_op.tp_rank, custom_op.tp_size
|
||||||
|
|
||||||
|
|
||||||
def is_moe_layer(prefix: str) -> bool:
|
def is_moe_layer(prefix: str) -> bool:
|
||||||
|
|||||||
Reference in New Issue
Block a user