[Bugfix][LoRA] Fix the bug when runs Qwen3-Reranker-0.6B with LoRA. (#7156)
### What this PR does / why we need it?
Fix the error that reports while initializing qwen3-reranker-0.6b model
with `--enable-lora`.
And add a testcase to verify the fix.
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: paulyu12 <507435917@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
11
tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja
Executable file
11
tests/e2e/singlecard/pooling/template/qwen3_reranker.jinja
Executable file
@@ -0,0 +1,11 @@
|
||||
<|im_start|>system
|
||||
Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
|
||||
<|im_start|>user
|
||||
<Instruct>: {{ messages | selectattr("role", "eq", "system") | map(attribute="content") | first | default("Given a web search query, retrieve relevant passages that answer the query") }}
|
||||
<Query>: {{ messages | selectattr("role", "eq", "query") | map(attribute="content") | first }}
|
||||
<Document>: {{ messages | selectattr("role", "eq", "document") | map(attribute="content") | first }}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
|
||||
</think>
|
||||
|
||||
73
tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py
Executable file
73
tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py
Executable file
@@ -0,0 +1,73 @@
|
||||
from pathlib import Path
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
model_name = "Qwen/Qwen3-Reranker-0.6B"
|
||||
|
||||
|
||||
def get_llm() -> LLM:
|
||||
"""
|
||||
Initializes and returns the LLM model for Qwen3-Reranker.
|
||||
|
||||
Returns:
|
||||
LLM: Configured vLLM instance for reranking tasks.
|
||||
|
||||
Note:
|
||||
This function loads the ORIGINAL Qwen3-Reranker model with specific
|
||||
overrides to make it compatible with vLLM's score API.
|
||||
"""
|
||||
return LLM(
|
||||
# Specify the original model from HuggingFace
|
||||
model=model_name,
|
||||
# Use pooling runner for score task
|
||||
runner="pooling",
|
||||
# HuggingFace model configuration overrides required for compatibility
|
||||
hf_overrides={
|
||||
# Manually route to sequence classification architecture
|
||||
# This tells vLLM to use Qwen3ForSequenceClassification instead of
|
||||
# the default Qwen3ForCausalLM
|
||||
"architectures": ["Qwen3ForSequenceClassification"],
|
||||
# Specify which token logits to extract from the language model head
|
||||
# The original reranker uses "no" and "yes" token logits for scoring
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
# Enable special handling for original Qwen3-Reranker models
|
||||
# This flag triggers conversion logic that transforms the two token
|
||||
# vectors into a single classification vector
|
||||
"is_original_qwen3_reranker": True,
|
||||
},
|
||||
enable_lora=True,
|
||||
)
|
||||
|
||||
|
||||
def test_reranker_models_lora():
|
||||
# Load the Jinja template for formatting query-document pairs
|
||||
# The template ensures proper formatting for the reranker model
|
||||
template_home = Path(__file__).parent / "template"
|
||||
template_path = "qwen3_reranker.jinja"
|
||||
chat_template = (template_home / template_path).read_text()
|
||||
|
||||
# Sample queries for testing the reranker
|
||||
queries = [
|
||||
"What is the capital of China?",
|
||||
"Explain gravity",
|
||||
]
|
||||
|
||||
# Corresponding documents to be scored against each query
|
||||
documents = [
|
||||
"The capital of China is Beijing.",
|
||||
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
|
||||
]
|
||||
|
||||
# Initialize the LLM model with the original Qwen3-Reranker configuration
|
||||
llm = get_llm()
|
||||
|
||||
# Compute relevance scores for each query-document pair
|
||||
# The score() method returns a relevance score for each pair
|
||||
# Higher scores indicate better relevance
|
||||
outputs = llm.score(queries, documents, chat_template=chat_template)
|
||||
|
||||
# Extract and print the relevance scores from the outputs
|
||||
# Each output contains a score representing query-document relevance
|
||||
print("-" * 30)
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
print("-" * 30)
|
||||
Reference in New Issue
Block a user