Files
xc-llm-ascend/tests/e2e/singlecard/pooling/test_qwen3_reranker_lora.py
yupeng 29f195a91c [Bugfix][LoRA] Fix the bug when runs Qwen3-Reranker-0.6B with LoRA. (#7156)
### What this PR does / why we need it?
Fix the error that reports while initializing qwen3-reranker-0.6b model
with `--enable-lora`.
And add a testcase to verify the fix.

- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: paulyu12 <507435917@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
2026-03-15 17:55:42 +08:00

74 lines
2.8 KiB
Python
Executable File

from pathlib import Path
from vllm import LLM
model_name = "Qwen/Qwen3-Reranker-0.6B"
def get_llm() -> LLM:
"""
Initializes and returns the LLM model for Qwen3-Reranker.
Returns:
LLM: Configured vLLM instance for reranking tasks.
Note:
This function loads the ORIGINAL Qwen3-Reranker model with specific
overrides to make it compatible with vLLM's score API.
"""
return LLM(
# Specify the original model from HuggingFace
model=model_name,
# Use pooling runner for score task
runner="pooling",
# HuggingFace model configuration overrides required for compatibility
hf_overrides={
# Manually route to sequence classification architecture
# This tells vLLM to use Qwen3ForSequenceClassification instead of
# the default Qwen3ForCausalLM
"architectures": ["Qwen3ForSequenceClassification"],
# Specify which token logits to extract from the language model head
# The original reranker uses "no" and "yes" token logits for scoring
"classifier_from_token": ["no", "yes"],
# Enable special handling for original Qwen3-Reranker models
# This flag triggers conversion logic that transforms the two token
# vectors into a single classification vector
"is_original_qwen3_reranker": True,
},
enable_lora=True,
)
def test_reranker_models_lora():
# Load the Jinja template for formatting query-document pairs
# The template ensures proper formatting for the reranker model
template_home = Path(__file__).parent / "template"
template_path = "qwen3_reranker.jinja"
chat_template = (template_home / template_path).read_text()
# Sample queries for testing the reranker
queries = [
"What is the capital of China?",
"Explain gravity",
]
# Corresponding documents to be scored against each query
documents = [
"The capital of China is Beijing.",
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]
# Initialize the LLM model with the original Qwen3-Reranker configuration
llm = get_llm()
# Compute relevance scores for each query-document pair
# The score() method returns a relevance score for each pair
# Higher scores indicate better relevance
outputs = llm.score(queries, documents, chat_template=chat_template)
# Extract and print the relevance scores from the outputs
# Each output contains a score representing query-document relevance
print("-" * 30)
print("Relevance scores:", [output.outputs.score for output in outputs])
print("-" * 30)