Files
ColBERT-Zero/README.md
ModelHub XC b2698c18ef 初始化项目,由ModelHub XC社区提供模型
Model: lightonai/ColBERT-Zero
Source: Original Platform
2026-05-14 16:59:45 +08:00

149 KiB
Raw Permalink Blame History

tags, pipeline_tag, library_name, license, language, metrics, model-index
tags pipeline_tag library_name license language metrics model-index
ColBERT
PyLate
sentence-transformers
sentence-similarity
feature-extraction
generated_from_trainer
dataset_size:640000
loss:Distillation
sentence-similarity PyLate apache-2.0
en
MaxSim_accuracy@1
MaxSim_accuracy@3
MaxSim_accuracy@5
MaxSim_accuracy@10
MaxSim_precision@1
MaxSim_precision@3
MaxSim_precision@5
MaxSim_precision@10
MaxSim_recall@1
MaxSim_recall@3
MaxSim_recall@5
MaxSim_recall@10
MaxSim_ndcg@10
MaxSim_mrr@10
MaxSim_map@100
name results
PyLate
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoClimateFEVER NanoClimateFEVER
type value name
MaxSim_accuracy@1 0.36 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.68 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.76 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.88 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.36 Maxsim Precision@1
type value name
MaxSim_precision@3 0.2866666666666666 Maxsim Precision@3
type value name
MaxSim_precision@5 0.21999999999999997 Maxsim Precision@5
type value name
MaxSim_precision@10 0.148 Maxsim Precision@10
type value name
MaxSim_recall@1 0.18 Maxsim Recall@1
type value name
MaxSim_recall@3 0.35999999999999993 Maxsim Recall@3
type value name
MaxSim_recall@5 0.429 Maxsim Recall@5
type value name
MaxSim_recall@10 0.5536666666666666 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.4511316943880545 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.5352619047619046 Maxsim Mrr@10
type value name
MaxSim_map@100 0.35707500469760434 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoDBPedia NanoDBPedia
type value name
MaxSim_accuracy@1 0.86 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.94 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.94 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.98 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.86 Maxsim Precision@1
type value name
MaxSim_precision@3 0.7333333333333333 Maxsim Precision@3
type value name
MaxSim_precision@5 0.66 Maxsim Precision@5
type value name
MaxSim_precision@10 0.5840000000000001 Maxsim Precision@10
type value name
MaxSim_recall@1 0.10798996781634018 Maxsim Recall@1
type value name
MaxSim_recall@3 0.21610834839667603 Maxsim Recall@3
type value name
MaxSim_recall@5 0.29328648273572205 Maxsim Recall@5
type value name
MaxSim_recall@10 0.4273378391765384 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.7325830538365519 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.8995238095238095 Maxsim Mrr@10
type value name
MaxSim_map@100 0.5805986129726132 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoFEVER NanoFEVER
type value name
MaxSim_accuracy@1 0.96 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 1.0 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 1.0 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 1.0 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.96 Maxsim Precision@1
type value name
MaxSim_precision@3 0.3533333333333333 Maxsim Precision@3
type value name
MaxSim_precision@5 0.21199999999999997 Maxsim Precision@5
type value name
MaxSim_precision@10 0.10999999999999999 Maxsim Precision@10
type value name
MaxSim_recall@1 0.8966666666666667 Maxsim Recall@1
type value name
MaxSim_recall@3 0.9633333333333333 Maxsim Recall@3
type value name
MaxSim_recall@5 0.9633333333333333 Maxsim Recall@5
type value name
MaxSim_recall@10 0.98 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.9624259972128165 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.9766666666666666 Maxsim Mrr@10
type value name
MaxSim_map@100 0.9478155706727135 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoFiQA2018 NanoFiQA2018
type value name
MaxSim_accuracy@1 0.58 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.66 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.72 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.82 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.58 Maxsim Precision@1
type value name
MaxSim_precision@3 0.32666666666666666 Maxsim Precision@3
type value name
MaxSim_precision@5 0.24799999999999997 Maxsim Precision@5
type value name
MaxSim_precision@10 0.14799999999999996 Maxsim Precision@10
type value name
MaxSim_recall@1 0.35257936507936505 Maxsim Recall@1
type value name
MaxSim_recall@3 0.47423809523809524 Maxsim Recall@3
type value name
MaxSim_recall@5 0.5460079365079364 Maxsim Recall@5
type value name
MaxSim_recall@10 0.6425317460317461 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.5786162417612232 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.643436507936508 Maxsim Mrr@10
type value name
MaxSim_map@100 0.5234035855771078 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoHotpotQA NanoHotpotQA
type value name
MaxSim_accuracy@1 0.98 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 1.0 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 1.0 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 1.0 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.98 Maxsim Precision@1
type value name
MaxSim_precision@3 0.6 Maxsim Precision@3
type value name
MaxSim_precision@5 0.3679999999999999 Maxsim Precision@5
type value name
MaxSim_precision@10 0.18599999999999994 Maxsim Precision@10
type value name
MaxSim_recall@1 0.49 Maxsim Recall@1
type value name
MaxSim_recall@3 0.9 Maxsim Recall@3
type value name
MaxSim_recall@5 0.92 Maxsim Recall@5
type value name
MaxSim_recall@10 0.93 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.924329868595787 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.99 Maxsim Mrr@10
type value name
MaxSim_map@100 0.8944956212370004 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoMSMARCO NanoMSMARCO
type value name
MaxSim_accuracy@1 0.6 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.68 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.78 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.9 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.6 Maxsim Precision@1
type value name
MaxSim_precision@3 0.22666666666666668 Maxsim Precision@3
type value name
MaxSim_precision@5 0.15600000000000003 Maxsim Precision@5
type value name
MaxSim_precision@10 0.09 Maxsim Precision@10
type value name
MaxSim_recall@1 0.6 Maxsim Recall@1
type value name
MaxSim_recall@3 0.68 Maxsim Recall@3
type value name
MaxSim_recall@5 0.78 Maxsim Recall@5
type value name
MaxSim_recall@10 0.9 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.7242459443760582 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.671047619047619 Maxsim Mrr@10
type value name
MaxSim_map@100 0.6766320575975747 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoNFCorpus NanoNFCorpus
type value name
MaxSim_accuracy@1 0.58 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.68 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.72 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.76 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.58 Maxsim Precision@1
type value name
MaxSim_precision@3 0.42666666666666664 Maxsim Precision@3
type value name
MaxSim_precision@5 0.396 Maxsim Precision@5
type value name
MaxSim_precision@10 0.316 Maxsim Precision@10
type value name
MaxSim_recall@1 0.06598420757312619 Maxsim Recall@1
type value name
MaxSim_recall@3 0.10355307905498773 Maxsim Recall@3
type value name
MaxSim_recall@5 0.1296680186177352 Maxsim Recall@5
type value name
MaxSim_recall@10 0.1635498250401139 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.4054849783640007 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.6303888888888889 Maxsim Mrr@10
type value name
MaxSim_map@100 0.195854964801369 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoNQ NanoNQ
type value name
MaxSim_accuracy@1 0.62 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.84 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.88 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.9 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.62 Maxsim Precision@1
type value name
MaxSim_precision@3 0.28 Maxsim Precision@3
type value name
MaxSim_precision@5 0.176 Maxsim Precision@5
type value name
MaxSim_precision@10 0.09599999999999997 Maxsim Precision@10
type value name
MaxSim_recall@1 0.59 Maxsim Recall@1
type value name
MaxSim_recall@3 0.78 Maxsim Recall@3
type value name
MaxSim_recall@5 0.81 Maxsim Recall@5
type value name
MaxSim_recall@10 0.86 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.7474767067573468 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.7341904761904762 Maxsim Mrr@10
type value name
MaxSim_map@100 0.7035987374595623 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoQuoraRetrieval NanoQuoraRetrieval
type value name
MaxSim_accuracy@1 0.92 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.98 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 1.0 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 1.0 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.92 Maxsim Precision@1
type value name
MaxSim_precision@3 0.3933333333333333 Maxsim Precision@3
type value name
MaxSim_precision@5 0.24799999999999997 Maxsim Precision@5
type value name
MaxSim_precision@10 0.128 Maxsim Precision@10
type value name
MaxSim_recall@1 0.7973333333333332 Maxsim Recall@1
type value name
MaxSim_recall@3 0.932 Maxsim Recall@3
type value name
MaxSim_recall@5 0.9626666666666668 Maxsim Recall@5
type value name
MaxSim_recall@10 0.9726666666666667 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.9376063901029283 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.9540000000000001 Maxsim Mrr@10
type value name
MaxSim_map@100 0.9156057922958499 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoSCIDOCS NanoSCIDOCS
type value name
MaxSim_accuracy@1 0.48 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.74 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.76 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.9 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.48 Maxsim Precision@1
type value name
MaxSim_precision@3 0.4066666666666666 Maxsim Precision@3
type value name
MaxSim_precision@5 0.30400000000000005 Maxsim Precision@5
type value name
MaxSim_precision@10 0.204 Maxsim Precision@10
type value name
MaxSim_recall@1 0.10266666666666666 Maxsim Recall@1
type value name
MaxSim_recall@3 0.25066666666666665 Maxsim Recall@3
type value name
MaxSim_recall@5 0.3106666666666667 Maxsim Recall@5
type value name
MaxSim_recall@10 0.41666666666666663 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.41240108229211636 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.6183888888888889 Maxsim Mrr@10
type value name
MaxSim_map@100 0.3293535579753635 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoArguAna NanoArguAna
type value name
MaxSim_accuracy@1 0.24 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.64 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.7 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.9 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.24 Maxsim Precision@1
type value name
MaxSim_precision@3 0.21333333333333335 Maxsim Precision@3
type value name
MaxSim_precision@5 0.14 Maxsim Precision@5
type value name
MaxSim_precision@10 0.08999999999999998 Maxsim Precision@10
type value name
MaxSim_recall@1 0.24 Maxsim Recall@1
type value name
MaxSim_recall@3 0.64 Maxsim Recall@3
type value name
MaxSim_recall@5 0.7 Maxsim Recall@5
type value name
MaxSim_recall@10 0.9 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.5619950169581177 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.4556587301587301 Maxsim Mrr@10
type value name
MaxSim_map@100 0.4583679653679654 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoSciFact NanoSciFact
type value name
MaxSim_accuracy@1 0.7 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.82 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.88 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.92 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.7 Maxsim Precision@1
type value name
MaxSim_precision@3 0.2866666666666667 Maxsim Precision@3
type value name
MaxSim_precision@5 0.19599999999999998 Maxsim Precision@5
type value name
MaxSim_precision@10 0.10199999999999998 Maxsim Precision@10
type value name
MaxSim_recall@1 0.675 Maxsim Recall@1
type value name
MaxSim_recall@3 0.79 Maxsim Recall@3
type value name
MaxSim_recall@5 0.87 Maxsim Recall@5
type value name
MaxSim_recall@10 0.91 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.8019869692829787 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.7716666666666667 Maxsim Mrr@10
type value name
MaxSim_map@100 0.7651960954534442 Maxsim Map@100
task dataset metrics
type name
py-late-information-retrieval Py Late Information Retrieval
name type
NanoTouche2020 NanoTouche2020
type value name
MaxSim_accuracy@1 0.8163265306122449 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.9795918367346939 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.9795918367346939 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.9795918367346939 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.8163265306122449 Maxsim Precision@1
type value name
MaxSim_precision@3 0.727891156462585 Maxsim Precision@3
type value name
MaxSim_precision@5 0.6653061224489795 Maxsim Precision@5
type value name
MaxSim_precision@10 0.5387755102040817 Maxsim Precision@10
type value name
MaxSim_recall@1 0.05638641704555484 Maxsim Recall@1
type value name
MaxSim_recall@3 0.1492928448908377 Maxsim Recall@3
type value name
MaxSim_recall@5 0.2240629902771357 Maxsim Recall@5
type value name
MaxSim_recall@10 0.3474561127492143 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.6176094809857532 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.8775510204081632 Maxsim Mrr@10
type value name
MaxSim_map@100 0.4570510040327342 Maxsim Map@100
task dataset metrics
type name
nano-beir Nano BEIR
name type
NanoBEIR mean NanoBEIR_mean
type value name
MaxSim_accuracy@1 0.6689481946624803 Maxsim Accuracy@1
type value name
MaxSim_accuracy@3 0.8184301412872842 Maxsim Accuracy@3
type value name
MaxSim_accuracy@5 0.855353218210361 Maxsim Accuracy@5
type value name
MaxSim_accuracy@10 0.9184301412872842 Maxsim Accuracy@10
type value name
MaxSim_precision@1 0.6689481946624803 Maxsim Precision@1
type value name
MaxSim_precision@3 0.4047095761381475 Maxsim Precision@3
type value name
MaxSim_precision@5 0.3068697017268446 Maxsim Precision@5
type value name
MaxSim_precision@10 0.210828885400314 Maxsim Precision@10
type value name
MaxSim_recall@1 0.39650820186008107 Maxsim Recall@1
type value name
MaxSim_recall@3 0.5568609513523536 Maxsim Recall@3
type value name
MaxSim_recall@5 0.6106686226773229 Maxsim Recall@5
type value name
MaxSim_recall@10 0.6926058094613547 Maxsim Recall@10
type value name
MaxSim_ndcg@10 0.6813764173010564 Maxsim Ndcg@10
type value name
MaxSim_mrr@10 0.7505985522414094 Maxsim Mrr@10
type value name
MaxSim_map@100 0.6003883515493001 Maxsim Map@100

Website LinkedIn X

📄 Paper | 📝 Blog | 📚 Collection

ColBERT-Zero

🎯 TL;DR: First large-scale fully pre-trained ColBERT model using only public data. Achieves 55.43 nDCG@10 on BEIR benchmark, outperforming GTE-ModernColBERT and GTE-ModernBERT trained on closed and stronger data. New SOTA on BEIR for models <150M parameters.

Why ColBERT-Zero?

Late interaction (ColBERT / multi-vector) models have clear advantages in out-of-domain generalization, long-context handling, and reasoning-intensive retrieval. Yet they remain undertrained: current state-of-the-art ColBERT models (e.g, GTE-ModernColBERT and ColBERT-small) are simply built by bolting a small knowledge distillation step onto a strong dense (single-vector) model. Even recent efforts like mxbai-edge-colbert-v0 perform all early training stages in a single-vector setting, only switching to the multi-vector objective at the very end.

This leaves a lot of performance on the table. ColBERT-Zero demonstrates that performing contrastive pre-training directly in the multi-vector setting, rather than treating it as an afterthought, unlocks a significantly higher performance ceiling. Trained exclusively on public data (Nomic-embed dataset mixture), ColBERT-Zero overcomes a 2.4-point data quality disadvantage to outperform models trained on proprietary, closed-source data. For detailed results, please have a look at our blogpost and the paper. All the models (including intermediate checkpoints) as well training code are released under an Apache 2.0 license.

Controlled Comparison Design

We deliberately trained on the public Nomic-embed data mixture for a strategic reason: Nomic has already trained a dense ModernBERT model (ModernBERT-embed) on this exact data. This lets us compare dense vs. multi-vector training with the same data, same base model (ModernBERT), and same pipeline. The only variable is whether the contrastive phases are performed in the dense or multi-vector setting.

This design reveals a striking result: the dense baseline trained on Nomic data scores 52.89, while the one trained on GTE's proprietary data scores 55.33: a 2.4-point data quality gap. Despite this disadvantage, ColBERT-Zero's full multi-vector pre-training pipeline closes and surpasses this gap, reaching 55.43 nDCG@10.

The Three-Phase Training Pipeline

The development followed a three-phase pipeline, each providing a different type of learning signal:

Phase 1 - Unsupervised Contrastive Pre-training

We began with the nomic-embed-unsupervised-data dataset. Using PyLate's GradCache implementation to scale per-GPU batch size without VRAM constraints, combined with cross-GPU gathering of representations, we reached effective batch sizes of ~16k, required for unsupervised training to produce plausible in-batch hard negatives. Unlike dense training, the multi-vector objective allows the encoder to learn fine-grained token importance from the very first phase.

Phase 2 - Supervised Contrastive Fine-tuning

We refined the model using the nomic-embed-supervised-data. This stage introduced mined hard negatives: documents that are superficially similar to the query but not actually relevant. This allows teaching the model to handle nuance by prioritizing specific keywords and contextual tokens most indicative of a true match.

Phase 3 - Knowledge Distillation (KD)

The final stage used the ms-marco-en-bge dataset. We leveraged a powerful Gemma-based model as a teacher, allowing our student models to learn to replicate complex reasoning scores via the efficient MaxSim operator.

Key Findings

1. The Standard Recipe Leaves Performance on the Table

The KD-only approach (the current industry standard) scores 54.09, lagging behind full pre-training by 1.3 points. A simple distillation step is insufficient for optimal multi-vector performance.

2. Supervised + KD Is the Efficiency Sweet Spot

By running a supervised contrastive step in the multi-vector setting before distillation, we reach 55.12 nDCG@10, closing most of the gap with the fully pre-trained model (55.43). This costs ~40 GH200-hours instead of ~408: roughly 10× cheaper for 99.4% of the performance.

3. Prompt Alignment Is Non-Negotiable

Nomic's base models are pre-trained with asymmetric prompts (search_query: and search_document:). While ColBERT has its own asymmetric mechanism via [Q] and [D] markers, we found:

  • Stripping pre-training prompts during fine-tuning causes significant performance degradation.
  • Adding prompts to a model not pre-trained with them also hurts performance.
  • Even with perfect alignment, prompts provide an intrinsic benefit: full ColBERT pre-training with prompts (55.43) vs. without prompts (54.61), no mismatch in either case, shows a meaningful 0.82-point gap.

Why do prompts help? Our leading hypothesis is that prompt tokens act as implicit query expansion: extra slots that don't carry specific meaning but let the model store global information about the sequence. The original ColBERT used [PAD] tokens for this purpose, but modern Flash Attention implementations broke this trick (masked tokens no longer produce usable embeddings). Explicit prompt tokens may be quietly re-enabling it.

Practical takeaway: Always align your prompts with the base model's pre-training setup. Misalignment is one of the easiest ways to silently lose performance. Note that this sensitivity decreases with stronger downstream fine-tuning: with enough training, the model can adapt to an initial mismatch.

Model Lineup

The Main Models (ColBERT-Zero)

ColBERT-Zero utilizes the full 3-phase pipeline with strict prompt alignment, achieving 55.43 nDCG@10 on BEIR, setting a new SOTA for models <150M parameters. We also provide ColBERT-Zero-noprompts, the same pipeline without asymmetric prompts, to study the impact of query expansion on multi-vector performance.

The cheap-to-train ones (ModernColBERT-embed-base)

These models represent the practical sweet spot. By skipping the expensive unsupervised phase, ModernColBERT-embed-base (Supervised + KD) achieves ~97% of the flagship's performance at only ~10% of the compute cost. For reference, ModernColBERT-embed-base-kd performs only the distillation step on a supervised dense base.

Intermediate Checkpoints

For researchers studying the incremental impact of each phase and prompt alignment, we release several ablation variants: ColBERT-Zero-supervised, ColBERT-Zero-unsupervised (and their -noprompts versions), and ModernColBERT-embed-base-supervised.

Full Performance on BEIR

<html lang="en"> <head> <style> .beir-wrap { overflow-x: auto; font-family: system-ui, sans-serif; width: 100%; display: block; -webkit-overflow-scrolling: touch; } .beir-wrap table { border-collapse: collapse; font-size: 0.70rem; white-space: nowrap; background: #fff; box-shadow: 0 1px 4px rgba(0,0,0,.1); border-radius: 8px; min-width: max-content; } .beir-wrap th, .beir-wrap td { padding: 7px 10px; text-align: center; border-bottom: 1px solid #e9ecef; } .beir-wrap td:first-child, .beir-wrap th:first-child { text-align: left; min-width: 260px; } .beir-wrap th { background: #1e293b; color: #fff; font-weight: 600; } .beir-wrap th.avg-col { background: #f59e0b; color: #1e293b; font-weight: 700; } .beir-wrap td.avg-col { font-weight: 700; font-size: 0.78rem; color: #1e293b; background: #fef3c7; border-left: 2px solid #f59e0b; border-right: 2px solid #f59e0b; } .beir-wrap tr:last-child td.avg-col { border-bottom: 2px solid #f59e0b; } .beir-wrap .section-row td { background: #334155; color: #94a3b8; font-weight: 600; font-size: 0.72rem; letter-spacing: .05em; text-transform: uppercase; padding: 5px 10px; } .beir-wrap strong { color: #0f172a; } .beir-wrap tbody tr:not(.section-row):hover td { background: #f1f5f9; } .beir-wrap tbody tr:not(.section-row):hover td.avg-col { background: #fde68a; } .beir-wrap a { color: #3b82f6; text-decoration: none; } .beir-wrap a:hover { text-decoration: underline; } </style> </head>
Model Avg FiQANFCorpusTREC-COVIDToucheArguAnaQuoraSCIDOCSSciFactNQClimateFEVERHotpotQADBPediaCQADupstackFEVERMSMARCO
Baselines
ModernBERT-embed-unsupervised 47.05 42.5335.3368.4418.5848.8288.6319.8372.3046.3222.9760.0037.9742.4067.3934.23
ModernBERT-embed-supervised 52.89 40.5933.4084.1531.9148.9688.8518.5969.6362.1535.6767.1141.5042.0887.3541.47
GTE-ModernColBERT 54.67 45.2837.9383.5931.2348.5186.6119.0676.3461.8030.6277.3248.0341.0087.4445.32
gte-modernbert-base 55.33 48.8136.4481.9521.6872.6888.5521.2977.4057.6237.7469.4741.7942.6391.0340.90
KD from dense supervised
ModernColBERT-embed-base-kd-only 54.09 42.5137.0179.5234.5851.7587.6718.1575.0461.4528.3176.7047.5440.6884.8245.57
Supervised + KD from dense unsupervised
ModernColBERT-embed-base-supervised 50.72 40.0935.5671.1225.5344.2786.9618.1973.7858.8932.9571.4943.2342.5570.5145.72
ModernColBERT-embed-base 55.12 41.5036.5177.4633.7752.4586.2618.6674.9062.2437.2780.0748.2741.6089.7146.17
ColBERT-Zero
Unsupervised 51.44 45.3836.8867.8222.5951.5387.7822.3076.7658.8024.2468.2943.1645.7681.5838.78
Supervised 51.81 42.4535.6074.7223.8341.8187.1919.8573.7161.9535.0171.3746.2045.1672.6145.68
Distilled 55.43 42.6237.2878.6936.1353.0785.2419.8876.5061.6635.7279.4147.4841.3490.5945.80
ColBERT-Zero-noprompts
Unsupervised 51.70 45.3134.7273.5523.2652.5688.1522.6376.1059.1824.2466.6642.6145.5681.8839.15
Supervised 52.39 43.3636.0172.4223.7947.4287.7921.3073.8562.2531.6170.3244.0744.0385.5442.11
Distilled 54.61 43.1436.6078.6036.3649.4988.0519.1376.4261.7332.7076.9947.6940.2185.9746.01
</html>

Limitations & Discussion

  • Data-specific findings. We deliberately used the Nomic Embed data mixture for controlled comparison. Some observations (particularly around prompt sensitivity) may not generalize to different or stronger training configurations.
  • Scale vs. objective. The gains from multi-vector pre-training likely reflect more training time in the multi-vector setting, rather than the contrastive objective itself. Performing KD alone at a larger scale might yield similar or superior results due to the higher quality of the distillation signal. Our study uses the conventional setup where training scale is inversely proportional to signal quality, reflecting the higher cost of generating high-quality labels.
  • Prompt sensitivity decreases with stronger fine-tuning. When experimenting with stronger fine-tuning data (e.g., NV-Retriever), adding prompts on top of a model pre-trained without them did not degrade results the way it did with ColBERT-Zero. With enough downstream training, the model can adapt to an initial mismatch.

Serving at Scale

For production deployment of ColBERT-Zero and other multi-vector models, check out NextPlaid and FastPlaid, our production-grade engines for multi-vector retrieval.

Resources

Model Details

Model Description

  • Model Type: PyLate model
  • Document Length: 519 tokens
  • Query Length: 39 tokens
  • Output Dimensionality: 128 tokens
  • Similarity Function: MaxSim
  • Training Dataset:
    • train

Model Sources

Full Model Architecture

ColBERT(
  (0): Transformer({'max_seq_length': 518, 'do_lower_case': False, 'architecture': 'ModernBertModel'})
  (1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity', 'use_residual': False})
)

Usage

First install the PyLate library:

pip install -U pylate

Warning

Prompt alignment is critical for ColBERT-Zero models. You must use prompt_name="query" when encoding queries and prompt_name="document" when encoding documents. ColBERT-Zero was pre-trained with asymmetric prompts (search_query: / search_document:), and stripping them causes significant performance.

Retrieval

Use this model with PyLate to index and retrieve documents. The index uses FastPLAID for efficient similarity search.

Indexing documents

Load the ColBERT model and initialize the PLAID index, then encode and index your documents:

from pylate import indexes, models, retrieve

# Step 1: Load the ColBERT model
model = models.ColBERT(
    model_name_or_path="pylate_model_id",
)

# Step 2: Initialize the PLAID index
index = indexes.PLAID(
    index_folder="pylate-index",
    index_name="index",
    override=True,  # This overwrites the existing index if any
)

# Step 3: Encode the documents
documents_ids = ["1", "2", "3"]
documents = ["document 1 text", "document 2 text", "document 3 text"]

documents_embeddings = model.encode(
    documents,
    batch_size=32,
    is_query=False,  # Ensure that it is set to False to indicate that these are documents, not queries
    prompt_name="document", # ⚠️ Required for ColBERT-Zero! Do not omit.
    show_progress_bar=True,
)

# Step 4: Add document embeddings to the index by providing embeddings and corresponding ids
index.add_documents(
    documents_ids=documents_ids,
    documents_embeddings=documents_embeddings,
)

Note that you do not have to recreate the index and encode the documents every time. Once you have created an index and added the documents, you can re-use the index later by loading it:

# To load an index, simply instantiate it with the correct folder/name and without overriding it
index = indexes.PLAID(
    index_folder="pylate-index",
    index_name="index",
)

Retrieving top-k documents for queries

Once the documents are indexed, you can retrieve the top-k most relevant documents for a given set of queries. To do so, initialize the ColBERT retriever with the index you want to search in, encode the queries and then retrieve the top-k documents to get the top matches ids and relevance scores:

[!WARNING] Always pass prompt_name="query" for queries and prompt_name="document" for documents. Omitting these prompts will silently degrade retrieval quality.

# Step 1: Initialize the ColBERT retriever
retriever = retrieve.ColBERT(index=index)

# Step 2: Encode the queries
queries_embeddings = model.encode(
    ["query for document 3", "query for document 1"],
    batch_size=32,
    is_query=True,  #  # Ensure that it is set to False to indicate that these are queries
    prompt_name="query", # ⚠️ Required for ColBERT-Zero! Do not omit.
    show_progress_bar=True,
)

# Step 3: Retrieve top-k documents
scores = retriever.retrieve(
    queries_embeddings=queries_embeddings,
    k=10,  # Retrieve the top 10 matches for each query
)

Reranking

Warning

Always pass prompt_name="query" for queries and prompt_name="document" for documents. Omitting these prompts will silently degrade retrieval quality.

If you only want to use the ColBERT model to perform reranking on top of your first-stage retrieval pipeline without building an index, you can simply use rank function and pass the queries and documents to rerank:

from pylate import rank, models

queries = [
    "query A",
    "query B",
]

documents = [
    ["document A", "document B"],
    ["document 1", "document C", "document B"],
]

documents_ids = [
    [1, 2],
    [1, 3, 2],
]

model = models.ColBERT(
    model_name_or_path="pylate_model_id",
)

queries_embeddings = model.encode(
    queries,
    is_query=True,
    prompt_name="query" # ⚠️ Required for ColBERT-Zero! Do not omit.
)

documents_embeddings = model.encode(
    documents,
    is_query=False,
    prompt_name="document" # ⚠️ Required for ColBERT-Zero! Do not omit.
)

reranked_documents = rank.rerank(
    documents_ids=documents_ids,
    queries_embeddings=queries_embeddings,
    documents_embeddings=documents_embeddings,
)

Evaluation

Metrics

Py Late Information Retrieval

  • Dataset: ['NanoClimateFEVER', 'NanoDBPedia', 'NanoFEVER', 'NanoFiQA2018', 'NanoHotpotQA', 'NanoMSMARCO', 'NanoNFCorpus', 'NanoNQ', 'NanoQuoraRetrieval', 'NanoSCIDOCS', 'NanoArguAna', 'NanoSciFact', 'NanoTouche2020']
  • Evaluated with pylate.evaluation.pylate_information_retrieval_evaluator.PyLateInformationRetrievalEvaluator
Metric NanoClimateFEVER NanoDBPedia NanoFEVER NanoFiQA2018 NanoHotpotQA NanoMSMARCO NanoNFCorpus NanoNQ NanoQuoraRetrieval NanoSCIDOCS NanoArguAna NanoSciFact NanoTouche2020
MaxSim_accuracy@1 0.36 0.86 0.96 0.58 0.98 0.6 0.58 0.62 0.92 0.48 0.24 0.7 0.8163
MaxSim_accuracy@3 0.68 0.94 1.0 0.66 1.0 0.68 0.68 0.84 0.98 0.74 0.64 0.82 0.9796
MaxSim_accuracy@5 0.76 0.94 1.0 0.72 1.0 0.78 0.72 0.88 1.0 0.76 0.7 0.88 0.9796
MaxSim_accuracy@10 0.88 0.98 1.0 0.82 1.0 0.9 0.76 0.9 1.0 0.9 0.9 0.92 0.9796
MaxSim_precision@1 0.36 0.86 0.96 0.58 0.98 0.6 0.58 0.62 0.92 0.48 0.24 0.7 0.8163
MaxSim_precision@3 0.2867 0.7333 0.3533 0.3267 0.6 0.2267 0.4267 0.28 0.3933 0.4067 0.2133 0.2867 0.7279
MaxSim_precision@5 0.22 0.66 0.212 0.248 0.368 0.156 0.396 0.176 0.248 0.304 0.14 0.196 0.6653
MaxSim_precision@10 0.148 0.584 0.11 0.148 0.186 0.09 0.316 0.096 0.128 0.204 0.09 0.102 0.5388
MaxSim_recall@1 0.18 0.108 0.8967 0.3526 0.49 0.6 0.066 0.59 0.7973 0.1027 0.24 0.675 0.0564
MaxSim_recall@3 0.36 0.2161 0.9633 0.4742 0.9 0.68 0.1036 0.78 0.932 0.2507 0.64 0.79 0.1493
MaxSim_recall@5 0.429 0.2933 0.9633 0.546 0.92 0.78 0.1297 0.81 0.9627 0.3107 0.7 0.87 0.2241
MaxSim_recall@10 0.5537 0.4273 0.98 0.6425 0.93 0.9 0.1635 0.86 0.9727 0.4167 0.9 0.91 0.3475
MaxSim_ndcg@10 0.4511 0.7326 0.9624 0.5786 0.9243 0.7242 0.4055 0.7475 0.9376 0.4124 0.562 0.802 0.6176
MaxSim_mrr@10 0.5353 0.8995 0.9767 0.6434 0.99 0.671 0.6304 0.7342 0.954 0.6184 0.4557 0.7717 0.8776
MaxSim_map@100 0.3571 0.5806 0.9478 0.5234 0.8945 0.6766 0.1959 0.7036 0.9156 0.3294 0.4584 0.7652 0.4571

Nano BEIR

  • Dataset: NanoBEIR_mean
  • Evaluated with pylate.evaluation.nano_beir_evaluator.NanoBEIREvaluator
Metric Value
MaxSim_accuracy@1 0.6689
MaxSim_accuracy@3 0.8184
MaxSim_accuracy@5 0.8554
MaxSim_accuracy@10 0.9184
MaxSim_precision@1 0.6689
MaxSim_precision@3 0.4047
MaxSim_precision@5 0.3069
MaxSim_precision@10 0.2108
MaxSim_recall@1 0.3965
MaxSim_recall@3 0.5569
MaxSim_recall@5 0.6107
MaxSim_recall@10 0.6926
MaxSim_ndcg@10 0.6814
MaxSim_mrr@10 0.7506
MaxSim_map@100 0.6004

Training Details

Training Dataset

train

  • Dataset: train
  • Size: 640,000 training samples
  • Columns: query_id, document_ids, and scores
  • Approximate statistics based on the first 1000 samples:
    query_id document_ids scores
    type int list list
    details
    • 836: ~0.10%
    • 3582: ~0.10%
    • ...
    • size: 32 elements
    • size: 32 elements
  • Samples:
    query_id document_ids scores
    685613 [7546874, 1176459, 197677, 2306318, 8541504, ...] [0.9999999992804947, 0.24845418756716053, 0.7594154013647826, 0.26644182105618575, 0.390668914839766, ...]
    237784 [6366584, 4034101, 2325374, 6914618, 6042146, ...] [0.9999999991784339, 0.42233632827946693, 0.5956354295491569, 0.12644415907455164, 0.6636713730105909, ...]
    904294 [448408, 8743975, 49600, 7339401, 2714261, ...] [0.9999999991841937, 0.877629062381539, 0.8330146583389045, 0.3116634796692611, 0.4633524534142185, ...]
  • Loss: pylate.losses.distillation.Distillation

Training Hyperparameters

Non-Default Hyperparameters

  • eval_strategy: steps
  • per_device_train_batch_size: 4
  • per_device_eval_batch_size: 4
  • gradient_accumulation_steps: 2
  • learning_rate: 1e-05
  • num_train_epochs: 1.0
  • bf16: True
  • dataloader_num_workers: 4
  • ddp_find_unused_parameters: False

All Hyperparameters

Click to expand
  • overwrite_output_dir: False
  • do_predict: False
  • eval_strategy: steps
  • prediction_loss_only: True
  • per_device_train_batch_size: 4
  • per_device_eval_batch_size: 4
  • per_gpu_train_batch_size: None
  • per_gpu_eval_batch_size: None
  • gradient_accumulation_steps: 2
  • eval_accumulation_steps: None
  • torch_empty_cache_steps: None
  • learning_rate: 1e-05
  • weight_decay: 0.0
  • adam_beta1: 0.9
  • adam_beta2: 0.999
  • adam_epsilon: 1e-08
  • max_grad_norm: 1.0
  • num_train_epochs: 1.0
  • max_steps: -1
  • lr_scheduler_type: linear
  • lr_scheduler_kwargs: {}
  • warmup_ratio: 0.0
  • warmup_steps: 0
  • log_level: passive
  • log_level_replica: warning
  • log_on_each_node: True
  • logging_nan_inf_filter: True
  • save_safetensors: True
  • save_on_each_node: False
  • save_only_model: False
  • restore_callback_states_from_checkpoint: False
  • no_cuda: False
  • use_cpu: False
  • use_mps_device: False
  • seed: 42
  • data_seed: None
  • jit_mode_eval: False
  • use_ipex: False
  • bf16: True
  • fp16: False
  • fp16_opt_level: O1
  • half_precision_backend: auto
  • bf16_full_eval: False
  • fp16_full_eval: False
  • tf32: None
  • local_rank: 3
  • ddp_backend: None
  • tpu_num_cores: None
  • tpu_metrics_debug: False
  • debug: []
  • dataloader_drop_last: True
  • dataloader_num_workers: 4
  • dataloader_prefetch_factor: None
  • past_index: -1
  • disable_tqdm: False
  • remove_unused_columns: True
  • label_names: None
  • load_best_model_at_end: False
  • ignore_data_skip: False
  • fsdp: []
  • fsdp_min_num_params: 0
  • fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
  • fsdp_transformer_layer_cls_to_wrap: None
  • accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
  • deepspeed: None
  • label_smoothing_factor: 0.0
  • optim: adamw_torch
  • optim_args: None
  • adafactor: False
  • group_by_length: False
  • length_column_name: length
  • ddp_find_unused_parameters: False
  • ddp_bucket_cap_mb: None
  • ddp_broadcast_buffers: False
  • dataloader_pin_memory: True
  • dataloader_persistent_workers: False
  • skip_memory_metrics: True
  • use_legacy_prediction_loop: False
  • push_to_hub: False
  • resume_from_checkpoint: None
  • hub_model_id: None
  • hub_strategy: every_save
  • hub_private_repo: None
  • hub_always_push: False
  • gradient_checkpointing: False
  • gradient_checkpointing_kwargs: None
  • include_inputs_for_metrics: False
  • include_for_metrics: []
  • eval_do_concat_batches: True
  • fp16_backend: auto
  • push_to_hub_model_id: None
  • push_to_hub_organization: None
  • mp_parameters:
  • auto_find_batch_size: False
  • full_determinism: False
  • torchdynamo: None
  • ray_scope: last
  • ddp_timeout: 1800
  • torch_compile: False
  • torch_compile_backend: None
  • torch_compile_mode: None
  • dispatch_batches: None
  • split_batches: None
  • include_tokens_per_second: False
  • include_num_input_tokens_seen: False
  • neftune_noise_alpha: None
  • optim_target_modules: None
  • batch_eval_metrics: False
  • eval_on_start: False
  • use_liger_kernel: False
  • eval_use_gather_object: False
  • average_tokens_across_devices: False
  • prompts: None
  • batch_sampler: batch_sampler
  • multi_dataset_batch_sampler: proportional
  • router_mapping: {}
  • learning_rate_mapping: {}

Training Logs

Click to expand
Epoch Step Training Loss NanoClimateFEVER_MaxSim_ndcg@10 NanoDBPedia_MaxSim_ndcg@10 NanoFEVER_MaxSim_ndcg@10 NanoFiQA2018_MaxSim_ndcg@10 NanoHotpotQA_MaxSim_ndcg@10 NanoMSMARCO_MaxSim_ndcg@10 NanoNFCorpus_MaxSim_ndcg@10 NanoNQ_MaxSim_ndcg@10 NanoQuoraRetrieval_MaxSim_ndcg@10 NanoSCIDOCS_MaxSim_ndcg@10 NanoArguAna_MaxSim_ndcg@10 NanoSciFact_MaxSim_ndcg@10 NanoTouche2020_MaxSim_ndcg@10 NanoBEIR_mean_MaxSim_ndcg@10
0.0025 50 0.0187 - - - - - - - - - - - - - -
0.0275 550 0.0155 - - - - - - - - - - - - - -
0.0525 1050 0.0146 - - - - - - - - - - - - - -
0.075 1500 0.0141 0.4530 0.7263 0.9670 0.5786 0.9313 0.7349 0.3994 0.7587 0.9506 0.4292 0.5152 0.8059 0.6139 0.6818
0.0775 1550 0.0139 - - - - - - - - - - - - - -
0.1025 2050 0.0138 - - - - - - - - - - - - - -
0.1275 2550 0.0132 - - - - - - - - - - - - - -
0.15 3000 0.0132 0.4562 0.7260 0.9738 0.5756 0.9221 0.7378 0.4021 0.7555 0.9473 0.4276 0.5376 0.8082 0.6206 0.6839
0.1525 3050 0.013 - - - - - - - - - - - - - -
0.1775 3550 0.0129 - - - - - - - - - - - - - -
0.2025 4050 0.0129 - - - - - - - - - - - - - -
0.225 4500 0.0126 0.4551 0.7381 0.9624 0.5890 0.9238 0.7381 0.3978 0.7522 0.9400 0.4206 0.5455 0.8141 0.6184 0.6842
0.2275 4550 0.0124 - - - - - - - - - - - - - -
0.2525 5050 0.0126 - - - - - - - - - - - - - -
0.2775 5550 0.0123 - - - - - - - - - - - - - -
0.3 6000 0.012 0.4474 0.7375 0.9635 0.5908 0.9282 0.7416 0.4064 0.7551 0.9424 0.4198 0.5592 0.8074 0.6191 0.6860
0.3025 6050 0.0125 - - - - - - - - - - - - - -
0.3275 6550 0.012 - - - - - - - - - - - - - -
0.3525 7050 0.0122 - - - - - - - - - - - - - -
0.375 7500 0.0123 0.4534 0.7266 0.9631 0.5875 0.9294 0.7349 0.4012 0.7459 0.9417 0.4195 0.5608 0.8060 0.6205 0.6839
0.3775 7550 0.0118 - - - - - - - - - - - - - -
0.4025 8050 0.0118 - - - - - - - - - - - - - -
0.4275 8550 0.0119 - - - - - - - - - - - - - -
0.45 9000 0.0114 0.4537 0.7219 0.9631 0.5837 0.9290 0.7374 0.4032 0.7522 0.9496 0.4134 0.5572 0.8113 0.6190 0.6842
0.4525 9050 0.0117 - - - - - - - - - - - - - -
0.4775 9550 0.0119 - - - - - - - - - - - - - -
0.5025 10050 0.0112 - - - - - - - - - - - - - -
0.525 10500 0.0117 0.4541 0.7325 0.9653 0.5803 0.9243 0.7357 0.4092 0.7566 0.9468 0.4169 0.5596 0.8040 0.6177 0.6849
0.5275 10550 0.0116 - - - - - - - - - - - - - -
0.5525 11050 0.0115 - - - - - - - - - - - - - -
0.5775 11550 0.0112 - - - - - - - - - - - - - -
0.6 12000 0.0112 0.4606 0.7310 0.9624 0.5862 0.9243 0.7341 0.4085 0.7523 0.9463 0.4192 0.5708 0.8086 0.6201 0.6865
0.6025 12050 0.0116 - - - - - - - - - - - - - -
0.6275 12550 0.0113 - - - - - - - - - - - - - -
0.6525 13050 0.0115 - - - - - - - - - - - - - -
0.675 13500 0.0111 0.4505 0.7294 0.9653 0.5796 0.9289 0.7348 0.4063 0.7553 0.9451 0.4205 0.5627 0.8034 0.6173 0.6845
0.6775 13550 0.0112 - - - - - - - - - - - - - -
0.7025 14050 0.0112 - - - - - - - - - - - - - -
0.7275 14550 0.0109 - - - - - - - - - - - - - -
0.75 15000 0.0113 0.4544 0.7281 0.9624 0.5785 0.9227 0.7241 0.4081 0.7495 0.9391 0.4158 0.5639 0.8020 0.6195 0.6822
0.7525 15050 0.0112 - - - - - - - - - - - - - -
0.7775 15550 0.011 - - - - - - - - - - - - - -
0.8025 16050 0.0106 - - - - - - - - - - - - - -
0.825 16500 0.0113 0.4520 0.7354 0.9624 0.5784 0.9279 0.7340 0.4042 0.7505 0.9388 0.4117 0.5630 0.8020 0.6204 0.6831
0.8275 16550 0.0107 - - - - - - - - - - - - - -
0.8525 17050 0.0109 - - - - - - - - - - - - - -
0.8775 17550 0.011 - - - - - - - - - - - - - -
0.9 18000 0.0109 0.4548 0.7336 0.9624 0.5791 0.9243 0.7313 0.4067 0.7475 0.9376 0.4132 0.5625 0.8094 0.6214 0.6834
0.9025 18050 0.011 - - - - - - - - - - - - - -
0.9275 18550 0.0109 - - - - - - - - - - - - - -
0.9525 19050 0.0107 - - - - - - - - - - - - - -
0.975 19500 0.0111 0.4511 0.7326 0.9624 0.5786 0.9243 0.7242 0.4055 0.7475 0.9376 0.4124 0.5620 0.8020 0.6176 0.6814
0.9775 19550 0.0112 - - - - - - - - - - - - - -

Framework Versions

  • Python: 3.13.0
  • Sentence Transformers: 5.1.1
  • PyLate: 1.3.4
  • Transformers: 4.48.3
  • PyTorch: 2.6.0
  • Accelerate: 1.12.0
  • Datasets: 4.4.1
  • Tokenizers: 0.21.0

Citation

BibTeX

ColBERT-Zero

@misc{chaffin2026colbertzeropretrainpretraincolbert,
  title         = {ColBERT-Zero: To Pre-train Or Not To Pre-train ColBERT models}, 
  author        = {Antoine Chaffin and Luca Arnaboldi and Amélie Chatelain and Florent Krzakala},
  year          = {2026},
  eprint        = {2602.16609},
  archivePrefix = {arXiv},
  primaryClass  = {cs.CL},
  url           = {https://arxiv.org/abs/2602.16609}, 
}

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084"
}

PyLate

@inproceedings{DBLP:conf/cikm/ChaffinS25,
  author       = {Antoine Chaffin and
                  Rapha{"{e}}l Sourty},
  editor       = {Meeyoung Cha and
                  Chanyoung Park and
                  Noseong Park and
                  Carl Yang and
                  Senjuti Basu Roy and
                  Jessie Li and
                  Jaap Kamps and
                  Kijung Shin and
                  Bryan Hooi and
                  Lifang He},
  title        = {PyLate: Flexible Training and Retrieval for Late Interaction Models},
  booktitle    = {Proceedings of the 34th {ACM} International Conference on Information
                  and Knowledge Management, {CIKM} 2025, Seoul, Republic of Korea, November
                  10-14, 2025},
  pages        = {6334--6339},
  publisher    = {{ACM}},
  year         = {2025},
  url          = {https://github.com/lightonai/pylate},
  doi          = {10.1145/3746252.3761608},
}

Nomic Embed

@article{DBLP:journals/tmlr/NussbaumMMD25,
  author       = {Zach Nussbaum and
                  John Xavier Morris and
                  Andriy Mulyar and
                  Brandon Duderstadt},
  title        = {Nomic Embed: Training a Reproducible Long Context Text Embedder},
  journal      = {Trans. Mach. Learn. Res.},
  volume       = {2025},
  year         = {2025},
  url          = {https://openreview.net/forum?id=IPmzyQSiQE},
  timestamp    = {Fri, 20 Jun 2025 14:19:48 +0200},
  biburl       = {https://dblp.org/rec/journals/tmlr/NussbaumMMD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}