初始化项目,由ModelHub XC社区提供模型
Model: 81melody/algerianME5 Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
10
1_Pooling/config.json
Normal file
10
1_Pooling/config.json
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"word_embedding_dimension": 768,
|
||||||
|
"pooling_mode_cls_token": false,
|
||||||
|
"pooling_mode_mean_tokens": true,
|
||||||
|
"pooling_mode_max_tokens": false,
|
||||||
|
"pooling_mode_mean_sqrt_len_tokens": false,
|
||||||
|
"pooling_mode_weightedmean_tokens": false,
|
||||||
|
"pooling_mode_lasttoken": false,
|
||||||
|
"include_prompt": true
|
||||||
|
}
|
||||||
360
README.md
Normal file
360
README.md
Normal file
@@ -0,0 +1,360 @@
|
|||||||
|
---
|
||||||
|
tags:
|
||||||
|
- sentence-transformers
|
||||||
|
- sentence-similarity
|
||||||
|
- feature-extraction
|
||||||
|
- dense
|
||||||
|
- Algerian AI
|
||||||
|
- Algerian
|
||||||
|
- algeria
|
||||||
|
- darja
|
||||||
|
- darija
|
||||||
|
- algerian darija
|
||||||
|
- algerian dialect
|
||||||
|
- rag
|
||||||
|
- ar
|
||||||
|
- multilingual-e5
|
||||||
|
- generated_from_trainer
|
||||||
|
- loss:MultipleNegativesRankingLoss
|
||||||
|
base_model: intfloat/multilingual-e5-base
|
||||||
|
widget:
|
||||||
|
- source_sentence: 'query: Renault Kangoo 2019'
|
||||||
|
sentences:
|
||||||
|
- >-
|
||||||
|
passage: سيارة Renault Kangoo 2019 Confort · مازوت · يدوية · 1.5 DCI 90ch ·
|
||||||
|
المسافة: 199,000 كم · السعر: 420 مليون دج · سيسبونسيو 10/10
|
||||||
|
|
||||||
|
موتور 10/10
|
||||||
|
|
||||||
|
سبيغة 0
|
||||||
|
|
||||||
|
كلشي معاود فيها جديد
|
||||||
|
- >-
|
||||||
|
passage: سيارة Dfsq Dfsq 2013 · بنزين · يدوية · 1.1 · المسافة: 280 كم ·
|
||||||
|
السعر: 140 مليون دج · باتنة · مفيهش معاود
|
||||||
|
|
||||||
|
موتور محطوط جديد
|
||||||
|
- >-
|
||||||
|
passage: بيع فيلا تيبازة بوسماعيل · فيلا · السعر: 8 مليون دج · تيبازة ·
|
||||||
|
agence immobilier LABID agrée par l'état met en vente trés bel villa r+2 de
|
||||||
|
sup 250 m² deux facade dans un résidence clôturé et gardée jour et nuit
|
||||||
|
libre de suite l'villa avec toute commanditée :
|
||||||
|
|
||||||
|
- rdc : deux garage pour 7 véhicule + studio + jardain
|
||||||
|
|
||||||
|
- 1ére étage : salon de chambre + cuisine + salle de bain + sanitaire
|
||||||
|
|
||||||
|
- 2éme étage : salon +3 chambre + sanitaire + Hammam
|
||||||
|
|
||||||
|
- 3éme étage : grand salon + deux terrasse
|
||||||
|
|
||||||
|
- chauffage centrale
|
||||||
|
|
||||||
|
- climatisation
|
||||||
|
|
||||||
|
- caméra de surveillance
|
||||||
|
|
||||||
|
- bâché d'eau
|
||||||
|
|
||||||
|
- acte et livret foncier
|
||||||
|
|
||||||
|
- les prix : 8 milliards nég lég
|
||||||
|
|
||||||
|
- pour plus d'informations consultéz agence labid au :
|
||||||
|
|
||||||
|
-
|
||||||
|
- source_sentence: 'query: location terrain Oran'
|
||||||
|
sentences:
|
||||||
|
- >-
|
||||||
|
passage: كراء عمارة وهران وهران · ارض · 90 م² · السعر: 6 مليون دج · وهران ·
|
||||||
|
location plusieurs appartements dans un immeuble de 5 étages et avec
|
||||||
|
ascenseur
|
||||||
|
|
||||||
|
les appartements sont neuf jamais habité
|
||||||
|
|
||||||
|
merci de nous contacter pour savoir plus de détails .
|
||||||
|
- >-
|
||||||
|
passage: سيارة Kia Seltos 2025 LUXuRY · بنزين · اوتوماتيك · 1.5 · السعر: 545
|
||||||
|
مليون دج · الوادي
|
||||||
|
- >-
|
||||||
|
passage: سيارة Peugeot 308 2015 Active · مازوت · يدوية · 1.6 e HDI 112ch ·
|
||||||
|
المسافة: 375,000 كم · وهران · Je vente 308 jdida machya 375000
|
||||||
|
- source_sentence: 'query: villa Alger avec jardin'
|
||||||
|
sentences:
|
||||||
|
- >-
|
||||||
|
passage: بيع شقة 3 غرف الجزائر العاشور · شقة · 3 غرف · السعر: 3 مليون دج ·
|
||||||
|
الجزائر ·vente une appartement a el3achour Hawch chawech De 96m F3 en 3 em
|
||||||
|
etg avec la scenseur tout comoditie chauffage central climatisation cuisine
|
||||||
|
équipée boxe pour stationnement les caméras de surveillance avec act et
|
||||||
|
livret foncièr
|
||||||
|
- >-
|
||||||
|
passage: كراء شقة دوبلكس 4 غرف الجزائر العاشور · شقة · 4 غرف · مطبخ مجهز ·
|
||||||
|
تدفئة مركزية · تكييف · تيراس · مفروش · جناح غرفة النوم · السعر: 29 مليون دج
|
||||||
|
· الجزائر · El Achour Location d’un Duplex F4 meublé de haut standing
|
||||||
|
superficie 213 m²
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Le Duplex se compose :
|
||||||
|
|
||||||
|
|
||||||
|
Niveau 1: une entrée, un joli séjour avec une salle à manger, une cuisine
|
||||||
|
équipée haute gamme, sanitaire + hammame, terrasse.
|
||||||
|
|
||||||
|
|
||||||
|
Niveau 2 : 3 chambres dont une master bed room, une salle de bain avec
|
||||||
|
jacuzzi, espace bureau, 2 balcons.
|
||||||
|
|
||||||
|
|
||||||
|
Équipements : climatisation, chauffage central, double vitrage, stores
|
||||||
|
électriques, visiophone, 1 place de parking.
|
||||||
|
|
||||||
|
|
||||||
|
Commodités de la résidence : ascenseur, parking, gardiennage 24h/24, aire de
|
||||||
|
jeux pour enfants, espaces verts pour vos moments de détente.
|
||||||
|
- >-
|
||||||
|
passage: كراء شقة 5 غرف البليدة البليدة · شقة · 5 غرف · السعر: 5 مليون دج ·
|
||||||
|
البليدة · 203m plus ascenseur
|
||||||
|
- source_sentence: 'query: Cuxi Cuxi 2025'
|
||||||
|
sentences:
|
||||||
|
- >-
|
||||||
|
passage: سيارة Volkswagen Golf 7 2016 Trendline + · مازوت · يدوية · 2.0 TDI
|
||||||
|
110ch · المسافة: 280,000 كم
|
||||||
|
- >-
|
||||||
|
passage: سيارة Opel Corsa 2001 Corsa · مازوت · يدوية · 1.7 D 60ch · المسافة:
|
||||||
|
350,000 كم · السعر: 65 مليون دج · موتور نعاود يدور شهرة السبيغة فيها سوباسمو
|
||||||
|
- >-
|
||||||
|
passage: سيارة Cuxi Cuxi 2025 · بنزين · اوتوماتيك · Yamaha 110 · المسافة:
|
||||||
|
9,250 كم · السعر: 28 مليون دج · قسنطينة · Cuxi 2025 jdida état 10/10
|
||||||
|
- source_sentence: 'query: Rani nhawes 3la tonobil Hyundai i10'
|
||||||
|
sentences:
|
||||||
|
- 'passage: بيع شقة غرفتين 3 غرف 4 غرف وهران بئر الجير · شقة · 3 غرف · وهران'
|
||||||
|
- >-
|
||||||
|
passage: سيارة Kia Cerato 2008 · مازوت · يدوية · المسافة: 230,000 كم ·
|
||||||
|
السعر: 135 مليون دج · سوق اهراس · Problem də terage
|
||||||
|
- >-
|
||||||
|
passage: سيارة Hyundai i10 2014 GLS · بنزين · يدوية · 1.1 · المسافة: 300,000
|
||||||
|
كم · عين تموشنت · Fiha bantoura
|
||||||
|
pipeline_tag: sentence-similarity
|
||||||
|
library_name: sentence-transformers
|
||||||
|
license: mit
|
||||||
|
language:
|
||||||
|
- ar
|
||||||
|
- fr
|
||||||
|
---
|
||||||
|
|
||||||
|
# AlgerianME5
|
||||||
|
|
||||||
|
**algerianME5** is a specialized **Sentence-Transformer** model designed to map Algerian search queries to a 768-dimensional dense vector space, It is specifically fine-tuned to understand the nuances and the vocabulary of the Algerian car and real estate markets, where listings often mix Arabic, French, and darja in both Arabic and Latin script
|
||||||
|
|
||||||
|
Note: For more details about the methodology, data synthesis, and evaluation, [please visit the full Medium Story](https://medium.com/@ayoubhimeur/building-a-semantic-search-engine-for-algerian-marketplaces-cc04a0008346)
|
||||||
|
|
||||||
|
## Key Features :
|
||||||
|
-**Domain Specific**: Optimized for real estate and automotive algerian vocabulary “sbigha,” “f3,” “livret foncier”
|
||||||
|
|
||||||
|
-**Cross lingual Retrieval**: Maps informal latin queries "tonobil mliha" to formal Arabic or French listing descriptions
|
||||||
|
|
||||||
|
-**Robust Embeddings**: Based on the powerful intfloat/multilingual-e5-base architecture
|
||||||
|
|
||||||
|
## Use cases :
|
||||||
|
|
||||||
|
-**Semantic Search** : Find relevant listings even if keywords dont match exactly (use it as a second layer)
|
||||||
|
|
||||||
|
-**Textual Similarity**:Compare two listings to find duplicates or similar items
|
||||||
|
|
||||||
|
-**Clustering** Group listings by sub-market or vehicle/property type
|
||||||
|
|
||||||
|
## Model Details
|
||||||
|
|
||||||
|
### Model Description
|
||||||
|
- **Model Type:** Sentence Transformer
|
||||||
|
- **Base model:** [intfloat/multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) <!-- at revision 835193815a3936a24a0ee7dc9e3d48c1fbb19c55 -->
|
||||||
|
- **Maximum Sequence Length:** 256 tokens
|
||||||
|
- **Output Dimensionality:** 768 dimensions
|
||||||
|
- **Similarity Function:** Cosine Similarity
|
||||||
|
|
||||||
|
### Model Sources
|
||||||
|
|
||||||
|
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
||||||
|
- **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
|
||||||
|
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
||||||
|
|
||||||
|
### Full Model Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
SentenceTransformer(
|
||||||
|
(0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'XLMRobertaModel'})
|
||||||
|
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
||||||
|
(2): Normalize()
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Direct Usage (Sentence Transformers)
|
||||||
|
|
||||||
|
First install the Sentence Transformers library:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -U sentence-transformers
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you can load this model and run inference.
|
||||||
|
```python
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
|
||||||
|
model = SentenceTransformer("81melody/algerianME5")
|
||||||
|
sentences = [
|
||||||
|
'query: Rani nhawes 3la tonobil Hyundai i10',
|
||||||
|
'passage: سيارة Hyundai i10 2014 GLS · بنزين · يدوية · 1.1 · المسافة: 300,000 كم · عين تموشنت · Fiha bantoura',
|
||||||
|
'passage: سيارة Kia Cerato 2008 · مازوت · يدوية · المسافة: 230,000 كم · السعر: 135 مليون دج · سوق اهراس · Problem də terage',
|
||||||
|
]
|
||||||
|
embeddings = model.encode(sentences)
|
||||||
|
print(embeddings.shape)
|
||||||
|
# [3, 768]
|
||||||
|
|
||||||
|
# Get the similarity scores for the embeddings
|
||||||
|
similarities = model.similarity(embeddings, embeddings)
|
||||||
|
print(similarities)
|
||||||
|
```
|
||||||
|
**OR**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from sentence_transformers import SentenceTransformer , util
|
||||||
|
model = SentenceTransformer("81melody/algerianME5")
|
||||||
|
listings = [
|
||||||
|
# REAL ESTATE
|
||||||
|
"بيع شقة 4 غرف الجزائر شراقة · شقة · 4 غرف · السعر: 4 مليون دج · Appartement Composé De 1 Suite Parentale... Résidence sécurisée",
|
||||||
|
"كراء شقة 4 غرف وهران وهران · شقة · 4 غرف · Location appartement par jour pour familles",
|
||||||
|
"بيع ارض تلمسان مغنية · ارض · الجزائر · بلان فالسانك مليح",
|
||||||
|
"كراء محل الجزائر الابيار · محل تجاري · 105 م² · Local avec Deux rideaux",
|
||||||
|
|
||||||
|
# CARS
|
||||||
|
"سيارة MG Zs Ev 2024 Comfort · بنزين · يدوية · 1.5 VTi-Tech 106ch · المسافة: 67,000 كم · Très beau SUV comme neuf",
|
||||||
|
"سيارة Hyundai Grand i10 2018 Restylée DZ · بنزين · يدوية · 1.2 ess 87ch · السعر: 265 مليون دج · صبيغة فيها لال و لامان",
|
||||||
|
"سيارة Renault Clio 4 2018 GT Line + · مازوت · يدوية · 1.5 DCI 85ch · السعر: 330 مليون دج"
|
||||||
|
]
|
||||||
|
queries = [
|
||||||
|
"شقة 4 غرف الجزائر",
|
||||||
|
"dar lel bi3 fi Alger centre",
|
||||||
|
"ard lel bi3 telemcan" ,
|
||||||
|
"chhal souma MG Zs Ev",
|
||||||
|
"Grand I10 2018 Restylée DZ",
|
||||||
|
"tonobil mliha fiha sbigha shwia"
|
||||||
|
]
|
||||||
|
q_prefix = "query: "
|
||||||
|
p_prefix = "passage: "
|
||||||
|
|
||||||
|
encoded_listings = model.encode(
|
||||||
|
[f"{p_prefix}{l}" for l in listings],
|
||||||
|
convert_to_tensor=True,
|
||||||
|
show_progress_bar=False
|
||||||
|
)
|
||||||
|
for query in queries:
|
||||||
|
print(f"\nQuery: '{query}'")
|
||||||
|
|
||||||
|
|
||||||
|
query_emb = model.encode(f"{q_prefix}{query}", convert_to_tensor=True)
|
||||||
|
hits = util.semantic_search(query_emb, encoded_listings, top_k=3)[0]
|
||||||
|
|
||||||
|
|
||||||
|
for i, hit in enumerate(hits):
|
||||||
|
score = hit['score']
|
||||||
|
doc_id = hit['corpus_id']
|
||||||
|
display_text = listings[doc_id][:100] + "..." if len(listings[doc_id]) > 100 else listings[doc_id]
|
||||||
|
print(f"[Score: {score:.3f}] {display_text}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Training Details
|
||||||
|
|
||||||
|
### Training Dataset
|
||||||
|
|
||||||
|
* Size: 100,000 training samples
|
||||||
|
* Columns: <code>sentence_0</code> and <code>sentence_1</code>
|
||||||
|
* Approximate statistics based on the first 1000 samples:
|
||||||
|
| | sentence_0 | sentence_1 |
|
||||||
|
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|
|
||||||
|
| type | string | string |
|
||||||
|
| details | <ul><li>min: 7 tokens</li><li>mean: 11.07 tokens</li><li>max: 22 tokens</li></ul> | <ul><li>min: 17 tokens</li><li>mean: 82.2 tokens</li><li>max: 256 tokens</li></ul> |
|
||||||
|
* Samples:
|
||||||
|
| sentence_0 | sentence_1 |
|
||||||
|
|:----------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| <code>query: بيع محل وهران بئر</code> | <code>passage: بيع محل وهران بئر الجير · محل تجاري · 750 م² · السعر: 20 مليار دج · وهران · On vous propose en vente un local de 750 m² (550 m² en rez-de-chaussée et 200 m² sous pente) , avec deux rideaux électriques , pour le prix de : 20 Milliards fixe .<br><br>Pour plus de détails veuillez nous contacter</code> |
|
||||||
|
| <code>query: شقة الجزائر برج</code> | <code>passage: بيع شقة الجزائر برج الكيفان · شقة · 1 غرف · 64 م² · وثائق: دفتر عقاري · عقد موثق · الجزائر · 🔔OPPORTUNITÉ EN OR 🔔<br>– T2 à vendre +paiement par tranche dans 24mois<br><br>❄️À seulement quelques pas de la piscine, dans une site sécurisée et bien située, ce T2 en semi-finis une valeur sûre pour tout investisseur avisé.<br><br>Pourquoi ce bien est exceptionnel ?<br>✅️Localisation stratégique, très demandée<br>✅️Retour sur investissement rapide<br>✅️Prêt à être exploité dès l’achat !<br>✅️Un petit prix pour un grand potentiel.<br>✅️Les bonnes affaires ne durent jamais longtemps…<br>Saisissez cette opportunité maintenant !</code> |
|
||||||
|
| <code>query: GX3 PRO 2025 X3 Pro</code> | <code>passage: سيارة Geely GX3 PRO 2025 X3 pro livane · بنزين · اوتوماتيك · 1.5 · المسافة: جديدة · بجاية · Vent une livane x3pro neuf carte grise Safia</code> |
|
||||||
|
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"scale": 20.0,
|
||||||
|
"similarity_fct": "cos_sim",
|
||||||
|
"gather_across_devices": false,
|
||||||
|
"directions": [
|
||||||
|
"query_to_doc"
|
||||||
|
],
|
||||||
|
"partition_mode": "joint",
|
||||||
|
"hardness_mode": null,
|
||||||
|
"hardness_strength": 0.0
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training Hyperparameters
|
||||||
|
#### Non-Default Hyperparameters
|
||||||
|
|
||||||
|
- `per_device_train_batch_size`: 16
|
||||||
|
- `per_device_eval_batch_size`: 16
|
||||||
|
- `fp16`: True
|
||||||
|
- `multi_dataset_batch_sampler`: round_robin
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
### Training Logs
|
||||||
|
| Epoch | Step | Training Loss |
|
||||||
|
|:------:|:-----:|:-------------:|
|
||||||
|
| ... | ... | ... |
|
||||||
|
| 2.32 | 14500 | 0.2827 |
|
||||||
|
| 2.4 | 15000 | 0.3062 |
|
||||||
|
| 2.48 | 15500 | 0.3045 |
|
||||||
|
| 2.56 | 16000 | 0.2841 |
|
||||||
|
|
||||||
|
|
||||||
|
### Framework Versions
|
||||||
|
- Python: 3.12.13
|
||||||
|
- Sentence Transformers: 5.3.0
|
||||||
|
- Transformers: 5.0.0
|
||||||
|
- PyTorch: 2.10.0+cu128
|
||||||
|
- Accelerate: 1.13.0
|
||||||
|
- Datasets: 4.0.0
|
||||||
|
- Tokenizers: 0.22.2
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
### BibTeX
|
||||||
|
|
||||||
|
#### Sentence Transformers
|
||||||
|
```bibtex
|
||||||
|
@inproceedings{reimers-2019-sentence-bert,
|
||||||
|
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
||||||
|
author = "Reimers, Nils and Gurevych, Iryna",
|
||||||
|
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
||||||
|
month = "11",
|
||||||
|
year = "2019",
|
||||||
|
publisher = "Association for Computational Linguistics",
|
||||||
|
url = "https://arxiv.org/abs/1908.10084",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### MultipleNegativesRankingLoss
|
||||||
|
```bibtex
|
||||||
|
@misc{oord2019representationlearningcontrastivepredictive,
|
||||||
|
title={Representation Learning with Contrastive Predictive Coding},
|
||||||
|
author={Aaron van den Oord and Yazhe Li and Oriol Vinyals},
|
||||||
|
year={2019},
|
||||||
|
eprint={1807.03748},
|
||||||
|
archivePrefix={arXiv},
|
||||||
|
primaryClass={cs.LG},
|
||||||
|
url={https://arxiv.org/abs/1807.03748},
|
||||||
|
}
|
||||||
|
```
|
||||||
|
### Contact
|
||||||
|
Iam interested in any further related work, contact me at mohamed.himeur@student.unamur.be
|
||||||
30
config.json
Normal file
30
config.json
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"add_cross_attention": false,
|
||||||
|
"architectures": [
|
||||||
|
"XLMRobertaModel"
|
||||||
|
],
|
||||||
|
"attention_probs_dropout_prob": 0.1,
|
||||||
|
"bos_token_id": 0,
|
||||||
|
"classifier_dropout": null,
|
||||||
|
"dtype": "float32",
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"hidden_act": "gelu",
|
||||||
|
"hidden_dropout_prob": 0.1,
|
||||||
|
"hidden_size": 768,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 3072,
|
||||||
|
"is_decoder": false,
|
||||||
|
"layer_norm_eps": 1e-05,
|
||||||
|
"max_position_embeddings": 514,
|
||||||
|
"model_type": "xlm-roberta",
|
||||||
|
"num_attention_heads": 12,
|
||||||
|
"num_hidden_layers": 12,
|
||||||
|
"output_past": true,
|
||||||
|
"pad_token_id": 1,
|
||||||
|
"position_embedding_type": "absolute",
|
||||||
|
"tie_word_embeddings": true,
|
||||||
|
"transformers_version": "5.0.0",
|
||||||
|
"type_vocab_size": 1,
|
||||||
|
"use_cache": true,
|
||||||
|
"vocab_size": 250002
|
||||||
|
}
|
||||||
14
config_sentence_transformers.json
Normal file
14
config_sentence_transformers.json
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"model_type": "SentenceTransformer",
|
||||||
|
"__version__": {
|
||||||
|
"sentence_transformers": "5.3.0",
|
||||||
|
"transformers": "5.0.0",
|
||||||
|
"pytorch": "2.10.0+cu128"
|
||||||
|
},
|
||||||
|
"prompts": {
|
||||||
|
"query": "",
|
||||||
|
"document": ""
|
||||||
|
},
|
||||||
|
"default_prompt_name": null,
|
||||||
|
"similarity_fn_name": "cosine"
|
||||||
|
}
|
||||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:3b342b6314310b1fdd89c60c05e81b0212c743f54d30430d2968fe9f4667afb3
|
||||||
|
size 1112197064
|
||||||
20
modules.json
Normal file
20
modules.json
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"idx": 0,
|
||||||
|
"name": "0",
|
||||||
|
"path": "",
|
||||||
|
"type": "sentence_transformers.models.Transformer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"idx": 1,
|
||||||
|
"name": "1",
|
||||||
|
"path": "1_Pooling",
|
||||||
|
"type": "sentence_transformers.models.Pooling"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"idx": 2,
|
||||||
|
"name": "2",
|
||||||
|
"path": "2_Normalize",
|
||||||
|
"type": "sentence_transformers.models.Normalize"
|
||||||
|
}
|
||||||
|
]
|
||||||
3
rng_state.pth
Normal file
3
rng_state.pth
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:913a5368a7577d2be0e6c0babf983ef3b480fcd6823e79a547c9a24735c8a300
|
||||||
|
size 14645
|
||||||
3
scaler.pt
Normal file
3
scaler.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:492bc78bd83b826eedbe67547210706d1c9a2b14604e5b97f861b508daf02d5c
|
||||||
|
size 1383
|
||||||
3
scheduler.pt
Normal file
3
scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:216aaa27aba2a207d4db80b37b87abe496b983a01e4691c445bcc1f4502f7dbf
|
||||||
|
size 1465
|
||||||
4
sentence_bert_config.json
Normal file
4
sentence_bert_config.json
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
{
|
||||||
|
"max_seq_length": 256,
|
||||||
|
"do_lower_case": false
|
||||||
|
}
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:0c16d8a2bff758ba6e009849c31b8ffc8ba92bfc907e0bcee96a09f1818fe2da
|
||||||
|
size 16766387
|
||||||
15
tokenizer_config.json
Normal file
15
tokenizer_config.json
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"add_prefix_space": true,
|
||||||
|
"backend": "tokenizers",
|
||||||
|
"bos_token": "<s>",
|
||||||
|
"clean_up_tokenization_spaces": true,
|
||||||
|
"cls_token": "<s>",
|
||||||
|
"eos_token": "</s>",
|
||||||
|
"is_local": false,
|
||||||
|
"mask_token": "<mask>",
|
||||||
|
"model_max_length": 512,
|
||||||
|
"pad_token": "<pad>",
|
||||||
|
"sep_token": "</s>",
|
||||||
|
"tokenizer_class": "XLMRobertaTokenizer",
|
||||||
|
"unk_token": "<unk>"
|
||||||
|
}
|
||||||
258
trainer_state.json
Normal file
258
trainer_state.json
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
{
|
||||||
|
"best_global_step": null,
|
||||||
|
"best_metric": null,
|
||||||
|
"best_model_checkpoint": null,
|
||||||
|
"epoch": 2.56,
|
||||||
|
"eval_steps": 0,
|
||||||
|
"global_step": 16000,
|
||||||
|
"is_hyper_param_search": false,
|
||||||
|
"is_local_process_zero": true,
|
||||||
|
"is_world_process_zero": true,
|
||||||
|
"log_history": [
|
||||||
|
{
|
||||||
|
"epoch": 0.08,
|
||||||
|
"grad_norm": 9.120098114013672,
|
||||||
|
"learning_rate": 5.2906666666666675e-06,
|
||||||
|
"loss": 0.772038330078125,
|
||||||
|
"step": 500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.16,
|
||||||
|
"grad_norm": 16.881511688232422,
|
||||||
|
"learning_rate": 1.0624e-05,
|
||||||
|
"loss": 0.37112783813476563,
|
||||||
|
"step": 1000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.24,
|
||||||
|
"grad_norm": 9.139080047607422,
|
||||||
|
"learning_rate": 1.5957333333333334e-05,
|
||||||
|
"loss": 0.36207400512695315,
|
||||||
|
"step": 1500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.32,
|
||||||
|
"grad_norm": 3.8323566913604736,
|
||||||
|
"learning_rate": 1.9856592592592595e-05,
|
||||||
|
"loss": 0.34735809326171874,
|
||||||
|
"step": 2000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.4,
|
||||||
|
"grad_norm": 4.640134811401367,
|
||||||
|
"learning_rate": 1.9265185185185186e-05,
|
||||||
|
"loss": 0.329142578125,
|
||||||
|
"step": 2500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.48,
|
||||||
|
"grad_norm": 7.571471214294434,
|
||||||
|
"learning_rate": 1.8672592592592594e-05,
|
||||||
|
"loss": 0.3232558288574219,
|
||||||
|
"step": 3000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.56,
|
||||||
|
"grad_norm": 5.5788984298706055,
|
||||||
|
"learning_rate": 1.8080000000000003e-05,
|
||||||
|
"loss": 0.33851583862304685,
|
||||||
|
"step": 3500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.64,
|
||||||
|
"grad_norm": 4.823075771331787,
|
||||||
|
"learning_rate": 1.748740740740741e-05,
|
||||||
|
"loss": 0.3265293884277344,
|
||||||
|
"step": 4000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.72,
|
||||||
|
"grad_norm": 5.9052886962890625,
|
||||||
|
"learning_rate": 1.6897185185185187e-05,
|
||||||
|
"loss": 0.3228313903808594,
|
||||||
|
"step": 4500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8,
|
||||||
|
"grad_norm": 2.530646562576294,
|
||||||
|
"learning_rate": 1.6304592592592593e-05,
|
||||||
|
"loss": 0.3212389831542969,
|
||||||
|
"step": 5000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.88,
|
||||||
|
"grad_norm": 2.003970146179199,
|
||||||
|
"learning_rate": 1.5712e-05,
|
||||||
|
"loss": 0.3108310546875,
|
||||||
|
"step": 5500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.96,
|
||||||
|
"grad_norm": 2.259843111038208,
|
||||||
|
"learning_rate": 1.511940740740741e-05,
|
||||||
|
"loss": 0.33287890625,
|
||||||
|
"step": 6000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.04,
|
||||||
|
"grad_norm": 1.1002967357635498,
|
||||||
|
"learning_rate": 1.4526814814814815e-05,
|
||||||
|
"loss": 0.3126535949707031,
|
||||||
|
"step": 6500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.12,
|
||||||
|
"grad_norm": 2.696305751800537,
|
||||||
|
"learning_rate": 1.3934222222222222e-05,
|
||||||
|
"loss": 0.3110744323730469,
|
||||||
|
"step": 7000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.2,
|
||||||
|
"grad_norm": 3.0759758949279785,
|
||||||
|
"learning_rate": 1.3341629629629631e-05,
|
||||||
|
"loss": 0.307027099609375,
|
||||||
|
"step": 7500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.28,
|
||||||
|
"grad_norm": 1.2770161628723145,
|
||||||
|
"learning_rate": 1.2749037037037038e-05,
|
||||||
|
"loss": 0.31455657958984373,
|
||||||
|
"step": 8000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.3599999999999999,
|
||||||
|
"grad_norm": 2.4967329502105713,
|
||||||
|
"learning_rate": 1.2156444444444447e-05,
|
||||||
|
"loss": 0.31456546020507814,
|
||||||
|
"step": 8500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.44,
|
||||||
|
"grad_norm": 5.275321006774902,
|
||||||
|
"learning_rate": 1.1565037037037039e-05,
|
||||||
|
"loss": 0.3131004943847656,
|
||||||
|
"step": 9000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.52,
|
||||||
|
"grad_norm": 2.145164966583252,
|
||||||
|
"learning_rate": 1.0972444444444446e-05,
|
||||||
|
"loss": 0.30567279052734375,
|
||||||
|
"step": 9500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.6,
|
||||||
|
"grad_norm": 2.0739190578460693,
|
||||||
|
"learning_rate": 1.0379851851851853e-05,
|
||||||
|
"loss": 0.28998117065429685,
|
||||||
|
"step": 10000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.6800000000000002,
|
||||||
|
"grad_norm": 3.1562881469726562,
|
||||||
|
"learning_rate": 9.78725925925926e-06,
|
||||||
|
"loss": 0.29778146362304686,
|
||||||
|
"step": 10500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.76,
|
||||||
|
"grad_norm": 3.498109817504883,
|
||||||
|
"learning_rate": 9.194666666666667e-06,
|
||||||
|
"loss": 0.2988756103515625,
|
||||||
|
"step": 11000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.8399999999999999,
|
||||||
|
"grad_norm": 3.3291115760803223,
|
||||||
|
"learning_rate": 8.602074074074076e-06,
|
||||||
|
"loss": 0.2985075988769531,
|
||||||
|
"step": 11500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.92,
|
||||||
|
"grad_norm": 1.631378173828125,
|
||||||
|
"learning_rate": 8.009481481481483e-06,
|
||||||
|
"loss": 0.2991393737792969,
|
||||||
|
"step": 12000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 2.0,
|
||||||
|
"grad_norm": 1.3230953216552734,
|
||||||
|
"learning_rate": 7.416888888888889e-06,
|
||||||
|
"loss": 0.30236148071289065,
|
||||||
|
"step": 12500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 2.08,
|
||||||
|
"grad_norm": 2.339695930480957,
|
||||||
|
"learning_rate": 6.825481481481482e-06,
|
||||||
|
"loss": 0.295860595703125,
|
||||||
|
"step": 13000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 2.16,
|
||||||
|
"grad_norm": 1.0685478448867798,
|
||||||
|
"learning_rate": 6.234074074074075e-06,
|
||||||
|
"loss": 0.2980207824707031,
|
||||||
|
"step": 13500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 2.24,
|
||||||
|
"grad_norm": 0.947058379650116,
|
||||||
|
"learning_rate": 5.6414814814814825e-06,
|
||||||
|
"loss": 0.29257803344726563,
|
||||||
|
"step": 14000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 2.32,
|
||||||
|
"grad_norm": 2.2130205631256104,
|
||||||
|
"learning_rate": 5.0488888888888895e-06,
|
||||||
|
"loss": 0.2826576843261719,
|
||||||
|
"step": 14500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 2.4,
|
||||||
|
"grad_norm": 6.699328422546387,
|
||||||
|
"learning_rate": 4.4562962962962965e-06,
|
||||||
|
"loss": 0.30620053100585937,
|
||||||
|
"step": 15000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 2.48,
|
||||||
|
"grad_norm": 0.5939074158668518,
|
||||||
|
"learning_rate": 3.863703703703704e-06,
|
||||||
|
"loss": 0.3044532470703125,
|
||||||
|
"step": 15500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 2.56,
|
||||||
|
"grad_norm": 2.3366057872772217,
|
||||||
|
"learning_rate": 3.2711111111111117e-06,
|
||||||
|
"loss": 0.28407180786132813,
|
||||||
|
"step": 16000
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"logging_steps": 500,
|
||||||
|
"max_steps": 18750,
|
||||||
|
"num_input_tokens_seen": 0,
|
||||||
|
"num_train_epochs": 3,
|
||||||
|
"save_steps": 1000,
|
||||||
|
"stateful_callbacks": {
|
||||||
|
"TrainerControl": {
|
||||||
|
"args": {
|
||||||
|
"should_epoch_stop": false,
|
||||||
|
"should_evaluate": false,
|
||||||
|
"should_log": false,
|
||||||
|
"should_save": true,
|
||||||
|
"should_training_stop": false
|
||||||
|
},
|
||||||
|
"attributes": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"total_flos": 0.0,
|
||||||
|
"train_batch_size": 16,
|
||||||
|
"trial_name": null,
|
||||||
|
"trial_params": null
|
||||||
|
}
|
||||||
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:546a615de05e8adf2749ffb2c7c65e652fb8eff7d18b9af7b4f23b45741d3fb4
|
||||||
|
size 5521
|
||||||
Reference in New Issue
Block a user