初始化项目,由ModelHub XC社区提供模型

Model: nanopass/test-model-fe
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-06-05 21:42:44 +08:00
commit 5f5187a825
15 changed files with 32200 additions and 0 deletions

27
.gitattributes vendored Normal file
View File

@@ -0,0 +1,27 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

7
1_Pooling/config.json Executable file
View File

@@ -0,0 +1,7 @@
{
"word_embedding_dimension": 384,
"pooling_mode_cls_token": false,
"pooling_mode_mean_tokens": true,
"pooling_mode_max_tokens": false,
"pooling_mode_mean_sqrt_len_tokens": false
}

179
README.md Normal file
View File

@@ -0,0 +1,179 @@
---
pipeline_tag: feature-extraction
tags:
- sentence-transformers
- feature-extraction
- sentence-similarity
---
# multi-qa-MiniLM-L6-cos-v1
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for **semantic search**. It has been trained on 215M (question, answer) pairs from diverse sources. For an introduction to semantic search, have a look at: [SBERT.net - Semantic Search](https://www.sbert.net/examples/applications/semantic-search/README.html)
## Usage (Sentence-Transformers)
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
```
pip install -U sentence-transformers
```
Then you can use the model like this:
```python
from sentence_transformers import SentenceTransformer, util
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
#Load the model
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
#Encode query and documents
query_emb = model.encode(query)
doc_emb = model.encode(docs)
#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))
#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
#Output passages & scores
for doc, score in doc_score_pairs:
print(score, doc)
```
## Usage (HuggingFace Transformers)
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the correct pooling-operation on-top of the contextualized word embeddings.
```python
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
#Mean Pooling - Take average of all tokens
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
#Encode text
def encode(texts):
# Tokenize sentences
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input, return_dict=True)
# Perform pooling
embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
# Normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
return embeddings
# Sentences we want sentence embeddings for
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
#Encode query and docs
query_emb = encode(query)
doc_emb = encode(docs)
#Compute dot score between query and all document embeddings
scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))
#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
#Output passages & scores
for doc, score in doc_score_pairs:
print(score, doc)
```
## Technical Details
In the following some technical details how this model must be used:
| Setting | Value |
| --- | :---: |
| Dimensions | 384 |
| Produces normalized embeddings | Yes |
| Pooling-Method | Mean pooling |
| Suitable score functions | dot-product (`util.dot_score`), cosine-similarity (`util.cos_sim`), or euclidean distance |
Note: When loaded with `sentence-transformers`, this model produces normalized embeddings with length 1. In that case, dot-product and cosine-similarity are equivalent. dot-product is preferred as it is faster. Euclidean distance is proportional to dot-product and can also be used.
----
## Background
The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
contrastive learning objective. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
We developped this model during the
[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
organized by Hugging Face. We developped this model as part of the project:
[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
## Intended uses
Our model is intented to be used for semantic search: It encodes queries / questions and text paragraphs in a dense vector space. It finds relevant documents for the given passages.
Note that there is a limit of 512 word pieces: Text longer than that will be truncated. Further note that the model was just trained on input text up to 250 word pieces. It might not work well for longer text.
## Training procedure
The full training script is accessible in this current repository: `train_script.py`.
### Pre-training
We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
#### Training
We use the concatenation from multiple datasets to fine-tune our model. In total we have about 215M (question, answer) pairs.
We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
The model was trained with [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) using Mean-pooling, cosine-similarity as similarity function, and a scale of 20.
| Dataset | Number of training tuples |
|--------------------------------------------------------|:--------------------------:|
| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs from WikiAnswers | 77,427,422 |
| [PAQ](https://github.com/facebookresearch/PAQ) Automatically generated (Question, Paragraph) pairs for each paragraph in Wikipedia | 64,371,441 |
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs from all StackExchanges | 25,316,456 |
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs from all StackExchanges | 21,396,559 |
| [MS MARCO](https://microsoft.github.io/msmarco/) Triplets (query, answer, hard_negative) for 500k queries from Bing search engine | 17,579,773 |
| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) (query, answer) pairs for 3M Google queries and Google featured snippet | 3,012,496 |
| [Amazon-QA](http://jmcauley.ucsd.edu/data/amazon/qa/) (Question, Answer) pairs from Amazon product pages | 2,448,839
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) pairs from Yahoo Answers | 1,198,260 |
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) pairs from Yahoo Answers | 681,164 |
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) pairs from Yahoo Answers | 659,896 |
| [SearchQA](https://huggingface.co/datasets/search_qa) (Question, Answer) pairs for 140k questions, each with Top5 Google snippets on that question | 582,261 |
| [ELI5](https://huggingface.co/datasets/eli5) (Question, Answer) pairs from Reddit ELI5 (explainlikeimfive) | 325,475 |
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions pairs (titles) | 304,525 |
| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Question, Duplicate_Question, Hard_Negative) triplets for Quora Questions Pairs dataset | 103,663 |
| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) (Question, Paragraph) pairs for 100k real Google queries with relevant Wikipedia paragraph | 100,231 |
| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) (Question, Paragraph) pairs from SQuAD2.0 dataset | 87,599 |
| [TriviaQA](https://huggingface.co/datasets/trivia_qa) (Question, Evidence) pairs | 73,346 |
| **Total** | **214,988,242** |

24
config.json Normal file
View File

@@ -0,0 +1,24 @@
{
"_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
"architectures": [
"BertModel"
],
"attention_probs_dropout_prob": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.8.2",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522
}

View File

@@ -0,0 +1,7 @@
{
"__version__": {
"sentence_transformers": "2.0.0",
"transformers": "4.6.1",
"pytorch": "1.8.1"
}
}

942
data_config.json Normal file
View File

@@ -0,0 +1,942 @@
[
{
"name": "stackexchange_title_body/skeptics.stackexchange.com.jsonl.gz",
"lines": 10009,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/islam.stackexchange.com.jsonl.gz",
"lines": 10052,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/anime.stackexchange.com.jsonl.gz",
"lines": 10131,
"weight": 3
},
{
"name": "stackexchange_title_body/writers.stackexchange.com.jsonl.gz",
"lines": 10157,
"weight": 3
},
{
"name": "stackexchange_title_body/astronomy.stackexchange.com.jsonl.gz",
"lines": 10462,
"weight": 3
},
{
"name": "stackexchange_title_body/vi.stackexchange.com.jsonl.gz",
"lines": 10551,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/french.stackexchange.com.jsonl.gz",
"lines": 10578,
"weight": 3
},
{
"name": "stackexchange_title_body/cstheory.stackexchange.com.jsonl.gz",
"lines": 10642,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/civicrm.stackexchange.com.jsonl.gz",
"lines": 10648,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/expressionengine.stackexchange.com.jsonl.gz",
"lines": 10742,
"weight": 3
},
{
"name": "stackexchange_title_body/engineering.stackexchange.com.jsonl.gz",
"lines": 10753,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/history.stackexchange.com.jsonl.gz",
"lines": 10766,
"weight": 3
},
{
"name": "stackexchange_title_body/french.stackexchange.com.jsonl.gz",
"lines": 10794,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/politics.stackexchange.com.jsonl.gz",
"lines": 11047,
"weight": 3
},
{
"name": "stackexchange_title_body/economics.stackexchange.com.jsonl.gz",
"lines": 11115,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/craftcms.stackexchange.com.jsonl.gz",
"lines": 11236,
"weight": 3
},
{
"name": "stackexchange_title_body/anime.stackexchange.com.jsonl.gz",
"lines": 11444,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/christianity.stackexchange.com.jsonl.gz",
"lines": 11498,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/softwarerecs.stackexchange.com.jsonl.gz",
"lines": 11761,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/boardgames.stackexchange.com.jsonl.gz",
"lines": 11805,
"weight": 3
},
{
"name": "stackexchange_title_body/islam.stackexchange.com.jsonl.gz",
"lines": 11853,
"weight": 3
},
{
"name": "stackexchange_title_body/expressionengine.stackexchange.com.jsonl.gz",
"lines": 11866,
"weight": 3
},
{
"name": "stackexchange_title_body/politics.stackexchange.com.jsonl.gz",
"lines": 11894,
"weight": 3
},
{
"name": "stackexchange_title_body/history.stackexchange.com.jsonl.gz",
"lines": 12021,
"weight": 3
},
{
"name": "stackexchange_title_body/christianity.stackexchange.com.jsonl.gz",
"lines": 12108,
"weight": 3
},
{
"name": "stackexchange_title_body/boardgames.stackexchange.com.jsonl.gz",
"lines": 12149,
"weight": 3
},
{
"name": "stackexchange_title_body/civicrm.stackexchange.com.jsonl.gz",
"lines": 12543,
"weight": 3
},
{
"name": "stackexchange_title_body/craftcms.stackexchange.com.jsonl.gz",
"lines": 12574,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/networkengineering.stackexchange.com.jsonl.gz",
"lines": 12590,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/space.stackexchange.com.jsonl.gz",
"lines": 12893,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/quant.stackexchange.com.jsonl.gz",
"lines": 12933,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/philosophy.stackexchange.com.jsonl.gz",
"lines": 13114,
"weight": 3
},
{
"name": "stackexchange_Title_Answer/gardening.stackexchange.com.jsonl.gz",
"lines": 13246,
"weight": 3
},
{
"name": "stackexchange_title_body/hinduism.stackexchange.com.jsonl.gz",
"lines": 13450,
"weight": 4
},
{
"name": "stackexchange_title_body/networkengineering.stackexchange.com.jsonl.gz",
"lines": 13454,
"weight": 4
},
{
"name": "stackexchange_Title_Answer/german.stackexchange.com.jsonl.gz",
"lines": 13733,
"weight": 4
},
{
"name": "stackexchange_title_body/german.stackexchange.com.jsonl.gz",
"lines": 13950,
"weight": 4
},
{
"name": "stackexchange_title_body/philosophy.stackexchange.com.jsonl.gz",
"lines": 14829,
"weight": 4
},
{
"name": "stackexchange_title_body/gardening.stackexchange.com.jsonl.gz",
"lines": 15136,
"weight": 4
},
{
"name": "stackexchange_title_body/space.stackexchange.com.jsonl.gz",
"lines": 15142,
"weight": 4
},
{
"name": "stackexchange_Title_Answer/bicycles.stackexchange.com.jsonl.gz",
"lines": 15708,
"weight": 4
},
{
"name": "stackexchange_Title_Answer/law.stackexchange.com.jsonl.gz",
"lines": 16133,
"weight": 4
},
{
"name": "stackexchange_Title_Answer/arduino.stackexchange.com.jsonl.gz",
"lines": 16281,
"weight": 4
},
{
"name": "stackexchange_title_body/bicycles.stackexchange.com.jsonl.gz",
"lines": 16353,
"weight": 4
},
{
"name": "stackexchange_Title_Answer/emacs.stackexchange.com.jsonl.gz",
"lines": 16830,
"weight": 4
},
{
"name": "stackexchange_title_body/quant.stackexchange.com.jsonl.gz",
"lines": 17261,
"weight": 4
},
{
"name": "stackexchange_Title_Answer/dsp.stackexchange.com.jsonl.gz",
"lines": 17430,
"weight": 4
},
{
"name": "stackexchange_Title_Answer/puzzling.stackexchange.com.jsonl.gz",
"lines": 17448,
"weight": 4
},
{
"name": "stackexchange_title_body/puzzling.stackexchange.com.jsonl.gz",
"lines": 17851,
"weight": 5
},
{
"name": "stackexchange_title_body/law.stackexchange.com.jsonl.gz",
"lines": 17941,
"weight": 5
},
{
"name": "stackexchange_Title_Answer/movies.stackexchange.com.jsonl.gz",
"lines": 18243,
"weight": 5
},
{
"name": "stackexchange_Title_Answer/mechanics.stackexchange.com.jsonl.gz",
"lines": 18613,
"weight": 5
},
{
"name": "stackexchange_Title_Answer/aviation.stackexchange.com.jsonl.gz",
"lines": 18755,
"weight": 5
},
{
"name": "stackexchange_Title_Answer/biology.stackexchange.com.jsonl.gz",
"lines": 19277,
"weight": 5
},
{
"name": "stackexchange_Title_Answer/crypto.stackexchange.com.jsonl.gz",
"lines": 19404,
"weight": 5
},
{
"name": "stackexchange_title_body/arduino.stackexchange.com.jsonl.gz",
"lines": 19553,
"weight": 5
},
{
"name": "stackexchange_Title_Answer/music.stackexchange.com.jsonl.gz",
"lines": 19936,
"weight": 5
},
{
"name": "stackexchange_title_body/aviation.stackexchange.com.jsonl.gz",
"lines": 20139,
"weight": 5
},
{
"name": "stackexchange_title_body/softwarerecs.stackexchange.com.jsonl.gz",
"lines": 20142,
"weight": 5
},
{
"name": "stackexchange_title_body/movies.stackexchange.com.jsonl.gz",
"lines": 20181,
"weight": 5
},
{
"name": "stackexchange_Title_Answer/datascience.stackexchange.com.jsonl.gz",
"lines": 20503,
"weight": 5
},
{
"name": "stackexchange_title_body/music.stackexchange.com.jsonl.gz",
"lines": 20636,
"weight": 5
},
{
"name": "stackexchange_Title_Answer/japanese.stackexchange.com.jsonl.gz",
"lines": 20948,
"weight": 5
},
{
"name": "stackexchange_title_body/emacs.stackexchange.com.jsonl.gz",
"lines": 21055,
"weight": 5
},
{
"name": "stackexchange_title_body/dsp.stackexchange.com.jsonl.gz",
"lines": 21252,
"weight": 5
},
{
"name": "stackexchange_title_body/japanese.stackexchange.com.jsonl.gz",
"lines": 22056,
"weight": 5
},
{
"name": "stackexchange_Title_Answer/bitcoin.stackexchange.com.jsonl.gz",
"lines": 22474,
"weight": 6
},
{
"name": "stackexchange_Title_Answer/cooking.stackexchange.com.jsonl.gz",
"lines": 22641,
"weight": 6
},
{
"name": "stackexchange_title_body/mechanics.stackexchange.com.jsonl.gz",
"lines": 22868,
"weight": 6
},
{
"name": "stackexchange_Title_Answer/photo.stackexchange.com.jsonl.gz",
"lines": 23204,
"weight": 6
},
{
"name": "stackexchange_title_body/crypto.stackexchange.com.jsonl.gz",
"lines": 23231,
"weight": 6
},
{
"name": "stackexchange_title_body/cooking.stackexchange.com.jsonl.gz",
"lines": 23705,
"weight": 6
},
{
"name": "stackexchange_title_body/photo.stackexchange.com.jsonl.gz",
"lines": 23753,
"weight": 6
},
{
"name": "stackexchange_Title_Answer/workplace.stackexchange.com.jsonl.gz",
"lines": 24012,
"weight": 6
},
{
"name": "stackexchange_Title_Answer/meta.stackoverflow.com.jsonl.gz",
"lines": 24044,
"weight": 6
},
{
"name": "stackexchange_Title_Answer/raspberrypi.stackexchange.com.jsonl.gz",
"lines": 24143,
"weight": 6
},
{
"name": "stackexchange_title_body/workplace.stackexchange.com.jsonl.gz",
"lines": 24189,
"weight": 6
},
{
"name": "stackexchange_title_body/biology.stackexchange.com.jsonl.gz",
"lines": 24447,
"weight": 6
},
{
"name": "stackexchange_Title_Answer/webapps.stackexchange.com.jsonl.gz",
"lines": 24867,
"weight": 6
},
{
"name": "stackexchange_title_body/bitcoin.stackexchange.com.jsonl.gz",
"lines": 25374,
"weight": 6
},
{
"name": "stackexchange_Title_Answer/judaism.stackexchange.com.jsonl.gz",
"lines": 26085,
"weight": 6
},
{
"name": "stackexchange_Title_Answer/ethereum.stackexchange.com.jsonl.gz",
"lines": 26124,
"weight": 6
},
{
"name": "stackexchange_Title_Answer/worldbuilding.stackexchange.com.jsonl.gz",
"lines": 26210,
"weight": 6
},
{
"name": "stackexchange_title_body/worldbuilding.stackexchange.com.jsonl.gz",
"lines": 26763,
"weight": 7
},
{
"name": "stackexchange_Title_Answer/chemistry.stackexchange.com.jsonl.gz",
"lines": 27061,
"weight": 7
},
{
"name": "stackexchange_title_body/datascience.stackexchange.com.jsonl.gz",
"lines": 27397,
"weight": 7
},
{
"name": "stackexchange_Title_Answer/graphicdesign.stackexchange.com.jsonl.gz",
"lines": 28083,
"weight": 7
},
{
"name": "stackexchange_Title_Answer/ux.stackexchange.com.jsonl.gz",
"lines": 28901,
"weight": 7
},
{
"name": "stackexchange_title_body/ux.stackexchange.com.jsonl.gz",
"lines": 29403,
"weight": 7
},
{
"name": "stackexchange_Title_Answer/money.stackexchange.com.jsonl.gz",
"lines": 29404,
"weight": 7
},
{
"name": "stackexchange_title_body/webapps.stackexchange.com.jsonl.gz",
"lines": 29697,
"weight": 7
},
{
"name": "stackexchange_Title_Answer/cs.stackexchange.com.jsonl.gz",
"lines": 30010,
"weight": 7
},
{
"name": "stackexchange_title_body/graphicdesign.stackexchange.com.jsonl.gz",
"lines": 30233,
"weight": 7
},
{
"name": "stackexchange_Title_Answer/webmasters.stackexchange.com.jsonl.gz",
"lines": 30370,
"weight": 7
},
{
"name": "stackexchange_title_body/raspberrypi.stackexchange.com.jsonl.gz",
"lines": 30625,
"weight": 7
},
{
"name": "stackexchange_title_body/money.stackexchange.com.jsonl.gz",
"lines": 32021,
"weight": 8
},
{
"name": "stackexchange_title_body/judaism.stackexchange.com.jsonl.gz",
"lines": 32028,
"weight": 8
},
{
"name": "stackexchange_Title_Answer/academia.stackexchange.com.jsonl.gz",
"lines": 32137,
"weight": 8
},
{
"name": "stackexchange_title_body/ethereum.stackexchange.com.jsonl.gz",
"lines": 32760,
"weight": 8
},
{
"name": "stackexchange_title_body/academia.stackexchange.com.jsonl.gz",
"lines": 34331,
"weight": 8
},
{
"name": "stackexchange_title_body/chemistry.stackexchange.com.jsonl.gz",
"lines": 34506,
"weight": 8
},
{
"name": "stackexchange_title_body/webmasters.stackexchange.com.jsonl.gz",
"lines": 34559,
"weight": 8
},
{
"name": "stackexchange_title_body/meta.stackoverflow.com.jsonl.gz",
"lines": 36456,
"weight": 9
},
{
"name": "stackexchange_Title_Answer/travel.stackexchange.com.jsonl.gz",
"lines": 36533,
"weight": 9
},
{
"name": "stackexchange_Title_Answer/android.stackexchange.com.jsonl.gz",
"lines": 38077,
"weight": 9
},
{
"name": "stackexchange_title_body/cs.stackexchange.com.jsonl.gz",
"lines": 38314,
"weight": 9
},
{
"name": "stackexchange_Title_Answer/gamedev.stackexchange.com.jsonl.gz",
"lines": 40154,
"weight": 10
},
{
"name": "stackexchange_Title_Answer/rpg.stackexchange.com.jsonl.gz",
"lines": 40435,
"weight": 10
},
{
"name": "stackexchange_title_body/travel.stackexchange.com.jsonl.gz",
"lines": 41227,
"weight": 10
},
{
"name": "stackexchange_Title_Answer/codereview.stackexchange.com.jsonl.gz",
"lines": 41748,
"weight": 10
},
{
"name": "stackexchange_title_body/rpg.stackexchange.com.jsonl.gz",
"lines": 42303,
"weight": 10
},
{
"name": "stackexchange_title_body/codereview.stackexchange.com.jsonl.gz",
"lines": 45765,
"weight": 11
},
{
"name": "stackexchange_title_body/gamedev.stackexchange.com.jsonl.gz",
"lines": 46485,
"weight": 11
},
{
"name": "stackexchange_Title_Answer/softwareengineering.stackexchange.com.jsonl.gz",
"lines": 51326,
"weight": 12
},
{
"name": "stackexchange_Title_Answer/security.stackexchange.com.jsonl.gz",
"lines": 51355,
"weight": 12
},
{
"name": "stackexchange_title_body/android.stackexchange.com.jsonl.gz",
"lines": 51608,
"weight": 12
},
{
"name": "stackexchange_Title_Answer/diy.stackexchange.com.jsonl.gz",
"lines": 52896,
"weight": 12
},
{
"name": "stackexchange_title_body/softwareengineering.stackexchange.com.jsonl.gz",
"lines": 53942,
"weight": 13
},
{
"name": "stackexchange_Title_Answer/blender.stackexchange.com.jsonl.gz",
"lines": 54153,
"weight": 13
},
{
"name": "stackexchange_Title_Answer/scifi.stackexchange.com.jsonl.gz",
"lines": 54805,
"weight": 13
},
{
"name": "stackexchange_title_body/security.stackexchange.com.jsonl.gz",
"lines": 58000,
"weight": 14
},
{
"name": "stackexchange_Title_Answer/mathematica.stackexchange.com.jsonl.gz",
"lines": 59895,
"weight": 14
},
{
"name": "stackexchange_title_body/diy.stackexchange.com.jsonl.gz",
"lines": 60083,
"weight": 14
},
{
"name": "stackexchange_Title_Answer/meta.stackexchange.com.jsonl.gz",
"lines": 60744,
"weight": 14
},
{
"name": "stackexchange_title_body/scifi.stackexchange.com.jsonl.gz",
"lines": 61528,
"weight": 14
},
{
"name": "stackexchange_Title_Answer/drupal.stackexchange.com.jsonl.gz",
"lines": 67817,
"weight": 16
},
{
"name": "stackexchange_Title_Answer/dba.stackexchange.com.jsonl.gz",
"lines": 71449,
"weight": 17
},
{
"name": "stackexchange_title_body/mathematica.stackexchange.com.jsonl.gz",
"lines": 73131,
"weight": 17
},
{
"name": "stackexchange_Title_Answer/ell.stackexchange.com.jsonl.gz",
"lines": 77892,
"weight": 18
},
{
"name": "stackexchange_Title_Answer/magento.stackexchange.com.jsonl.gz",
"lines": 79241,
"weight": 18
},
{
"name": "stackexchange_title_body/drupal.stackexchange.com.jsonl.gz",
"lines": 79717,
"weight": 18
},
{
"name": "stackexchange_Title_Answer/sharepoint.stackexchange.com.jsonl.gz",
"lines": 80420,
"weight": 19
},
{
"name": "stackexchange_title_body/blender.stackexchange.com.jsonl.gz",
"lines": 80766,
"weight": 19
},
{
"name": "stackexchange_title_body/dba.stackexchange.com.jsonl.gz",
"lines": 81871,
"weight": 19
},
{
"name": "stackexchange_Title_Answer/gaming.stackexchange.com.jsonl.gz",
"lines": 82887,
"weight": 19
},
{
"name": "stackexchange_title_body/ell.stackexchange.com.jsonl.gz",
"lines": 83271,
"weight": 19
},
{
"name": "stackexchange_title_body/meta.stackexchange.com.jsonl.gz",
"lines": 83510,
"weight": 19
},
{
"name": "stackexchange_Title_Answer/wordpress.stackexchange.com.jsonl.gz",
"lines": 83621,
"weight": 19
},
{
"name": "stackexchange_Title_Answer/mathoverflow.net.jsonl.gz",
"lines": 85289,
"weight": 20
},
{
"name": "stackexchange_Title_Answer/salesforce.stackexchange.com.jsonl.gz",
"lines": 87272,
"weight": 20
},
{
"name": "stackexchange_title_body/gaming.stackexchange.com.jsonl.gz",
"lines": 88912,
"weight": 21
},
{
"name": "stackexchange_Title_Answer/apple.stackexchange.com.jsonl.gz",
"lines": 92487,
"weight": 21
},
{
"name": "stackexchange_title_body/sharepoint.stackexchange.com.jsonl.gz",
"lines": 94011,
"weight": 22
},
{
"name": "stackexchange_title_body/magento.stackexchange.com.jsonl.gz",
"lines": 99991,
"weight": 23
},
{
"name": "stackexchange_Title_Answer/gis.stackexchange.com.jsonl.gz",
"lines": 100254,
"weight": 23
},
{
"name": "stackexchange_title_body/wordpress.stackexchange.com.jsonl.gz",
"lines": 100474,
"weight": 23
},
{
"name": "stackexchange_Title_Answer/english.stackexchange.com.jsonl.gz",
"lines": 100640,
"weight": 23
},
{
"name": "stackexchange_title_body/salesforce.stackexchange.com.jsonl.gz",
"lines": 105260,
"weight": 24
},
{
"name": "stackexchange_title_body/english.stackexchange.com.jsonl.gz",
"lines": 109522,
"weight": 25
},
{
"name": "stackexchange_title_body/apple.stackexchange.com.jsonl.gz",
"lines": 110622,
"weight": 25
},
{
"name": "stackexchange_Title_Answer/stats.stackexchange.com.jsonl.gz",
"lines": 115679,
"weight": 27
},
{
"name": "stackexchange_title_body/mathoverflow.net.jsonl.gz",
"lines": 120851,
"weight": 28
},
{
"name": "stackexchange_Title_Answer/electronics.stackexchange.com.jsonl.gz",
"lines": 129494,
"weight": 30
},
{
"name": "stackexchange_title_body/gis.stackexchange.com.jsonl.gz",
"lines": 131000,
"weight": 30
},
{
"name": "stackexchange_Title_Answer/physics.stackexchange.com.jsonl.gz",
"lines": 141230,
"weight": 32
},
{
"name": "stackexchange_title_body/electronics.stackexchange.com.jsonl.gz",
"lines": 143582,
"weight": 33
},
{
"name": "TriviaQA_pairs.jsonl.gz",
"lines": 73346,
"weight": 34
},
{
"name": "stackexchange_Title_Answer/unix.stackexchange.com.jsonl.gz",
"lines": 155414,
"weight": 36
},
{
"name": "stackexchange_Title_Answer/tex.stackexchange.com.jsonl.gz",
"lines": 171628,
"weight": 39
},
{
"name": "squad_pairs.jsonl.gz",
"lines": 87599,
"weight": 40
},
{
"name": "stackexchange_title_body/physics.stackexchange.com.jsonl.gz",
"lines": 173307,
"weight": 40
},
{
"name": "stackexchange_title_body/stats.stackexchange.com.jsonl.gz",
"lines": 173466,
"weight": 40
},
{
"name": "stackexchange_title_body/unix.stackexchange.com.jsonl.gz",
"lines": 185997,
"weight": 42
},
{
"name": "NQ-train_pairs.jsonl.gz",
"lines": 100231,
"weight": 46
},
{
"name": "stackexchange_title_body/tex.stackexchange.com.jsonl.gz",
"lines": 202954,
"weight": 46
},
{
"name": "quora_duplicates_triplets.jsonl.gz",
"lines": 103663,
"weight": 47
},
{
"name": "stackexchange_Title_Answer/serverfault.com.jsonl.gz",
"lines": 238507,
"weight": 54
},
{
"name": "stackexchange_Title_Answer/askubuntu.com.jsonl.gz",
"lines": 267135,
"weight": 61
},
{
"name": "stackexchange_title_body/serverfault.com.jsonl.gz",
"lines": 270904,
"weight": 62
},
{
"name": "stackexchange_duplicate_questions_title_title.jsonl.gz",
"lines": 304525,
"weight": 69
},
{
"name": "stackexchange_title_body/askubuntu.com.jsonl.gz",
"lines": 347925,
"weight": 79
},
{
"name": "stackexchange_Title_Answer/superuser.com.jsonl.gz",
"lines": 352610,
"weight": 80
},
{
"name": "stackexchange_title_body/superuser.com.jsonl.gz",
"lines": 435463,
"weight": 99
},
{
"name": "stackexchange_title_body/small_stackexchanges.jsonl.gz",
"lines": 448146,
"weight": 102
},
{
"name": "stackexchange_Title_Answer/small_stackexchanges.jsonl.gz",
"lines": 460256,
"weight": 104
},
{
"name": "eli5_question_answer.jsonl.gz",
"lines": 325475,
"weight": 147
},
{
"name": "yahoo_answers_title_question.jsonl.gz",
"lines": 659896,
"weight": 149
},
{
"name": "PAQ_pairs.jsonl.gz",
"lines": 64371441,
"weight": 150
},
{
"name": "WikiAnswers_pairs.jsonl.gz",
"lines": 77427422,
"weight": 150
},
{
"name": "stackexchange_Title_Answer/math.stackexchange.com.jsonl.gz",
"lines": 1100953,
"weight": 226
},
{
"name": "yahoo_answers_title_answer.jsonl.gz",
"lines": 1198260,
"weight": 226
},
{
"name": "stackexchange_title_body/math.stackexchange.com.jsonl.gz",
"lines": 1338443,
"weight": 226
},
{
"name": "stackexchange_Title_Answer/stackoverflow.com-Posts.jsonl.gz",
"lines": 15768211,
"weight": 226
},
{
"name": "stackexchange_title_body/stackoverflow.com-Posts.jsonl.gz",
"lines": 18562443,
"weight": 226
},
{
"name": "searchQA_question_top5_snippets_merged.jsonl.gz",
"lines": 582261,
"weight": 263
},
{
"name": "amazon-qa-train-pairs.jsonl.gz",
"lines": 2448839,
"weight": 451
},
{
"name": "gooaq_pairs.jsonl.gz",
"lines": 3012496,
"weight": 451
},
{
"name": "msmarco-query_passage_negative_v2.jsonl.gz",
"lines": 17579773,
"weight": 1000
}
]

20
modules.json Executable file
View File

@@ -0,0 +1,20 @@
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
},
{
"idx": 2,
"name": "2",
"path": "2_Normalize",
"type": "sentence_transformers.models.Normalize"
}
]

101
push.ipynb Normal file
View File

@@ -0,0 +1,101 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import create_repo"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://huggingface.co/haqishen/test-model'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"repo_url = create_repo(name=\"test-model\", exist_ok=True)\n",
"repo_url"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import Repository"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"ename": "OSError",
"evalue": "Tried to clone a repository in a non-empty folder that isn't a git repository. If you really want to do this, do it manually:\ngit init && git remote add origin && git pull origin main\n or clone repo to a new folder and move your existing files there afterwards.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-12-0104f78c3353>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrepo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mRepository\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlocal_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"./\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclone_from\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"haqishen/test-model\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/miniconda3/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, local_dir, clone_from, repo_type, use_auth_token, git_user, git_email, revision, private, skip_lfs_files)\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 420\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mclone_from\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 421\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclone_from\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrepo_url\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mclone_from\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 422\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 423\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_git_repo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlocal_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mclone_from\u001b[0;34m(self, repo_url, use_auth_token)\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0min_repository\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 621\u001b[0m raise EnvironmentError(\n\u001b[0;32m--> 622\u001b[0;31m \u001b[0;34m\"Tried to clone a repository in a non-empty folder that isn't a git repository. If you really \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 623\u001b[0m \u001b[0;34m\"want to do this, do it manually:\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 624\u001b[0m \u001b[0;34m\"git init && git remote add origin && git pull origin main\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mOSError\u001b[0m: Tried to clone a repository in a non-empty folder that isn't a git repository. If you really want to do this, do it manually:\ngit init && git remote add origin && git pull origin main\n or clone repo to a new folder and move your existing files there afterwards."
]
}
],
"source": [
"repo = Repository(local_dir=\"./test-model\", clone_from=\"haqishen/test-model\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

3
pytorch_model.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:df507ec1743de52aa0a1b401183c5fd8ca18e846689421ea6c94cae014d9b26b
size 90888945

4
sentence_bert_config.json Executable file
View File

@@ -0,0 +1,4 @@
{
"max_seq_length": 512,
"do_lower_case": false
}

1
special_tokens_map.json Normal file
View File

@@ -0,0 +1 @@
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

1
tokenizer.json Normal file

File diff suppressed because one or more lines are too long

1
tokenizer_config.json Normal file
View File

@@ -0,0 +1 @@
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}

361
train_script.py Normal file
View File

@@ -0,0 +1,361 @@
"""
Train script for a single file
Need to set the TPU address first:
export XRT_TPU_CONFIG="localservice;0;localhost:51011"
"""
import torch.multiprocessing as mp
import threading
import time
import random
import sys
import argparse
import gzip
import json
import logging
import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch
import torch_xla
import torch_xla.core
import torch_xla.core.functions
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.distributed.parallel_loader as pl
import os
from shutil import copyfile
from transformers import (
AdamW,
AutoModel,
AutoTokenizer,
get_linear_schedule_with_warmup,
set_seed,
)
class AutoModelForSentenceEmbedding(nn.Module):
def __init__(self, model_name, tokenizer, args):
super(AutoModelForSentenceEmbedding, self).__init__()
assert args.pooling in ['mean', 'cls']
self.model = AutoModel.from_pretrained(model_name)
self.normalize = not args.no_normalize
self.tokenizer = tokenizer
self.pooling = args.pooling
def forward(self, **kwargs):
model_output = self.model(**kwargs)
if self.pooling == 'mean':
embeddings = self.mean_pooling(model_output, kwargs['attention_mask'])
elif self.pooling == 'cls':
embeddings = self.cls_pooling(model_output, kwargs['attention_mask'])
if self.normalize:
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return embeddings
def mean_pooling(self, model_output, attention_mask):
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def cls_pooling(self, model_output, attention_mask):
return model_output[0][:,0]
def save_pretrained(self, output_path):
if xm.is_master_ordinal():
self.tokenizer.save_pretrained(output_path)
self.model.config.save_pretrained(output_path)
xm.save(self.model.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
def train_function(index, args, queue):
tokenizer = AutoTokenizer.from_pretrained(args.model)
model = AutoModelForSentenceEmbedding(args.model, tokenizer, args)
### Train Loop
device = xm.xla_device()
model = model.to(device)
# Instantiate optimizer
optimizer = AdamW(params=model.parameters(), lr=2e-5, correct_bias=True)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=500,
num_training_steps=args.steps,
)
# Now we train the model
cross_entropy_loss = nn.CrossEntropyLoss()
max_grad_norm = 1
model.train()
for global_step in tqdm.trange(args.steps, disable=not xm.is_master_ordinal()):
#### Get the batch data
batch = queue.get()
#print(index, "batch {}x{}".format(len(batch), ",".join([str(len(b)) for b in batch])))
if len(batch[0]) == 2: #(anchor, positive)
text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length_a, truncation=True, padding="max_length")
text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length_b, truncation=True, padding="max_length")
### Compute embeddings
embeddings_a = model(**text1.to(device))
embeddings_b = model(**text2.to(device))
### Gather all embedings
embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
embeddings_b = torch_xla.core.functions.all_gather(embeddings_b)
### Compute similarity scores 512 x 512
scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
### Compute cross-entropy loss
labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device) # Example a[i] should match with b[i]
## Symmetric loss as in CLIP
loss = (cross_entropy_loss(scores, labels) + cross_entropy_loss(scores.transpose(0, 1), labels)) / 2
else: #(anchor, positive, negative)
text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length_a, truncation=True, padding="max_length")
text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length_b, truncation=True, padding="max_length")
text3 = tokenizer([b[2] for b in batch], return_tensors="pt", max_length=args.max_length_b, truncation=True, padding="max_length")
embeddings_a = model(**text1.to(device))
embeddings_b1 = model(**text2.to(device))
embeddings_b2 = model(**text3.to(device))
embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
embeddings_b1 = torch_xla.core.functions.all_gather(embeddings_b1)
embeddings_b2 = torch_xla.core.functions.all_gather(embeddings_b2)
embeddings_b = torch.cat([embeddings_b1, embeddings_b2])
### Compute similarity scores 512 x 1024
scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
### Compute cross-entropy loss
labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device) # Example a[i] should match with b[i]
## One-way loss
loss = cross_entropy_loss(scores, labels)
# Backward pass
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
xm.optimizer_step(optimizer, barrier=True)
lr_scheduler.step()
#Save model
if (global_step+1) % args.save_steps == 0:
output_path = os.path.join(args.output, str(global_step+1))
xm.master_print("save model: "+output_path)
model.save_pretrained(output_path)
output_path = os.path.join(args.output, "final")
xm.master_print("save model final: "+ output_path)
model.save_pretrained(output_path)
def produce_data(args, queue, filepaths, dataset_indices):
global_batch_size = args.batch_size*args.nprocs #Global batch size
num_same_dataset = int(args.nprocs / args.datasets_per_batch)
print("producer", "global_batch_size", global_batch_size)
print("producer", "num_same_dataset", num_same_dataset)
datasets = []
for filepath in filepaths:
if "reddit_" in filepath: #Special dataset class for Reddit files
data_obj = RedditDataset(filepath)
else:
data_obj = Dataset(filepath)
datasets.append(iter(data_obj))
# Store if dataset is in a 2 col or 3 col format
num_cols = {idx: len(next(dataset)) for idx, dataset in enumerate(datasets)}
while True:
texts_in_batch = set()
batch_format = None #2 vs 3 col format for this batch
#Add data from several sub datasets
for _ in range(args.datasets_per_batch):
valid_dataset = False #Check that datasets have the same 2/3 col format
while not valid_dataset:
data_idx = random.choice(dataset_indices)
if batch_format is None:
batch_format = num_cols[data_idx]
valid_dataset = True
else: #Check that this dataset has the same format
valid_dataset = (batch_format == num_cols[data_idx])
#Get data from this dataset
dataset = datasets[data_idx]
local_batch_size = args.batch_size
if batch_format == 3 and args.batch_size_triplets is not None:
local_batch_size = args.batch_size_triplets
for _ in range(num_same_dataset):
for _ in range(args.nprocs):
batch_device = [] #A batch for one device
while len(batch_device) < local_batch_size:
sample = next(dataset)
in_batch = False
for text in sample:
if text in texts_in_batch:
in_batch = True
break
if not in_batch:
for text in sample:
texts_in_batch.add(text)
batch_device.append(sample)
queue.put(batch_device)
class RedditDataset:
"""
A class that handles the reddit data files
"""
def __init__(self, filepath):
self.filepath = filepath
def __iter__(self):
while True:
with gzip.open(self.filepath, "rt") as fIn:
for line in fIn:
data = json.loads(line)
if "response" in data and "context" in data:
yield [data["response"], data["context"]]
class Dataset:
"""
A class that handles one dataset
"""
def __init__(self, filepath):
self.filepath = filepath
def __iter__(self):
max_dataset_size = 20*1000*1000 #Cache small datasets in memory
dataset = []
data_format = None
while dataset is None or len(dataset) == 0:
with gzip.open(self.filepath, "rt") as fIn:
for line in fIn:
data = json.loads(line)
if isinstance(data, dict):
data = data['texts']
if data_format is None:
data_format = len(data)
#Ensure that all entries are of the same 2/3 col format
assert len(data) == data_format
if dataset is not None:
dataset.append(data)
if len(dataset) >= max_dataset_size:
dataset = None
yield data
# Data loaded. Now stream to the queue
# Shuffle for each epoch
while True:
random.shuffle(dataset)
for data in dataset:
yield data
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--model', default='nreimers/MiniLM-L6-H384-uncased')
parser.add_argument('--steps', type=int, default=2000)
parser.add_argument('--save_steps', type=int, default=10000)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--batch_size_triplets', type=int, default=None)
parser.add_argument('--max_length_a', type=int, default=128)
parser.add_argument('--max_length_b', type=int, default=128)
parser.add_argument('--nprocs', type=int, default=8)
parser.add_argument('--datasets_per_batch', type=int, default=2, help="Number of datasets per batch")
parser.add_argument('--scale', type=float, default=20, help="Use 20 for cossim, and 1 when you work with unnormalized embeddings with dot product")
parser.add_argument('--no_normalize', action="store_true", default=False, help="If set: Embeddings are not normalized")
parser.add_argument('--pooling', default='mean')
parser.add_argument('--data_folder', default="/data", help="Folder with your dataset files")
parser.add_argument('data_config', help="A data_config.json file")
parser.add_argument('output')
args = parser.parse_args()
# Ensure num proc is devisible by datasets_per_batch
assert (args.nprocs % args.datasets_per_batch) == 0
logging.info("Output: "+args.output)
if os.path.exists(args.output):
print("Output folder already exists.")
input("Continue?")
# Write train script to output path
os.makedirs(args.output, exist_ok=True)
data_config_path = os.path.join(args.output, 'data_config.json')
copyfile(args.data_config, data_config_path)
train_script_path = os.path.join(args.output, 'train_script.py')
copyfile(__file__, train_script_path)
with open(train_script_path, 'a') as fOut:
fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
#Load data config
with open(args.data_config) as fIn:
data_config = json.load(fIn)
queue = mp.Queue(maxsize=100*args.nprocs)
filepaths = []
dataset_indices = []
for idx, data in enumerate(data_config):
filepaths.append(os.path.join(os.path.expanduser(args.data_folder), data['name']))
dataset_indices.extend([idx]*data['weight'])
# Start producer
p = mp.Process(target=produce_data, args=(args, queue, filepaths, dataset_indices))
p.start()
# Run training
print("Start processes:", args.nprocs)
xmp.spawn(train_function, args=(args, queue), nprocs=args.nprocs, start_method='fork')
print("Training done")
print("It might be that not all processes exit automatically. In that case you must manually kill this process.")
print("With 'pkill python' you can kill all remaining python processes")
p.kill()
exit()
# Script was called via:
#python train_many_data_files_v2.py --steps 200000 --batch_size 128 --model nreimers/MiniLM-L6-H384-uncased --max_length_a 64 --max_length_b 250 train_data_configs/multi-qa_v1.json output/multi-qa_v1-MiniLM-L6-mean_cos

30522
vocab.txt Normal file

File diff suppressed because it is too large Load Diff