初始化项目,由ModelHub XC社区提供模型
Model: flax-sentence-embeddings/st-codesearch-distilroberta-base Source: Original Platform
This commit is contained in:
120
train_script.py
Executable file
120
train_script.py
Executable file
@@ -0,0 +1,120 @@
|
||||
import math
|
||||
from sentence_transformers import models, losses, datasets
|
||||
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
|
||||
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import sys
|
||||
import os
|
||||
import gzip
|
||||
import csv
|
||||
from MultiDatasetDataLoader import MultiDatasetDataLoader
|
||||
from shutil import copyfile
|
||||
import json
|
||||
import argparse
|
||||
|
||||
#### Just some code to print debug information to stdout
|
||||
logging.basicConfig(format='%(asctime)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S',
|
||||
level=logging.INFO,
|
||||
handlers=[LoggingHandler()])
|
||||
#### /print debug information to stdout
|
||||
|
||||
|
||||
#model_name = 'distilroberta-base'
|
||||
#batch_size_pairs = 200
|
||||
#batch_size_triplets = 200
|
||||
#steps_per_epoch = 10000
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model', default='nreimers/MiniLM-L6-H384-uncased')
|
||||
parser.add_argument('--steps', type=int, default=2000)
|
||||
parser.add_argument('--batch_size_pairs', type=int, default=256)
|
||||
parser.add_argument('--batch_size_triplets', type=int, default=256)
|
||||
parser.add_argument('--data', nargs='+', default=[])
|
||||
parser.add_argument('--name')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
model_name = args.model #'nreimers/MiniLM-L6-H384-uncased'
|
||||
batch_size_pairs = args.batch_size_pairs #256
|
||||
batch_size_triplets = args.batch_size_triplets #256
|
||||
steps_per_epoch = args.steps #2000
|
||||
|
||||
num_epochs = 1
|
||||
max_seq_length = 128
|
||||
use_amp = True
|
||||
warmup_steps = 500
|
||||
|
||||
#####
|
||||
|
||||
output_path = 'output/training_data_benchmark-{}-norm-{}'.format(model_name.replace("/", "-"), args.name)
|
||||
logging.info("Output: "+output_path)
|
||||
if os.path.exists(output_path):
|
||||
exit()
|
||||
|
||||
|
||||
# Write train script to output path
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
|
||||
train_script_path = os.path.join(output_path, 'train_script.py')
|
||||
copyfile(__file__, train_script_path)
|
||||
with open(train_script_path, 'a') as fOut:
|
||||
fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
|
||||
|
||||
## SentenceTransformer model
|
||||
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
|
||||
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
|
||||
norm = models.Normalize()
|
||||
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, norm])
|
||||
|
||||
datasets = []
|
||||
for filepath in args.data:
|
||||
filepath = filepath.strip()
|
||||
dataset = []
|
||||
|
||||
|
||||
with gzip.open(filepath, 'rt', encoding='utf8') as fIn:
|
||||
for line in fIn:
|
||||
data = json.loads(line.strip())
|
||||
|
||||
if not isinstance(data, dict):
|
||||
data = {'guid': None, 'texts': data}
|
||||
|
||||
dataset.append(InputExample(guid=data.get('guid', None), texts=data['texts']))
|
||||
if len(dataset) >= (steps_per_epoch * batch_size_pairs * 2):
|
||||
break
|
||||
|
||||
datasets.append(dataset)
|
||||
logging.info("{}: {}".format(filepath, len(dataset)))
|
||||
|
||||
|
||||
|
||||
train_dataloader = MultiDatasetDataLoader(datasets, batch_size_pairs=batch_size_pairs, batch_size_triplets=batch_size_triplets, random_batch_fraction=0.25)
|
||||
|
||||
|
||||
# Our training loss
|
||||
train_loss = losses.MultipleNegativesRankingLoss(model, scale=20, similarity_fct=util.dot_score)
|
||||
|
||||
|
||||
|
||||
#Read STSbenchmark dataset and use it as development set
|
||||
|
||||
# Configure the training
|
||||
logging.info("Warmup-steps: {}".format(warmup_steps))
|
||||
|
||||
# Train the model
|
||||
model.fit(train_objectives=[(train_dataloader, train_loss)],
|
||||
evaluator=None,
|
||||
epochs=1,
|
||||
warmup_steps=warmup_steps,
|
||||
steps_per_epoch=steps_per_epoch,
|
||||
scheduler='warmupconstant',
|
||||
use_amp=use_amp
|
||||
)
|
||||
|
||||
|
||||
model.save(output_path)
|
||||
|
||||
# Script was called via:
|
||||
#python training_data_benchmark_norm_cos.py --name codesearch-full --model distilroberta-base --steps 10000 --data data/codesearchnet.jsonl.gz
|
||||
Reference in New Issue
Block a user