4430 lines
243 KiB
HTML
4430 lines
243 KiB
HTML
|
|
<!doctype html>
|
||
|
|
<html>
|
||
|
|
<head>
|
||
|
|
<meta charset="utf-8"/>
|
||
|
|
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||
|
|
<title>Training Report</title>
|
||
|
|
<style>
|
||
|
|
body { margin: 0; font-family: system-ui, Segoe UI, Arial; background: #0b0f17; color: #e6e6e6; }
|
||
|
|
header { padding: 14px 18px; border-bottom: 1px solid rgba(255,255,255,0.10); }
|
||
|
|
.wrap { padding: 14px 18px; display: grid; gap: 14px; }
|
||
|
|
details { background: rgba(255,255,255,0.04); padding: 10px 12px; border-radius: 10px; }
|
||
|
|
summary { cursor: pointer; font-weight: 600; }
|
||
|
|
pre { margin: 8px 0 0; white-space: pre-wrap; word-wrap: break-word; }
|
||
|
|
.muted { opacity: .8; font-size: 12px; }
|
||
|
|
</style>
|
||
|
|
</head>
|
||
|
|
<body>
|
||
|
|
<header>
|
||
|
|
<div style="font-size:18px;font-weight:700;">Training Report</div>
|
||
|
|
<div class="muted">Single-file HTML (dashboard + run payload)</div>
|
||
|
|
</header>
|
||
|
|
|
||
|
|
<div class="wrap">
|
||
|
|
<details open><summary>Notes</summary><pre></pre></details>
|
||
|
|
|
||
|
|
<div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
|
||
|
|
<script charset="utf-8" src="https://cdn.plot.ly/plotly-2.35.2.min.js"></script> <div id="fbea6609-4680-44c8-9218-05ec099740b1" class="plotly-graph-div" style="height:900px; width:100%;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("fbea6609-4680-44c8-9218-05ec099740b1")) { Plotly.newPlot( "fbea6609-4680-44c8-9218-05ec099740b1", [{"mode":"lines","name":"train_loss (raw)","opacity":0.35,"x":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600],"y":[2.2041,2.6901,2.6774,2.4478,1.9988,2.0358,2.5824,2.3479,2.6387,2.6083,2.3015,2.7042,2.5386,2.5886,2.7168,2.1886,2.4191,2.2736,2.7336,2.5288,2.0214,2.3708,2.5341,2.4852,2.4959,2.243,2.1622,2.1827,2.3346,1.9959,2.5963,2.3312,2.2653,2.6019,2.2858,2.1479,2.3149,2.4299,2.1583,2.1631,2.0198,2.3086,2.1388,1.8651,1.8407,1.9392,2.4167,2.2915,1.7683,2.0858,2.1761,2.145,1.829,2.345,1.9989,2.3336,2.0979,2.4041,2.1076,2.2672,2.1081,2.034,2.1147,1.9639,1.7557,1.8861,2.272,1.7173,1.8545,2.0492,1.9203,2.1317,2.282,2.2023,2.3545,2.1984,1.9843,2.2899,1.8448,1.857,2.1387,1.9587,2.5776,2.1138,2.0632,2.1885,2.2515,2.3173,1.9536,1.9134,2.2808,2.2055,1.9373,1.7517,1.9707,1.9629,1.8158,2.5133,2.263,2.1035,1.8114,2.3733,1.929,1.9718,2.1026,1.7999,2.0349,2.2571,2.0794,2.1671,2.1292,2.2534,2.0333,2.1495,1.5522,2.2221,2.4227,2.3886,2.0651,1.9465,2.588,1.7602,2.2738,1.8145,1.7572,2.0206,2.1292,2.1207,2.1515,1.7264,1.6036,2.132,2.061,1.8765,1.8337,2.0664,1.9651,1.9243,2.0289,1.7206,1.7212,1.559,1.9442,1.8913,2.1248,2.1745,2.0572,1.8028,2.2202,1.8407,2.3419,2.0321,1.7054,1.9304,2.1154,2.0367,1.5693,1.726,1.7872,1.9831,2.0571,1.8674,1.7982,1.7992,2.2013,1.7908,2.1308,1.735,2.1536,1.86
|
||
|
|
|
||
|
|
<details><summary>Run meta (quick)</summary><pre>{
|
||
|
|
"model": "unsloth/Phi-4-unsloth-bnb-4bit",
|
||
|
|
"dataset": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl",
|
||
|
|
"examples_total": 2872,
|
||
|
|
"examples_train": 1436,
|
||
|
|
"examples_eval": 1436,
|
||
|
|
"world_size": 1,
|
||
|
|
"effective_batch_size": 8,
|
||
|
|
"steps_per_epoch_approx": 179.5,
|
||
|
|
"max_steps": 2000,
|
||
|
|
"eval_steps": 50,
|
||
|
|
"save_steps": 50,
|
||
|
|
"learning_rate": 9.95267419777795e-06,
|
||
|
|
"warmup_steps": 10,
|
||
|
|
"lr_scheduler_type": "linear",
|
||
|
|
"weight_decay": 0.009206070410847844,
|
||
|
|
"lora_r": 32,
|
||
|
|
"lora_alpha": 64,
|
||
|
|
"lora_dropout": 0.0,
|
||
|
|
"best_checkpoint": "outputs/continue_r1_from_350_20260112_073729/checkpoint-100",
|
||
|
|
"LR_AUTO_ENABLED": true,
|
||
|
|
"LR_AUTO_USE_N": "train",
|
||
|
|
"LR_AUTO_N_REF": 1436,
|
||
|
|
"LR_AUTO_BASE": 1e-05,
|
||
|
|
"LR_AUTO_MULT": 0.5,
|
||
|
|
"LR_AUTO_FINAL": 5e-06,
|
||
|
|
"best_step": 100,
|
||
|
|
"best_eval_loss": 2.2380564212799072,
|
||
|
|
"best_blended": 1.3520409573791146,
|
||
|
|
"best_blended_step": 600
|
||
|
|
}</pre></details>
|
||
|
|
<details><summary>config_snapshot</summary><pre>{
|
||
|
|
"MODEL_NAME": "unsloth/Phi-4-unsloth-bnb-4bit",
|
||
|
|
"CHAT_TEMPLATE": "phi-4",
|
||
|
|
"MAX_SEQ_LENGTH": 2048,
|
||
|
|
"LOAD_IN_4BIT": true,
|
||
|
|
"DATASET_NAME": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl",
|
||
|
|
"DATASET_SPLIT": "train",
|
||
|
|
"PER_DEVICE_TRAIN_BATCH_SIZE": 2,
|
||
|
|
"GRADIENT_ACCUMULATION_STEPS": 4,
|
||
|
|
"WARMUP_STEPS": 10,
|
||
|
|
"MAX_STEPS": 2000,
|
||
|
|
"LEARNING_RATE": 9.95267419777795e-06,
|
||
|
|
"WEIGHT_DECAY": 0.009206070410847844,
|
||
|
|
"LR_SCHEDULER_TYPE": "linear",
|
||
|
|
"SEED": 3407,
|
||
|
|
"PLOTLY_DARK_MODE": true,
|
||
|
|
"PLOTLY_BASE_COLOR": "#00CC96",
|
||
|
|
"PLOTLY_EMA_SPAN": 25,
|
||
|
|
"LR_AUTO_ENABLED": true,
|
||
|
|
"LR_AUTO_USE_N": "train",
|
||
|
|
"LR_AUTO_N_REF": 1436,
|
||
|
|
"LR_AUTO_BASE": 1e-05,
|
||
|
|
"LR_AUTO_MULT": 0.5,
|
||
|
|
"LR_AUTO_FINAL": 5e-06
|
||
|
|
}</pre></details>
|
||
|
|
<details><summary>run_manifest</summary><pre>{
|
||
|
|
"model_name": "unsloth/Phi-4-unsloth-bnb-4bit",
|
||
|
|
"dataset": {
|
||
|
|
"name": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl",
|
||
|
|
"split": "train"
|
||
|
|
},
|
||
|
|
"training": {
|
||
|
|
"max_steps": 2000,
|
||
|
|
"learning_rate": 9.95267419777795e-06,
|
||
|
|
"per_device_train_batch_size": 2,
|
||
|
|
"gradient_accumulation_steps": 4,
|
||
|
|
"max_seq_length": 2048,
|
||
|
|
"seed": 3407,
|
||
|
|
"optimizer": "adamw_8bit",
|
||
|
|
"lr_scheduler_type": "linear"
|
||
|
|
},
|
||
|
|
"auto_lr": {
|
||
|
|
"enabled": true,
|
||
|
|
"use_n": "train",
|
||
|
|
"n_ref": 1436,
|
||
|
|
"base": 1e-05,
|
||
|
|
"mult": 0.5,
|
||
|
|
"final": 5e-06
|
||
|
|
},
|
||
|
|
"best": {
|
||
|
|
"checkpoint": "/content/outputs/continue_r1_from_350_20260112_073729/checkpoint-100",
|
||
|
|
"metric": 2.2380564212799072,
|
||
|
|
"metric_name": "eval_loss"
|
||
|
|
},
|
||
|
|
"plotly": {
|
||
|
|
"html": "training_loss_step.html"
|
||
|
|
}
|
||
|
|
}</pre></details>
|
||
|
|
<details><summary>trainer.state.log_history</summary><pre>[
|
||
|
|
{
|
||
|
|
"loss": 2.2041,
|
||
|
|
"grad_norm": 4.026190757751465,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"epoch": 0.005571030640668524,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.6901,
|
||
|
|
"grad_norm": 1.616629719734192,
|
||
|
|
"learning_rate": 1.4636285584967574e-07,
|
||
|
|
"epoch": 0.011142061281337047,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.6774,
|
||
|
|
"grad_norm": 13.836981773376465,
|
||
|
|
"learning_rate": 2.927257116993515e-07,
|
||
|
|
"epoch": 0.016713091922005572,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.4478,
|
||
|
|
"grad_norm": 1.857710361480713,
|
||
|
|
"learning_rate": 4.3908856754902726e-07,
|
||
|
|
"epoch": 0.022284122562674095,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9988,
|
||
|
|
"grad_norm": 1.4818029403686523,
|
||
|
|
"learning_rate": 5.85451423398703e-07,
|
||
|
|
"epoch": 0.027855153203342618,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0358,
|
||
|
|
"grad_norm": 1.726440191268921,
|
||
|
|
"learning_rate": 7.318142792483787e-07,
|
||
|
|
"epoch": 0.033426183844011144,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.5824,
|
||
|
|
"grad_norm": 2.0604233741760254,
|
||
|
|
"learning_rate": 8.781771350980545e-07,
|
||
|
|
"epoch": 0.03899721448467967,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3479,
|
||
|
|
"grad_norm": 1.7288694381713867,
|
||
|
|
"learning_rate": 1.0245399909477302e-06,
|
||
|
|
"epoch": 0.04456824512534819,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.6387,
|
||
|
|
"grad_norm": 1.9069620370864868,
|
||
|
|
"learning_rate": 1.170902846797406e-06,
|
||
|
|
"epoch": 0.05013927576601671,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.6083,
|
||
|
|
"grad_norm": 1.4719465970993042,
|
||
|
|
"learning_rate": 1.3172657026470817e-06,
|
||
|
|
"epoch": 0.055710306406685235,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3015,
|
||
|
|
"grad_norm": 1.6306267976760864,
|
||
|
|
"learning_rate": 1.4636285584967574e-06,
|
||
|
|
"epoch": 0.06128133704735376,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.7042,
|
||
|
|
"grad_norm": 1.4724116325378418,
|
||
|
|
"learning_rate": 1.6099914143464333e-06,
|
||
|
|
"epoch": 0.06685236768802229,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.5386,
|
||
|
|
"grad_norm": 1.5470020771026611,
|
||
|
|
"learning_rate": 1.756354270196109e-06,
|
||
|
|
"epoch": 0.07242339832869081,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.5886,
|
||
|
|
"grad_norm": 2.022662401199341,
|
||
|
|
"learning_rate": 1.9027171260457846e-06,
|
||
|
|
"epoch": 0.07799442896935933,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.7168,
|
||
|
|
"grad_norm": 1.8387386798858643,
|
||
|
|
"learning_rate": 2.0490799818954605e-06,
|
||
|
|
"epoch": 0.08356545961002786,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1886,
|
||
|
|
"grad_norm": 1.9359395503997803,
|
||
|
|
"learning_rate": 2.195442837745136e-06,
|
||
|
|
"epoch": 0.08913649025069638,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.4191,
|
||
|
|
"grad_norm": 1.5662318468093872,
|
||
|
|
"learning_rate": 2.341805693594812e-06,
|
||
|
|
"epoch": 0.0947075208913649,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2736,
|
||
|
|
"grad_norm": 1.7207640409469604,
|
||
|
|
"learning_rate": 2.4881685494444876e-06,
|
||
|
|
"epoch": 0.10027855153203342,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.7336,
|
||
|
|
"grad_norm": 1.6225577592849731,
|
||
|
|
"learning_rate": 2.6345314052941634e-06,
|
||
|
|
"epoch": 0.10584958217270195,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.5288,
|
||
|
|
"grad_norm": 1.6348892450332642,
|
||
|
|
"learning_rate": 2.780894261143839e-06,
|
||
|
|
"epoch": 0.11142061281337047,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0214,
|
||
|
|
"grad_norm": 1.5059679746627808,
|
||
|
|
"learning_rate": 2.927257116993515e-06,
|
||
|
|
"epoch": 0.116991643454039,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3708,
|
||
|
|
"grad_norm": 1.3699105978012085,
|
||
|
|
"learning_rate": 3.073619972843191e-06,
|
||
|
|
"epoch": 0.12256267409470752,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.5341,
|
||
|
|
"grad_norm": 2.241403341293335,
|
||
|
|
"learning_rate": 3.2199828286928667e-06,
|
||
|
|
"epoch": 0.12813370473537605,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.4852,
|
||
|
|
"grad_norm": 1.7692517042160034,
|
||
|
|
"learning_rate": 3.3663456845425424e-06,
|
||
|
|
"epoch": 0.13370473537604458,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.4959,
|
||
|
|
"grad_norm": 1.9559876918792725,
|
||
|
|
"learning_rate": 3.512708540392218e-06,
|
||
|
|
"epoch": 0.1392757660167131,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.243,
|
||
|
|
"grad_norm": 1.7536145448684692,
|
||
|
|
"learning_rate": 3.659071396241894e-06,
|
||
|
|
"epoch": 0.14484679665738162,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1622,
|
||
|
|
"grad_norm": 1.8103671073913574,
|
||
|
|
"learning_rate": 3.805434252091569e-06,
|
||
|
|
"epoch": 0.15041782729805014,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1827,
|
||
|
|
"grad_norm": 1.5473895072937012,
|
||
|
|
"learning_rate": 3.951797107941245e-06,
|
||
|
|
"epoch": 0.15598885793871867,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3346,
|
||
|
|
"grad_norm": 1.7137048244476318,
|
||
|
|
"learning_rate": 4.098159963790921e-06,
|
||
|
|
"epoch": 0.1615598885793872,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9959,
|
||
|
|
"grad_norm": 1.7803088426589966,
|
||
|
|
"learning_rate": 4.244522819640596e-06,
|
||
|
|
"epoch": 0.1671309192200557,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.5963,
|
||
|
|
"grad_norm": 1.3565785884857178,
|
||
|
|
"learning_rate": 4.390885675490272e-06,
|
||
|
|
"epoch": 0.17270194986072424,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3312,
|
||
|
|
"grad_norm": 1.4962913990020752,
|
||
|
|
"learning_rate": 4.537248531339948e-06,
|
||
|
|
"epoch": 0.17827298050139276,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2653,
|
||
|
|
"grad_norm": 1.4864503145217896,
|
||
|
|
"learning_rate": 4.683611387189624e-06,
|
||
|
|
"epoch": 0.18384401114206128,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.6019,
|
||
|
|
"grad_norm": 2.2222468852996826,
|
||
|
|
"learning_rate": 4.829974243039299e-06,
|
||
|
|
"epoch": 0.1894150417827298,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2858,
|
||
|
|
"grad_norm": 1.6111881732940674,
|
||
|
|
"learning_rate": 4.976337098888975e-06,
|
||
|
|
"epoch": 0.19498607242339833,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1479,
|
||
|
|
"grad_norm": 2.307185173034668,
|
||
|
|
"learning_rate": 5.1226999547386506e-06,
|
||
|
|
"epoch": 0.20055710306406685,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3149,
|
||
|
|
"grad_norm": 1.9714685678482056,
|
||
|
|
"learning_rate": 5.269062810588327e-06,
|
||
|
|
"epoch": 0.20612813370473537,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.4299,
|
||
|
|
"grad_norm": 1.6915550231933594,
|
||
|
|
"learning_rate": 5.415425666438002e-06,
|
||
|
|
"epoch": 0.2116991643454039,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1583,
|
||
|
|
"grad_norm": 1.9084649085998535,
|
||
|
|
"learning_rate": 5.561788522287678e-06,
|
||
|
|
"epoch": 0.21727019498607242,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1631,
|
||
|
|
"grad_norm": 1.882629632949829,
|
||
|
|
"learning_rate": 5.7081513781373534e-06,
|
||
|
|
"epoch": 0.22284122562674094,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0198,
|
||
|
|
"grad_norm": 1.335666537284851,
|
||
|
|
"learning_rate": 5.85451423398703e-06,
|
||
|
|
"epoch": 0.22841225626740946,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3086,
|
||
|
|
"grad_norm": 2.620265007019043,
|
||
|
|
"learning_rate": 6.000877089836705e-06,
|
||
|
|
"epoch": 0.233983286908078,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1388,
|
||
|
|
"grad_norm": 2.0138227939605713,
|
||
|
|
"learning_rate": 6.147239945686382e-06,
|
||
|
|
"epoch": 0.2395543175487465,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8651,
|
||
|
|
"grad_norm": 1.6108520030975342,
|
||
|
|
"learning_rate": 6.293602801536056e-06,
|
||
|
|
"epoch": 0.24512534818941503,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8407,
|
||
|
|
"grad_norm": 1.5935970544815063,
|
||
|
|
"learning_rate": 6.439965657385733e-06,
|
||
|
|
"epoch": 0.25069637883008355,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9392,
|
||
|
|
"grad_norm": 1.3794289827346802,
|
||
|
|
"learning_rate": 6.586328513235409e-06,
|
||
|
|
"epoch": 0.2562674094707521,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.4167,
|
||
|
|
"grad_norm": 1.23729407787323,
|
||
|
|
"learning_rate": 6.732691369085085e-06,
|
||
|
|
"epoch": 0.2618384401114206,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2915,
|
||
|
|
"grad_norm": 1.4265947341918945,
|
||
|
|
"learning_rate": 6.87905422493476e-06,
|
||
|
|
"epoch": 0.26740947075208915,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7683,
|
||
|
|
"grad_norm": 1.696736216545105,
|
||
|
|
"learning_rate": 7.025417080784436e-06,
|
||
|
|
"epoch": 0.27298050139275765,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0858,
|
||
|
|
"grad_norm": 1.5071961879730225,
|
||
|
|
"learning_rate": 7.1717799366341115e-06,
|
||
|
|
"epoch": 0.2785515320334262,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.315699815750122,
|
||
|
|
"eval_runtime": 35.9132,
|
||
|
|
"eval_samples_per_second": 39.985,
|
||
|
|
"eval_steps_per_second": 2.005,
|
||
|
|
"epoch": 0.2785515320334262,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1761,
|
||
|
|
"grad_norm": 1.5999361276626587,
|
||
|
|
"learning_rate": 7.318142792483788e-06,
|
||
|
|
"epoch": 0.2841225626740947,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.145,
|
||
|
|
"grad_norm": 2.0915520191192627,
|
||
|
|
"learning_rate": 7.464505648333463e-06,
|
||
|
|
"epoch": 0.28969359331476324,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.829,
|
||
|
|
"grad_norm": 4.090714931488037,
|
||
|
|
"learning_rate": 7.610868504183138e-06,
|
||
|
|
"epoch": 0.29526462395543174,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.345,
|
||
|
|
"grad_norm": 1.6347575187683105,
|
||
|
|
"learning_rate": 7.757231360032815e-06,
|
||
|
|
"epoch": 0.3008356545961003,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9989,
|
||
|
|
"grad_norm": 1.5609041452407837,
|
||
|
|
"learning_rate": 7.90359421588249e-06,
|
||
|
|
"epoch": 0.3064066852367688,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3336,
|
||
|
|
"grad_norm": 1.6561325788497925,
|
||
|
|
"learning_rate": 8.049957071732166e-06,
|
||
|
|
"epoch": 0.31197771587743733,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0979,
|
||
|
|
"grad_norm": 1.6579258441925049,
|
||
|
|
"learning_rate": 8.196319927581842e-06,
|
||
|
|
"epoch": 0.31754874651810583,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.4041,
|
||
|
|
"grad_norm": 1.8354761600494385,
|
||
|
|
"learning_rate": 8.342682783431518e-06,
|
||
|
|
"epoch": 0.3231197771587744,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1076,
|
||
|
|
"grad_norm": 1.7043126821517944,
|
||
|
|
"learning_rate": 8.489045639281193e-06,
|
||
|
|
"epoch": 0.3286908077994429,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2672,
|
||
|
|
"grad_norm": 1.5663846731185913,
|
||
|
|
"learning_rate": 8.635408495130869e-06,
|
||
|
|
"epoch": 0.3342618384401114,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1081,
|
||
|
|
"grad_norm": 1.8134770393371582,
|
||
|
|
"learning_rate": 8.781771350980545e-06,
|
||
|
|
"epoch": 0.3398328690807799,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.034,
|
||
|
|
"grad_norm": 1.3617796897888184,
|
||
|
|
"learning_rate": 8.928134206830221e-06,
|
||
|
|
"epoch": 0.34540389972144847,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1147,
|
||
|
|
"grad_norm": 1.7525910139083862,
|
||
|
|
"learning_rate": 9.074497062679895e-06,
|
||
|
|
"epoch": 0.35097493036211697,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9639,
|
||
|
|
"grad_norm": 1.7186285257339478,
|
||
|
|
"learning_rate": 9.220859918529572e-06,
|
||
|
|
"epoch": 0.3565459610027855,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7557,
|
||
|
|
"grad_norm": 1.9141530990600586,
|
||
|
|
"learning_rate": 9.367222774379248e-06,
|
||
|
|
"epoch": 0.362116991643454,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8861,
|
||
|
|
"grad_norm": 1.696165680885315,
|
||
|
|
"learning_rate": 9.513585630228924e-06,
|
||
|
|
"epoch": 0.36768802228412256,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.272,
|
||
|
|
"grad_norm": 1.24228036403656,
|
||
|
|
"learning_rate": 9.659948486078598e-06,
|
||
|
|
"epoch": 0.3732590529247911,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7173,
|
||
|
|
"grad_norm": 1.9760662317276,
|
||
|
|
"learning_rate": 9.806311341928276e-06,
|
||
|
|
"epoch": 0.3788300835654596,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8545,
|
||
|
|
"grad_norm": 1.3207972049713135,
|
||
|
|
"learning_rate": 9.95267419777795e-06,
|
||
|
|
"epoch": 0.38440111420612816,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0492,
|
||
|
|
"grad_norm": 1.5849637985229492,
|
||
|
|
"learning_rate": 1.0099037053627625e-05,
|
||
|
|
"epoch": 0.38997214484679665,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9203,
|
||
|
|
"grad_norm": 2.7006468772888184,
|
||
|
|
"learning_rate": 1.0245399909477301e-05,
|
||
|
|
"epoch": 0.3955431754874652,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1317,
|
||
|
|
"grad_norm": 1.9178322553634644,
|
||
|
|
"learning_rate": 1.0391762765326979e-05,
|
||
|
|
"epoch": 0.4011142061281337,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.282,
|
||
|
|
"grad_norm": 1.5044149160385132,
|
||
|
|
"learning_rate": 1.0538125621176653e-05,
|
||
|
|
"epoch": 0.40668523676880225,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2023,
|
||
|
|
"grad_norm": 1.9386659860610962,
|
||
|
|
"learning_rate": 1.068448847702633e-05,
|
||
|
|
"epoch": 0.41225626740947074,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3545,
|
||
|
|
"grad_norm": 1.3408238887786865,
|
||
|
|
"learning_rate": 1.0830851332876004e-05,
|
||
|
|
"epoch": 0.4178272980501393,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1984,
|
||
|
|
"grad_norm": 2.221109390258789,
|
||
|
|
"learning_rate": 1.0977214188725682e-05,
|
||
|
|
"epoch": 0.4233983286908078,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9843,
|
||
|
|
"grad_norm": 1.7843296527862549,
|
||
|
|
"learning_rate": 1.1123577044575356e-05,
|
||
|
|
"epoch": 0.42896935933147634,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2899,
|
||
|
|
"grad_norm": 1.6259101629257202,
|
||
|
|
"learning_rate": 1.1269939900425032e-05,
|
||
|
|
"epoch": 0.43454038997214484,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8448,
|
||
|
|
"grad_norm": 1.718583345413208,
|
||
|
|
"learning_rate": 1.1416302756274707e-05,
|
||
|
|
"epoch": 0.4401114206128134,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.857,
|
||
|
|
"grad_norm": 1.8396937847137451,
|
||
|
|
"learning_rate": 1.1562665612124385e-05,
|
||
|
|
"epoch": 0.4456824512534819,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1387,
|
||
|
|
"grad_norm": 1.808605670928955,
|
||
|
|
"learning_rate": 1.170902846797406e-05,
|
||
|
|
"epoch": 0.45125348189415043,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9587,
|
||
|
|
"grad_norm": 2.590714931488037,
|
||
|
|
"learning_rate": 1.1855391323823735e-05,
|
||
|
|
"epoch": 0.4568245125348189,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.5776,
|
||
|
|
"grad_norm": 1.550307273864746,
|
||
|
|
"learning_rate": 1.200175417967341e-05,
|
||
|
|
"epoch": 0.4623955431754875,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1138,
|
||
|
|
"grad_norm": 1.7622662782669067,
|
||
|
|
"learning_rate": 1.2148117035523088e-05,
|
||
|
|
"epoch": 0.467966573816156,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0632,
|
||
|
|
"grad_norm": 2.1933865547180176,
|
||
|
|
"learning_rate": 1.2294479891372764e-05,
|
||
|
|
"epoch": 0.4735376044568245,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1885,
|
||
|
|
"grad_norm": 1.6188870668411255,
|
||
|
|
"learning_rate": 1.2440842747222438e-05,
|
||
|
|
"epoch": 0.479108635097493,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2515,
|
||
|
|
"grad_norm": 1.6533507108688354,
|
||
|
|
"learning_rate": 1.2587205603072113e-05,
|
||
|
|
"epoch": 0.48467966573816157,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3173,
|
||
|
|
"grad_norm": 1.295457363128662,
|
||
|
|
"learning_rate": 1.2733568458921789e-05,
|
||
|
|
"epoch": 0.49025069637883006,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9536,
|
||
|
|
"grad_norm": 1.5764713287353516,
|
||
|
|
"learning_rate": 1.2879931314771467e-05,
|
||
|
|
"epoch": 0.4958217270194986,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9134,
|
||
|
|
"grad_norm": 1.8399816751480103,
|
||
|
|
"learning_rate": 1.3026294170621141e-05,
|
||
|
|
"epoch": 0.5013927576601671,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2808,
|
||
|
|
"grad_norm": 1.7519652843475342,
|
||
|
|
"learning_rate": 1.3172657026470817e-05,
|
||
|
|
"epoch": 0.5069637883008357,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2055,
|
||
|
|
"grad_norm": 1.4549530744552612,
|
||
|
|
"learning_rate": 1.3319019882320492e-05,
|
||
|
|
"epoch": 0.5125348189415042,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9373,
|
||
|
|
"grad_norm": 2.0461559295654297,
|
||
|
|
"learning_rate": 1.346538273817017e-05,
|
||
|
|
"epoch": 0.5181058495821727,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7517,
|
||
|
|
"grad_norm": 1.5427114963531494,
|
||
|
|
"learning_rate": 1.3611745594019844e-05,
|
||
|
|
"epoch": 0.5236768802228412,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9707,
|
||
|
|
"grad_norm": 1.5442962646484375,
|
||
|
|
"learning_rate": 1.375810844986952e-05,
|
||
|
|
"epoch": 0.5292479108635098,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9629,
|
||
|
|
"grad_norm": 1.939523458480835,
|
||
|
|
"learning_rate": 1.3904471305719195e-05,
|
||
|
|
"epoch": 0.5348189415041783,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8158,
|
||
|
|
"grad_norm": 1.9389022588729858,
|
||
|
|
"learning_rate": 1.4050834161568872e-05,
|
||
|
|
"epoch": 0.5403899721448467,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.5133,
|
||
|
|
"grad_norm": 1.9970468282699585,
|
||
|
|
"learning_rate": 1.4197197017418547e-05,
|
||
|
|
"epoch": 0.5459610027855153,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.263,
|
||
|
|
"grad_norm": 1.5786551237106323,
|
||
|
|
"learning_rate": 1.4343559873268223e-05,
|
||
|
|
"epoch": 0.5515320334261838,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1035,
|
||
|
|
"grad_norm": 2.2139763832092285,
|
||
|
|
"learning_rate": 1.4489922729117897e-05,
|
||
|
|
"epoch": 0.5571030640668524,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.2380564212799072,
|
||
|
|
"eval_runtime": 35.9418,
|
||
|
|
"eval_samples_per_second": 39.954,
|
||
|
|
"eval_steps_per_second": 2.003,
|
||
|
|
"epoch": 0.5571030640668524,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8114,
|
||
|
|
"grad_norm": 2.2652857303619385,
|
||
|
|
"learning_rate": 1.4636285584967575e-05,
|
||
|
|
"epoch": 0.5626740947075209,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3733,
|
||
|
|
"grad_norm": 1.688621997833252,
|
||
|
|
"learning_rate": 1.4782648440817251e-05,
|
||
|
|
"epoch": 0.5682451253481894,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.929,
|
||
|
|
"grad_norm": 2.500704765319824,
|
||
|
|
"learning_rate": 1.4929011296666926e-05,
|
||
|
|
"epoch": 0.5738161559888579,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9718,
|
||
|
|
"grad_norm": 1.492704153060913,
|
||
|
|
"learning_rate": 1.50753741525166e-05,
|
||
|
|
"epoch": 0.5793871866295265,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1026,
|
||
|
|
"grad_norm": 1.6980139017105103,
|
||
|
|
"learning_rate": 1.5221737008366276e-05,
|
||
|
|
"epoch": 0.584958217270195,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7999,
|
||
|
|
"grad_norm": 1.7127199172973633,
|
||
|
|
"learning_rate": 1.5368099864215953e-05,
|
||
|
|
"epoch": 0.5905292479108635,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0349,
|
||
|
|
"grad_norm": 1.8260376453399658,
|
||
|
|
"learning_rate": 1.551446272006563e-05,
|
||
|
|
"epoch": 0.596100278551532,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2571,
|
||
|
|
"grad_norm": 1.8122572898864746,
|
||
|
|
"learning_rate": 1.5660825575915305e-05,
|
||
|
|
"epoch": 0.6016713091922006,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0794,
|
||
|
|
"grad_norm": 2.299410343170166,
|
||
|
|
"learning_rate": 1.580718843176498e-05,
|
||
|
|
"epoch": 0.6072423398328691,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1671,
|
||
|
|
"grad_norm": 1.4942196607589722,
|
||
|
|
"learning_rate": 1.5953551287614657e-05,
|
||
|
|
"epoch": 0.6128133704735376,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1292,
|
||
|
|
"grad_norm": 1.6794716119766235,
|
||
|
|
"learning_rate": 1.609991414346433e-05,
|
||
|
|
"epoch": 0.6183844011142061,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2534,
|
||
|
|
"grad_norm": 1.8196300268173218,
|
||
|
|
"learning_rate": 1.6246276999314006e-05,
|
||
|
|
"epoch": 0.6239554317548747,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0333,
|
||
|
|
"grad_norm": 1.5703504085540771,
|
||
|
|
"learning_rate": 1.6392639855163684e-05,
|
||
|
|
"epoch": 0.6295264623955432,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1495,
|
||
|
|
"grad_norm": 1.766376256942749,
|
||
|
|
"learning_rate": 1.6539002711013358e-05,
|
||
|
|
"epoch": 0.6350974930362117,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5522,
|
||
|
|
"grad_norm": 2.6598968505859375,
|
||
|
|
"learning_rate": 1.6685365566863036e-05,
|
||
|
|
"epoch": 0.6406685236768802,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2221,
|
||
|
|
"grad_norm": 10.731008529663086,
|
||
|
|
"learning_rate": 1.683172842271271e-05,
|
||
|
|
"epoch": 0.6462395543175488,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.4227,
|
||
|
|
"grad_norm": 2.2150168418884277,
|
||
|
|
"learning_rate": 1.6978091278562385e-05,
|
||
|
|
"epoch": 0.6518105849582173,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3886,
|
||
|
|
"grad_norm": 2.283031940460205,
|
||
|
|
"learning_rate": 1.7124454134412063e-05,
|
||
|
|
"epoch": 0.6573816155988857,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0651,
|
||
|
|
"grad_norm": 2.6018834114074707,
|
||
|
|
"learning_rate": 1.7270816990261737e-05,
|
||
|
|
"epoch": 0.6629526462395543,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9465,
|
||
|
|
"grad_norm": 1.8486937284469604,
|
||
|
|
"learning_rate": 1.7417179846111412e-05,
|
||
|
|
"epoch": 0.6685236768802229,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.588,
|
||
|
|
"grad_norm": 2.0970637798309326,
|
||
|
|
"learning_rate": 1.756354270196109e-05,
|
||
|
|
"epoch": 0.6740947075208914,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7602,
|
||
|
|
"grad_norm": 1.5886075496673584,
|
||
|
|
"learning_rate": 1.7709905557810764e-05,
|
||
|
|
"epoch": 0.6796657381615598,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2738,
|
||
|
|
"grad_norm": 2.4414422512054443,
|
||
|
|
"learning_rate": 1.7856268413660442e-05,
|
||
|
|
"epoch": 0.6852367688022284,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8145,
|
||
|
|
"grad_norm": 1.7890093326568604,
|
||
|
|
"learning_rate": 1.8002631269510116e-05,
|
||
|
|
"epoch": 0.6908077994428969,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7572,
|
||
|
|
"grad_norm": 1.7805349826812744,
|
||
|
|
"learning_rate": 1.814899412535979e-05,
|
||
|
|
"epoch": 0.6963788300835655,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0206,
|
||
|
|
"grad_norm": 1.9520258903503418,
|
||
|
|
"learning_rate": 1.829535698120947e-05,
|
||
|
|
"epoch": 0.7019498607242339,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1292,
|
||
|
|
"grad_norm": 1.6244016885757446,
|
||
|
|
"learning_rate": 1.8441719837059143e-05,
|
||
|
|
"epoch": 0.7075208913649025,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1207,
|
||
|
|
"grad_norm": 1.6681342124938965,
|
||
|
|
"learning_rate": 1.858808269290882e-05,
|
||
|
|
"epoch": 0.713091922005571,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1515,
|
||
|
|
"grad_norm": 2.1032838821411133,
|
||
|
|
"learning_rate": 1.8734445548758495e-05,
|
||
|
|
"epoch": 0.7186629526462396,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7264,
|
||
|
|
"grad_norm": 2.093341588973999,
|
||
|
|
"learning_rate": 1.888080840460817e-05,
|
||
|
|
"epoch": 0.724233983286908,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6036,
|
||
|
|
"grad_norm": 1.9431419372558594,
|
||
|
|
"learning_rate": 1.9027171260457848e-05,
|
||
|
|
"epoch": 0.7298050139275766,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.132,
|
||
|
|
"grad_norm": 3.0380795001983643,
|
||
|
|
"learning_rate": 1.9173534116307522e-05,
|
||
|
|
"epoch": 0.7353760445682451,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.061,
|
||
|
|
"grad_norm": 3.623516321182251,
|
||
|
|
"learning_rate": 1.9319896972157197e-05,
|
||
|
|
"epoch": 0.7409470752089137,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8765,
|
||
|
|
"grad_norm": 2.320667266845703,
|
||
|
|
"learning_rate": 1.9466259828006874e-05,
|
||
|
|
"epoch": 0.7465181058495822,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8337,
|
||
|
|
"grad_norm": 1.9040995836257935,
|
||
|
|
"learning_rate": 1.9612622683856552e-05,
|
||
|
|
"epoch": 0.7520891364902507,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0664,
|
||
|
|
"grad_norm": 1.8677185773849487,
|
||
|
|
"learning_rate": 1.9758985539706227e-05,
|
||
|
|
"epoch": 0.7576601671309192,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9651,
|
||
|
|
"grad_norm": 2.414144992828369,
|
||
|
|
"learning_rate": 1.99053483955559e-05,
|
||
|
|
"epoch": 0.7632311977715878,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9243,
|
||
|
|
"grad_norm": 2.5697357654571533,
|
||
|
|
"learning_rate": 1.989220087349807e-05,
|
||
|
|
"epoch": 0.7688022284122563,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0289,
|
||
|
|
"grad_norm": 2.384612560272217,
|
||
|
|
"learning_rate": 1.987905335144024e-05,
|
||
|
|
"epoch": 0.7743732590529248,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7206,
|
||
|
|
"grad_norm": 2.284605026245117,
|
||
|
|
"learning_rate": 1.986590582938241e-05,
|
||
|
|
"epoch": 0.7799442896935933,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7212,
|
||
|
|
"grad_norm": 2.3488142490386963,
|
||
|
|
"learning_rate": 1.985275830732458e-05,
|
||
|
|
"epoch": 0.7855153203342619,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.559,
|
||
|
|
"grad_norm": 1.849543809890747,
|
||
|
|
"learning_rate": 1.983961078526675e-05,
|
||
|
|
"epoch": 0.7910863509749304,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9442,
|
||
|
|
"grad_norm": 2.343719720840454,
|
||
|
|
"learning_rate": 1.9826463263208915e-05,
|
||
|
|
"epoch": 0.7966573816155988,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8913,
|
||
|
|
"grad_norm": 2.6115176677703857,
|
||
|
|
"learning_rate": 1.9813315741151084e-05,
|
||
|
|
"epoch": 0.8022284122562674,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1248,
|
||
|
|
"grad_norm": 2.703418731689453,
|
||
|
|
"learning_rate": 1.9800168219093254e-05,
|
||
|
|
"epoch": 0.807799442896936,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1745,
|
||
|
|
"grad_norm": 2.379194736480713,
|
||
|
|
"learning_rate": 1.9787020697035423e-05,
|
||
|
|
"epoch": 0.8133704735376045,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0572,
|
||
|
|
"grad_norm": 2.4916770458221436,
|
||
|
|
"learning_rate": 1.9773873174977593e-05,
|
||
|
|
"epoch": 0.8189415041782729,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8028,
|
||
|
|
"grad_norm": 3.7550413608551025,
|
||
|
|
"learning_rate": 1.9760725652919762e-05,
|
||
|
|
"epoch": 0.8245125348189415,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2202,
|
||
|
|
"grad_norm": 1.704113483428955,
|
||
|
|
"learning_rate": 1.974757813086193e-05,
|
||
|
|
"epoch": 0.83008356545961,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8407,
|
||
|
|
"grad_norm": 2.14805269241333,
|
||
|
|
"learning_rate": 1.9734430608804098e-05,
|
||
|
|
"epoch": 0.8356545961002786,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.264693021774292,
|
||
|
|
"eval_runtime": 35.9452,
|
||
|
|
"eval_samples_per_second": 39.95,
|
||
|
|
"eval_steps_per_second": 2.003,
|
||
|
|
"epoch": 0.8356545961002786,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3419,
|
||
|
|
"grad_norm": 2.3600826263427734,
|
||
|
|
"learning_rate": 1.972128308674627e-05,
|
||
|
|
"epoch": 0.841225626740947,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0321,
|
||
|
|
"grad_norm": 2.7362117767333984,
|
||
|
|
"learning_rate": 1.970813556468844e-05,
|
||
|
|
"epoch": 0.8467966573816156,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7054,
|
||
|
|
"grad_norm": 2.982322931289673,
|
||
|
|
"learning_rate": 1.9694988042630607e-05,
|
||
|
|
"epoch": 0.8523676880222841,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9304,
|
||
|
|
"grad_norm": 2.8210840225219727,
|
||
|
|
"learning_rate": 1.9681840520572776e-05,
|
||
|
|
"epoch": 0.8579387186629527,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1154,
|
||
|
|
"grad_norm": 2.412022113800049,
|
||
|
|
"learning_rate": 1.9668692998514946e-05,
|
||
|
|
"epoch": 0.8635097493036211,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0367,
|
||
|
|
"grad_norm": 2.439105987548828,
|
||
|
|
"learning_rate": 1.9655545476457115e-05,
|
||
|
|
"epoch": 0.8690807799442897,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5693,
|
||
|
|
"grad_norm": 2.276296854019165,
|
||
|
|
"learning_rate": 1.9642397954399285e-05,
|
||
|
|
"epoch": 0.8746518105849582,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.726,
|
||
|
|
"grad_norm": 2.12568998336792,
|
||
|
|
"learning_rate": 1.9629250432341454e-05,
|
||
|
|
"epoch": 0.8802228412256268,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7872,
|
||
|
|
"grad_norm": 2.1106767654418945,
|
||
|
|
"learning_rate": 1.9616102910283624e-05,
|
||
|
|
"epoch": 0.8857938718662952,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9831,
|
||
|
|
"grad_norm": 1.9893423318862915,
|
||
|
|
"learning_rate": 1.960295538822579e-05,
|
||
|
|
"epoch": 0.8913649025069638,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0571,
|
||
|
|
"grad_norm": 2.222038984298706,
|
||
|
|
"learning_rate": 1.958980786616796e-05,
|
||
|
|
"epoch": 0.8969359331476323,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8674,
|
||
|
|
"grad_norm": 2.5205395221710205,
|
||
|
|
"learning_rate": 1.957666034411013e-05,
|
||
|
|
"epoch": 0.9025069637883009,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7982,
|
||
|
|
"grad_norm": 2.212405204772949,
|
||
|
|
"learning_rate": 1.95635128220523e-05,
|
||
|
|
"epoch": 0.9080779944289693,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7992,
|
||
|
|
"grad_norm": 2.304945468902588,
|
||
|
|
"learning_rate": 1.9550365299994468e-05,
|
||
|
|
"epoch": 0.9136490250696379,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2013,
|
||
|
|
"grad_norm": 2.8349928855895996,
|
||
|
|
"learning_rate": 1.9537217777936638e-05,
|
||
|
|
"epoch": 0.9192200557103064,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7908,
|
||
|
|
"grad_norm": 2.2040209770202637,
|
||
|
|
"learning_rate": 1.9524070255878807e-05,
|
||
|
|
"epoch": 0.924791086350975,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1308,
|
||
|
|
"grad_norm": 2.550541400909424,
|
||
|
|
"learning_rate": 1.9510922733820977e-05,
|
||
|
|
"epoch": 0.9303621169916435,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.735,
|
||
|
|
"grad_norm": 2.9808292388916016,
|
||
|
|
"learning_rate": 1.9497775211763146e-05,
|
||
|
|
"epoch": 0.935933147632312,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1536,
|
||
|
|
"grad_norm": 2.4572677612304688,
|
||
|
|
"learning_rate": 1.9484627689705316e-05,
|
||
|
|
"epoch": 0.9415041782729805,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8616,
|
||
|
|
"grad_norm": 2.414435863494873,
|
||
|
|
"learning_rate": 1.9471480167647482e-05,
|
||
|
|
"epoch": 0.947075208913649,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9501,
|
||
|
|
"grad_norm": 2.490251064300537,
|
||
|
|
"learning_rate": 1.945833264558965e-05,
|
||
|
|
"epoch": 0.9526462395543176,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7965,
|
||
|
|
"grad_norm": 3.2512645721435547,
|
||
|
|
"learning_rate": 1.944518512353182e-05,
|
||
|
|
"epoch": 0.958217270194986,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8903,
|
||
|
|
"grad_norm": 2.0697317123413086,
|
||
|
|
"learning_rate": 1.943203760147399e-05,
|
||
|
|
"epoch": 0.9637883008356546,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9153,
|
||
|
|
"grad_norm": 2.869088888168335,
|
||
|
|
"learning_rate": 1.941889007941616e-05,
|
||
|
|
"epoch": 0.9693593314763231,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0043,
|
||
|
|
"grad_norm": 2.5188841819763184,
|
||
|
|
"learning_rate": 1.940574255735833e-05,
|
||
|
|
"epoch": 0.9749303621169917,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0106,
|
||
|
|
"grad_norm": 2.2558531761169434,
|
||
|
|
"learning_rate": 1.93925950353005e-05,
|
||
|
|
"epoch": 0.9805013927576601,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0356,
|
||
|
|
"grad_norm": 2.78887677192688,
|
||
|
|
"learning_rate": 1.9379447513242665e-05,
|
||
|
|
"epoch": 0.9860724233983287,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4849,
|
||
|
|
"grad_norm": 2.9200024604797363,
|
||
|
|
"learning_rate": 1.9366299991184838e-05,
|
||
|
|
"epoch": 0.9916434540389972,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6636,
|
||
|
|
"grad_norm": 2.443997621536255,
|
||
|
|
"learning_rate": 1.9353152469127007e-05,
|
||
|
|
"epoch": 0.9972144846796658,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4097,
|
||
|
|
"grad_norm": 3.399275779724121,
|
||
|
|
"learning_rate": 1.9340004947069174e-05,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7611,
|
||
|
|
"grad_norm": 2.312861442565918,
|
||
|
|
"learning_rate": 1.9326857425011343e-05,
|
||
|
|
"epoch": 1.0055710306406684,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0967,
|
||
|
|
"grad_norm": 2.799191951751709,
|
||
|
|
"learning_rate": 1.9313709902953513e-05,
|
||
|
|
"epoch": 1.011142061281337,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3108,
|
||
|
|
"grad_norm": 2.4845213890075684,
|
||
|
|
"learning_rate": 1.9300562380895682e-05,
|
||
|
|
"epoch": 1.0167130919220055,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6263,
|
||
|
|
"grad_norm": 2.72027325630188,
|
||
|
|
"learning_rate": 1.928741485883785e-05,
|
||
|
|
"epoch": 1.0222841225626742,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6228,
|
||
|
|
"grad_norm": 3.2783589363098145,
|
||
|
|
"learning_rate": 1.927426733678002e-05,
|
||
|
|
"epoch": 1.0278551532033426,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9384,
|
||
|
|
"grad_norm": 2.455291986465454,
|
||
|
|
"learning_rate": 1.926111981472219e-05,
|
||
|
|
"epoch": 1.033426183844011,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5646,
|
||
|
|
"grad_norm": 2.2230939865112305,
|
||
|
|
"learning_rate": 1.9247972292664357e-05,
|
||
|
|
"epoch": 1.0389972144846797,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8545,
|
||
|
|
"grad_norm": 2.596928119659424,
|
||
|
|
"learning_rate": 1.9234824770606526e-05,
|
||
|
|
"epoch": 1.0445682451253482,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8436,
|
||
|
|
"grad_norm": 2.5703697204589844,
|
||
|
|
"learning_rate": 1.9221677248548696e-05,
|
||
|
|
"epoch": 1.0501392757660166,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2574,
|
||
|
|
"grad_norm": 3.021871566772461,
|
||
|
|
"learning_rate": 1.920852972649087e-05,
|
||
|
|
"epoch": 1.0557103064066853,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8046,
|
||
|
|
"grad_norm": 2.35603404045105,
|
||
|
|
"learning_rate": 1.9195382204433035e-05,
|
||
|
|
"epoch": 1.0612813370473537,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6635,
|
||
|
|
"grad_norm": 2.453967809677124,
|
||
|
|
"learning_rate": 1.9182234682375204e-05,
|
||
|
|
"epoch": 1.0668523676880224,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9118,
|
||
|
|
"grad_norm": 3.2305331230163574,
|
||
|
|
"learning_rate": 1.9169087160317374e-05,
|
||
|
|
"epoch": 1.0724233983286908,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5529,
|
||
|
|
"grad_norm": 2.248871326446533,
|
||
|
|
"learning_rate": 1.9155939638259543e-05,
|
||
|
|
"epoch": 1.0779944289693593,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7932,
|
||
|
|
"grad_norm": 3.0331363677978516,
|
||
|
|
"learning_rate": 1.9142792116201713e-05,
|
||
|
|
"epoch": 1.083565459610028,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7632,
|
||
|
|
"grad_norm": 3.543948173522949,
|
||
|
|
"learning_rate": 1.9129644594143882e-05,
|
||
|
|
"epoch": 1.0891364902506964,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8788,
|
||
|
|
"grad_norm": 3.4173591136932373,
|
||
|
|
"learning_rate": 1.911649707208605e-05,
|
||
|
|
"epoch": 1.0947075208913648,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0881,
|
||
|
|
"grad_norm": 3.4639406204223633,
|
||
|
|
"learning_rate": 1.9103349550028218e-05,
|
||
|
|
"epoch": 1.1002785515320335,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9197,
|
||
|
|
"grad_norm": 3.6082725524902344,
|
||
|
|
"learning_rate": 1.9090202027970388e-05,
|
||
|
|
"epoch": 1.105849582172702,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4541,
|
||
|
|
"grad_norm": 2.834181070327759,
|
||
|
|
"learning_rate": 1.9077054505912557e-05,
|
||
|
|
"epoch": 1.1114206128133706,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.3124067783355713,
|
||
|
|
"eval_runtime": 35.9422,
|
||
|
|
"eval_samples_per_second": 39.953,
|
||
|
|
"eval_steps_per_second": 2.003,
|
||
|
|
"epoch": 1.1114206128133706,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8766,
|
||
|
|
"grad_norm": 2.44728422164917,
|
||
|
|
"learning_rate": 1.9063906983854727e-05,
|
||
|
|
"epoch": 1.116991643454039,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8877,
|
||
|
|
"grad_norm": 3.1577866077423096,
|
||
|
|
"learning_rate": 1.9050759461796896e-05,
|
||
|
|
"epoch": 1.1225626740947074,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6045,
|
||
|
|
"grad_norm": 3.5458521842956543,
|
||
|
|
"learning_rate": 1.9037611939739066e-05,
|
||
|
|
"epoch": 1.128133704735376,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.705,
|
||
|
|
"grad_norm": 2.496349811553955,
|
||
|
|
"learning_rate": 1.9024464417681232e-05,
|
||
|
|
"epoch": 1.1337047353760445,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6478,
|
||
|
|
"grad_norm": 3.2897088527679443,
|
||
|
|
"learning_rate": 1.9011316895623405e-05,
|
||
|
|
"epoch": 1.1392757660167132,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6703,
|
||
|
|
"grad_norm": 3.1694509983062744,
|
||
|
|
"learning_rate": 1.8998169373565574e-05,
|
||
|
|
"epoch": 1.1448467966573816,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0232,
|
||
|
|
"grad_norm": 2.8644907474517822,
|
||
|
|
"learning_rate": 1.8985021851507744e-05,
|
||
|
|
"epoch": 1.15041782729805,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.581,
|
||
|
|
"grad_norm": 2.930053472518921,
|
||
|
|
"learning_rate": 1.897187432944991e-05,
|
||
|
|
"epoch": 1.1559888579387188,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6617,
|
||
|
|
"grad_norm": 2.9067940711975098,
|
||
|
|
"learning_rate": 1.895872680739208e-05,
|
||
|
|
"epoch": 1.1615598885793872,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.173,
|
||
|
|
"grad_norm": 3.746903419494629,
|
||
|
|
"learning_rate": 1.894557928533425e-05,
|
||
|
|
"epoch": 1.1671309192200556,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5917,
|
||
|
|
"grad_norm": 4.83465576171875,
|
||
|
|
"learning_rate": 1.893243176327642e-05,
|
||
|
|
"epoch": 1.1727019498607243,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.531,
|
||
|
|
"grad_norm": 3.0352439880371094,
|
||
|
|
"learning_rate": 1.8919284241218588e-05,
|
||
|
|
"epoch": 1.1782729805013927,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.193,
|
||
|
|
"grad_norm": 2.738152027130127,
|
||
|
|
"learning_rate": 1.8906136719160758e-05,
|
||
|
|
"epoch": 1.1838440111420612,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9884,
|
||
|
|
"grad_norm": 3.005979061126709,
|
||
|
|
"learning_rate": 1.8892989197102927e-05,
|
||
|
|
"epoch": 1.1894150417827298,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8659,
|
||
|
|
"grad_norm": 3.930433750152588,
|
||
|
|
"learning_rate": 1.8879841675045093e-05,
|
||
|
|
"epoch": 1.1949860724233983,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.191,
|
||
|
|
"grad_norm": 3.3943190574645996,
|
||
|
|
"learning_rate": 1.8866694152987263e-05,
|
||
|
|
"epoch": 1.200557103064067,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1538,
|
||
|
|
"grad_norm": 3.4692654609680176,
|
||
|
|
"learning_rate": 1.8853546630929436e-05,
|
||
|
|
"epoch": 1.2061281337047354,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9939,
|
||
|
|
"grad_norm": 2.889341354370117,
|
||
|
|
"learning_rate": 1.8840399108871602e-05,
|
||
|
|
"epoch": 1.2116991643454038,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1667,
|
||
|
|
"grad_norm": 3.123650550842285,
|
||
|
|
"learning_rate": 1.882725158681377e-05,
|
||
|
|
"epoch": 1.2172701949860725,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9743,
|
||
|
|
"grad_norm": 2.6485071182250977,
|
||
|
|
"learning_rate": 1.881410406475594e-05,
|
||
|
|
"epoch": 1.222841225626741,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5679,
|
||
|
|
"grad_norm": 3.791811227798462,
|
||
|
|
"learning_rate": 1.880095654269811e-05,
|
||
|
|
"epoch": 1.2284122562674096,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1841,
|
||
|
|
"grad_norm": 3.286864757537842,
|
||
|
|
"learning_rate": 1.878780902064028e-05,
|
||
|
|
"epoch": 1.233983286908078,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1165,
|
||
|
|
"grad_norm": 2.930072784423828,
|
||
|
|
"learning_rate": 1.877466149858245e-05,
|
||
|
|
"epoch": 1.2395543175487465,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7816,
|
||
|
|
"grad_norm": 2.936857223510742,
|
||
|
|
"learning_rate": 1.876151397652462e-05,
|
||
|
|
"epoch": 1.2451253481894151,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.702,
|
||
|
|
"grad_norm": 2.3516695499420166,
|
||
|
|
"learning_rate": 1.8748366454466785e-05,
|
||
|
|
"epoch": 1.2506963788300836,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7222,
|
||
|
|
"grad_norm": 3.2817559242248535,
|
||
|
|
"learning_rate": 1.8735218932408955e-05,
|
||
|
|
"epoch": 1.2562674094707522,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7151,
|
||
|
|
"grad_norm": 2.987518548965454,
|
||
|
|
"learning_rate": 1.8722071410351124e-05,
|
||
|
|
"epoch": 1.2618384401114207,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0545,
|
||
|
|
"grad_norm": 3.132258415222168,
|
||
|
|
"learning_rate": 1.8708923888293294e-05,
|
||
|
|
"epoch": 1.267409470752089,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4732,
|
||
|
|
"grad_norm": 3.2233877182006836,
|
||
|
|
"learning_rate": 1.8695776366235463e-05,
|
||
|
|
"epoch": 1.2729805013927575,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7168,
|
||
|
|
"grad_norm": 3.2920405864715576,
|
||
|
|
"learning_rate": 1.8682628844177633e-05,
|
||
|
|
"epoch": 1.2785515320334262,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.497,
|
||
|
|
"grad_norm": 2.536219596862793,
|
||
|
|
"learning_rate": 1.8669481322119802e-05,
|
||
|
|
"epoch": 1.2841225626740946,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8177,
|
||
|
|
"grad_norm": 4.246109485626221,
|
||
|
|
"learning_rate": 1.865633380006197e-05,
|
||
|
|
"epoch": 1.2896935933147633,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9449,
|
||
|
|
"grad_norm": 2.6518428325653076,
|
||
|
|
"learning_rate": 1.864318627800414e-05,
|
||
|
|
"epoch": 1.2952646239554317,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4673,
|
||
|
|
"grad_norm": 3.7276058197021484,
|
||
|
|
"learning_rate": 1.863003875594631e-05,
|
||
|
|
"epoch": 1.3008356545961002,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.865,
|
||
|
|
"grad_norm": 3.2901997566223145,
|
||
|
|
"learning_rate": 1.8616891233888477e-05,
|
||
|
|
"epoch": 1.3064066852367688,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7449,
|
||
|
|
"grad_norm": 2.6417624950408936,
|
||
|
|
"learning_rate": 1.8603743711830646e-05,
|
||
|
|
"epoch": 1.3119777158774373,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0348,
|
||
|
|
"grad_norm": 3.81978702545166,
|
||
|
|
"learning_rate": 1.8590596189772816e-05,
|
||
|
|
"epoch": 1.317548746518106,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8765,
|
||
|
|
"grad_norm": 2.615661382675171,
|
||
|
|
"learning_rate": 1.8577448667714985e-05,
|
||
|
|
"epoch": 1.3231197771587744,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7559,
|
||
|
|
"grad_norm": 3.2889416217803955,
|
||
|
|
"learning_rate": 1.8564301145657155e-05,
|
||
|
|
"epoch": 1.3286908077994428,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9031,
|
||
|
|
"grad_norm": 4.006824970245361,
|
||
|
|
"learning_rate": 1.8551153623599324e-05,
|
||
|
|
"epoch": 1.3342618384401115,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6102,
|
||
|
|
"grad_norm": 3.3491599559783936,
|
||
|
|
"learning_rate": 1.8538006101541494e-05,
|
||
|
|
"epoch": 1.33983286908078,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0716,
|
||
|
|
"grad_norm": 3.2669260501861572,
|
||
|
|
"learning_rate": 1.852485857948366e-05,
|
||
|
|
"epoch": 1.3454038997214486,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0298,
|
||
|
|
"grad_norm": 4.218564510345459,
|
||
|
|
"learning_rate": 1.851171105742583e-05,
|
||
|
|
"epoch": 1.350974930362117,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9911,
|
||
|
|
"grad_norm": 3.5515315532684326,
|
||
|
|
"learning_rate": 1.8498563535368003e-05,
|
||
|
|
"epoch": 1.3565459610027855,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3477,
|
||
|
|
"grad_norm": 4.0060343742370605,
|
||
|
|
"learning_rate": 1.848541601331017e-05,
|
||
|
|
"epoch": 1.362116991643454,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4686,
|
||
|
|
"grad_norm": 3.574927568435669,
|
||
|
|
"learning_rate": 1.8472268491252338e-05,
|
||
|
|
"epoch": 1.3676880222841226,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.617,
|
||
|
|
"grad_norm": 3.4316840171813965,
|
||
|
|
"learning_rate": 1.8459120969194508e-05,
|
||
|
|
"epoch": 1.3732590529247912,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6593,
|
||
|
|
"grad_norm": 3.2629754543304443,
|
||
|
|
"learning_rate": 1.8445973447136677e-05,
|
||
|
|
"epoch": 1.3788300835654597,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2608,
|
||
|
|
"grad_norm": 3.133815050125122,
|
||
|
|
"learning_rate": 1.8432825925078847e-05,
|
||
|
|
"epoch": 1.384401114206128,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8523,
|
||
|
|
"grad_norm": 3.742141008377075,
|
||
|
|
"learning_rate": 1.8419678403021016e-05,
|
||
|
|
"epoch": 1.3899721448467965,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.335228443145752,
|
||
|
|
"eval_runtime": 35.9668,
|
||
|
|
"eval_samples_per_second": 39.926,
|
||
|
|
"eval_steps_per_second": 2.002,
|
||
|
|
"epoch": 1.3899721448467965,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7768,
|
||
|
|
"grad_norm": 3.9163429737091064,
|
||
|
|
"learning_rate": 1.8406530880963186e-05,
|
||
|
|
"epoch": 1.3955431754874652,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7455,
|
||
|
|
"grad_norm": 3.3456947803497314,
|
||
|
|
"learning_rate": 1.8393383358905352e-05,
|
||
|
|
"epoch": 1.4011142061281336,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7103,
|
||
|
|
"grad_norm": 4.220420837402344,
|
||
|
|
"learning_rate": 1.838023583684752e-05,
|
||
|
|
"epoch": 1.4066852367688023,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0054,
|
||
|
|
"grad_norm": 4.233839511871338,
|
||
|
|
"learning_rate": 1.836708831478969e-05,
|
||
|
|
"epoch": 1.4122562674094707,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7175,
|
||
|
|
"grad_norm": 3.703934669494629,
|
||
|
|
"learning_rate": 1.8353940792731864e-05,
|
||
|
|
"epoch": 1.4178272980501392,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7225,
|
||
|
|
"grad_norm": 4.210822105407715,
|
||
|
|
"learning_rate": 1.834079327067403e-05,
|
||
|
|
"epoch": 1.4233983286908078,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6882,
|
||
|
|
"grad_norm": 3.8861896991729736,
|
||
|
|
"learning_rate": 1.83276457486162e-05,
|
||
|
|
"epoch": 1.4289693593314763,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0721,
|
||
|
|
"grad_norm": 4.4140424728393555,
|
||
|
|
"learning_rate": 1.831449822655837e-05,
|
||
|
|
"epoch": 1.434540389972145,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6198,
|
||
|
|
"grad_norm": 3.1098673343658447,
|
||
|
|
"learning_rate": 1.830135070450054e-05,
|
||
|
|
"epoch": 1.4401114206128134,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9632,
|
||
|
|
"grad_norm": 2.9485561847686768,
|
||
|
|
"learning_rate": 1.8288203182442708e-05,
|
||
|
|
"epoch": 1.4456824512534818,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9262,
|
||
|
|
"grad_norm": 3.842655658721924,
|
||
|
|
"learning_rate": 1.8275055660384878e-05,
|
||
|
|
"epoch": 1.4512534818941505,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0807,
|
||
|
|
"grad_norm": 4.122529983520508,
|
||
|
|
"learning_rate": 1.8261908138327047e-05,
|
||
|
|
"epoch": 1.456824512534819,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0099,
|
||
|
|
"grad_norm": 3.6181795597076416,
|
||
|
|
"learning_rate": 1.8248760616269213e-05,
|
||
|
|
"epoch": 1.4623955431754876,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7435,
|
||
|
|
"grad_norm": 3.9433975219726562,
|
||
|
|
"learning_rate": 1.8235613094211383e-05,
|
||
|
|
"epoch": 1.467966573816156,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4648,
|
||
|
|
"grad_norm": 5.496665000915527,
|
||
|
|
"learning_rate": 1.8222465572153552e-05,
|
||
|
|
"epoch": 1.4735376044568245,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.106,
|
||
|
|
"grad_norm": 3.3920114040374756,
|
||
|
|
"learning_rate": 1.8209318050095722e-05,
|
||
|
|
"epoch": 1.479108635097493,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4486,
|
||
|
|
"grad_norm": 4.195888519287109,
|
||
|
|
"learning_rate": 1.819617052803789e-05,
|
||
|
|
"epoch": 1.4846796657381616,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4996,
|
||
|
|
"grad_norm": 3.5301265716552734,
|
||
|
|
"learning_rate": 1.818302300598006e-05,
|
||
|
|
"epoch": 1.49025069637883,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8247,
|
||
|
|
"grad_norm": 3.3157520294189453,
|
||
|
|
"learning_rate": 1.8169875483922227e-05,
|
||
|
|
"epoch": 1.4958217270194987,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6092,
|
||
|
|
"grad_norm": 4.3797383308410645,
|
||
|
|
"learning_rate": 1.8156727961864397e-05,
|
||
|
|
"epoch": 1.501392757660167,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6071,
|
||
|
|
"grad_norm": 3.3917229175567627,
|
||
|
|
"learning_rate": 1.814358043980657e-05,
|
||
|
|
"epoch": 1.5069637883008355,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9553,
|
||
|
|
"grad_norm": 3.171808958053589,
|
||
|
|
"learning_rate": 1.813043291774874e-05,
|
||
|
|
"epoch": 1.5125348189415042,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8105,
|
||
|
|
"grad_norm": 3.1904940605163574,
|
||
|
|
"learning_rate": 1.8117285395690905e-05,
|
||
|
|
"epoch": 1.5181058495821727,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5718,
|
||
|
|
"grad_norm": 3.7544777393341064,
|
||
|
|
"learning_rate": 1.8104137873633075e-05,
|
||
|
|
"epoch": 1.5236768802228413,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9999,
|
||
|
|
"grad_norm": 4.143693923950195,
|
||
|
|
"learning_rate": 1.8090990351575244e-05,
|
||
|
|
"epoch": 1.5292479108635098,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0393,
|
||
|
|
"grad_norm": 3.505359411239624,
|
||
|
|
"learning_rate": 1.8077842829517414e-05,
|
||
|
|
"epoch": 1.5348189415041782,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6101,
|
||
|
|
"grad_norm": 4.118677139282227,
|
||
|
|
"learning_rate": 1.8064695307459583e-05,
|
||
|
|
"epoch": 1.5403899721448466,
|
||
|
|
"step": 277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6718,
|
||
|
|
"grad_norm": 4.947996139526367,
|
||
|
|
"learning_rate": 1.8051547785401753e-05,
|
||
|
|
"epoch": 1.5459610027855153,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2007,
|
||
|
|
"grad_norm": 4.226828575134277,
|
||
|
|
"learning_rate": 1.8038400263343922e-05,
|
||
|
|
"epoch": 1.551532033426184,
|
||
|
|
"step": 279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7025,
|
||
|
|
"grad_norm": 4.085235118865967,
|
||
|
|
"learning_rate": 1.802525274128609e-05,
|
||
|
|
"epoch": 1.5571030640668524,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7632,
|
||
|
|
"grad_norm": 3.5451292991638184,
|
||
|
|
"learning_rate": 1.8012105219228258e-05,
|
||
|
|
"epoch": 1.5626740947075208,
|
||
|
|
"step": 281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4975,
|
||
|
|
"grad_norm": 5.2698540687561035,
|
||
|
|
"learning_rate": 1.799895769717043e-05,
|
||
|
|
"epoch": 1.5682451253481893,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2189,
|
||
|
|
"grad_norm": 3.662693738937378,
|
||
|
|
"learning_rate": 1.7985810175112597e-05,
|
||
|
|
"epoch": 1.573816155988858,
|
||
|
|
"step": 283
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1889,
|
||
|
|
"grad_norm": 3.9369843006134033,
|
||
|
|
"learning_rate": 1.7972662653054766e-05,
|
||
|
|
"epoch": 1.5793871866295266,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.782,
|
||
|
|
"grad_norm": 5.153691291809082,
|
||
|
|
"learning_rate": 1.7959515130996936e-05,
|
||
|
|
"epoch": 1.584958217270195,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7055,
|
||
|
|
"grad_norm": 3.5153331756591797,
|
||
|
|
"learning_rate": 1.7946367608939105e-05,
|
||
|
|
"epoch": 1.5905292479108635,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0713,
|
||
|
|
"grad_norm": 3.8740577697753906,
|
||
|
|
"learning_rate": 1.7933220086881275e-05,
|
||
|
|
"epoch": 1.596100278551532,
|
||
|
|
"step": 287
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6159,
|
||
|
|
"grad_norm": 2.977501153945923,
|
||
|
|
"learning_rate": 1.7920072564823445e-05,
|
||
|
|
"epoch": 1.6016713091922006,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0388,
|
||
|
|
"grad_norm": 4.873539447784424,
|
||
|
|
"learning_rate": 1.7906925042765614e-05,
|
||
|
|
"epoch": 1.6072423398328692,
|
||
|
|
"step": 289
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7656,
|
||
|
|
"grad_norm": 3.6297993659973145,
|
||
|
|
"learning_rate": 1.789377752070778e-05,
|
||
|
|
"epoch": 1.6128133704735377,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9818,
|
||
|
|
"grad_norm": 2.868178367614746,
|
||
|
|
"learning_rate": 1.788062999864995e-05,
|
||
|
|
"epoch": 1.6183844011142061,
|
||
|
|
"step": 291
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6421,
|
||
|
|
"grad_norm": 4.532885551452637,
|
||
|
|
"learning_rate": 1.786748247659212e-05,
|
||
|
|
"epoch": 1.6239554317548746,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.653,
|
||
|
|
"grad_norm": 5.63344669342041,
|
||
|
|
"learning_rate": 1.785433495453429e-05,
|
||
|
|
"epoch": 1.6295264623955432,
|
||
|
|
"step": 293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8727,
|
||
|
|
"grad_norm": 4.235146999359131,
|
||
|
|
"learning_rate": 1.7841187432476458e-05,
|
||
|
|
"epoch": 1.6350974930362117,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3509,
|
||
|
|
"grad_norm": 4.512764930725098,
|
||
|
|
"learning_rate": 1.7828039910418628e-05,
|
||
|
|
"epoch": 1.6406685236768803,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7836,
|
||
|
|
"grad_norm": 3.72898268699646,
|
||
|
|
"learning_rate": 1.7814892388360797e-05,
|
||
|
|
"epoch": 1.6462395543175488,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6315,
|
||
|
|
"grad_norm": 3.1936659812927246,
|
||
|
|
"learning_rate": 1.7801744866302963e-05,
|
||
|
|
"epoch": 1.6518105849582172,
|
||
|
|
"step": 297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9805,
|
||
|
|
"grad_norm": 3.1188321113586426,
|
||
|
|
"learning_rate": 1.7788597344245136e-05,
|
||
|
|
"epoch": 1.6573816155988856,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8716,
|
||
|
|
"grad_norm": 4.88875150680542,
|
||
|
|
"learning_rate": 1.7775449822187306e-05,
|
||
|
|
"epoch": 1.6629526462395543,
|
||
|
|
"step": 299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4669,
|
||
|
|
"grad_norm": 4.494915962219238,
|
||
|
|
"learning_rate": 1.7762302300129472e-05,
|
||
|
|
"epoch": 1.668523676880223,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.3116097450256348,
|
||
|
|
"eval_runtime": 35.9294,
|
||
|
|
"eval_samples_per_second": 39.967,
|
||
|
|
"eval_steps_per_second": 2.004,
|
||
|
|
"epoch": 1.668523676880223,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3418,
|
||
|
|
"grad_norm": 4.365106582641602,
|
||
|
|
"learning_rate": 1.774915477807164e-05,
|
||
|
|
"epoch": 1.6740947075208914,
|
||
|
|
"step": 301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4561,
|
||
|
|
"grad_norm": 4.683363914489746,
|
||
|
|
"learning_rate": 1.773600725601381e-05,
|
||
|
|
"epoch": 1.6796657381615598,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8321,
|
||
|
|
"grad_norm": 4.195693492889404,
|
||
|
|
"learning_rate": 1.772285973395598e-05,
|
||
|
|
"epoch": 1.6852367688022283,
|
||
|
|
"step": 303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8932,
|
||
|
|
"grad_norm": 4.681265830993652,
|
||
|
|
"learning_rate": 1.770971221189815e-05,
|
||
|
|
"epoch": 1.690807799442897,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0071,
|
||
|
|
"grad_norm": 5.034351348876953,
|
||
|
|
"learning_rate": 1.769656468984032e-05,
|
||
|
|
"epoch": 1.6963788300835656,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9824,
|
||
|
|
"grad_norm": 3.9581334590911865,
|
||
|
|
"learning_rate": 1.768341716778249e-05,
|
||
|
|
"epoch": 1.701949860724234,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.2225,
|
||
|
|
"grad_norm": 3.9467825889587402,
|
||
|
|
"learning_rate": 1.7670269645724655e-05,
|
||
|
|
"epoch": 1.7075208913649025,
|
||
|
|
"step": 307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.671,
|
||
|
|
"grad_norm": 3.7253997325897217,
|
||
|
|
"learning_rate": 1.7657122123666825e-05,
|
||
|
|
"epoch": 1.713091922005571,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7876,
|
||
|
|
"grad_norm": 4.8212480545043945,
|
||
|
|
"learning_rate": 1.7643974601608998e-05,
|
||
|
|
"epoch": 1.7186629526462396,
|
||
|
|
"step": 309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1102,
|
||
|
|
"grad_norm": 4.235992431640625,
|
||
|
|
"learning_rate": 1.7630827079551164e-05,
|
||
|
|
"epoch": 1.724233983286908,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7577,
|
||
|
|
"grad_norm": 3.5870513916015625,
|
||
|
|
"learning_rate": 1.7617679557493333e-05,
|
||
|
|
"epoch": 1.7298050139275767,
|
||
|
|
"step": 311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3948,
|
||
|
|
"grad_norm": 4.27365779876709,
|
||
|
|
"learning_rate": 1.7604532035435503e-05,
|
||
|
|
"epoch": 1.7353760445682451,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5507,
|
||
|
|
"grad_norm": 4.927708625793457,
|
||
|
|
"learning_rate": 1.7591384513377672e-05,
|
||
|
|
"epoch": 1.7409470752089136,
|
||
|
|
"step": 313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5299,
|
||
|
|
"grad_norm": 4.702437877655029,
|
||
|
|
"learning_rate": 1.7578236991319842e-05,
|
||
|
|
"epoch": 1.7465181058495822,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6187,
|
||
|
|
"grad_norm": 4.205385684967041,
|
||
|
|
"learning_rate": 1.756508946926201e-05,
|
||
|
|
"epoch": 1.7520891364902507,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6467,
|
||
|
|
"grad_norm": 3.724274158477783,
|
||
|
|
"learning_rate": 1.755194194720418e-05,
|
||
|
|
"epoch": 1.7576601671309193,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.48,
|
||
|
|
"grad_norm": 5.0788187980651855,
|
||
|
|
"learning_rate": 1.7538794425146347e-05,
|
||
|
|
"epoch": 1.7632311977715878,
|
||
|
|
"step": 317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2413,
|
||
|
|
"grad_norm": 4.211026191711426,
|
||
|
|
"learning_rate": 1.7525646903088517e-05,
|
||
|
|
"epoch": 1.7688022284122562,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2792,
|
||
|
|
"grad_norm": 4.383068561553955,
|
||
|
|
"learning_rate": 1.7512499381030686e-05,
|
||
|
|
"epoch": 1.7743732590529246,
|
||
|
|
"step": 319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0635,
|
||
|
|
"grad_norm": 5.2455668449401855,
|
||
|
|
"learning_rate": 1.7499351858972856e-05,
|
||
|
|
"epoch": 1.7799442896935933,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9011,
|
||
|
|
"grad_norm": 4.73854398727417,
|
||
|
|
"learning_rate": 1.7486204336915025e-05,
|
||
|
|
"epoch": 1.785515320334262,
|
||
|
|
"step": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9017,
|
||
|
|
"grad_norm": 5.136256217956543,
|
||
|
|
"learning_rate": 1.7473056814857195e-05,
|
||
|
|
"epoch": 1.7910863509749304,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7304,
|
||
|
|
"grad_norm": 5.707761764526367,
|
||
|
|
"learning_rate": 1.7459909292799364e-05,
|
||
|
|
"epoch": 1.7966573816155988,
|
||
|
|
"step": 323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9703,
|
||
|
|
"grad_norm": 4.81571102142334,
|
||
|
|
"learning_rate": 1.744676177074153e-05,
|
||
|
|
"epoch": 1.8022284122562673,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6825,
|
||
|
|
"grad_norm": 6.157602310180664,
|
||
|
|
"learning_rate": 1.7433614248683703e-05,
|
||
|
|
"epoch": 1.807799442896936,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7945,
|
||
|
|
"grad_norm": 5.200462818145752,
|
||
|
|
"learning_rate": 1.7420466726625873e-05,
|
||
|
|
"epoch": 1.8133704735376046,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7701,
|
||
|
|
"grad_norm": 5.342528820037842,
|
||
|
|
"learning_rate": 1.7407319204568042e-05,
|
||
|
|
"epoch": 1.818941504178273,
|
||
|
|
"step": 327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8,
|
||
|
|
"grad_norm": 4.419646739959717,
|
||
|
|
"learning_rate": 1.739417168251021e-05,
|
||
|
|
"epoch": 1.8245125348189415,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3064,
|
||
|
|
"grad_norm": 5.106484889984131,
|
||
|
|
"learning_rate": 1.7381024160452378e-05,
|
||
|
|
"epoch": 1.83008356545961,
|
||
|
|
"step": 329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0357,
|
||
|
|
"grad_norm": 4.221576690673828,
|
||
|
|
"learning_rate": 1.7367876638394547e-05,
|
||
|
|
"epoch": 1.8356545961002786,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6015,
|
||
|
|
"grad_norm": 6.323553562164307,
|
||
|
|
"learning_rate": 1.7354729116336717e-05,
|
||
|
|
"epoch": 1.841225626740947,
|
||
|
|
"step": 331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5858,
|
||
|
|
"grad_norm": 4.978970527648926,
|
||
|
|
"learning_rate": 1.7341581594278887e-05,
|
||
|
|
"epoch": 1.8467966573816157,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9489,
|
||
|
|
"grad_norm": 3.1882030963897705,
|
||
|
|
"learning_rate": 1.7328434072221056e-05,
|
||
|
|
"epoch": 1.8523676880222841,
|
||
|
|
"step": 333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1722,
|
||
|
|
"grad_norm": 4.047868251800537,
|
||
|
|
"learning_rate": 1.7315286550163222e-05,
|
||
|
|
"epoch": 1.8579387186629526,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5027,
|
||
|
|
"grad_norm": 4.2307448387146,
|
||
|
|
"learning_rate": 1.730213902810539e-05,
|
||
|
|
"epoch": 1.863509749303621,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.481,
|
||
|
|
"grad_norm": 6.048774242401123,
|
||
|
|
"learning_rate": 1.7288991506047565e-05,
|
||
|
|
"epoch": 1.8690807799442897,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8746,
|
||
|
|
"grad_norm": 5.389241695404053,
|
||
|
|
"learning_rate": 1.7275843983989734e-05,
|
||
|
|
"epoch": 1.8746518105849583,
|
||
|
|
"step": 337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0807,
|
||
|
|
"grad_norm": 4.036198139190674,
|
||
|
|
"learning_rate": 1.72626964619319e-05,
|
||
|
|
"epoch": 1.8802228412256268,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7448,
|
||
|
|
"grad_norm": 5.005743503570557,
|
||
|
|
"learning_rate": 1.724954893987407e-05,
|
||
|
|
"epoch": 1.8857938718662952,
|
||
|
|
"step": 339
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9092,
|
||
|
|
"grad_norm": 4.462837219238281,
|
||
|
|
"learning_rate": 1.723640141781624e-05,
|
||
|
|
"epoch": 1.8913649025069637,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7032,
|
||
|
|
"grad_norm": 4.945067405700684,
|
||
|
|
"learning_rate": 1.722325389575841e-05,
|
||
|
|
"epoch": 1.8969359331476323,
|
||
|
|
"step": 341
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9141,
|
||
|
|
"grad_norm": 3.7232062816619873,
|
||
|
|
"learning_rate": 1.721010637370058e-05,
|
||
|
|
"epoch": 1.902506963788301,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8258,
|
||
|
|
"grad_norm": 3.8830628395080566,
|
||
|
|
"learning_rate": 1.7196958851642748e-05,
|
||
|
|
"epoch": 1.9080779944289694,
|
||
|
|
"step": 343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7998,
|
||
|
|
"grad_norm": 4.693456649780273,
|
||
|
|
"learning_rate": 1.7183811329584917e-05,
|
||
|
|
"epoch": 1.9136490250696379,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0583,
|
||
|
|
"grad_norm": 4.737421989440918,
|
||
|
|
"learning_rate": 1.7170663807527083e-05,
|
||
|
|
"epoch": 1.9192200557103063,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.494,
|
||
|
|
"grad_norm": 2.78582501411438,
|
||
|
|
"learning_rate": 1.7157516285469253e-05,
|
||
|
|
"epoch": 1.924791086350975,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7167,
|
||
|
|
"grad_norm": 4.305075168609619,
|
||
|
|
"learning_rate": 1.7144368763411423e-05,
|
||
|
|
"epoch": 1.9303621169916436,
|
||
|
|
"step": 347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7753,
|
||
|
|
"grad_norm": 3.9957072734832764,
|
||
|
|
"learning_rate": 1.7131221241353592e-05,
|
||
|
|
"epoch": 1.935933147632312,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8852,
|
||
|
|
"grad_norm": 4.9537434577941895,
|
||
|
|
"learning_rate": 1.711807371929576e-05,
|
||
|
|
"epoch": 1.9415041782729805,
|
||
|
|
"step": 349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8729,
|
||
|
|
"grad_norm": 3.9404208660125732,
|
||
|
|
"learning_rate": 1.710492619723793e-05,
|
||
|
|
"epoch": 1.947075208913649,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.3213632106781006,
|
||
|
|
"eval_runtime": 35.9387,
|
||
|
|
"eval_samples_per_second": 39.957,
|
||
|
|
"eval_steps_per_second": 2.003,
|
||
|
|
"epoch": 1.947075208913649,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1419,
|
||
|
|
"grad_norm": 3.202141046524048,
|
||
|
|
"learning_rate": 1.70917786751801e-05,
|
||
|
|
"epoch": 1.9526462395543176,
|
||
|
|
"step": 351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.83,
|
||
|
|
"grad_norm": 4.432948112487793,
|
||
|
|
"learning_rate": 1.707863115312227e-05,
|
||
|
|
"epoch": 1.958217270194986,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.3556,
|
||
|
|
"grad_norm": 5.213648796081543,
|
||
|
|
"learning_rate": 1.706548363106444e-05,
|
||
|
|
"epoch": 1.9637883008356547,
|
||
|
|
"step": 353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1396,
|
||
|
|
"grad_norm": 4.155479431152344,
|
||
|
|
"learning_rate": 1.705233610900661e-05,
|
||
|
|
"epoch": 1.9693593314763231,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4222,
|
||
|
|
"grad_norm": 5.146358013153076,
|
||
|
|
"learning_rate": 1.7039188586948775e-05,
|
||
|
|
"epoch": 1.9749303621169916,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9362,
|
||
|
|
"grad_norm": 3.264761447906494,
|
||
|
|
"learning_rate": 1.7026041064890945e-05,
|
||
|
|
"epoch": 1.98050139275766,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9471,
|
||
|
|
"grad_norm": 3.308243989944458,
|
||
|
|
"learning_rate": 1.7012893542833114e-05,
|
||
|
|
"epoch": 1.9860724233983287,
|
||
|
|
"step": 357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0193,
|
||
|
|
"grad_norm": 4.1630859375,
|
||
|
|
"learning_rate": 1.6999746020775284e-05,
|
||
|
|
"epoch": 1.9916434540389973,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.1048,
|
||
|
|
"grad_norm": 4.196152210235596,
|
||
|
|
"learning_rate": 1.6986598498717453e-05,
|
||
|
|
"epoch": 1.9972144846796658,
|
||
|
|
"step": 359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0755,
|
||
|
|
"grad_norm": 4.194087028503418,
|
||
|
|
"learning_rate": 1.6973450976659623e-05,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4388,
|
||
|
|
"grad_norm": 4.208454132080078,
|
||
|
|
"learning_rate": 1.6960303454601792e-05,
|
||
|
|
"epoch": 2.0055710306406684,
|
||
|
|
"step": 361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7819,
|
||
|
|
"grad_norm": 3.549447774887085,
|
||
|
|
"learning_rate": 1.694715593254396e-05,
|
||
|
|
"epoch": 2.011142061281337,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5135,
|
||
|
|
"grad_norm": 3.6767420768737793,
|
||
|
|
"learning_rate": 1.693400841048613e-05,
|
||
|
|
"epoch": 2.0167130919220058,
|
||
|
|
"step": 363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7713,
|
||
|
|
"grad_norm": 3.816209554672241,
|
||
|
|
"learning_rate": 1.69208608884283e-05,
|
||
|
|
"epoch": 2.022284122562674,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6624,
|
||
|
|
"grad_norm": 3.2220561504364014,
|
||
|
|
"learning_rate": 1.6907713366370467e-05,
|
||
|
|
"epoch": 2.0278551532033426,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9059,
|
||
|
|
"grad_norm": 3.4210987091064453,
|
||
|
|
"learning_rate": 1.6894565844312637e-05,
|
||
|
|
"epoch": 2.033426183844011,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.155,
|
||
|
|
"grad_norm": 4.348776817321777,
|
||
|
|
"learning_rate": 1.6881418322254806e-05,
|
||
|
|
"epoch": 2.0389972144846795,
|
||
|
|
"step": 367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4513,
|
||
|
|
"grad_norm": 4.143118858337402,
|
||
|
|
"learning_rate": 1.6868270800196976e-05,
|
||
|
|
"epoch": 2.0445682451253484,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8148,
|
||
|
|
"grad_norm": 4.118925094604492,
|
||
|
|
"learning_rate": 1.6855123278139145e-05,
|
||
|
|
"epoch": 2.050139275766017,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6325,
|
||
|
|
"grad_norm": 4.060324668884277,
|
||
|
|
"learning_rate": 1.6841975756081315e-05,
|
||
|
|
"epoch": 2.0557103064066853,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.694,
|
||
|
|
"grad_norm": 4.604481220245361,
|
||
|
|
"learning_rate": 1.6828828234023484e-05,
|
||
|
|
"epoch": 2.0612813370473537,
|
||
|
|
"step": 371
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4905,
|
||
|
|
"grad_norm": 5.273688316345215,
|
||
|
|
"learning_rate": 1.681568071196565e-05,
|
||
|
|
"epoch": 2.066852367688022,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1557,
|
||
|
|
"grad_norm": 6.0254387855529785,
|
||
|
|
"learning_rate": 1.680253318990782e-05,
|
||
|
|
"epoch": 2.0724233983286906,
|
||
|
|
"step": 373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.999,
|
||
|
|
"grad_norm": 5.017882823944092,
|
||
|
|
"learning_rate": 1.678938566784999e-05,
|
||
|
|
"epoch": 2.0779944289693595,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4159,
|
||
|
|
"grad_norm": 6.874935626983643,
|
||
|
|
"learning_rate": 1.6776238145792162e-05,
|
||
|
|
"epoch": 2.083565459610028,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9789,
|
||
|
|
"grad_norm": 6.245709419250488,
|
||
|
|
"learning_rate": 1.676309062373433e-05,
|
||
|
|
"epoch": 2.0891364902506964,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3929,
|
||
|
|
"grad_norm": 6.976832866668701,
|
||
|
|
"learning_rate": 1.6749943101676498e-05,
|
||
|
|
"epoch": 2.094707520891365,
|
||
|
|
"step": 377
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5721,
|
||
|
|
"grad_norm": 7.426636695861816,
|
||
|
|
"learning_rate": 1.6736795579618668e-05,
|
||
|
|
"epoch": 2.1002785515320332,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4603,
|
||
|
|
"grad_norm": 8.876333236694336,
|
||
|
|
"learning_rate": 1.6723648057560837e-05,
|
||
|
|
"epoch": 2.105849582172702,
|
||
|
|
"step": 379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2115,
|
||
|
|
"grad_norm": 5.889682769775391,
|
||
|
|
"learning_rate": 1.6710500535503007e-05,
|
||
|
|
"epoch": 2.1114206128133706,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1689,
|
||
|
|
"grad_norm": 6.435322284698486,
|
||
|
|
"learning_rate": 1.6697353013445176e-05,
|
||
|
|
"epoch": 2.116991643454039,
|
||
|
|
"step": 381
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1904,
|
||
|
|
"grad_norm": 6.061446666717529,
|
||
|
|
"learning_rate": 1.6684205491387342e-05,
|
||
|
|
"epoch": 2.1225626740947074,
|
||
|
|
"step": 382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3799,
|
||
|
|
"grad_norm": 7.56770658493042,
|
||
|
|
"learning_rate": 1.6671057969329512e-05,
|
||
|
|
"epoch": 2.128133704735376,
|
||
|
|
"step": 383
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5787,
|
||
|
|
"grad_norm": 8.942233085632324,
|
||
|
|
"learning_rate": 1.665791044727168e-05,
|
||
|
|
"epoch": 2.1337047353760448,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4084,
|
||
|
|
"grad_norm": 7.448763847351074,
|
||
|
|
"learning_rate": 1.664476292521385e-05,
|
||
|
|
"epoch": 2.139275766016713,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3685,
|
||
|
|
"grad_norm": 5.792154312133789,
|
||
|
|
"learning_rate": 1.663161540315602e-05,
|
||
|
|
"epoch": 2.1448467966573816,
|
||
|
|
"step": 386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5465,
|
||
|
|
"grad_norm": 7.226157188415527,
|
||
|
|
"learning_rate": 1.661846788109819e-05,
|
||
|
|
"epoch": 2.15041782729805,
|
||
|
|
"step": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1914,
|
||
|
|
"grad_norm": 5.6042022705078125,
|
||
|
|
"learning_rate": 1.660532035904036e-05,
|
||
|
|
"epoch": 2.1559888579387185,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6443,
|
||
|
|
"grad_norm": 5.619427680969238,
|
||
|
|
"learning_rate": 1.6592172836982525e-05,
|
||
|
|
"epoch": 2.1615598885793874,
|
||
|
|
"step": 389
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5371,
|
||
|
|
"grad_norm": 4.770148754119873,
|
||
|
|
"learning_rate": 1.65790253149247e-05,
|
||
|
|
"epoch": 2.167130919220056,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5124,
|
||
|
|
"grad_norm": 7.61703634262085,
|
||
|
|
"learning_rate": 1.6565877792866868e-05,
|
||
|
|
"epoch": 2.1727019498607243,
|
||
|
|
"step": 391
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6248,
|
||
|
|
"grad_norm": 4.498234272003174,
|
||
|
|
"learning_rate": 1.6552730270809037e-05,
|
||
|
|
"epoch": 2.1782729805013927,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4621,
|
||
|
|
"grad_norm": 4.0563063621521,
|
||
|
|
"learning_rate": 1.6539582748751204e-05,
|
||
|
|
"epoch": 2.183844011142061,
|
||
|
|
"step": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4315,
|
||
|
|
"grad_norm": 6.069952964782715,
|
||
|
|
"learning_rate": 1.6526435226693373e-05,
|
||
|
|
"epoch": 2.1894150417827296,
|
||
|
|
"step": 394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4308,
|
||
|
|
"grad_norm": 6.728673458099365,
|
||
|
|
"learning_rate": 1.6513287704635543e-05,
|
||
|
|
"epoch": 2.1949860724233985,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2975,
|
||
|
|
"grad_norm": 14.551620483398438,
|
||
|
|
"learning_rate": 1.6500140182577712e-05,
|
||
|
|
"epoch": 2.200557103064067,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4624,
|
||
|
|
"grad_norm": 6.782831192016602,
|
||
|
|
"learning_rate": 1.648699266051988e-05,
|
||
|
|
"epoch": 2.2061281337047354,
|
||
|
|
"step": 397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5891,
|
||
|
|
"grad_norm": 6.513261795043945,
|
||
|
|
"learning_rate": 1.647384513846205e-05,
|
||
|
|
"epoch": 2.211699164345404,
|
||
|
|
"step": 398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3152,
|
||
|
|
"grad_norm": 6.3476433753967285,
|
||
|
|
"learning_rate": 1.646069761640422e-05,
|
||
|
|
"epoch": 2.2172701949860723,
|
||
|
|
"step": 399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3129,
|
||
|
|
"grad_norm": 4.936390399932861,
|
||
|
|
"learning_rate": 1.6447550094346387e-05,
|
||
|
|
"epoch": 2.222841225626741,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.531832218170166,
|
||
|
|
"eval_runtime": 35.95,
|
||
|
|
"eval_samples_per_second": 39.944,
|
||
|
|
"eval_steps_per_second": 2.003,
|
||
|
|
"epoch": 2.222841225626741,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2283,
|
||
|
|
"grad_norm": 8.302631378173828,
|
||
|
|
"learning_rate": 1.6434402572288556e-05,
|
||
|
|
"epoch": 2.2284122562674096,
|
||
|
|
"step": 401
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1884,
|
||
|
|
"grad_norm": 5.8890886306762695,
|
||
|
|
"learning_rate": 1.642125505023073e-05,
|
||
|
|
"epoch": 2.233983286908078,
|
||
|
|
"step": 402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3971,
|
||
|
|
"grad_norm": 6.417287349700928,
|
||
|
|
"learning_rate": 1.6408107528172895e-05,
|
||
|
|
"epoch": 2.2395543175487465,
|
||
|
|
"step": 403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5501,
|
||
|
|
"grad_norm": 6.351545810699463,
|
||
|
|
"learning_rate": 1.6394960006115065e-05,
|
||
|
|
"epoch": 2.245125348189415,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1685,
|
||
|
|
"grad_norm": 5.121798992156982,
|
||
|
|
"learning_rate": 1.6381812484057234e-05,
|
||
|
|
"epoch": 2.2506963788300833,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3617,
|
||
|
|
"grad_norm": 5.293002128601074,
|
||
|
|
"learning_rate": 1.6368664961999404e-05,
|
||
|
|
"epoch": 2.256267409470752,
|
||
|
|
"step": 406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3164,
|
||
|
|
"grad_norm": 6.6434431076049805,
|
||
|
|
"learning_rate": 1.6355517439941573e-05,
|
||
|
|
"epoch": 2.2618384401114207,
|
||
|
|
"step": 407
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4339,
|
||
|
|
"grad_norm": 6.383541584014893,
|
||
|
|
"learning_rate": 1.6342369917883743e-05,
|
||
|
|
"epoch": 2.267409470752089,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3699,
|
||
|
|
"grad_norm": 5.989224433898926,
|
||
|
|
"learning_rate": 1.6329222395825913e-05,
|
||
|
|
"epoch": 2.2729805013927575,
|
||
|
|
"step": 409
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4938,
|
||
|
|
"grad_norm": 6.49315881729126,
|
||
|
|
"learning_rate": 1.631607487376808e-05,
|
||
|
|
"epoch": 2.2785515320334264,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0902,
|
||
|
|
"grad_norm": 4.942923069000244,
|
||
|
|
"learning_rate": 1.6302927351710248e-05,
|
||
|
|
"epoch": 2.284122562674095,
|
||
|
|
"step": 411
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0282,
|
||
|
|
"grad_norm": 5.219899654388428,
|
||
|
|
"learning_rate": 1.6289779829652418e-05,
|
||
|
|
"epoch": 2.2896935933147633,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3465,
|
||
|
|
"grad_norm": 5.91557502746582,
|
||
|
|
"learning_rate": 1.6276632307594587e-05,
|
||
|
|
"epoch": 2.2952646239554317,
|
||
|
|
"step": 413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4312,
|
||
|
|
"grad_norm": 7.332894325256348,
|
||
|
|
"learning_rate": 1.6263484785536757e-05,
|
||
|
|
"epoch": 2.3008356545961,
|
||
|
|
"step": 414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1921,
|
||
|
|
"grad_norm": 6.784351825714111,
|
||
|
|
"learning_rate": 1.6250337263478926e-05,
|
||
|
|
"epoch": 2.3064066852367686,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3644,
|
||
|
|
"grad_norm": 6.222668647766113,
|
||
|
|
"learning_rate": 1.6237189741421096e-05,
|
||
|
|
"epoch": 2.3119777158774375,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3318,
|
||
|
|
"grad_norm": 6.7379841804504395,
|
||
|
|
"learning_rate": 1.6224042219363265e-05,
|
||
|
|
"epoch": 2.317548746518106,
|
||
|
|
"step": 417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3955,
|
||
|
|
"grad_norm": 7.218482494354248,
|
||
|
|
"learning_rate": 1.6210894697305435e-05,
|
||
|
|
"epoch": 2.3231197771587744,
|
||
|
|
"step": 418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5949,
|
||
|
|
"grad_norm": 6.676080226898193,
|
||
|
|
"learning_rate": 1.6197747175247604e-05,
|
||
|
|
"epoch": 2.328690807799443,
|
||
|
|
"step": 419
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2428,
|
||
|
|
"grad_norm": 6.974861145019531,
|
||
|
|
"learning_rate": 1.618459965318977e-05,
|
||
|
|
"epoch": 2.3342618384401113,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2438,
|
||
|
|
"grad_norm": 7.018064975738525,
|
||
|
|
"learning_rate": 1.617145213113194e-05,
|
||
|
|
"epoch": 2.33983286908078,
|
||
|
|
"step": 421
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4979,
|
||
|
|
"grad_norm": 6.781156063079834,
|
||
|
|
"learning_rate": 1.615830460907411e-05,
|
||
|
|
"epoch": 2.3454038997214486,
|
||
|
|
"step": 422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4914,
|
||
|
|
"grad_norm": 6.291943550109863,
|
||
|
|
"learning_rate": 1.614515708701628e-05,
|
||
|
|
"epoch": 2.350974930362117,
|
||
|
|
"step": 423
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1937,
|
||
|
|
"grad_norm": 6.769220352172852,
|
||
|
|
"learning_rate": 1.613200956495845e-05,
|
||
|
|
"epoch": 2.3565459610027855,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4428,
|
||
|
|
"grad_norm": 7.461434841156006,
|
||
|
|
"learning_rate": 1.6118862042900618e-05,
|
||
|
|
"epoch": 2.362116991643454,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0756,
|
||
|
|
"grad_norm": 5.971315860748291,
|
||
|
|
"learning_rate": 1.6105714520842788e-05,
|
||
|
|
"epoch": 2.3676880222841223,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1709,
|
||
|
|
"grad_norm": 6.632075786590576,
|
||
|
|
"learning_rate": 1.6092566998784954e-05,
|
||
|
|
"epoch": 2.3732590529247912,
|
||
|
|
"step": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2953,
|
||
|
|
"grad_norm": 6.03197717666626,
|
||
|
|
"learning_rate": 1.6079419476727123e-05,
|
||
|
|
"epoch": 2.3788300835654597,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1653,
|
||
|
|
"grad_norm": 7.393289089202881,
|
||
|
|
"learning_rate": 1.6066271954669296e-05,
|
||
|
|
"epoch": 2.384401114206128,
|
||
|
|
"step": 429
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.542,
|
||
|
|
"grad_norm": 9.518671989440918,
|
||
|
|
"learning_rate": 1.6053124432611462e-05,
|
||
|
|
"epoch": 2.3899721448467965,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0957,
|
||
|
|
"grad_norm": 7.086347579956055,
|
||
|
|
"learning_rate": 1.6039976910553632e-05,
|
||
|
|
"epoch": 2.3955431754874654,
|
||
|
|
"step": 431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1408,
|
||
|
|
"grad_norm": 5.21544885635376,
|
||
|
|
"learning_rate": 1.60268293884958e-05,
|
||
|
|
"epoch": 2.401114206128134,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1708,
|
||
|
|
"grad_norm": 7.537359237670898,
|
||
|
|
"learning_rate": 1.601368186643797e-05,
|
||
|
|
"epoch": 2.4066852367688023,
|
||
|
|
"step": 433
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.101,
|
||
|
|
"grad_norm": 4.926475524902344,
|
||
|
|
"learning_rate": 1.600053434438014e-05,
|
||
|
|
"epoch": 2.4122562674094707,
|
||
|
|
"step": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3898,
|
||
|
|
"grad_norm": 5.6016740798950195,
|
||
|
|
"learning_rate": 1.598738682232231e-05,
|
||
|
|
"epoch": 2.417827298050139,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4717,
|
||
|
|
"grad_norm": 7.16878604888916,
|
||
|
|
"learning_rate": 1.597423930026448e-05,
|
||
|
|
"epoch": 2.4233983286908076,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6173,
|
||
|
|
"grad_norm": 6.310802459716797,
|
||
|
|
"learning_rate": 1.5961091778206646e-05,
|
||
|
|
"epoch": 2.4289693593314765,
|
||
|
|
"step": 437
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6172,
|
||
|
|
"grad_norm": 8.035069465637207,
|
||
|
|
"learning_rate": 1.5947944256148815e-05,
|
||
|
|
"epoch": 2.434540389972145,
|
||
|
|
"step": 438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4479,
|
||
|
|
"grad_norm": 7.806406497955322,
|
||
|
|
"learning_rate": 1.5934796734090985e-05,
|
||
|
|
"epoch": 2.4401114206128134,
|
||
|
|
"step": 439
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3459,
|
||
|
|
"grad_norm": 5.882315635681152,
|
||
|
|
"learning_rate": 1.5921649212033154e-05,
|
||
|
|
"epoch": 2.445682451253482,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2195,
|
||
|
|
"grad_norm": 5.817505359649658,
|
||
|
|
"learning_rate": 1.5908501689975324e-05,
|
||
|
|
"epoch": 2.4512534818941503,
|
||
|
|
"step": 441
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3043,
|
||
|
|
"grad_norm": 7.497400283813477,
|
||
|
|
"learning_rate": 1.5895354167917493e-05,
|
||
|
|
"epoch": 2.456824512534819,
|
||
|
|
"step": 442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.42,
|
||
|
|
"grad_norm": 5.955392837524414,
|
||
|
|
"learning_rate": 1.5882206645859663e-05,
|
||
|
|
"epoch": 2.4623955431754876,
|
||
|
|
"step": 443
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4764,
|
||
|
|
"grad_norm": 8.848158836364746,
|
||
|
|
"learning_rate": 1.5869059123801832e-05,
|
||
|
|
"epoch": 2.467966573816156,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4508,
|
||
|
|
"grad_norm": 6.384143829345703,
|
||
|
|
"learning_rate": 1.5855911601744002e-05,
|
||
|
|
"epoch": 2.4735376044568245,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3499,
|
||
|
|
"grad_norm": 7.251498699188232,
|
||
|
|
"learning_rate": 1.584276407968617e-05,
|
||
|
|
"epoch": 2.479108635097493,
|
||
|
|
"step": 446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.297,
|
||
|
|
"grad_norm": 8.700945854187012,
|
||
|
|
"learning_rate": 1.5829616557628337e-05,
|
||
|
|
"epoch": 2.4846796657381613,
|
||
|
|
"step": 447
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1607,
|
||
|
|
"grad_norm": 8.17098617553711,
|
||
|
|
"learning_rate": 1.5816469035570507e-05,
|
||
|
|
"epoch": 2.4902506963788302,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5328,
|
||
|
|
"grad_norm": 6.918285846710205,
|
||
|
|
"learning_rate": 1.5803321513512676e-05,
|
||
|
|
"epoch": 2.4958217270194987,
|
||
|
|
"step": 449
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6258,
|
||
|
|
"grad_norm": 6.7390851974487305,
|
||
|
|
"learning_rate": 1.5790173991454846e-05,
|
||
|
|
"epoch": 2.501392757660167,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.571645498275757,
|
||
|
|
"eval_runtime": 35.9556,
|
||
|
|
"eval_samples_per_second": 39.938,
|
||
|
|
"eval_steps_per_second": 2.002,
|
||
|
|
"epoch": 2.501392757660167,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5923,
|
||
|
|
"grad_norm": 6.522182941436768,
|
||
|
|
"learning_rate": 1.5777026469397015e-05,
|
||
|
|
"epoch": 2.5069637883008355,
|
||
|
|
"step": 451
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2816,
|
||
|
|
"grad_norm": 5.984560489654541,
|
||
|
|
"learning_rate": 1.5763878947339185e-05,
|
||
|
|
"epoch": 2.5125348189415044,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2029,
|
||
|
|
"grad_norm": 8.060498237609863,
|
||
|
|
"learning_rate": 1.5750731425281354e-05,
|
||
|
|
"epoch": 2.518105849582173,
|
||
|
|
"step": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2117,
|
||
|
|
"grad_norm": 6.93899393081665,
|
||
|
|
"learning_rate": 1.573758390322352e-05,
|
||
|
|
"epoch": 2.5236768802228413,
|
||
|
|
"step": 454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4347,
|
||
|
|
"grad_norm": 6.21560525894165,
|
||
|
|
"learning_rate": 1.572443638116569e-05,
|
||
|
|
"epoch": 2.5292479108635098,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3394,
|
||
|
|
"grad_norm": 7.837366580963135,
|
||
|
|
"learning_rate": 1.5711288859107863e-05,
|
||
|
|
"epoch": 2.534818941504178,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4262,
|
||
|
|
"grad_norm": 7.609643936157227,
|
||
|
|
"learning_rate": 1.5698141337050033e-05,
|
||
|
|
"epoch": 2.5403899721448466,
|
||
|
|
"step": 457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3738,
|
||
|
|
"grad_norm": 6.487556457519531,
|
||
|
|
"learning_rate": 1.56849938149922e-05,
|
||
|
|
"epoch": 2.545961002785515,
|
||
|
|
"step": 458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4021,
|
||
|
|
"grad_norm": 6.344869136810303,
|
||
|
|
"learning_rate": 1.5671846292934368e-05,
|
||
|
|
"epoch": 2.551532033426184,
|
||
|
|
"step": 459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3887,
|
||
|
|
"grad_norm": 6.960203170776367,
|
||
|
|
"learning_rate": 1.5658698770876538e-05,
|
||
|
|
"epoch": 2.5571030640668524,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2997,
|
||
|
|
"grad_norm": 11.57795524597168,
|
||
|
|
"learning_rate": 1.5645551248818707e-05,
|
||
|
|
"epoch": 2.562674094707521,
|
||
|
|
"step": 461
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5967,
|
||
|
|
"grad_norm": 6.889705181121826,
|
||
|
|
"learning_rate": 1.5632403726760877e-05,
|
||
|
|
"epoch": 2.5682451253481893,
|
||
|
|
"step": 462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2643,
|
||
|
|
"grad_norm": 8.502350807189941,
|
||
|
|
"learning_rate": 1.5619256204703046e-05,
|
||
|
|
"epoch": 2.573816155988858,
|
||
|
|
"step": 463
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3686,
|
||
|
|
"grad_norm": 8.704366683959961,
|
||
|
|
"learning_rate": 1.5606108682645216e-05,
|
||
|
|
"epoch": 2.5793871866295266,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9961,
|
||
|
|
"grad_norm": 8.154948234558105,
|
||
|
|
"learning_rate": 1.5592961160587382e-05,
|
||
|
|
"epoch": 2.584958217270195,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0603,
|
||
|
|
"grad_norm": 5.729700088500977,
|
||
|
|
"learning_rate": 1.557981363852955e-05,
|
||
|
|
"epoch": 2.5905292479108635,
|
||
|
|
"step": 466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6641,
|
||
|
|
"grad_norm": 7.716269493103027,
|
||
|
|
"learning_rate": 1.556666611647172e-05,
|
||
|
|
"epoch": 2.596100278551532,
|
||
|
|
"step": 467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2886,
|
||
|
|
"grad_norm": 11.220166206359863,
|
||
|
|
"learning_rate": 1.555351859441389e-05,
|
||
|
|
"epoch": 2.6016713091922004,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2922,
|
||
|
|
"grad_norm": 7.163726329803467,
|
||
|
|
"learning_rate": 1.554037107235606e-05,
|
||
|
|
"epoch": 2.6072423398328692,
|
||
|
|
"step": 469
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1046,
|
||
|
|
"grad_norm": 7.28581428527832,
|
||
|
|
"learning_rate": 1.552722355029823e-05,
|
||
|
|
"epoch": 2.6128133704735377,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6142,
|
||
|
|
"grad_norm": 9.65365219116211,
|
||
|
|
"learning_rate": 1.5514076028240396e-05,
|
||
|
|
"epoch": 2.618384401114206,
|
||
|
|
"step": 471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5575,
|
||
|
|
"grad_norm": 6.458492279052734,
|
||
|
|
"learning_rate": 1.550092850618257e-05,
|
||
|
|
"epoch": 2.6239554317548746,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3655,
|
||
|
|
"grad_norm": 7.325246810913086,
|
||
|
|
"learning_rate": 1.5487780984124738e-05,
|
||
|
|
"epoch": 2.6295264623955434,
|
||
|
|
"step": 473
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3344,
|
||
|
|
"grad_norm": 7.81355619430542,
|
||
|
|
"learning_rate": 1.5474633462066908e-05,
|
||
|
|
"epoch": 2.635097493036212,
|
||
|
|
"step": 474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2505,
|
||
|
|
"grad_norm": 7.347303867340088,
|
||
|
|
"learning_rate": 1.5461485940009074e-05,
|
||
|
|
"epoch": 2.6406685236768803,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1988,
|
||
|
|
"grad_norm": 7.306774616241455,
|
||
|
|
"learning_rate": 1.5448338417951243e-05,
|
||
|
|
"epoch": 2.6462395543175488,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4075,
|
||
|
|
"grad_norm": 7.261951446533203,
|
||
|
|
"learning_rate": 1.5435190895893413e-05,
|
||
|
|
"epoch": 2.651810584958217,
|
||
|
|
"step": 477
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3235,
|
||
|
|
"grad_norm": 8.138806343078613,
|
||
|
|
"learning_rate": 1.5422043373835582e-05,
|
||
|
|
"epoch": 2.6573816155988856,
|
||
|
|
"step": 478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4297,
|
||
|
|
"grad_norm": 7.515624046325684,
|
||
|
|
"learning_rate": 1.5408895851777752e-05,
|
||
|
|
"epoch": 2.662952646239554,
|
||
|
|
"step": 479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0187,
|
||
|
|
"grad_norm": 7.298752307891846,
|
||
|
|
"learning_rate": 1.539574832971992e-05,
|
||
|
|
"epoch": 2.668523676880223,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1512,
|
||
|
|
"grad_norm": 7.08530855178833,
|
||
|
|
"learning_rate": 1.538260080766209e-05,
|
||
|
|
"epoch": 2.6740947075208914,
|
||
|
|
"step": 481
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9209,
|
||
|
|
"grad_norm": 8.528051376342773,
|
||
|
|
"learning_rate": 1.5369453285604257e-05,
|
||
|
|
"epoch": 2.67966573816156,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6726,
|
||
|
|
"grad_norm": 6.991207122802734,
|
||
|
|
"learning_rate": 1.535630576354643e-05,
|
||
|
|
"epoch": 2.6852367688022283,
|
||
|
|
"step": 483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6101,
|
||
|
|
"grad_norm": 6.910933971405029,
|
||
|
|
"learning_rate": 1.53431582414886e-05,
|
||
|
|
"epoch": 2.690807799442897,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0596,
|
||
|
|
"grad_norm": 6.858171463012695,
|
||
|
|
"learning_rate": 1.5330010719430766e-05,
|
||
|
|
"epoch": 2.6963788300835656,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3009,
|
||
|
|
"grad_norm": 7.1738409996032715,
|
||
|
|
"learning_rate": 1.5316863197372935e-05,
|
||
|
|
"epoch": 2.701949860724234,
|
||
|
|
"step": 486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1306,
|
||
|
|
"grad_norm": 6.751303672790527,
|
||
|
|
"learning_rate": 1.5303715675315105e-05,
|
||
|
|
"epoch": 2.7075208913649025,
|
||
|
|
"step": 487
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6064,
|
||
|
|
"grad_norm": 7.458596706390381,
|
||
|
|
"learning_rate": 1.5290568153257274e-05,
|
||
|
|
"epoch": 2.713091922005571,
|
||
|
|
"step": 488
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3423,
|
||
|
|
"grad_norm": 4.847519397735596,
|
||
|
|
"learning_rate": 1.5277420631199444e-05,
|
||
|
|
"epoch": 2.7186629526462394,
|
||
|
|
"step": 489
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0908,
|
||
|
|
"grad_norm": 6.585028648376465,
|
||
|
|
"learning_rate": 1.5264273109141613e-05,
|
||
|
|
"epoch": 2.724233983286908,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6632,
|
||
|
|
"grad_norm": 5.222984790802002,
|
||
|
|
"learning_rate": 1.5251125587083783e-05,
|
||
|
|
"epoch": 2.7298050139275767,
|
||
|
|
"step": 491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3113,
|
||
|
|
"grad_norm": 6.947058200836182,
|
||
|
|
"learning_rate": 1.523797806502595e-05,
|
||
|
|
"epoch": 2.735376044568245,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0863,
|
||
|
|
"grad_norm": 5.885672569274902,
|
||
|
|
"learning_rate": 1.522483054296812e-05,
|
||
|
|
"epoch": 2.7409470752089136,
|
||
|
|
"step": 493
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1982,
|
||
|
|
"grad_norm": 7.9502034187316895,
|
||
|
|
"learning_rate": 1.521168302091029e-05,
|
||
|
|
"epoch": 2.7465181058495824,
|
||
|
|
"step": 494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3941,
|
||
|
|
"grad_norm": 5.9523773193359375,
|
||
|
|
"learning_rate": 1.5198535498852457e-05,
|
||
|
|
"epoch": 2.752089136490251,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3251,
|
||
|
|
"grad_norm": 7.984345436096191,
|
||
|
|
"learning_rate": 1.5185387976794627e-05,
|
||
|
|
"epoch": 2.7576601671309193,
|
||
|
|
"step": 496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8109,
|
||
|
|
"grad_norm": 8.467183113098145,
|
||
|
|
"learning_rate": 1.5172240454736796e-05,
|
||
|
|
"epoch": 2.7632311977715878,
|
||
|
|
"step": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1339,
|
||
|
|
"grad_norm": 7.878790378570557,
|
||
|
|
"learning_rate": 1.5159092932678966e-05,
|
||
|
|
"epoch": 2.768802228412256,
|
||
|
|
"step": 498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1736,
|
||
|
|
"grad_norm": 5.638209819793701,
|
||
|
|
"learning_rate": 1.5145945410621134e-05,
|
||
|
|
"epoch": 2.7743732590529246,
|
||
|
|
"step": 499
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3546,
|
||
|
|
"grad_norm": 7.818211078643799,
|
||
|
|
"learning_rate": 1.5132797888563303e-05,
|
||
|
|
"epoch": 2.779944289693593,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.6166257858276367,
|
||
|
|
"eval_runtime": 35.971,
|
||
|
|
"eval_samples_per_second": 39.921,
|
||
|
|
"eval_steps_per_second": 2.002,
|
||
|
|
"epoch": 2.779944289693593,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4636,
|
||
|
|
"grad_norm": 6.118830680847168,
|
||
|
|
"learning_rate": 1.5119650366505473e-05,
|
||
|
|
"epoch": 2.785515320334262,
|
||
|
|
"step": 501
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5519,
|
||
|
|
"grad_norm": 7.9165778160095215,
|
||
|
|
"learning_rate": 1.510650284444764e-05,
|
||
|
|
"epoch": 2.7910863509749304,
|
||
|
|
"step": 502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5206,
|
||
|
|
"grad_norm": 6.975761413574219,
|
||
|
|
"learning_rate": 1.5093355322389812e-05,
|
||
|
|
"epoch": 2.796657381615599,
|
||
|
|
"step": 503
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0665,
|
||
|
|
"grad_norm": 9.277933120727539,
|
||
|
|
"learning_rate": 1.5080207800331981e-05,
|
||
|
|
"epoch": 2.8022284122562673,
|
||
|
|
"step": 504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2801,
|
||
|
|
"grad_norm": 8.121682167053223,
|
||
|
|
"learning_rate": 1.5067060278274151e-05,
|
||
|
|
"epoch": 2.807799442896936,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.565,
|
||
|
|
"grad_norm": 8.76021957397461,
|
||
|
|
"learning_rate": 1.5053912756216319e-05,
|
||
|
|
"epoch": 2.8133704735376046,
|
||
|
|
"step": 506
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3502,
|
||
|
|
"grad_norm": 8.618566513061523,
|
||
|
|
"learning_rate": 1.5040765234158488e-05,
|
||
|
|
"epoch": 2.818941504178273,
|
||
|
|
"step": 507
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5859,
|
||
|
|
"grad_norm": 8.027894020080566,
|
||
|
|
"learning_rate": 1.5027617712100658e-05,
|
||
|
|
"epoch": 2.8245125348189415,
|
||
|
|
"step": 508
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4159,
|
||
|
|
"grad_norm": 7.063473701477051,
|
||
|
|
"learning_rate": 1.5014470190042826e-05,
|
||
|
|
"epoch": 2.83008356545961,
|
||
|
|
"step": 509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5672,
|
||
|
|
"grad_norm": 6.095931053161621,
|
||
|
|
"learning_rate": 1.5001322667984995e-05,
|
||
|
|
"epoch": 2.8356545961002784,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2551,
|
||
|
|
"grad_norm": 6.445271968841553,
|
||
|
|
"learning_rate": 1.4988175145927165e-05,
|
||
|
|
"epoch": 2.841225626740947,
|
||
|
|
"step": 511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9671,
|
||
|
|
"grad_norm": 7.601891040802002,
|
||
|
|
"learning_rate": 1.4975027623869334e-05,
|
||
|
|
"epoch": 2.8467966573816157,
|
||
|
|
"step": 512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4462,
|
||
|
|
"grad_norm": 8.017728805541992,
|
||
|
|
"learning_rate": 1.4961880101811502e-05,
|
||
|
|
"epoch": 2.852367688022284,
|
||
|
|
"step": 513
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2006,
|
||
|
|
"grad_norm": 6.753676891326904,
|
||
|
|
"learning_rate": 1.4948732579753672e-05,
|
||
|
|
"epoch": 2.8579387186629526,
|
||
|
|
"step": 514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9354,
|
||
|
|
"grad_norm": 6.220627784729004,
|
||
|
|
"learning_rate": 1.4935585057695843e-05,
|
||
|
|
"epoch": 2.863509749303621,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5554,
|
||
|
|
"grad_norm": 7.825878620147705,
|
||
|
|
"learning_rate": 1.4922437535638009e-05,
|
||
|
|
"epoch": 2.86908077994429,
|
||
|
|
"step": 516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9281,
|
||
|
|
"grad_norm": 7.7669548988342285,
|
||
|
|
"learning_rate": 1.490929001358018e-05,
|
||
|
|
"epoch": 2.8746518105849583,
|
||
|
|
"step": 517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1678,
|
||
|
|
"grad_norm": 6.18816614151001,
|
||
|
|
"learning_rate": 1.489614249152235e-05,
|
||
|
|
"epoch": 2.8802228412256268,
|
||
|
|
"step": 518
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8378,
|
||
|
|
"grad_norm": 11.241938591003418,
|
||
|
|
"learning_rate": 1.4882994969464517e-05,
|
||
|
|
"epoch": 2.885793871866295,
|
||
|
|
"step": 519
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6602,
|
||
|
|
"grad_norm": 6.708087921142578,
|
||
|
|
"learning_rate": 1.4869847447406687e-05,
|
||
|
|
"epoch": 2.8913649025069637,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5575,
|
||
|
|
"grad_norm": 8.96353530883789,
|
||
|
|
"learning_rate": 1.4856699925348856e-05,
|
||
|
|
"epoch": 2.896935933147632,
|
||
|
|
"step": 521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3553,
|
||
|
|
"grad_norm": 7.286456108093262,
|
||
|
|
"learning_rate": 1.4843552403291026e-05,
|
||
|
|
"epoch": 2.902506963788301,
|
||
|
|
"step": 522
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3618,
|
||
|
|
"grad_norm": 6.448929309844971,
|
||
|
|
"learning_rate": 1.4830404881233194e-05,
|
||
|
|
"epoch": 2.9080779944289694,
|
||
|
|
"step": 523
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0911,
|
||
|
|
"grad_norm": 6.1524739265441895,
|
||
|
|
"learning_rate": 1.4817257359175363e-05,
|
||
|
|
"epoch": 2.913649025069638,
|
||
|
|
"step": 524
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2465,
|
||
|
|
"grad_norm": 6.833171367645264,
|
||
|
|
"learning_rate": 1.4804109837117533e-05,
|
||
|
|
"epoch": 2.9192200557103063,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9937,
|
||
|
|
"grad_norm": 8.745670318603516,
|
||
|
|
"learning_rate": 1.47909623150597e-05,
|
||
|
|
"epoch": 2.924791086350975,
|
||
|
|
"step": 526
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3931,
|
||
|
|
"grad_norm": 6.3659186363220215,
|
||
|
|
"learning_rate": 1.477781479300187e-05,
|
||
|
|
"epoch": 2.9303621169916436,
|
||
|
|
"step": 527
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.029,
|
||
|
|
"grad_norm": 8.309256553649902,
|
||
|
|
"learning_rate": 1.476466727094404e-05,
|
||
|
|
"epoch": 2.935933147632312,
|
||
|
|
"step": 528
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.383,
|
||
|
|
"grad_norm": 7.611057758331299,
|
||
|
|
"learning_rate": 1.4751519748886211e-05,
|
||
|
|
"epoch": 2.9415041782729805,
|
||
|
|
"step": 529
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4833,
|
||
|
|
"grad_norm": 9.441068649291992,
|
||
|
|
"learning_rate": 1.4738372226828379e-05,
|
||
|
|
"epoch": 2.947075208913649,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2739,
|
||
|
|
"grad_norm": 7.198431968688965,
|
||
|
|
"learning_rate": 1.4725224704770548e-05,
|
||
|
|
"epoch": 2.9526462395543174,
|
||
|
|
"step": 531
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4294,
|
||
|
|
"grad_norm": 8.88117790222168,
|
||
|
|
"learning_rate": 1.4712077182712718e-05,
|
||
|
|
"epoch": 2.958217270194986,
|
||
|
|
"step": 532
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0204,
|
||
|
|
"grad_norm": 9.982294082641602,
|
||
|
|
"learning_rate": 1.4698929660654886e-05,
|
||
|
|
"epoch": 2.9637883008356547,
|
||
|
|
"step": 533
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1488,
|
||
|
|
"grad_norm": 8.535533905029297,
|
||
|
|
"learning_rate": 1.4685782138597055e-05,
|
||
|
|
"epoch": 2.969359331476323,
|
||
|
|
"step": 534
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.49,
|
||
|
|
"grad_norm": 6.813885688781738,
|
||
|
|
"learning_rate": 1.4672634616539225e-05,
|
||
|
|
"epoch": 2.9749303621169916,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.113,
|
||
|
|
"grad_norm": 9.557439804077148,
|
||
|
|
"learning_rate": 1.4659487094481394e-05,
|
||
|
|
"epoch": 2.98050139275766,
|
||
|
|
"step": 536
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5004,
|
||
|
|
"grad_norm": 6.406128883361816,
|
||
|
|
"learning_rate": 1.4646339572423562e-05,
|
||
|
|
"epoch": 2.986072423398329,
|
||
|
|
"step": 537
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1722,
|
||
|
|
"grad_norm": 7.9670915603637695,
|
||
|
|
"learning_rate": 1.4633192050365732e-05,
|
||
|
|
"epoch": 2.9916434540389973,
|
||
|
|
"step": 538
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5033,
|
||
|
|
"grad_norm": 9.402728080749512,
|
||
|
|
"learning_rate": 1.4620044528307901e-05,
|
||
|
|
"epoch": 2.997214484679666,
|
||
|
|
"step": 539
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.279,
|
||
|
|
"grad_norm": 7.38714075088501,
|
||
|
|
"learning_rate": 1.4606897006250069e-05,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6426,
|
||
|
|
"grad_norm": 7.639667510986328,
|
||
|
|
"learning_rate": 1.4593749484192238e-05,
|
||
|
|
"epoch": 3.0055710306406684,
|
||
|
|
"step": 541
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8426,
|
||
|
|
"grad_norm": 7.864633560180664,
|
||
|
|
"learning_rate": 1.458060196213441e-05,
|
||
|
|
"epoch": 3.011142061281337,
|
||
|
|
"step": 542
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0651,
|
||
|
|
"grad_norm": 6.637276649475098,
|
||
|
|
"learning_rate": 1.4567454440076576e-05,
|
||
|
|
"epoch": 3.0167130919220058,
|
||
|
|
"step": 543
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0804,
|
||
|
|
"grad_norm": 7.148686408996582,
|
||
|
|
"learning_rate": 1.4554306918018747e-05,
|
||
|
|
"epoch": 3.022284122562674,
|
||
|
|
"step": 544
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0182,
|
||
|
|
"grad_norm": 6.767364501953125,
|
||
|
|
"learning_rate": 1.4541159395960917e-05,
|
||
|
|
"epoch": 3.0278551532033426,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8165,
|
||
|
|
"grad_norm": 6.77062463760376,
|
||
|
|
"learning_rate": 1.4528011873903086e-05,
|
||
|
|
"epoch": 3.033426183844011,
|
||
|
|
"step": 546
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9941,
|
||
|
|
"grad_norm": 8.067922592163086,
|
||
|
|
"learning_rate": 1.4514864351845254e-05,
|
||
|
|
"epoch": 3.0389972144846795,
|
||
|
|
"step": 547
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9579,
|
||
|
|
"grad_norm": 8.817468643188477,
|
||
|
|
"learning_rate": 1.4501716829787423e-05,
|
||
|
|
"epoch": 3.0445682451253484,
|
||
|
|
"step": 548
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6023,
|
||
|
|
"grad_norm": 8.70374870300293,
|
||
|
|
"learning_rate": 1.4488569307729593e-05,
|
||
|
|
"epoch": 3.050139275766017,
|
||
|
|
"step": 549
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1392,
|
||
|
|
"grad_norm": 9.344374656677246,
|
||
|
|
"learning_rate": 1.447542178567176e-05,
|
||
|
|
"epoch": 3.0557103064066853,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 3.0347440242767334,
|
||
|
|
"eval_runtime": 35.9429,
|
||
|
|
"eval_samples_per_second": 39.952,
|
||
|
|
"eval_steps_per_second": 2.003,
|
||
|
|
"epoch": 3.0557103064066853,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.663,
|
||
|
|
"grad_norm": 10.07166862487793,
|
||
|
|
"learning_rate": 1.446227426361393e-05,
|
||
|
|
"epoch": 3.0612813370473537,
|
||
|
|
"step": 551
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1801,
|
||
|
|
"grad_norm": 14.619653701782227,
|
||
|
|
"learning_rate": 1.44491267415561e-05,
|
||
|
|
"epoch": 3.066852367688022,
|
||
|
|
"step": 552
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9107,
|
||
|
|
"grad_norm": 10.427509307861328,
|
||
|
|
"learning_rate": 1.443597921949827e-05,
|
||
|
|
"epoch": 3.0724233983286906,
|
||
|
|
"step": 553
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9474,
|
||
|
|
"grad_norm": 8.392213821411133,
|
||
|
|
"learning_rate": 1.4422831697440437e-05,
|
||
|
|
"epoch": 3.0779944289693595,
|
||
|
|
"step": 554
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7972,
|
||
|
|
"grad_norm": 13.848929405212402,
|
||
|
|
"learning_rate": 1.4409684175382607e-05,
|
||
|
|
"epoch": 3.083565459610028,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7298,
|
||
|
|
"grad_norm": 9.263422966003418,
|
||
|
|
"learning_rate": 1.4396536653324778e-05,
|
||
|
|
"epoch": 3.0891364902506964,
|
||
|
|
"step": 556
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.5749,
|
||
|
|
"grad_norm": 11.082460403442383,
|
||
|
|
"learning_rate": 1.4383389131266946e-05,
|
||
|
|
"epoch": 3.094707520891365,
|
||
|
|
"step": 557
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7301,
|
||
|
|
"grad_norm": 7.7812604904174805,
|
||
|
|
"learning_rate": 1.4370241609209115e-05,
|
||
|
|
"epoch": 3.1002785515320332,
|
||
|
|
"step": 558
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.5884,
|
||
|
|
"grad_norm": 12.2935791015625,
|
||
|
|
"learning_rate": 1.4357094087151285e-05,
|
||
|
|
"epoch": 3.105849582172702,
|
||
|
|
"step": 559
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8034,
|
||
|
|
"grad_norm": 8.129678726196289,
|
||
|
|
"learning_rate": 1.4343946565093454e-05,
|
||
|
|
"epoch": 3.1114206128133706,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6528,
|
||
|
|
"grad_norm": 8.628301620483398,
|
||
|
|
"learning_rate": 1.4330799043035622e-05,
|
||
|
|
"epoch": 3.116991643454039,
|
||
|
|
"step": 561
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8483,
|
||
|
|
"grad_norm": 10.514995574951172,
|
||
|
|
"learning_rate": 1.4317651520977792e-05,
|
||
|
|
"epoch": 3.1225626740947074,
|
||
|
|
"step": 562
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7009,
|
||
|
|
"grad_norm": 8.187010765075684,
|
||
|
|
"learning_rate": 1.4304503998919961e-05,
|
||
|
|
"epoch": 3.128133704735376,
|
||
|
|
"step": 563
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8732,
|
||
|
|
"grad_norm": 10.525712013244629,
|
||
|
|
"learning_rate": 1.4291356476862129e-05,
|
||
|
|
"epoch": 3.1337047353760448,
|
||
|
|
"step": 564
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8319,
|
||
|
|
"grad_norm": 9.198347091674805,
|
||
|
|
"learning_rate": 1.4278208954804298e-05,
|
||
|
|
"epoch": 3.139275766016713,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.726,
|
||
|
|
"grad_norm": 8.486757278442383,
|
||
|
|
"learning_rate": 1.4265061432746468e-05,
|
||
|
|
"epoch": 3.1448467966573816,
|
||
|
|
"step": 566
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.4993,
|
||
|
|
"grad_norm": 8.220407485961914,
|
||
|
|
"learning_rate": 1.4251913910688636e-05,
|
||
|
|
"epoch": 3.15041782729805,
|
||
|
|
"step": 567
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0212,
|
||
|
|
"grad_norm": 7.644767761230469,
|
||
|
|
"learning_rate": 1.4238766388630805e-05,
|
||
|
|
"epoch": 3.1559888579387185,
|
||
|
|
"step": 568
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8306,
|
||
|
|
"grad_norm": 11.287712097167969,
|
||
|
|
"learning_rate": 1.4225618866572977e-05,
|
||
|
|
"epoch": 3.1615598885793874,
|
||
|
|
"step": 569
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.626,
|
||
|
|
"grad_norm": 7.9160637855529785,
|
||
|
|
"learning_rate": 1.4212471344515146e-05,
|
||
|
|
"epoch": 3.167130919220056,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.75,
|
||
|
|
"grad_norm": 11.988582611083984,
|
||
|
|
"learning_rate": 1.4199323822457314e-05,
|
||
|
|
"epoch": 3.1727019498607243,
|
||
|
|
"step": 571
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2685,
|
||
|
|
"grad_norm": 9.961721420288086,
|
||
|
|
"learning_rate": 1.4186176300399483e-05,
|
||
|
|
"epoch": 3.1782729805013927,
|
||
|
|
"step": 572
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7429,
|
||
|
|
"grad_norm": 12.098424911499023,
|
||
|
|
"learning_rate": 1.4173028778341653e-05,
|
||
|
|
"epoch": 3.183844011142061,
|
||
|
|
"step": 573
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.0086,
|
||
|
|
"grad_norm": 8.59049129486084,
|
||
|
|
"learning_rate": 1.415988125628382e-05,
|
||
|
|
"epoch": 3.1894150417827296,
|
||
|
|
"step": 574
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.1215,
|
||
|
|
"grad_norm": 10.50232219696045,
|
||
|
|
"learning_rate": 1.414673373422599e-05,
|
||
|
|
"epoch": 3.1949860724233985,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6729,
|
||
|
|
"grad_norm": 11.673900604248047,
|
||
|
|
"learning_rate": 1.413358621216816e-05,
|
||
|
|
"epoch": 3.200557103064067,
|
||
|
|
"step": 576
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2036,
|
||
|
|
"grad_norm": 6.419600009918213,
|
||
|
|
"learning_rate": 1.412043869011033e-05,
|
||
|
|
"epoch": 3.2061281337047354,
|
||
|
|
"step": 577
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6877,
|
||
|
|
"grad_norm": 10.218490600585938,
|
||
|
|
"learning_rate": 1.4107291168052497e-05,
|
||
|
|
"epoch": 3.211699164345404,
|
||
|
|
"step": 578
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.5637,
|
||
|
|
"grad_norm": 5.7183918952941895,
|
||
|
|
"learning_rate": 1.4094143645994667e-05,
|
||
|
|
"epoch": 3.2172701949860723,
|
||
|
|
"step": 579
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7498,
|
||
|
|
"grad_norm": 11.460823059082031,
|
||
|
|
"learning_rate": 1.4080996123936836e-05,
|
||
|
|
"epoch": 3.222841225626741,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6792,
|
||
|
|
"grad_norm": 8.623233795166016,
|
||
|
|
"learning_rate": 1.4067848601879004e-05,
|
||
|
|
"epoch": 3.2284122562674096,
|
||
|
|
"step": 581
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6752,
|
||
|
|
"grad_norm": 11.339884757995605,
|
||
|
|
"learning_rate": 1.4054701079821174e-05,
|
||
|
|
"epoch": 3.233983286908078,
|
||
|
|
"step": 582
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8586,
|
||
|
|
"grad_norm": 12.452316284179688,
|
||
|
|
"learning_rate": 1.4041553557763345e-05,
|
||
|
|
"epoch": 3.2395543175487465,
|
||
|
|
"step": 583
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8345,
|
||
|
|
"grad_norm": 6.755831241607666,
|
||
|
|
"learning_rate": 1.4028406035705514e-05,
|
||
|
|
"epoch": 3.245125348189415,
|
||
|
|
"step": 584
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6932,
|
||
|
|
"grad_norm": 9.68067741394043,
|
||
|
|
"learning_rate": 1.4015258513647682e-05,
|
||
|
|
"epoch": 3.2506963788300833,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2071,
|
||
|
|
"grad_norm": 11.948298454284668,
|
||
|
|
"learning_rate": 1.4002110991589852e-05,
|
||
|
|
"epoch": 3.256267409470752,
|
||
|
|
"step": 586
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7349,
|
||
|
|
"grad_norm": 11.49226188659668,
|
||
|
|
"learning_rate": 1.3988963469532021e-05,
|
||
|
|
"epoch": 3.2618384401114207,
|
||
|
|
"step": 587
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7923,
|
||
|
|
"grad_norm": 10.757736206054688,
|
||
|
|
"learning_rate": 1.3975815947474189e-05,
|
||
|
|
"epoch": 3.267409470752089,
|
||
|
|
"step": 588
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6857,
|
||
|
|
"grad_norm": 8.46744441986084,
|
||
|
|
"learning_rate": 1.3962668425416358e-05,
|
||
|
|
"epoch": 3.2729805013927575,
|
||
|
|
"step": 589
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9153,
|
||
|
|
"grad_norm": 6.472330093383789,
|
||
|
|
"learning_rate": 1.3949520903358528e-05,
|
||
|
|
"epoch": 3.2785515320334264,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7542,
|
||
|
|
"grad_norm": 12.151514053344727,
|
||
|
|
"learning_rate": 1.3936373381300696e-05,
|
||
|
|
"epoch": 3.284122562674095,
|
||
|
|
"step": 591
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9487,
|
||
|
|
"grad_norm": 11.680760383605957,
|
||
|
|
"learning_rate": 1.3923225859242865e-05,
|
||
|
|
"epoch": 3.2896935933147633,
|
||
|
|
"step": 592
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6893,
|
||
|
|
"grad_norm": 9.367558479309082,
|
||
|
|
"learning_rate": 1.3910078337185035e-05,
|
||
|
|
"epoch": 3.2952646239554317,
|
||
|
|
"step": 593
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7126,
|
||
|
|
"grad_norm": 10.658570289611816,
|
||
|
|
"learning_rate": 1.3896930815127206e-05,
|
||
|
|
"epoch": 3.3008356545961,
|
||
|
|
"step": 594
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8014,
|
||
|
|
"grad_norm": 8.675304412841797,
|
||
|
|
"learning_rate": 1.3883783293069372e-05,
|
||
|
|
"epoch": 3.3064066852367686,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.7078,
|
||
|
|
"grad_norm": 6.470170974731445,
|
||
|
|
"learning_rate": 1.3870635771011543e-05,
|
||
|
|
"epoch": 3.3119777158774375,
|
||
|
|
"step": 596
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.6612,
|
||
|
|
"grad_norm": 7.141599178314209,
|
||
|
|
"learning_rate": 1.3857488248953713e-05,
|
||
|
|
"epoch": 3.317548746518106,
|
||
|
|
"step": 597
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8968,
|
||
|
|
"grad_norm": 9.977639198303223,
|
||
|
|
"learning_rate": 1.384434072689588e-05,
|
||
|
|
"epoch": 3.3231197771587744,
|
||
|
|
"step": 598
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.8395,
|
||
|
|
"grad_norm": 10.208252906799316,
|
||
|
|
"learning_rate": 1.383119320483805e-05,
|
||
|
|
"epoch": 3.328690807799443,
|
||
|
|
"step": 599
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.9248,
|
||
|
|
"grad_norm": 9.933085441589355,
|
||
|
|
"learning_rate": 1.381804568278022e-05,
|
||
|
|
"epoch": 3.3342618384401113,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 2.9689695835113525,
|
||
|
|
"eval_runtime": 35.9526,
|
||
|
|
"eval_samples_per_second": 39.941,
|
||
|
|
"eval_steps_per_second": 2.003,
|
||
|
|
"epoch": 3.3342618384401113,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"train_runtime": 2220.807,
|
||
|
|
"train_samples_per_second": 5.944,
|
||
|
|
"train_steps_per_second": 0.743,
|
||
|
|
"total_flos": 4.35104765343744e+16,
|
||
|
|
"train_loss": 1.654274252106746,
|
||
|
|
"epoch": 3.3342618384401113,
|
||
|
|
"step": 600
|
||
|
|
}
|
||
|
|
]</pre></details>
|
||
|
|
|
||
|
|
<script type="application/json" id="run-payload">{"run_meta": {"model": "unsloth/Phi-4-unsloth-bnb-4bit", "dataset": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl", "examples_total": 2872, "examples_train": 1436, "examples_eval": 1436, "world_size": 1, "effective_batch_size": 8, "steps_per_epoch_approx": 179.5, "max_steps": 2000, "eval_steps": 50, "save_steps": 50, "learning_rate": 9.95267419777795e-06, "warmup_steps": 10, "lr_scheduler_type": "linear", "weight_decay": 0.009206070410847844, "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.0, "best_checkpoint": "outputs/continue_r1_from_350_20260112_073729/checkpoint-100", "LR_AUTO_ENABLED": true, "LR_AUTO_USE_N": "train", "LR_AUTO_N_REF": 1436, "LR_AUTO_BASE": 1e-05, "LR_AUTO_MULT": 0.5, "LR_AUTO_FINAL": 5e-06, "best_step": 100, "best_eval_loss": 2.2380564212799072, "best_blended": 1.3520409573791146, "best_blended_step": 600}, "config_snapshot": {"MODEL_NAME": "unsloth/Phi-4-unsloth-bnb-4bit", "CHAT_TEMPLATE": "phi-4", "MAX_SEQ_LENGTH": 2048, "LOAD_IN_4BIT": true, "DATASET_NAME": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl", "DATASET_SPLIT": "train", "PER_DEVICE_TRAIN_BATCH_SIZE": 2, "GRADIENT_ACCUMULATION_STEPS": 4, "WARMUP_STEPS": 10, "MAX_STEPS": 2000, "LEARNING_RATE": 9.95267419777795e-06, "WEIGHT_DECAY": 0.009206070410847844, "LR_SCHEDULER_TYPE": "linear", "SEED": 3407, "PLOTLY_DARK_MODE": true, "PLOTLY_BASE_COLOR": "#00CC96", "PLOTLY_EMA_SPAN": 25, "LR_AUTO_ENABLED": true, "LR_AUTO_USE_N": "train", "LR_AUTO_N_REF": 1436, "LR_AUTO_BASE": 1e-05, "LR_AUTO_MULT": 0.5, "LR_AUTO_FINAL": 5e-06}, "run_manifest": {"model_name": "unsloth/Phi-4-unsloth-bnb-4bit", "dataset": {"name": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl", "split": "train"}, "training": {"max_steps": 2000, "learning_rate": 9.95267419777795e-06, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 4, "max_seq_length": 2048, "seed": 3407, "optimizer": "adamw_8bit", "lr_scheduler_type": "linear"}, "auto_lr": {"enabled": true, "use_n": "train", "n_ref": 1436, "base": 1e-05, "mult": 0.5, "final": 5e-06}, "best": {"checkpoint": "/content/outputs/continue_r1_from_350_20260112_073729/checkpoint-100", "metric": 2.2380564212799072, "metric_name": "eval_loss"}, "plotly": {"html": "training_loss_step.html"}}, "log_history": [{"loss": 2.2041, "grad_norm": 4.026190757751465, "learning_rate": 0.0, "epoch": 0.005571030640668524, "step": 1}, {"loss": 2.6901, "grad_norm": 1.616629719734192, "learning_rate": 1.4636285584967574e-07, "epoch": 0.011142061281337047, "step": 2}, {"loss": 2.6774, "grad_norm": 13.836981773376465, "learning_rate": 2.927257116993515e-07, "epoch": 0.016713091922005572, "step": 3}, {"loss": 2.4478, "grad_norm": 1.857710361480713, "learning_rate": 4.3908856754902726e-07, "epoch": 0.022284122562674095, "step": 4}, {"loss": 1.9988, "grad_norm": 1.4818029403686523, "learning_rate": 5.85451423398703e-07, "epoch": 0.027855153203342618, "step": 5}, {"loss": 2.0358, "grad_norm": 1.726440191268921, "learning_rate": 7.318142792483787e-07, "epoch": 0.033426183844011144, "step": 6}, {"loss": 2.5824, "grad_norm": 2.0604233741760254, "learning_rate": 8.781771350980545e-07, "epoch": 0.03899721448467967, "step": 7}, {"loss": 2.3479, "grad_norm": 1.7288694381713867, "learning_rate": 1.0245399909477302e-06, "epoch": 0.04456824512534819, "step": 8}, {"loss": 2.6387, "grad_norm": 1.9069620370864868, "learning_rate": 1.170902846797406e-06, "epoch": 0.05013927576601671, "step": 9}, {"loss": 2.6083, "grad_norm": 1.4719465970993042, "learning_rate": 1.3172657026470817e-06, "epoch": 0.055710306406685235, "step": 10}, {"loss": 2.3015, "grad_norm": 1.6306267976760864, "learning_rate": 1.4636285584967574e-06, "epoch": 0.06128133704735376, "step": 11}, {"loss": 2.7042, "grad_norm": 1.4724116325378418, "learning_rate": 1.6099914143464333e-06, "epoch": 0.06685236768802229, "step": 12}, {"loss": 2.5386, "grad_norm": 1.5470020771026611, "learning_rate": 1.756354270196109e-06, "epoch": 0.07242339832869081, "step": 13}, {"loss": 2.5886
|
||
|
|
</div>
|
||
|
|
</body>
|
||
|
|
</html>
|