3211 lines
82 KiB
JSON
3211 lines
82 KiB
JSON
{
|
|
"best_global_step": 264,
|
|
"best_metric": 1.308773159980774,
|
|
"best_model_checkpoint": "saves/qwen3-4B/medical-o1-sft-full-1e-5/checkpoint-264",
|
|
"epoch": 3.0,
|
|
"eval_steps": 44,
|
|
"global_step": 441,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.006837606837606838,
|
|
"grad_norm": 24.729957580566406,
|
|
"learning_rate": 0.0,
|
|
"loss": 2.180166482925415,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.013675213675213675,
|
|
"grad_norm": 25.152711868286133,
|
|
"learning_rate": 4.347826086956522e-07,
|
|
"loss": 2.1789543628692627,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.020512820512820513,
|
|
"grad_norm": 24.6761417388916,
|
|
"learning_rate": 8.695652173913044e-07,
|
|
"loss": 2.204561233520508,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.02735042735042735,
|
|
"grad_norm": 24.276906967163086,
|
|
"learning_rate": 1.3043478260869566e-06,
|
|
"loss": 2.1825883388519287,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.03418803418803419,
|
|
"grad_norm": 23.327831268310547,
|
|
"learning_rate": 1.7391304347826088e-06,
|
|
"loss": 2.2022361755371094,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.041025641025641026,
|
|
"grad_norm": 20.180011749267578,
|
|
"learning_rate": 2.173913043478261e-06,
|
|
"loss": 2.0757670402526855,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.04786324786324787,
|
|
"grad_norm": 18.820642471313477,
|
|
"learning_rate": 2.6086956521739132e-06,
|
|
"loss": 2.024721145629883,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.0547008547008547,
|
|
"grad_norm": 13.223835945129395,
|
|
"learning_rate": 3.043478260869566e-06,
|
|
"loss": 1.9034565687179565,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.06153846153846154,
|
|
"grad_norm": 11.584263801574707,
|
|
"learning_rate": 3.4782608695652175e-06,
|
|
"loss": 1.8130236864089966,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.06837606837606838,
|
|
"grad_norm": 5.6841607093811035,
|
|
"learning_rate": 3.91304347826087e-06,
|
|
"loss": 1.6309248208999634,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.07521367521367521,
|
|
"grad_norm": 4.208008766174316,
|
|
"learning_rate": 4.347826086956522e-06,
|
|
"loss": 1.5361576080322266,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.08205128205128205,
|
|
"grad_norm": 3.528555154800415,
|
|
"learning_rate": 4.782608695652174e-06,
|
|
"loss": 1.6088225841522217,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.08888888888888889,
|
|
"grad_norm": 3.099165916442871,
|
|
"learning_rate": 5.2173913043478265e-06,
|
|
"loss": 1.5432047843933105,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.09572649572649573,
|
|
"grad_norm": 6.412608623504639,
|
|
"learning_rate": 5.652173913043479e-06,
|
|
"loss": 1.5963867902755737,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.10256410256410256,
|
|
"grad_norm": 5.609615802764893,
|
|
"learning_rate": 6.086956521739132e-06,
|
|
"loss": 1.5698325634002686,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.1094017094017094,
|
|
"grad_norm": 4.161319255828857,
|
|
"learning_rate": 6.521739130434783e-06,
|
|
"loss": 1.555444598197937,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.11623931623931624,
|
|
"grad_norm": 3.2057743072509766,
|
|
"learning_rate": 6.956521739130435e-06,
|
|
"loss": 1.475843906402588,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.12307692307692308,
|
|
"grad_norm": 2.5646772384643555,
|
|
"learning_rate": 7.391304347826087e-06,
|
|
"loss": 1.509574294090271,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.12991452991452992,
|
|
"grad_norm": 1.9250593185424805,
|
|
"learning_rate": 7.82608695652174e-06,
|
|
"loss": 1.4932482242584229,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.13675213675213677,
|
|
"grad_norm": 1.6663166284561157,
|
|
"learning_rate": 8.260869565217392e-06,
|
|
"loss": 1.4706228971481323,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.14358974358974358,
|
|
"grad_norm": 1.488690733909607,
|
|
"learning_rate": 8.695652173913044e-06,
|
|
"loss": 1.4192920923233032,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.15042735042735042,
|
|
"grad_norm": 1.3503153324127197,
|
|
"learning_rate": 9.130434782608697e-06,
|
|
"loss": 1.427452802658081,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.15726495726495726,
|
|
"grad_norm": 1.2214534282684326,
|
|
"learning_rate": 9.565217391304349e-06,
|
|
"loss": 1.4610393047332764,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.1641025641025641,
|
|
"grad_norm": 1.1983873844146729,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.4273948669433594,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.17094017094017094,
|
|
"grad_norm": 1.1930960416793823,
|
|
"learning_rate": 9.999858783596665e-06,
|
|
"loss": 1.4003199338912964,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.17777777777777778,
|
|
"grad_norm": 1.0275226831436157,
|
|
"learning_rate": 9.999435142363484e-06,
|
|
"loss": 1.4090672731399536,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.18461538461538463,
|
|
"grad_norm": 1.001726508140564,
|
|
"learning_rate": 9.998729100230497e-06,
|
|
"loss": 1.3982799053192139,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.19145299145299147,
|
|
"grad_norm": 0.9476358890533447,
|
|
"learning_rate": 9.997740697079595e-06,
|
|
"loss": 1.4250205755233765,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.19829059829059828,
|
|
"grad_norm": 0.9169353246688843,
|
|
"learning_rate": 9.99646998874227e-06,
|
|
"loss": 1.407841682434082,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.20512820512820512,
|
|
"grad_norm": 0.9049670696258545,
|
|
"learning_rate": 9.994917046996472e-06,
|
|
"loss": 1.4163107872009277,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.21196581196581196,
|
|
"grad_norm": 0.902590811252594,
|
|
"learning_rate": 9.993081959562539e-06,
|
|
"loss": 1.4395619630813599,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.2188034188034188,
|
|
"grad_norm": 0.9725260138511658,
|
|
"learning_rate": 9.990964830098246e-06,
|
|
"loss": 1.4067661762237549,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.22564102564102564,
|
|
"grad_norm": 0.8750798106193542,
|
|
"learning_rate": 9.98856577819296e-06,
|
|
"loss": 1.4079771041870117,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.23247863247863249,
|
|
"grad_norm": 0.8549812436103821,
|
|
"learning_rate": 9.985884939360873e-06,
|
|
"loss": 1.398482322692871,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.23931623931623933,
|
|
"grad_norm": 0.869503378868103,
|
|
"learning_rate": 9.98292246503335e-06,
|
|
"loss": 1.344150424003601,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.24615384615384617,
|
|
"grad_norm": 0.9242067337036133,
|
|
"learning_rate": 9.979678522550382e-06,
|
|
"loss": 1.37479567527771,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.252991452991453,
|
|
"grad_norm": 0.8416987657546997,
|
|
"learning_rate": 9.976153295151123e-06,
|
|
"loss": 1.3731480836868286,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.25982905982905985,
|
|
"grad_norm": 0.9907390475273132,
|
|
"learning_rate": 9.972346981963546e-06,
|
|
"loss": 1.3624351024627686,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.26666666666666666,
|
|
"grad_norm": 0.8205696940422058,
|
|
"learning_rate": 9.968259797993197e-06,
|
|
"loss": 1.3645293712615967,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.27350427350427353,
|
|
"grad_norm": 0.8257843852043152,
|
|
"learning_rate": 9.963891974111042e-06,
|
|
"loss": 1.3727067708969116,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.28034188034188035,
|
|
"grad_norm": 0.7986466288566589,
|
|
"learning_rate": 9.959243757040434e-06,
|
|
"loss": 1.3945657014846802,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.28717948717948716,
|
|
"grad_norm": 0.9684669971466064,
|
|
"learning_rate": 9.95431540934317e-06,
|
|
"loss": 1.3376381397247314,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.294017094017094,
|
|
"grad_norm": 0.7717859148979187,
|
|
"learning_rate": 9.949107209404664e-06,
|
|
"loss": 1.354946494102478,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.30085470085470084,
|
|
"grad_norm": 0.8021324276924133,
|
|
"learning_rate": 9.943619451418225e-06,
|
|
"loss": 1.3951725959777832,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.30085470085470084,
|
|
"eval_loss": 1.362805724143982,
|
|
"eval_runtime": 24.9887,
|
|
"eval_samples_per_second": 39.458,
|
|
"eval_steps_per_second": 4.962,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.3076923076923077,
|
|
"grad_norm": 0.829911470413208,
|
|
"learning_rate": 9.937852445368427e-06,
|
|
"loss": 1.3832783699035645,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.3145299145299145,
|
|
"grad_norm": 0.8109715580940247,
|
|
"learning_rate": 9.931806517013612e-06,
|
|
"loss": 1.3637301921844482,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.3213675213675214,
|
|
"grad_norm": 0.7627991437911987,
|
|
"learning_rate": 9.925482007867485e-06,
|
|
"loss": 1.3353031873703003,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.3282051282051282,
|
|
"grad_norm": 0.7720788717269897,
|
|
"learning_rate": 9.918879275179819e-06,
|
|
"loss": 1.367252230644226,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.335042735042735,
|
|
"grad_norm": 0.7520493865013123,
|
|
"learning_rate": 9.911998691916275e-06,
|
|
"loss": 1.386542797088623,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.3418803418803419,
|
|
"grad_norm": 0.7559177875518799,
|
|
"learning_rate": 9.904840646737346e-06,
|
|
"loss": 1.3789976835250854,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.3487179487179487,
|
|
"grad_norm": 0.770207405090332,
|
|
"learning_rate": 9.89740554397639e-06,
|
|
"loss": 1.356705904006958,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 0.7609772086143494,
|
|
"learning_rate": 9.889693803616793e-06,
|
|
"loss": 1.3461980819702148,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.3623931623931624,
|
|
"grad_norm": 0.7604424953460693,
|
|
"learning_rate": 9.881705861268252e-06,
|
|
"loss": 1.344923496246338,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.36923076923076925,
|
|
"grad_norm": 0.7701961398124695,
|
|
"learning_rate": 9.873442168142158e-06,
|
|
"loss": 1.364449143409729,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.37606837606837606,
|
|
"grad_norm": 0.7939377427101135,
|
|
"learning_rate": 9.864903191026125e-06,
|
|
"loss": 1.4013525247573853,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.38290598290598293,
|
|
"grad_norm": 0.7690542340278625,
|
|
"learning_rate": 9.856089412257605e-06,
|
|
"loss": 1.3586581945419312,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.38974358974358975,
|
|
"grad_norm": 0.798068106174469,
|
|
"learning_rate": 9.847001329696653e-06,
|
|
"loss": 1.3378022909164429,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.39658119658119656,
|
|
"grad_norm": 0.7824757695198059,
|
|
"learning_rate": 9.837639456697802e-06,
|
|
"loss": 1.3118129968643188,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.40341880341880343,
|
|
"grad_norm": 0.7629351019859314,
|
|
"learning_rate": 9.828004322081067e-06,
|
|
"loss": 1.3393217325210571,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.41025641025641024,
|
|
"grad_norm": 0.7708514332771301,
|
|
"learning_rate": 9.818096470102067e-06,
|
|
"loss": 1.3732938766479492,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.4170940170940171,
|
|
"grad_norm": 0.8133201003074646,
|
|
"learning_rate": 9.807916460421294e-06,
|
|
"loss": 1.3423891067504883,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.4239316239316239,
|
|
"grad_norm": 0.7727287411689758,
|
|
"learning_rate": 9.797464868072489e-06,
|
|
"loss": 1.3378151655197144,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.4307692307692308,
|
|
"grad_norm": 0.7684638500213623,
|
|
"learning_rate": 9.78674228343016e-06,
|
|
"loss": 1.3335256576538086,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.4376068376068376,
|
|
"grad_norm": 0.7602411508560181,
|
|
"learning_rate": 9.775749312176249e-06,
|
|
"loss": 1.3320605754852295,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 0.8044481873512268,
|
|
"learning_rate": 9.764486575265893e-06,
|
|
"loss": 1.3325685262680054,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.4512820512820513,
|
|
"grad_norm": 0.7876479029655457,
|
|
"learning_rate": 9.752954708892379e-06,
|
|
"loss": 1.3242830038070679,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.4581196581196581,
|
|
"grad_norm": 0.7659040689468384,
|
|
"learning_rate": 9.741154364451179e-06,
|
|
"loss": 1.3692903518676758,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.46495726495726497,
|
|
"grad_norm": 0.8316842317581177,
|
|
"learning_rate": 9.729086208503174e-06,
|
|
"loss": 1.344923734664917,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.4717948717948718,
|
|
"grad_norm": 0.8216245174407959,
|
|
"learning_rate": 9.716750922736998e-06,
|
|
"loss": 1.3780957460403442,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.47863247863247865,
|
|
"grad_norm": 0.7839699387550354,
|
|
"learning_rate": 9.704149203930522e-06,
|
|
"loss": 1.3786989450454712,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.48547008547008547,
|
|
"grad_norm": 0.7707169055938721,
|
|
"learning_rate": 9.691281763911513e-06,
|
|
"loss": 1.3283625841140747,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.49230769230769234,
|
|
"grad_norm": 0.7598075270652771,
|
|
"learning_rate": 9.67814932951741e-06,
|
|
"loss": 1.3375245332717896,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.49914529914529915,
|
|
"grad_norm": 0.8022596836090088,
|
|
"learning_rate": 9.664752642554272e-06,
|
|
"loss": 1.3409022092819214,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.505982905982906,
|
|
"grad_norm": 0.7512302398681641,
|
|
"learning_rate": 9.651092459754879e-06,
|
|
"loss": 1.2996271848678589,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.5128205128205128,
|
|
"grad_norm": 0.7390022277832031,
|
|
"learning_rate": 9.637169552735985e-06,
|
|
"loss": 1.3141694068908691,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.5196581196581197,
|
|
"grad_norm": 0.7599424123764038,
|
|
"learning_rate": 9.622984707954732e-06,
|
|
"loss": 1.3220386505126953,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.5264957264957265,
|
|
"grad_norm": 0.7562436461448669,
|
|
"learning_rate": 9.608538726664224e-06,
|
|
"loss": 1.3605300188064575,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.5333333333333333,
|
|
"grad_norm": 0.7731190919876099,
|
|
"learning_rate": 9.593832424868271e-06,
|
|
"loss": 1.3461638689041138,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.5401709401709401,
|
|
"grad_norm": 0.7543560266494751,
|
|
"learning_rate": 9.578866633275289e-06,
|
|
"loss": 1.340885877609253,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.5470085470085471,
|
|
"grad_norm": 0.772647500038147,
|
|
"learning_rate": 9.563642197251382e-06,
|
|
"loss": 1.3663382530212402,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.5538461538461539,
|
|
"grad_norm": 0.7314751148223877,
|
|
"learning_rate": 9.548159976772593e-06,
|
|
"loss": 1.3287297487258911,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.5606837606837607,
|
|
"grad_norm": 0.7391103506088257,
|
|
"learning_rate": 9.532420846376316e-06,
|
|
"loss": 1.3285285234451294,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.5675213675213675,
|
|
"grad_norm": 0.7641813158988953,
|
|
"learning_rate": 9.516425695111906e-06,
|
|
"loss": 1.3269128799438477,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.5743589743589743,
|
|
"grad_norm": 0.7769819498062134,
|
|
"learning_rate": 9.500175426490455e-06,
|
|
"loss": 1.3374706506729126,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.5811965811965812,
|
|
"grad_norm": 0.7199158668518066,
|
|
"learning_rate": 9.48367095843376e-06,
|
|
"loss": 1.3117002248764038,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.588034188034188,
|
|
"grad_norm": 0.7510148882865906,
|
|
"learning_rate": 9.466913223222467e-06,
|
|
"loss": 1.3387565612792969,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.5948717948717949,
|
|
"grad_norm": 0.7325724363327026,
|
|
"learning_rate": 9.449903167443415e-06,
|
|
"loss": 1.269672155380249,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.6017094017094017,
|
|
"grad_norm": 0.7675944566726685,
|
|
"learning_rate": 9.432641751936162e-06,
|
|
"loss": 1.3153454065322876,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.6017094017094017,
|
|
"eval_loss": 1.3318638801574707,
|
|
"eval_runtime": 24.6717,
|
|
"eval_samples_per_second": 39.965,
|
|
"eval_steps_per_second": 5.026,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.6085470085470085,
|
|
"grad_norm": 0.7539426684379578,
|
|
"learning_rate": 9.415129951738713e-06,
|
|
"loss": 1.378519058227539,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.6153846153846154,
|
|
"grad_norm": 0.7739952802658081,
|
|
"learning_rate": 9.397368756032445e-06,
|
|
"loss": 1.3163981437683105,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.6222222222222222,
|
|
"grad_norm": 0.7639786005020142,
|
|
"learning_rate": 9.379359168086231e-06,
|
|
"loss": 1.3244612216949463,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.629059829059829,
|
|
"grad_norm": 0.7307687997817993,
|
|
"learning_rate": 9.361102205199762e-06,
|
|
"loss": 1.3425580263137817,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.6358974358974359,
|
|
"grad_norm": 0.7326052188873291,
|
|
"learning_rate": 9.34259889864609e-06,
|
|
"loss": 1.349947452545166,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.6427350427350428,
|
|
"grad_norm": 0.7336087822914124,
|
|
"learning_rate": 9.32385029361338e-06,
|
|
"loss": 1.3235843181610107,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.6495726495726496,
|
|
"grad_norm": 0.7857178449630737,
|
|
"learning_rate": 9.304857449145858e-06,
|
|
"loss": 1.29775071144104,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.6564102564102564,
|
|
"grad_norm": 0.7694044709205627,
|
|
"learning_rate": 9.285621438083997e-06,
|
|
"loss": 1.3575528860092163,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.6632478632478632,
|
|
"grad_norm": 0.7426573634147644,
|
|
"learning_rate": 9.26614334700392e-06,
|
|
"loss": 1.334963083267212,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.67008547008547,
|
|
"grad_norm": 0.7567334175109863,
|
|
"learning_rate": 9.246424276156008e-06,
|
|
"loss": 1.335172176361084,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.676923076923077,
|
|
"grad_norm": 0.733529269695282,
|
|
"learning_rate": 9.226465339402768e-06,
|
|
"loss": 1.3033547401428223,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.6837606837606838,
|
|
"grad_norm": 0.7475197315216064,
|
|
"learning_rate": 9.206267664155906e-06,
|
|
"loss": 1.316215991973877,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.6905982905982906,
|
|
"grad_norm": 0.7870779633522034,
|
|
"learning_rate": 9.185832391312644e-06,
|
|
"loss": 1.347679853439331,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.6974358974358974,
|
|
"grad_norm": 0.764722466468811,
|
|
"learning_rate": 9.165160675191272e-06,
|
|
"loss": 1.305860996246338,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.7042735042735043,
|
|
"grad_norm": 0.7680871486663818,
|
|
"learning_rate": 9.144253683465953e-06,
|
|
"loss": 1.3211126327514648,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 0.734742283821106,
|
|
"learning_rate": 9.123112597100759e-06,
|
|
"loss": 1.2861220836639404,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.717948717948718,
|
|
"grad_norm": 0.7347426414489746,
|
|
"learning_rate": 9.101738610282956e-06,
|
|
"loss": 1.315138578414917,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.7247863247863248,
|
|
"grad_norm": 0.7639749646186829,
|
|
"learning_rate": 9.080132930355567e-06,
|
|
"loss": 1.3426464796066284,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.7316239316239316,
|
|
"grad_norm": 0.7904943227767944,
|
|
"learning_rate": 9.058296777749154e-06,
|
|
"loss": 1.334005355834961,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.7384615384615385,
|
|
"grad_norm": 0.780296266078949,
|
|
"learning_rate": 9.03623138591289e-06,
|
|
"loss": 1.3893626928329468,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.7452991452991453,
|
|
"grad_norm": 0.7619044184684753,
|
|
"learning_rate": 9.013938001244885e-06,
|
|
"loss": 1.3112680912017822,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.7521367521367521,
|
|
"grad_norm": 0.7852951884269714,
|
|
"learning_rate": 8.99141788302178e-06,
|
|
"loss": 1.3263344764709473,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.7589743589743589,
|
|
"grad_norm": 0.746293306350708,
|
|
"learning_rate": 8.968672303327614e-06,
|
|
"loss": 1.3137162923812866,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.7658119658119659,
|
|
"grad_norm": 0.7697060704231262,
|
|
"learning_rate": 8.94570254698197e-06,
|
|
"loss": 1.305846095085144,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.7726495726495727,
|
|
"grad_norm": 0.7505799531936646,
|
|
"learning_rate": 8.922509911467395e-06,
|
|
"loss": 1.3263046741485596,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.7794871794871795,
|
|
"grad_norm": 0.7378644347190857,
|
|
"learning_rate": 8.899095706856122e-06,
|
|
"loss": 1.2952595949172974,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.7863247863247863,
|
|
"grad_norm": 0.7393775582313538,
|
|
"learning_rate": 8.875461255736055e-06,
|
|
"loss": 1.314041018486023,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.7931623931623931,
|
|
"grad_norm": 0.7198286056518555,
|
|
"learning_rate": 8.851607893136065e-06,
|
|
"loss": 1.301222801208496,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 0.7539902925491333,
|
|
"learning_rate": 8.827536966450584e-06,
|
|
"loss": 1.3459645509719849,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.8068376068376069,
|
|
"grad_norm": 0.728272020816803,
|
|
"learning_rate": 8.803249835363486e-06,
|
|
"loss": 1.3075345754623413,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.8136752136752137,
|
|
"grad_norm": 0.7353615164756775,
|
|
"learning_rate": 8.778747871771293e-06,
|
|
"loss": 1.2967561483383179,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.8205128205128205,
|
|
"grad_norm": 0.7358576655387878,
|
|
"learning_rate": 8.754032459705672e-06,
|
|
"loss": 1.3145124912261963,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.8273504273504273,
|
|
"grad_norm": 0.7736720442771912,
|
|
"learning_rate": 8.729104995255265e-06,
|
|
"loss": 1.3146538734436035,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.8341880341880342,
|
|
"grad_norm": 0.7337418794631958,
|
|
"learning_rate": 8.703966886486819e-06,
|
|
"loss": 1.2823609113693237,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.841025641025641,
|
|
"grad_norm": 0.7514926195144653,
|
|
"learning_rate": 8.67861955336566e-06,
|
|
"loss": 1.3389618396759033,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.8478632478632478,
|
|
"grad_norm": 0.7190932035446167,
|
|
"learning_rate": 8.65306442767547e-06,
|
|
"loss": 1.3115108013153076,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.8547008547008547,
|
|
"grad_norm": 0.7332461476325989,
|
|
"learning_rate": 8.627302952937431e-06,
|
|
"loss": 1.333253264427185,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.8615384615384616,
|
|
"grad_norm": 0.7428878545761108,
|
|
"learning_rate": 8.601336584328659e-06,
|
|
"loss": 1.3187751770019531,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.8683760683760684,
|
|
"grad_norm": 0.7715012431144714,
|
|
"learning_rate": 8.575166788600031e-06,
|
|
"loss": 1.3300316333770752,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.8752136752136752,
|
|
"grad_norm": 0.7566640973091125,
|
|
"learning_rate": 8.548795043993316e-06,
|
|
"loss": 1.307992696762085,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.882051282051282,
|
|
"grad_norm": 0.7760566473007202,
|
|
"learning_rate": 8.522222840157687e-06,
|
|
"loss": 1.32774817943573,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 0.7682384848594666,
|
|
"learning_rate": 8.495451678065563e-06,
|
|
"loss": 1.3295447826385498,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.8957264957264958,
|
|
"grad_norm": 0.7397897839546204,
|
|
"learning_rate": 8.468483069927832e-06,
|
|
"loss": 1.3145328760147095,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.9025641025641026,
|
|
"grad_norm": 0.7603890299797058,
|
|
"learning_rate": 8.441318539108433e-06,
|
|
"loss": 1.3174394369125366,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.9025641025641026,
|
|
"eval_loss": 1.317511796951294,
|
|
"eval_runtime": 24.6804,
|
|
"eval_samples_per_second": 39.951,
|
|
"eval_steps_per_second": 5.024,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.9094017094017094,
|
|
"grad_norm": 0.7623502612113953,
|
|
"learning_rate": 8.413959620038306e-06,
|
|
"loss": 1.3393348455429077,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.9162393162393162,
|
|
"grad_norm": 0.7669332027435303,
|
|
"learning_rate": 8.386407858128707e-06,
|
|
"loss": 1.302769660949707,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.9230769230769231,
|
|
"grad_norm": 0.7234067320823669,
|
|
"learning_rate": 8.358664809683926e-06,
|
|
"loss": 1.3381096124649048,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.9299145299145299,
|
|
"grad_norm": 0.7574735283851624,
|
|
"learning_rate": 8.330732041813367e-06,
|
|
"loss": 1.335377812385559,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.9367521367521368,
|
|
"grad_norm": 0.7575842142105103,
|
|
"learning_rate": 8.302611132343042e-06,
|
|
"loss": 1.3330005407333374,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.9435897435897436,
|
|
"grad_norm": 0.7127556800842285,
|
|
"learning_rate": 8.274303669726427e-06,
|
|
"loss": 1.2971893548965454,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.9504273504273504,
|
|
"grad_norm": 0.8172794580459595,
|
|
"learning_rate": 8.245811252954741e-06,
|
|
"loss": 1.3225749731063843,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.9572649572649573,
|
|
"grad_norm": 0.7154548764228821,
|
|
"learning_rate": 8.217135491466636e-06,
|
|
"loss": 1.2955387830734253,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.9641025641025641,
|
|
"grad_norm": 0.7610012888908386,
|
|
"learning_rate": 8.18827800505727e-06,
|
|
"loss": 1.3369195461273193,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.9709401709401709,
|
|
"grad_norm": 0.7487711906433105,
|
|
"learning_rate": 8.15924042378682e-06,
|
|
"loss": 1.2916451692581177,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.9777777777777777,
|
|
"grad_norm": 0.7546627521514893,
|
|
"learning_rate": 8.130024387888402e-06,
|
|
"loss": 1.310347318649292,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.9846153846153847,
|
|
"grad_norm": 0.7537707090377808,
|
|
"learning_rate": 8.100631547675417e-06,
|
|
"loss": 1.3267855644226074,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.9914529914529915,
|
|
"grad_norm": 0.7335416078567505,
|
|
"learning_rate": 8.071063563448341e-06,
|
|
"loss": 1.2958036661148071,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.9982905982905983,
|
|
"grad_norm": 0.773562490940094,
|
|
"learning_rate": 8.041322105400923e-06,
|
|
"loss": 1.2804107666015625,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.4411433935165405,
|
|
"learning_rate": 8.01140885352586e-06,
|
|
"loss": 1.3802165985107422,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 1.0068376068376068,
|
|
"grad_norm": 0.9124190211296082,
|
|
"learning_rate": 7.981325497519892e-06,
|
|
"loss": 1.2135487794876099,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 1.0136752136752136,
|
|
"grad_norm": 0.8284032344818115,
|
|
"learning_rate": 7.951073736688348e-06,
|
|
"loss": 1.1935949325561523,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 1.0205128205128204,
|
|
"grad_norm": 0.8174305558204651,
|
|
"learning_rate": 7.920655279849173e-06,
|
|
"loss": 1.2410966157913208,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 1.0273504273504273,
|
|
"grad_norm": 0.7865321040153503,
|
|
"learning_rate": 7.890071845236395e-06,
|
|
"loss": 1.2489113807678223,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 1.0341880341880343,
|
|
"grad_norm": 0.812463104724884,
|
|
"learning_rate": 7.859325160403073e-06,
|
|
"loss": 1.1999475955963135,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 1.041025641025641,
|
|
"grad_norm": 0.8780131936073303,
|
|
"learning_rate": 7.8284169621237e-06,
|
|
"loss": 1.2193069458007812,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 1.047863247863248,
|
|
"grad_norm": 0.8348581790924072,
|
|
"learning_rate": 7.797348996296116e-06,
|
|
"loss": 1.1925896406173706,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 1.0547008547008547,
|
|
"grad_norm": 0.8675538897514343,
|
|
"learning_rate": 7.766123017842877e-06,
|
|
"loss": 1.2143549919128418,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 1.0615384615384615,
|
|
"grad_norm": 0.8252431750297546,
|
|
"learning_rate": 7.734740790612137e-06,
|
|
"loss": 1.2455641031265259,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 1.0683760683760684,
|
|
"grad_norm": 0.8385781049728394,
|
|
"learning_rate": 7.703204087277989e-06,
|
|
"loss": 1.2102444171905518,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 1.0752136752136752,
|
|
"grad_norm": 0.827889084815979,
|
|
"learning_rate": 7.671514689240366e-06,
|
|
"loss": 1.2144052982330322,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 1.082051282051282,
|
|
"grad_norm": 0.7633846998214722,
|
|
"learning_rate": 7.639674386524395e-06,
|
|
"loss": 1.2118767499923706,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 1.0888888888888888,
|
|
"grad_norm": 0.8267090320587158,
|
|
"learning_rate": 7.607684977679284e-06,
|
|
"loss": 1.188737392425537,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.0957264957264958,
|
|
"grad_norm": 0.8270633816719055,
|
|
"learning_rate": 7.575548269676741e-06,
|
|
"loss": 1.214994192123413,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 1.1025641025641026,
|
|
"grad_norm": 0.8160786628723145,
|
|
"learning_rate": 7.543266077808893e-06,
|
|
"loss": 1.221800446510315,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 1.1094017094017095,
|
|
"grad_norm": 0.829490065574646,
|
|
"learning_rate": 7.510840225585749e-06,
|
|
"loss": 1.1974472999572754,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 1.1162393162393163,
|
|
"grad_norm": 0.8170298933982849,
|
|
"learning_rate": 7.478272544632204e-06,
|
|
"loss": 1.2150561809539795,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 1.123076923076923,
|
|
"grad_norm": 0.7731851935386658,
|
|
"learning_rate": 7.44556487458456e-06,
|
|
"loss": 1.1988686323165894,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 1.12991452991453,
|
|
"grad_norm": 0.7923320531845093,
|
|
"learning_rate": 7.412719062986632e-06,
|
|
"loss": 1.2086683511734009,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 1.1367521367521367,
|
|
"grad_norm": 0.7592716217041016,
|
|
"learning_rate": 7.379736965185369e-06,
|
|
"loss": 1.215879201889038,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 1.1435897435897435,
|
|
"grad_norm": 0.7586809396743774,
|
|
"learning_rate": 7.3466204442260605e-06,
|
|
"loss": 1.2311599254608154,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 1.1504273504273503,
|
|
"grad_norm": 0.7838971614837646,
|
|
"learning_rate": 7.313371370747104e-06,
|
|
"loss": 1.2183728218078613,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 1.1572649572649572,
|
|
"grad_norm": 0.7780983448028564,
|
|
"learning_rate": 7.279991622874319e-06,
|
|
"loss": 1.1952356100082397,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 1.1641025641025642,
|
|
"grad_norm": 0.7715050578117371,
|
|
"learning_rate": 7.24648308611489e-06,
|
|
"loss": 1.2417360544204712,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 1.170940170940171,
|
|
"grad_norm": 0.7692239880561829,
|
|
"learning_rate": 7.212847653250828e-06,
|
|
"loss": 1.2170333862304688,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 1.1777777777777778,
|
|
"grad_norm": 0.7896147966384888,
|
|
"learning_rate": 7.1790872242320775e-06,
|
|
"loss": 1.2121965885162354,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 1.1846153846153846,
|
|
"grad_norm": 0.8173856139183044,
|
|
"learning_rate": 7.145203706069183e-06,
|
|
"loss": 1.1911547183990479,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 1.1914529914529914,
|
|
"grad_norm": 0.7522553205490112,
|
|
"learning_rate": 7.1111990127255684e-06,
|
|
"loss": 1.210161566734314,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 1.1982905982905983,
|
|
"grad_norm": 0.7353285551071167,
|
|
"learning_rate": 7.0770750650094335e-06,
|
|
"loss": 1.1757725477218628,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 1.1982905982905983,
|
|
"eval_loss": 1.3184372186660767,
|
|
"eval_runtime": 24.8388,
|
|
"eval_samples_per_second": 39.696,
|
|
"eval_steps_per_second": 4.992,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 1.205128205128205,
|
|
"grad_norm": 0.7701054811477661,
|
|
"learning_rate": 7.042833790465241e-06,
|
|
"loss": 1.2243812084197998,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 1.2119658119658119,
|
|
"grad_norm": 0.7278676629066467,
|
|
"learning_rate": 7.008477123264849e-06,
|
|
"loss": 1.198972463607788,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 1.218803418803419,
|
|
"grad_norm": 0.7595424056053162,
|
|
"learning_rate": 6.974007004098243e-06,
|
|
"loss": 1.2435779571533203,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 1.2256410256410257,
|
|
"grad_norm": 0.7661744952201843,
|
|
"learning_rate": 6.939425380063924e-06,
|
|
"loss": 1.2413814067840576,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.2324786324786325,
|
|
"grad_norm": 0.7790281176567078,
|
|
"learning_rate": 6.9047342045589224e-06,
|
|
"loss": 1.1771953105926514,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 1.2393162393162394,
|
|
"grad_norm": 0.7655471563339233,
|
|
"learning_rate": 6.869935437168449e-06,
|
|
"loss": 1.203190565109253,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 1.2461538461538462,
|
|
"grad_norm": 0.784903347492218,
|
|
"learning_rate": 6.835031043555211e-06,
|
|
"loss": 1.2171598672866821,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 1.252991452991453,
|
|
"grad_norm": 0.7539082765579224,
|
|
"learning_rate": 6.800022995348381e-06,
|
|
"loss": 1.2139626741409302,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 1.2598290598290598,
|
|
"grad_norm": 0.7623985409736633,
|
|
"learning_rate": 6.76491327003222e-06,
|
|
"loss": 1.2187587022781372,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 1.2666666666666666,
|
|
"grad_norm": 0.7418251037597656,
|
|
"learning_rate": 6.729703850834381e-06,
|
|
"loss": 1.2088682651519775,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 1.2735042735042734,
|
|
"grad_norm": 0.7652315497398376,
|
|
"learning_rate": 6.694396726613883e-06,
|
|
"loss": 1.2204537391662598,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 1.2803418803418802,
|
|
"grad_norm": 0.7618216872215271,
|
|
"learning_rate": 6.65899389174876e-06,
|
|
"loss": 1.220557451248169,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 1.287179487179487,
|
|
"grad_norm": 0.774918794631958,
|
|
"learning_rate": 6.6234973460234184e-06,
|
|
"loss": 1.238166093826294,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 1.294017094017094,
|
|
"grad_norm": 0.7822843790054321,
|
|
"learning_rate": 6.587909094515663e-06,
|
|
"loss": 1.2424533367156982,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 1.300854700854701,
|
|
"grad_norm": 0.7934525012969971,
|
|
"learning_rate": 6.552231147483448e-06,
|
|
"loss": 1.1982380151748657,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 1.3076923076923077,
|
|
"grad_norm": 0.7817178964614868,
|
|
"learning_rate": 6.5164655202513135e-06,
|
|
"loss": 1.205663800239563,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 1.3145299145299145,
|
|
"grad_norm": 0.8002380728721619,
|
|
"learning_rate": 6.480614233096558e-06,
|
|
"loss": 1.1866426467895508,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 1.3213675213675213,
|
|
"grad_norm": 0.7488191723823547,
|
|
"learning_rate": 6.444679311135112e-06,
|
|
"loss": 1.2407163381576538,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 1.3282051282051281,
|
|
"grad_norm": 0.8069729208946228,
|
|
"learning_rate": 6.408662784207149e-06,
|
|
"loss": 1.2296785116195679,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 1.335042735042735,
|
|
"grad_norm": 0.8026877641677856,
|
|
"learning_rate": 6.372566686762427e-06,
|
|
"loss": 1.228287696838379,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 1.341880341880342,
|
|
"grad_norm": 0.7794991731643677,
|
|
"learning_rate": 6.336393057745365e-06,
|
|
"loss": 1.2325451374053955,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 1.3487179487179488,
|
|
"grad_norm": 0.7851534485816956,
|
|
"learning_rate": 6.300143940479881e-06,
|
|
"loss": 1.2433525323867798,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 0.7642512321472168,
|
|
"learning_rate": 6.2638213825539595e-06,
|
|
"loss": 1.2330515384674072,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 1.3623931623931624,
|
|
"grad_norm": 0.8071786165237427,
|
|
"learning_rate": 6.227427435703997e-06,
|
|
"loss": 1.2169106006622314,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.3692307692307693,
|
|
"grad_norm": 0.7421261668205261,
|
|
"learning_rate": 6.190964155698903e-06,
|
|
"loss": 1.1981184482574463,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 1.376068376068376,
|
|
"grad_norm": 0.7663130760192871,
|
|
"learning_rate": 6.154433602223979e-06,
|
|
"loss": 1.184199333190918,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 1.3829059829059829,
|
|
"grad_norm": 0.778105616569519,
|
|
"learning_rate": 6.117837838764579e-06,
|
|
"loss": 1.1941637992858887,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 1.3897435897435897,
|
|
"grad_norm": 0.7876622676849365,
|
|
"learning_rate": 6.0811789324895365e-06,
|
|
"loss": 1.1943039894104004,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 1.3965811965811965,
|
|
"grad_norm": 0.7890434861183167,
|
|
"learning_rate": 6.044458954134411e-06,
|
|
"loss": 1.1947365999221802,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 1.4034188034188033,
|
|
"grad_norm": 0.7558045387268066,
|
|
"learning_rate": 6.0076799778845105e-06,
|
|
"loss": 1.1994682550430298,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 1.4102564102564101,
|
|
"grad_norm": 0.7472313046455383,
|
|
"learning_rate": 5.970844081257734e-06,
|
|
"loss": 1.210819959640503,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 1.4170940170940172,
|
|
"grad_norm": 0.7487971782684326,
|
|
"learning_rate": 5.933953344987215e-06,
|
|
"loss": 1.1884093284606934,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 1.423931623931624,
|
|
"grad_norm": 0.7524631023406982,
|
|
"learning_rate": 5.897009852903792e-06,
|
|
"loss": 1.2101268768310547,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 1.4307692307692308,
|
|
"grad_norm": 0.7583618760108948,
|
|
"learning_rate": 5.860015691818292e-06,
|
|
"loss": 1.214969515800476,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 1.4376068376068376,
|
|
"grad_norm": 0.7619627118110657,
|
|
"learning_rate": 5.82297295140367e-06,
|
|
"loss": 1.1723865270614624,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 1.4444444444444444,
|
|
"grad_norm": 0.782787024974823,
|
|
"learning_rate": 5.78588372407695e-06,
|
|
"loss": 1.2125704288482666,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 1.4512820512820512,
|
|
"grad_norm": 0.7758169174194336,
|
|
"learning_rate": 5.748750104881051e-06,
|
|
"loss": 1.219278335571289,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 1.458119658119658,
|
|
"grad_norm": 0.7914722561836243,
|
|
"learning_rate": 5.711574191366427e-06,
|
|
"loss": 1.2299978733062744,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 1.464957264957265,
|
|
"grad_norm": 0.7562519907951355,
|
|
"learning_rate": 5.674358083472598e-06,
|
|
"loss": 1.1945183277130127,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 1.471794871794872,
|
|
"grad_norm": 0.7890987396240234,
|
|
"learning_rate": 5.637103883409525e-06,
|
|
"loss": 1.228225827217102,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 1.4786324786324787,
|
|
"grad_norm": 0.7438657879829407,
|
|
"learning_rate": 5.599813695538866e-06,
|
|
"loss": 1.1812902688980103,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 1.4854700854700855,
|
|
"grad_norm": 0.7696713805198669,
|
|
"learning_rate": 5.562489626255104e-06,
|
|
"loss": 1.2277076244354248,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 1.4923076923076923,
|
|
"grad_norm": 0.8019750714302063,
|
|
"learning_rate": 5.52513378386657e-06,
|
|
"loss": 1.2309683561325073,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 1.4991452991452991,
|
|
"grad_norm": 0.7668002247810364,
|
|
"learning_rate": 5.487748278476342e-06,
|
|
"loss": 1.2046821117401123,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.4991452991452991,
|
|
"eval_loss": 1.3131194114685059,
|
|
"eval_runtime": 24.7008,
|
|
"eval_samples_per_second": 39.918,
|
|
"eval_steps_per_second": 5.02,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.505982905982906,
|
|
"grad_norm": 0.7732208967208862,
|
|
"learning_rate": 5.450335221863068e-06,
|
|
"loss": 1.2219358682632446,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 1.5128205128205128,
|
|
"grad_norm": 0.7456432580947876,
|
|
"learning_rate": 5.412896727361663e-06,
|
|
"loss": 1.2196807861328125,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 1.5196581196581196,
|
|
"grad_norm": 0.7411943674087524,
|
|
"learning_rate": 5.375434909743942e-06,
|
|
"loss": 1.2303682565689087,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 1.5264957264957264,
|
|
"grad_norm": 0.7763144373893738,
|
|
"learning_rate": 5.337951885099167e-06,
|
|
"loss": 1.188888669013977,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 1.5333333333333332,
|
|
"grad_norm": 0.8138889074325562,
|
|
"learning_rate": 5.300449770714502e-06,
|
|
"loss": 1.1965391635894775,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 1.54017094017094,
|
|
"grad_norm": 0.7770660519599915,
|
|
"learning_rate": 5.262930684955439e-06,
|
|
"loss": 1.233127474784851,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 1.547008547008547,
|
|
"grad_norm": 0.7718791961669922,
|
|
"learning_rate": 5.225396747146112e-06,
|
|
"loss": 1.240120768547058,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 1.5538461538461539,
|
|
"grad_norm": 0.7710370421409607,
|
|
"learning_rate": 5.187850077449604e-06,
|
|
"loss": 1.202008605003357,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 1.5606837606837607,
|
|
"grad_norm": 0.7775757908821106,
|
|
"learning_rate": 5.150292796748174e-06,
|
|
"loss": 1.2269346714019775,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 1.5675213675213675,
|
|
"grad_norm": 0.7479456067085266,
|
|
"learning_rate": 5.112727026523461e-06,
|
|
"loss": 1.1906824111938477,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 1.5743589743589743,
|
|
"grad_norm": 0.7567362189292908,
|
|
"learning_rate": 5.075154888736653e-06,
|
|
"loss": 1.1966190338134766,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 1.5811965811965814,
|
|
"grad_norm": 0.7536229491233826,
|
|
"learning_rate": 5.03757850570861e-06,
|
|
"loss": 1.1917792558670044,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 1.5880341880341882,
|
|
"grad_norm": 0.7776764035224915,
|
|
"learning_rate": 5e-06,
|
|
"loss": 1.1941741704940796,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 1.594871794871795,
|
|
"grad_norm": 0.7667071223258972,
|
|
"learning_rate": 4.9624214942913916e-06,
|
|
"loss": 1.1881437301635742,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 1.6017094017094018,
|
|
"grad_norm": 0.773404061794281,
|
|
"learning_rate": 4.924845111263349e-06,
|
|
"loss": 1.2190567255020142,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 1.6085470085470086,
|
|
"grad_norm": 0.7392263412475586,
|
|
"learning_rate": 4.88727297347654e-06,
|
|
"loss": 1.2026817798614502,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 1.6153846153846154,
|
|
"grad_norm": 0.7713451981544495,
|
|
"learning_rate": 4.8497072032518274e-06,
|
|
"loss": 1.2358677387237549,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 1.6222222222222222,
|
|
"grad_norm": 0.7625684142112732,
|
|
"learning_rate": 4.8121499225503974e-06,
|
|
"loss": 1.1716538667678833,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 1.629059829059829,
|
|
"grad_norm": 0.7581425309181213,
|
|
"learning_rate": 4.774603252853889e-06,
|
|
"loss": 1.1988354921340942,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 1.6358974358974359,
|
|
"grad_norm": 0.751584529876709,
|
|
"learning_rate": 4.737069315044562e-06,
|
|
"loss": 1.2101967334747314,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.6427350427350427,
|
|
"grad_norm": 0.7554129362106323,
|
|
"learning_rate": 4.699550229285499e-06,
|
|
"loss": 1.202675223350525,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 1.6495726495726495,
|
|
"grad_norm": 0.761131227016449,
|
|
"learning_rate": 4.662048114900837e-06,
|
|
"loss": 1.201820731163025,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 1.6564102564102563,
|
|
"grad_norm": 0.7265458703041077,
|
|
"learning_rate": 4.624565090256059e-06,
|
|
"loss": 1.2179176807403564,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 1.6632478632478631,
|
|
"grad_norm": 0.767880916595459,
|
|
"learning_rate": 4.587103272638339e-06,
|
|
"loss": 1.1769942045211792,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 1.67008547008547,
|
|
"grad_norm": 0.7633269429206848,
|
|
"learning_rate": 4.549664778136933e-06,
|
|
"loss": 1.2298530340194702,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 1.676923076923077,
|
|
"grad_norm": 0.7275070548057556,
|
|
"learning_rate": 4.512251721523659e-06,
|
|
"loss": 1.2158825397491455,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 1.6837606837606838,
|
|
"grad_norm": 0.7592760920524597,
|
|
"learning_rate": 4.4748662161334335e-06,
|
|
"loss": 1.207166314125061,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 1.6905982905982906,
|
|
"grad_norm": 0.7778440713882446,
|
|
"learning_rate": 4.437510373744897e-06,
|
|
"loss": 1.2096598148345947,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 1.6974358974358974,
|
|
"grad_norm": 0.7637122869491577,
|
|
"learning_rate": 4.400186304461136e-06,
|
|
"loss": 1.1851915121078491,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 1.7042735042735044,
|
|
"grad_norm": 0.7784591317176819,
|
|
"learning_rate": 4.362896116590475e-06,
|
|
"loss": 1.2293877601623535,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 1.7111111111111112,
|
|
"grad_norm": 0.8099437355995178,
|
|
"learning_rate": 4.325641916527405e-06,
|
|
"loss": 1.2101249694824219,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 1.717948717948718,
|
|
"grad_norm": 0.7552655339241028,
|
|
"learning_rate": 4.2884258086335755e-06,
|
|
"loss": 1.2240850925445557,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 1.7247863247863249,
|
|
"grad_norm": 0.7730560898780823,
|
|
"learning_rate": 4.25124989511895e-06,
|
|
"loss": 1.2249057292938232,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 1.7316239316239317,
|
|
"grad_norm": 0.7381757497787476,
|
|
"learning_rate": 4.214116275923051e-06,
|
|
"loss": 1.1832340955734253,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 1.7384615384615385,
|
|
"grad_norm": 0.739567756652832,
|
|
"learning_rate": 4.17702704859633e-06,
|
|
"loss": 1.200039267539978,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 1.7452991452991453,
|
|
"grad_norm": 0.774598240852356,
|
|
"learning_rate": 4.1399843081817085e-06,
|
|
"loss": 1.2123297452926636,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 1.7521367521367521,
|
|
"grad_norm": 0.8052539229393005,
|
|
"learning_rate": 4.1029901470962105e-06,
|
|
"loss": 1.2242088317871094,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 1.758974358974359,
|
|
"grad_norm": 0.7723326683044434,
|
|
"learning_rate": 4.066046655012786e-06,
|
|
"loss": 1.2281506061553955,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 1.7658119658119658,
|
|
"grad_norm": 0.7577686309814453,
|
|
"learning_rate": 4.029155918742268e-06,
|
|
"loss": 1.2183786630630493,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 1.7726495726495726,
|
|
"grad_norm": 0.7814478278160095,
|
|
"learning_rate": 3.992320022115492e-06,
|
|
"loss": 1.2138553857803345,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.7794871794871794,
|
|
"grad_norm": 0.7868865132331848,
|
|
"learning_rate": 3.955541045865591e-06,
|
|
"loss": 1.1890326738357544,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 1.7863247863247862,
|
|
"grad_norm": 0.7574802041053772,
|
|
"learning_rate": 3.918821067510464e-06,
|
|
"loss": 1.1699459552764893,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 1.793162393162393,
|
|
"grad_norm": 0.7787984013557434,
|
|
"learning_rate": 3.882162161235421e-06,
|
|
"loss": 1.1902029514312744,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 1.8,
|
|
"grad_norm": 0.780857264995575,
|
|
"learning_rate": 3.845566397776022e-06,
|
|
"loss": 1.1960508823394775,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 1.8,
|
|
"eval_loss": 1.308773159980774,
|
|
"eval_runtime": 24.5858,
|
|
"eval_samples_per_second": 40.104,
|
|
"eval_steps_per_second": 5.044,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 1.8068376068376069,
|
|
"grad_norm": 0.7353282570838928,
|
|
"learning_rate": 3.8090358443010993e-06,
|
|
"loss": 1.2238385677337646,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 1.8136752136752137,
|
|
"grad_norm": 0.7844496369361877,
|
|
"learning_rate": 3.7725725642960047e-06,
|
|
"loss": 1.2065067291259766,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 1.8205128205128205,
|
|
"grad_norm": 0.7792806029319763,
|
|
"learning_rate": 3.7361786174460414e-06,
|
|
"loss": 1.1908563375473022,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 1.8273504273504273,
|
|
"grad_norm": 0.7404017448425293,
|
|
"learning_rate": 3.6998560595201188e-06,
|
|
"loss": 1.2162412405014038,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 1.8341880341880343,
|
|
"grad_norm": 0.7953075170516968,
|
|
"learning_rate": 3.6636069422546363e-06,
|
|
"loss": 1.2134095430374146,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 1.8410256410256411,
|
|
"grad_norm": 0.7584754824638367,
|
|
"learning_rate": 3.627433313237576e-06,
|
|
"loss": 1.2177472114562988,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 1.847863247863248,
|
|
"grad_norm": 0.7290381789207458,
|
|
"learning_rate": 3.5913372157928515e-06,
|
|
"loss": 1.189732551574707,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 1.8547008547008548,
|
|
"grad_norm": 0.7861201763153076,
|
|
"learning_rate": 3.555320688864889e-06,
|
|
"loss": 1.2073522806167603,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 1.8615384615384616,
|
|
"grad_norm": 0.7544710636138916,
|
|
"learning_rate": 3.519385766903442e-06,
|
|
"loss": 1.2041759490966797,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 1.8683760683760684,
|
|
"grad_norm": 0.7539916038513184,
|
|
"learning_rate": 3.483534479748688e-06,
|
|
"loss": 1.2057629823684692,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 1.8752136752136752,
|
|
"grad_norm": 0.7374740242958069,
|
|
"learning_rate": 3.447768852516554e-06,
|
|
"loss": 1.2203168869018555,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 1.882051282051282,
|
|
"grad_norm": 0.7594785690307617,
|
|
"learning_rate": 3.4120909054843375e-06,
|
|
"loss": 1.182802438735962,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 1.8888888888888888,
|
|
"grad_norm": 0.7542571425437927,
|
|
"learning_rate": 3.3765026539765832e-06,
|
|
"loss": 1.2168110609054565,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 1.8957264957264957,
|
|
"grad_norm": 0.7577287554740906,
|
|
"learning_rate": 3.3410061082512422e-06,
|
|
"loss": 1.2106308937072754,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 1.9025641025641025,
|
|
"grad_norm": 0.7561420798301697,
|
|
"learning_rate": 3.3056032733861188e-06,
|
|
"loss": 1.20242440700531,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 1.9094017094017093,
|
|
"grad_norm": 0.7456007599830627,
|
|
"learning_rate": 3.2702961491656197e-06,
|
|
"loss": 1.2251598834991455,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.916239316239316,
|
|
"grad_norm": 0.790366530418396,
|
|
"learning_rate": 3.2350867299677802e-06,
|
|
"loss": 1.2062650918960571,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 1.9230769230769231,
|
|
"grad_norm": 0.7317772507667542,
|
|
"learning_rate": 3.1999770046516198e-06,
|
|
"loss": 1.1729378700256348,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 1.92991452991453,
|
|
"grad_norm": 0.7773919105529785,
|
|
"learning_rate": 3.164968956444791e-06,
|
|
"loss": 1.1983883380889893,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 1.9367521367521368,
|
|
"grad_norm": 0.7585593461990356,
|
|
"learning_rate": 3.130064562831553e-06,
|
|
"loss": 1.2086600065231323,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 1.9435897435897436,
|
|
"grad_norm": 0.7703876495361328,
|
|
"learning_rate": 3.0952657954410792e-06,
|
|
"loss": 1.2189124822616577,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 1.9504273504273504,
|
|
"grad_norm": 0.7693601250648499,
|
|
"learning_rate": 3.0605746199360755e-06,
|
|
"loss": 1.210176706314087,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 1.9572649572649574,
|
|
"grad_norm": 0.7466776967048645,
|
|
"learning_rate": 3.0259929959017585e-06,
|
|
"loss": 1.2027801275253296,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 1.9641025641025642,
|
|
"grad_norm": 0.772388219833374,
|
|
"learning_rate": 2.991522876735154e-06,
|
|
"loss": 1.2112243175506592,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 1.970940170940171,
|
|
"grad_norm": 0.7715580463409424,
|
|
"learning_rate": 2.95716620953476e-06,
|
|
"loss": 1.1904889345169067,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 1.9777777777777779,
|
|
"grad_norm": 0.7397588491439819,
|
|
"learning_rate": 2.9229249349905686e-06,
|
|
"loss": 1.1913639307022095,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 1.9846153846153847,
|
|
"grad_norm": 0.7530134916305542,
|
|
"learning_rate": 2.8888009872744332e-06,
|
|
"loss": 1.2205219268798828,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 1.9914529914529915,
|
|
"grad_norm": 0.7689472436904907,
|
|
"learning_rate": 2.8547962939308187e-06,
|
|
"loss": 1.2000938653945923,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 1.9982905982905983,
|
|
"grad_norm": 0.7348621487617493,
|
|
"learning_rate": 2.8209127757679246e-06,
|
|
"loss": 1.1786831617355347,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.537250280380249,
|
|
"learning_rate": 2.787152346749173e-06,
|
|
"loss": 1.1778086423873901,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 2.006837606837607,
|
|
"grad_norm": 0.9093112945556641,
|
|
"learning_rate": 2.7535169138851124e-06,
|
|
"loss": 1.1308534145355225,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 2.0136752136752136,
|
|
"grad_norm": 0.895119845867157,
|
|
"learning_rate": 2.720008377125682e-06,
|
|
"loss": 1.1030248403549194,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 2.0205128205128204,
|
|
"grad_norm": 0.822189211845398,
|
|
"learning_rate": 2.686628629252899e-06,
|
|
"loss": 1.0862432718276978,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 2.0273504273504273,
|
|
"grad_norm": 0.839640200138092,
|
|
"learning_rate": 2.6533795557739407e-06,
|
|
"loss": 1.0923850536346436,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 2.034188034188034,
|
|
"grad_norm": 0.7948157787322998,
|
|
"learning_rate": 2.6202630348146323e-06,
|
|
"loss": 1.1080037355422974,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 2.041025641025641,
|
|
"grad_norm": 0.7708576321601868,
|
|
"learning_rate": 2.5872809370133704e-06,
|
|
"loss": 1.133652687072754,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 2.0478632478632477,
|
|
"grad_norm": 0.784568727016449,
|
|
"learning_rate": 2.5544351254154407e-06,
|
|
"loss": 1.1596778631210327,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 2.0547008547008545,
|
|
"grad_norm": 0.8119481205940247,
|
|
"learning_rate": 2.5217274553677975e-06,
|
|
"loss": 1.129364252090454,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 2.0615384615384613,
|
|
"grad_norm": 0.7969528436660767,
|
|
"learning_rate": 2.489159774414252e-06,
|
|
"loss": 1.0949797630310059,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 2.0683760683760686,
|
|
"grad_norm": 0.823360800743103,
|
|
"learning_rate": 2.4567339221911086e-06,
|
|
"loss": 1.1301119327545166,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 2.0752136752136754,
|
|
"grad_norm": 0.8292282223701477,
|
|
"learning_rate": 2.424451730323261e-06,
|
|
"loss": 1.1120922565460205,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 2.082051282051282,
|
|
"grad_norm": 0.8004986047744751,
|
|
"learning_rate": 2.3923150223207176e-06,
|
|
"loss": 1.1214550733566284,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 2.088888888888889,
|
|
"grad_norm": 0.8165397644042969,
|
|
"learning_rate": 2.3603256134756066e-06,
|
|
"loss": 1.1209532022476196,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 2.095726495726496,
|
|
"grad_norm": 0.8034455180168152,
|
|
"learning_rate": 2.328485310759635e-06,
|
|
"loss": 1.1401094198226929,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 2.095726495726496,
|
|
"eval_loss": 1.3253560066223145,
|
|
"eval_runtime": 24.6122,
|
|
"eval_samples_per_second": 40.061,
|
|
"eval_steps_per_second": 5.038,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 2.1025641025641026,
|
|
"grad_norm": 0.7844864130020142,
|
|
"learning_rate": 2.296795912722014e-06,
|
|
"loss": 1.144791603088379,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 2.1094017094017095,
|
|
"grad_norm": 0.7857894897460938,
|
|
"learning_rate": 2.265259209387867e-06,
|
|
"loss": 1.1488922834396362,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 2.1162393162393163,
|
|
"grad_norm": 0.7851693630218506,
|
|
"learning_rate": 2.2338769821571225e-06,
|
|
"loss": 1.1399354934692383,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 2.123076923076923,
|
|
"grad_norm": 0.8227202296257019,
|
|
"learning_rate": 2.202651003703885e-06,
|
|
"loss": 1.1063587665557861,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 2.12991452991453,
|
|
"grad_norm": 0.822938084602356,
|
|
"learning_rate": 2.1715830378763025e-06,
|
|
"loss": 1.1050540208816528,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 2.1367521367521367,
|
|
"grad_norm": 0.8058551549911499,
|
|
"learning_rate": 2.140674839596931e-06,
|
|
"loss": 1.0922585725784302,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 2.1435897435897435,
|
|
"grad_norm": 0.7917458415031433,
|
|
"learning_rate": 2.109928154763606e-06,
|
|
"loss": 1.1247828006744385,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 2.1504273504273503,
|
|
"grad_norm": 0.8290326595306396,
|
|
"learning_rate": 2.0793447201508288e-06,
|
|
"loss": 1.1369386911392212,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 2.157264957264957,
|
|
"grad_norm": 0.7832273840904236,
|
|
"learning_rate": 2.0489262633116536e-06,
|
|
"loss": 1.110697627067566,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 2.164102564102564,
|
|
"grad_norm": 0.7919285297393799,
|
|
"learning_rate": 2.01867450248011e-06,
|
|
"loss": 1.157274842262268,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 2.1709401709401708,
|
|
"grad_norm": 0.7776212096214294,
|
|
"learning_rate": 1.9885911464741413e-06,
|
|
"loss": 1.139618992805481,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 2.1777777777777776,
|
|
"grad_norm": 0.7800706624984741,
|
|
"learning_rate": 1.9586778945990785e-06,
|
|
"loss": 1.1110671758651733,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.184615384615385,
|
|
"grad_norm": 0.8117327094078064,
|
|
"learning_rate": 1.928936436551661e-06,
|
|
"loss": 1.1395684480667114,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 2.1914529914529917,
|
|
"grad_norm": 0.7962910532951355,
|
|
"learning_rate": 1.8993684523245842e-06,
|
|
"loss": 1.1162846088409424,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 2.1982905982905985,
|
|
"grad_norm": 0.7874794602394104,
|
|
"learning_rate": 1.8699756121115997e-06,
|
|
"loss": 1.1188956499099731,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 2.2051282051282053,
|
|
"grad_norm": 0.785068690776825,
|
|
"learning_rate": 1.8407595762131814e-06,
|
|
"loss": 1.1131058931350708,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 2.211965811965812,
|
|
"grad_norm": 0.8046601414680481,
|
|
"learning_rate": 1.811721994942731e-06,
|
|
"loss": 1.1231977939605713,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 2.218803418803419,
|
|
"grad_norm": 0.759477972984314,
|
|
"learning_rate": 1.7828645085333645e-06,
|
|
"loss": 1.1036738157272339,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 2.2256410256410257,
|
|
"grad_norm": 0.7955328226089478,
|
|
"learning_rate": 1.7541887470452606e-06,
|
|
"loss": 1.166395664215088,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 2.2324786324786325,
|
|
"grad_norm": 0.7807881236076355,
|
|
"learning_rate": 1.7256963302735752e-06,
|
|
"loss": 1.1385221481323242,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 2.2393162393162394,
|
|
"grad_norm": 0.7881447076797485,
|
|
"learning_rate": 1.6973888676569594e-06,
|
|
"loss": 1.145586609840393,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 2.246153846153846,
|
|
"grad_norm": 0.8092402815818787,
|
|
"learning_rate": 1.6692679581866334e-06,
|
|
"loss": 1.1422295570373535,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 2.252991452991453,
|
|
"grad_norm": 0.7870088219642639,
|
|
"learning_rate": 1.6413351903160763e-06,
|
|
"loss": 1.1302958726882935,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 2.25982905982906,
|
|
"grad_norm": 0.8018279075622559,
|
|
"learning_rate": 1.6135921418712959e-06,
|
|
"loss": 1.114201545715332,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 2.2666666666666666,
|
|
"grad_norm": 0.7955658435821533,
|
|
"learning_rate": 1.5860403799616951e-06,
|
|
"loss": 1.1686758995056152,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 2.2735042735042734,
|
|
"grad_norm": 0.8098942637443542,
|
|
"learning_rate": 1.5586814608915673e-06,
|
|
"loss": 1.1103954315185547,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 2.2803418803418802,
|
|
"grad_norm": 0.7653470039367676,
|
|
"learning_rate": 1.5315169300721694e-06,
|
|
"loss": 1.1263670921325684,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 2.287179487179487,
|
|
"grad_norm": 0.7954714894294739,
|
|
"learning_rate": 1.5045483219344387e-06,
|
|
"loss": 1.091448187828064,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 2.294017094017094,
|
|
"grad_norm": 0.7870411276817322,
|
|
"learning_rate": 1.4777771598423147e-06,
|
|
"loss": 1.127175211906433,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 2.3008547008547007,
|
|
"grad_norm": 0.8070060014724731,
|
|
"learning_rate": 1.4512049560066837e-06,
|
|
"loss": 1.1385235786437988,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 2.3076923076923075,
|
|
"grad_norm": 0.7654244303703308,
|
|
"learning_rate": 1.4248332113999708e-06,
|
|
"loss": 1.1272555589675903,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 2.3145299145299143,
|
|
"grad_norm": 0.7763322591781616,
|
|
"learning_rate": 1.3986634156713418e-06,
|
|
"loss": 1.1271766424179077,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.3213675213675216,
|
|
"grad_norm": 0.7544705867767334,
|
|
"learning_rate": 1.3726970470625705e-06,
|
|
"loss": 1.157515525817871,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 2.3282051282051284,
|
|
"grad_norm": 0.7676778435707092,
|
|
"learning_rate": 1.3469355723245303e-06,
|
|
"loss": 1.1277141571044922,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 2.335042735042735,
|
|
"grad_norm": 0.7713337540626526,
|
|
"learning_rate": 1.321380446634342e-06,
|
|
"loss": 1.1003583669662476,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 2.341880341880342,
|
|
"grad_norm": 0.7740820646286011,
|
|
"learning_rate": 1.2960331135131826e-06,
|
|
"loss": 1.1071029901504517,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 2.348717948717949,
|
|
"grad_norm": 0.758073091506958,
|
|
"learning_rate": 1.270895004744737e-06,
|
|
"loss": 1.110722303390503,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 2.3555555555555556,
|
|
"grad_norm": 0.7693141102790833,
|
|
"learning_rate": 1.245967540294329e-06,
|
|
"loss": 1.097144365310669,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 2.3623931623931624,
|
|
"grad_norm": 0.7613301873207092,
|
|
"learning_rate": 1.2212521282287093e-06,
|
|
"loss": 1.130142092704773,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 2.3692307692307693,
|
|
"grad_norm": 0.7610928416252136,
|
|
"learning_rate": 1.1967501646365147e-06,
|
|
"loss": 1.1337437629699707,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 2.376068376068376,
|
|
"grad_norm": 0.7692887187004089,
|
|
"learning_rate": 1.172463033549418e-06,
|
|
"loss": 1.1064190864562988,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 2.382905982905983,
|
|
"grad_norm": 0.7826989889144897,
|
|
"learning_rate": 1.1483921068639353e-06,
|
|
"loss": 1.1885005235671997,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 2.3897435897435897,
|
|
"grad_norm": 0.7613060474395752,
|
|
"learning_rate": 1.1245387442639456e-06,
|
|
"loss": 1.110337734222412,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 2.3965811965811965,
|
|
"grad_norm": 0.7910706400871277,
|
|
"learning_rate": 1.1009042931438784e-06,
|
|
"loss": 1.1144278049468994,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 2.3965811965811965,
|
|
"eval_loss": 1.323965311050415,
|
|
"eval_runtime": 24.7109,
|
|
"eval_samples_per_second": 39.901,
|
|
"eval_steps_per_second": 5.018,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 2.4034188034188033,
|
|
"grad_norm": 0.7570564150810242,
|
|
"learning_rate": 1.077490088532605e-06,
|
|
"loss": 1.114471435546875,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 2.41025641025641,
|
|
"grad_norm": 0.7983273863792419,
|
|
"learning_rate": 1.0542974530180327e-06,
|
|
"loss": 1.132286787033081,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 2.417094017094017,
|
|
"grad_norm": 0.7606459856033325,
|
|
"learning_rate": 1.0313276966723867e-06,
|
|
"loss": 1.0865505933761597,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 2.4239316239316238,
|
|
"grad_norm": 0.7879711389541626,
|
|
"learning_rate": 1.00858211697822e-06,
|
|
"loss": 1.1440324783325195,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 2.430769230769231,
|
|
"grad_norm": 0.762718915939331,
|
|
"learning_rate": 9.860619987551157e-07,
|
|
"loss": 1.1018445491790771,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 2.437606837606838,
|
|
"grad_norm": 0.7899941802024841,
|
|
"learning_rate": 9.637686140871121e-07,
|
|
"loss": 1.1469783782958984,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 2.4444444444444446,
|
|
"grad_norm": 0.7909042239189148,
|
|
"learning_rate": 9.417032222508476e-07,
|
|
"loss": 1.1333407163619995,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 2.4512820512820515,
|
|
"grad_norm": 0.7936816811561584,
|
|
"learning_rate": 9.198670696444339e-07,
|
|
"loss": 1.1438573598861694,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.4581196581196583,
|
|
"grad_norm": 0.7882561683654785,
|
|
"learning_rate": 8.982613897170439e-07,
|
|
"loss": 1.1176822185516357,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 2.464957264957265,
|
|
"grad_norm": 0.7810674905776978,
|
|
"learning_rate": 8.768874028992431e-07,
|
|
"loss": 1.135961651802063,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 2.471794871794872,
|
|
"grad_norm": 0.7794176340103149,
|
|
"learning_rate": 8.557463165340479e-07,
|
|
"loss": 1.1315698623657227,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 2.4786324786324787,
|
|
"grad_norm": 0.7674309611320496,
|
|
"learning_rate": 8.348393248087289e-07,
|
|
"loss": 1.1471264362335205,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 2.4854700854700855,
|
|
"grad_norm": 0.7684411406517029,
|
|
"learning_rate": 8.141676086873574e-07,
|
|
"loss": 1.1023811101913452,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 2.4923076923076923,
|
|
"grad_norm": 0.7729819416999817,
|
|
"learning_rate": 7.937323358440935e-07,
|
|
"loss": 1.1146825551986694,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 2.499145299145299,
|
|
"grad_norm": 0.7710589170455933,
|
|
"learning_rate": 7.735346605972322e-07,
|
|
"loss": 1.1076273918151855,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 2.505982905982906,
|
|
"grad_norm": 0.7700541019439697,
|
|
"learning_rate": 7.535757238439939e-07,
|
|
"loss": 1.1303023099899292,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 2.5128205128205128,
|
|
"grad_norm": 0.7796255946159363,
|
|
"learning_rate": 7.338566529960817e-07,
|
|
"loss": 1.1434168815612793,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 2.5196581196581196,
|
|
"grad_norm": 0.7890748977661133,
|
|
"learning_rate": 7.143785619160026e-07,
|
|
"loss": 1.137059211730957,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 2.5264957264957264,
|
|
"grad_norm": 0.7733116149902344,
|
|
"learning_rate": 6.951425508541432e-07,
|
|
"loss": 1.1050790548324585,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 2.533333333333333,
|
|
"grad_norm": 0.7718008160591125,
|
|
"learning_rate": 6.761497063866207e-07,
|
|
"loss": 1.1239290237426758,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 2.54017094017094,
|
|
"grad_norm": 0.7675129771232605,
|
|
"learning_rate": 6.574011013539111e-07,
|
|
"loss": 1.1362709999084473,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 2.547008547008547,
|
|
"grad_norm": 0.7831134796142578,
|
|
"learning_rate": 6.388977948002406e-07,
|
|
"loss": 1.1359511613845825,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 2.5538461538461537,
|
|
"grad_norm": 0.7688263654708862,
|
|
"learning_rate": 6.206408319137703e-07,
|
|
"loss": 1.1311153173446655,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 2.5606837606837605,
|
|
"grad_norm": 0.7608706951141357,
|
|
"learning_rate": 6.026312439675553e-07,
|
|
"loss": 1.1158239841461182,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 2.5675213675213673,
|
|
"grad_norm": 0.7655665278434753,
|
|
"learning_rate": 5.848700482612873e-07,
|
|
"loss": 1.1498501300811768,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 2.574358974358974,
|
|
"grad_norm": 0.7795934081077576,
|
|
"learning_rate": 5.673582480638395e-07,
|
|
"loss": 1.1341049671173096,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 2.5811965811965814,
|
|
"grad_norm": 0.7773811221122742,
|
|
"learning_rate": 5.500968325565859e-07,
|
|
"loss": 1.1404979228973389,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 2.588034188034188,
|
|
"grad_norm": 0.8611118793487549,
|
|
"learning_rate": 5.330867767775333e-07,
|
|
"loss": 1.0921636819839478,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 2.594871794871795,
|
|
"grad_norm": 0.745428204536438,
|
|
"learning_rate": 5.163290415662408e-07,
|
|
"loss": 1.1557259559631348,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 2.601709401709402,
|
|
"grad_norm": 0.7756429314613342,
|
|
"learning_rate": 4.998245735095459e-07,
|
|
"loss": 1.1447691917419434,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 2.6085470085470086,
|
|
"grad_norm": 0.7908133864402771,
|
|
"learning_rate": 4.835743048880959e-07,
|
|
"loss": 1.143109917640686,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 2.6153846153846154,
|
|
"grad_norm": 0.7732424736022949,
|
|
"learning_rate": 4.6757915362368567e-07,
|
|
"loss": 1.132035493850708,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 2.6222222222222222,
|
|
"grad_norm": 0.7889422178268433,
|
|
"learning_rate": 4.5184002322740784e-07,
|
|
"loss": 1.1180846691131592,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 2.629059829059829,
|
|
"grad_norm": 0.7938551902770996,
|
|
"learning_rate": 4.363578027486187e-07,
|
|
"loss": 1.1456289291381836,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 2.635897435897436,
|
|
"grad_norm": 0.8030667901039124,
|
|
"learning_rate": 4.211333667247125e-07,
|
|
"loss": 1.1397569179534912,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 2.6427350427350427,
|
|
"grad_norm": 0.7819530367851257,
|
|
"learning_rate": 4.0616757513173123e-07,
|
|
"loss": 1.1004501581192017,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 2.6495726495726495,
|
|
"grad_norm": 0.758314311504364,
|
|
"learning_rate": 3.9146127333577757e-07,
|
|
"loss": 1.1101858615875244,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 2.6564102564102563,
|
|
"grad_norm": 0.7801131010055542,
|
|
"learning_rate": 3.7701529204526856e-07,
|
|
"loss": 1.1453076601028442,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 2.663247863247863,
|
|
"grad_norm": 0.7489244937896729,
|
|
"learning_rate": 3.6283044726401594e-07,
|
|
"loss": 1.0911612510681152,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 2.67008547008547,
|
|
"grad_norm": 0.761225700378418,
|
|
"learning_rate": 3.4890754024512254e-07,
|
|
"loss": 1.130741000175476,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 2.676923076923077,
|
|
"grad_norm": 0.761887788772583,
|
|
"learning_rate": 3.352473574457304e-07,
|
|
"loss": 1.120837926864624,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 2.683760683760684,
|
|
"grad_norm": 0.7792303562164307,
|
|
"learning_rate": 3.2185067048259245e-07,
|
|
"loss": 1.1177864074707031,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 2.690598290598291,
|
|
"grad_norm": 0.7689954042434692,
|
|
"learning_rate": 3.087182360884872e-07,
|
|
"loss": 1.177292823791504,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 2.6974358974358976,
|
|
"grad_norm": 0.7710866332054138,
|
|
"learning_rate": 2.9585079606947843e-07,
|
|
"loss": 1.1195672750473022,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 2.6974358974358976,
|
|
"eval_loss": 1.3236175775527954,
|
|
"eval_runtime": 24.7082,
|
|
"eval_samples_per_second": 39.906,
|
|
"eval_steps_per_second": 5.019,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 2.7042735042735044,
|
|
"grad_norm": 0.7776737809181213,
|
|
"learning_rate": 2.8324907726300366e-07,
|
|
"loss": 1.113619327545166,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"grad_norm": 0.7743112444877625,
|
|
"learning_rate": 2.7091379149682683e-07,
|
|
"loss": 1.0938081741333008,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 2.717948717948718,
|
|
"grad_norm": 0.7779694199562073,
|
|
"learning_rate": 2.5884563554882336e-07,
|
|
"loss": 1.1138122081756592,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 2.724786324786325,
|
|
"grad_norm": 0.7622742652893066,
|
|
"learning_rate": 2.470452911076227e-07,
|
|
"loss": 1.1006677150726318,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.7316239316239317,
|
|
"grad_norm": 0.7664272785186768,
|
|
"learning_rate": 2.355134247341073e-07,
|
|
"loss": 1.1065200567245483,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 2.7384615384615385,
|
|
"grad_norm": 0.7712447643280029,
|
|
"learning_rate": 2.242506878237538e-07,
|
|
"loss": 1.1020417213439941,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 2.7452991452991453,
|
|
"grad_norm": 0.7656382322311401,
|
|
"learning_rate": 2.1325771656984075e-07,
|
|
"loss": 1.1001569032669067,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 2.752136752136752,
|
|
"grad_norm": 0.7811654806137085,
|
|
"learning_rate": 2.0253513192751374e-07,
|
|
"loss": 1.1310510635375977,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 2.758974358974359,
|
|
"grad_norm": 0.7687283158302307,
|
|
"learning_rate": 1.9208353957870684e-07,
|
|
"loss": 1.146543264389038,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 2.7658119658119658,
|
|
"grad_norm": 0.7670867443084717,
|
|
"learning_rate": 1.8190352989793325e-07,
|
|
"loss": 1.1161731481552124,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 2.7726495726495726,
|
|
"grad_norm": 0.7807978391647339,
|
|
"learning_rate": 1.7199567791893524e-07,
|
|
"loss": 1.1282137632369995,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 2.7794871794871794,
|
|
"grad_norm": 0.7957569360733032,
|
|
"learning_rate": 1.6236054330219853e-07,
|
|
"loss": 1.1041632890701294,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 2.786324786324786,
|
|
"grad_norm": 0.7832216024398804,
|
|
"learning_rate": 1.5299867030334815e-07,
|
|
"loss": 1.108730435371399,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 2.793162393162393,
|
|
"grad_norm": 0.753606915473938,
|
|
"learning_rate": 1.439105877423963e-07,
|
|
"loss": 1.131809115409851,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 2.8,
|
|
"grad_norm": 0.7802961468696594,
|
|
"learning_rate": 1.350968089738758e-07,
|
|
"loss": 1.1083602905273438,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 2.8068376068376066,
|
|
"grad_norm": 0.768670380115509,
|
|
"learning_rate": 1.2655783185784253e-07,
|
|
"loss": 1.1080389022827148,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 2.8136752136752134,
|
|
"grad_norm": 0.7562652230262756,
|
|
"learning_rate": 1.1829413873174988e-07,
|
|
"loss": 1.1086317300796509,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 2.8205128205128203,
|
|
"grad_norm": 0.763107180595398,
|
|
"learning_rate": 1.1030619638320805e-07,
|
|
"loss": 1.1433099508285522,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 2.827350427350427,
|
|
"grad_norm": 0.7749531865119934,
|
|
"learning_rate": 1.0259445602361084e-07,
|
|
"loss": 1.129563331604004,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 2.8341880341880343,
|
|
"grad_norm": 0.7604458928108215,
|
|
"learning_rate": 9.51593532626538e-08,
|
|
"loss": 1.120940089225769,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 2.841025641025641,
|
|
"grad_norm": 0.750518262386322,
|
|
"learning_rate": 8.800130808372553e-08,
|
|
"loss": 1.0916835069656372,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 2.847863247863248,
|
|
"grad_norm": 0.7595433592796326,
|
|
"learning_rate": 8.11207248201834e-08,
|
|
"loss": 1.1178152561187744,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 2.8547008547008548,
|
|
"grad_norm": 0.7640005350112915,
|
|
"learning_rate": 7.45179921325162e-08,
|
|
"loss": 1.1630092859268188,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 2.8615384615384616,
|
|
"grad_norm": 0.8447228074073792,
|
|
"learning_rate": 6.819348298638839e-08,
|
|
"loss": 1.1273298263549805,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.8683760683760684,
|
|
"grad_norm": 0.7577494978904724,
|
|
"learning_rate": 6.214755463157417e-08,
|
|
"loss": 1.0993590354919434,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 2.875213675213675,
|
|
"grad_norm": 0.7751004099845886,
|
|
"learning_rate": 5.638054858177644e-08,
|
|
"loss": 1.1498969793319702,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 2.882051282051282,
|
|
"grad_norm": 0.7662968039512634,
|
|
"learning_rate": 5.089279059533658e-08,
|
|
"loss": 1.1176806688308716,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 2.888888888888889,
|
|
"grad_norm": 0.7827076315879822,
|
|
"learning_rate": 4.568459065683206e-08,
|
|
"loss": 1.1449580192565918,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 2.8957264957264957,
|
|
"grad_norm": 0.7646909952163696,
|
|
"learning_rate": 4.0756242959567596e-08,
|
|
"loss": 1.1186950206756592,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 2.9025641025641025,
|
|
"grad_norm": 0.7541195154190063,
|
|
"learning_rate": 3.610802588895845e-08,
|
|
"loss": 1.131952166557312,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 2.9094017094017093,
|
|
"grad_norm": 0.7776208519935608,
|
|
"learning_rate": 3.1740202006804166e-08,
|
|
"loss": 1.1178792715072632,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 2.916239316239316,
|
|
"grad_norm": 0.7766209244728088,
|
|
"learning_rate": 2.765301803645426e-08,
|
|
"loss": 1.1331486701965332,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 2.9230769230769234,
|
|
"grad_norm": 0.7666369676589966,
|
|
"learning_rate": 2.3846704848878298e-08,
|
|
"loss": 1.1589261293411255,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 2.92991452991453,
|
|
"grad_norm": 0.7775545716285706,
|
|
"learning_rate": 2.0321477449619098e-08,
|
|
"loss": 1.1344677209854126,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 2.936752136752137,
|
|
"grad_norm": 0.7537861466407776,
|
|
"learning_rate": 1.7077534966650767e-08,
|
|
"loss": 1.1040513515472412,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 2.943589743589744,
|
|
"grad_norm": 0.7825785875320435,
|
|
"learning_rate": 1.411506063912882e-08,
|
|
"loss": 1.1581734418869019,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 2.9504273504273506,
|
|
"grad_norm": 0.7491230368614197,
|
|
"learning_rate": 1.1434221807041234e-08,
|
|
"loss": 1.1113041639328003,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 2.9572649572649574,
|
|
"grad_norm": 0.7601305842399597,
|
|
"learning_rate": 9.035169901754902e-09,
|
|
"loss": 1.0998278856277466,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 2.9641025641025642,
|
|
"grad_norm": 0.7869414687156677,
|
|
"learning_rate": 6.918040437463025e-09,
|
|
"loss": 1.1475398540496826,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 2.970940170940171,
|
|
"grad_norm": 0.760128915309906,
|
|
"learning_rate": 5.082953003528457e-09,
|
|
"loss": 1.1517993211746216,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 2.977777777777778,
|
|
"grad_norm": 0.7626367211341858,
|
|
"learning_rate": 3.530011257730226e-09,
|
|
"loss": 1.1134616136550903,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 2.9846153846153847,
|
|
"grad_norm": 0.765670657157898,
|
|
"learning_rate": 2.2593029204076578e-09,
|
|
"loss": 1.1342540979385376,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 2.9914529914529915,
|
|
"grad_norm": 0.7739811539649963,
|
|
"learning_rate": 1.2708997695043412e-09,
|
|
"loss": 1.1077520847320557,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 2.9982905982905983,
|
|
"grad_norm": 0.7707903385162354,
|
|
"learning_rate": 5.648576365169245e-10,
|
|
"loss": 1.0939933061599731,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.9982905982905983,
|
|
"eval_loss": 1.3233778476715088,
|
|
"eval_runtime": 24.6851,
|
|
"eval_samples_per_second": 39.943,
|
|
"eval_steps_per_second": 5.023,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 1.5993366241455078,
|
|
"learning_rate": 1.4121640333653042e-10,
|
|
"loss": 1.0642163753509521,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 441,
|
|
"total_flos": 9.743300044908134e+17,
|
|
"train_loss": 1.2459275746832088,
|
|
"train_runtime": 6646.3979,
|
|
"train_samples_per_second": 8.449,
|
|
"train_steps_per_second": 0.066
|
|
}
|
|
],
|
|
"logging_steps": 1.0,
|
|
"max_steps": 441,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 44,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 9.743300044908134e+17,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|