1195 lines
30 KiB
JSON
1195 lines
30 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 2.9785780813411984,
|
||
|
|
"eval_steps": 30,
|
||
|
|
"global_step": 1200,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.024837007140639553,
|
||
|
|
"grad_norm": 8.359491348266602,
|
||
|
|
"learning_rate": 1.487603305785124e-05,
|
||
|
|
"loss": 2.6721,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04967401428127911,
|
||
|
|
"grad_norm": 0.796862006187439,
|
||
|
|
"learning_rate": 3.1404958677685955e-05,
|
||
|
|
"loss": 1.0523,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07451102142191866,
|
||
|
|
"grad_norm": 0.9021220803260803,
|
||
|
|
"learning_rate": 4.793388429752066e-05,
|
||
|
|
"loss": 0.6256,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07451102142191866,
|
||
|
|
"eval_loss": 0.5602371096611023,
|
||
|
|
"eval_runtime": 38.7856,
|
||
|
|
"eval_samples_per_second": 4.383,
|
||
|
|
"eval_steps_per_second": 4.383,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09934802856255821,
|
||
|
|
"grad_norm": 0.5970816612243652,
|
||
|
|
"learning_rate": 6.446280991735537e-05,
|
||
|
|
"loss": 0.4598,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12418503570319776,
|
||
|
|
"grad_norm": 0.6478993892669678,
|
||
|
|
"learning_rate": 8.099173553719009e-05,
|
||
|
|
"loss": 0.3539,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14902204284383733,
|
||
|
|
"grad_norm": 0.4981272518634796,
|
||
|
|
"learning_rate": 9.75206611570248e-05,
|
||
|
|
"loss": 0.2828,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14902204284383733,
|
||
|
|
"eval_loss": 0.28084230422973633,
|
||
|
|
"eval_runtime": 38.8471,
|
||
|
|
"eval_samples_per_second": 4.376,
|
||
|
|
"eval_steps_per_second": 4.376,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17385904998447688,
|
||
|
|
"grad_norm": 0.5028337836265564,
|
||
|
|
"learning_rate": 0.0001140495867768595,
|
||
|
|
"loss": 0.2523,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19869605712511643,
|
||
|
|
"grad_norm": 0.7332940697669983,
|
||
|
|
"learning_rate": 0.00013057851239669423,
|
||
|
|
"loss": 0.2251,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22353306426575598,
|
||
|
|
"grad_norm": 0.43022558093070984,
|
||
|
|
"learning_rate": 0.00014710743801652894,
|
||
|
|
"loss": 0.1988,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22353306426575598,
|
||
|
|
"eval_loss": 0.19827328622341156,
|
||
|
|
"eval_runtime": 38.898,
|
||
|
|
"eval_samples_per_second": 4.37,
|
||
|
|
"eval_steps_per_second": 4.37,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24837007140639553,
|
||
|
|
"grad_norm": 0.33378636837005615,
|
||
|
|
"learning_rate": 0.00016363636363636366,
|
||
|
|
"loss": 0.1873,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2732070785470351,
|
||
|
|
"grad_norm": 0.6605438590049744,
|
||
|
|
"learning_rate": 0.00018016528925619835,
|
||
|
|
"loss": 0.1663,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29804408568767465,
|
||
|
|
"grad_norm": 0.26731953024864197,
|
||
|
|
"learning_rate": 0.0001966942148760331,
|
||
|
|
"loss": 0.1704,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29804408568767465,
|
||
|
|
"eval_loss": 0.16433002054691315,
|
||
|
|
"eval_runtime": 38.9658,
|
||
|
|
"eval_samples_per_second": 4.363,
|
||
|
|
"eval_steps_per_second": 4.363,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3228810928283142,
|
||
|
|
"grad_norm": 0.33725589513778687,
|
||
|
|
"learning_rate": 0.00019997332081116373,
|
||
|
|
"loss": 0.166,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34771809996895375,
|
||
|
|
"grad_norm": 0.41751629114151,
|
||
|
|
"learning_rate": 0.00019986496100395275,
|
||
|
|
"loss": 0.1693,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3725551071095933,
|
||
|
|
"grad_norm": 0.34873735904693604,
|
||
|
|
"learning_rate": 0.000199673343399533,
|
||
|
|
"loss": 0.1571,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3725551071095933,
|
||
|
|
"eval_loss": 0.15006589889526367,
|
||
|
|
"eval_runtime": 38.8567,
|
||
|
|
"eval_samples_per_second": 4.375,
|
||
|
|
"eval_steps_per_second": 4.375,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39739211425023285,
|
||
|
|
"grad_norm": 0.31080615520477295,
|
||
|
|
"learning_rate": 0.00019939862775022893,
|
||
|
|
"loss": 0.1608,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4222291213908724,
|
||
|
|
"grad_norm": 0.18451546132564545,
|
||
|
|
"learning_rate": 0.0001990410430875205,
|
||
|
|
"loss": 0.1477,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44706612853151195,
|
||
|
|
"grad_norm": 0.21936577558517456,
|
||
|
|
"learning_rate": 0.00019860088753109896,
|
||
|
|
"loss": 0.1582,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44706612853151195,
|
||
|
|
"eval_loss": 0.14357255399227142,
|
||
|
|
"eval_runtime": 38.8454,
|
||
|
|
"eval_samples_per_second": 4.376,
|
||
|
|
"eval_steps_per_second": 4.376,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47190313567215153,
|
||
|
|
"grad_norm": 0.16537490487098694,
|
||
|
|
"learning_rate": 0.00019807852804032305,
|
||
|
|
"loss": 0.1474,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49674014281279105,
|
||
|
|
"grad_norm": 0.21974419057369232,
|
||
|
|
"learning_rate": 0.00019747440010828383,
|
||
|
|
"loss": 0.1475,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5215771499534306,
|
||
|
|
"grad_norm": 0.26473143696784973,
|
||
|
|
"learning_rate": 0.00019678900739873226,
|
||
|
|
"loss": 0.1443,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5215771499534306,
|
||
|
|
"eval_loss": 0.13610073924064636,
|
||
|
|
"eval_runtime": 38.8034,
|
||
|
|
"eval_samples_per_second": 4.381,
|
||
|
|
"eval_steps_per_second": 4.381,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5464141570940702,
|
||
|
|
"grad_norm": 0.24532222747802734,
|
||
|
|
"learning_rate": 0.000196022921326173,
|
||
|
|
"loss": 0.1418,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5712511642347097,
|
||
|
|
"grad_norm": 0.189656063914299,
|
||
|
|
"learning_rate": 0.00019517678057947384,
|
||
|
|
"loss": 0.1463,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5960881713753493,
|
||
|
|
"grad_norm": 0.16695067286491394,
|
||
|
|
"learning_rate": 0.00019425129058938832,
|
||
|
|
"loss": 0.1518,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5960881713753493,
|
||
|
|
"eval_loss": 0.12985987961292267,
|
||
|
|
"eval_runtime": 38.9149,
|
||
|
|
"eval_samples_per_second": 4.369,
|
||
|
|
"eval_steps_per_second": 4.369,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6209251785159888,
|
||
|
|
"grad_norm": 0.20477107167243958,
|
||
|
|
"learning_rate": 0.00019324722294043558,
|
||
|
|
"loss": 0.1365,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6457621856566284,
|
||
|
|
"grad_norm": 0.14411672949790955,
|
||
|
|
"learning_rate": 0.00019216541472762735,
|
||
|
|
"loss": 0.1396,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.670599192797268,
|
||
|
|
"grad_norm": 0.12490475922822952,
|
||
|
|
"learning_rate": 0.0001910067678585786,
|
||
|
|
"loss": 0.1462,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.670599192797268,
|
||
|
|
"eval_loss": 0.13094556331634521,
|
||
|
|
"eval_runtime": 38.8929,
|
||
|
|
"eval_samples_per_second": 4.371,
|
||
|
|
"eval_steps_per_second": 4.371,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6954361999379075,
|
||
|
|
"grad_norm": 0.2038726955652237,
|
||
|
|
"learning_rate": 0.0001897722483015838,
|
||
|
|
"loss": 0.1331,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.720273207078547,
|
||
|
|
"grad_norm": 0.1514689326286316,
|
||
|
|
"learning_rate": 0.00018846288528028555,
|
||
|
|
"loss": 0.138,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7451102142191866,
|
||
|
|
"grad_norm": 0.22410112619400024,
|
||
|
|
"learning_rate": 0.0001870797704156067,
|
||
|
|
"loss": 0.1478,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7451102142191866,
|
||
|
|
"eval_loss": 0.1277894824743271,
|
||
|
|
"eval_runtime": 38.922,
|
||
|
|
"eval_samples_per_second": 4.368,
|
||
|
|
"eval_steps_per_second": 4.368,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7699472213598262,
|
||
|
|
"grad_norm": 0.1832033395767212,
|
||
|
|
"learning_rate": 0.00018562405681566216,
|
||
|
|
"loss": 0.1431,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7947842285004657,
|
||
|
|
"grad_norm": 0.11910755932331085,
|
||
|
|
"learning_rate": 0.00018409695811440796,
|
||
|
|
"loss": 0.1427,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8196212356411052,
|
||
|
|
"grad_norm": 0.13471344113349915,
|
||
|
|
"learning_rate": 0.00018249974745983023,
|
||
|
|
"loss": 0.1471,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8196212356411052,
|
||
|
|
"eval_loss": 0.12796619534492493,
|
||
|
|
"eval_runtime": 38.9144,
|
||
|
|
"eval_samples_per_second": 4.369,
|
||
|
|
"eval_steps_per_second": 4.369,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8444582427817448,
|
||
|
|
"grad_norm": 0.1409328728914261,
|
||
|
|
"learning_rate": 0.00018083375645251684,
|
||
|
|
"loss": 0.1413,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8692952499223844,
|
||
|
|
"grad_norm": 0.26645776629447937,
|
||
|
|
"learning_rate": 0.00017910037403549693,
|
||
|
|
"loss": 0.1383,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8941322570630239,
|
||
|
|
"grad_norm": 0.21028681099414825,
|
||
|
|
"learning_rate": 0.0001773010453362737,
|
||
|
|
"loss": 0.138,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8941322570630239,
|
||
|
|
"eval_loss": 0.12485021352767944,
|
||
|
|
"eval_runtime": 39.0821,
|
||
|
|
"eval_samples_per_second": 4.35,
|
||
|
|
"eval_steps_per_second": 4.35,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9189692642036634,
|
||
|
|
"grad_norm": 0.13296136260032654,
|
||
|
|
"learning_rate": 0.0001754372704620164,
|
||
|
|
"loss": 0.1494,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9438062713443031,
|
||
|
|
"grad_norm": 0.16724663972854614,
|
||
|
|
"learning_rate": 0.00017351060324891502,
|
||
|
|
"loss": 0.1391,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9686432784849426,
|
||
|
|
"grad_norm": 0.11043282598257065,
|
||
|
|
"learning_rate": 0.00017152264996674136,
|
||
|
|
"loss": 0.1425,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9686432784849426,
|
||
|
|
"eval_loss": 0.127302885055542,
|
||
|
|
"eval_runtime": 38.9,
|
||
|
|
"eval_samples_per_second": 4.37,
|
||
|
|
"eval_steps_per_second": 4.37,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9934802856255821,
|
||
|
|
"grad_norm": 0.28038299083709717,
|
||
|
|
"learning_rate": 0.00016947506797969562,
|
||
|
|
"loss": 0.1323,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0173859049984477,
|
||
|
|
"grad_norm": 0.13846513628959656,
|
||
|
|
"learning_rate": 0.00016736956436465573,
|
||
|
|
"loss": 0.1375,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0422229121390871,
|
||
|
|
"grad_norm": 0.13630150258541107,
|
||
|
|
"learning_rate": 0.00016520789448798087,
|
||
|
|
"loss": 0.138,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0422229121390871,
|
||
|
|
"eval_loss": 0.12523552775382996,
|
||
|
|
"eval_runtime": 38.8684,
|
||
|
|
"eval_samples_per_second": 4.374,
|
||
|
|
"eval_steps_per_second": 4.374,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0670599192797268,
|
||
|
|
"grad_norm": 0.6552147269248962,
|
||
|
|
"learning_rate": 0.00016299186054205577,
|
||
|
|
"loss": 0.1347,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0918969264203664,
|
||
|
|
"grad_norm": 0.14625756442546844,
|
||
|
|
"learning_rate": 0.00016072331004279614,
|
||
|
|
"loss": 0.1388,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1167339335610058,
|
||
|
|
"grad_norm": 0.29366788268089294,
|
||
|
|
"learning_rate": 0.00015840413428936767,
|
||
|
|
"loss": 0.1237,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1167339335610058,
|
||
|
|
"eval_loss": 0.1281893253326416,
|
||
|
|
"eval_runtime": 38.858,
|
||
|
|
"eval_samples_per_second": 4.375,
|
||
|
|
"eval_steps_per_second": 4.375,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1415709407016454,
|
||
|
|
"grad_norm": 0.4872620105743408,
|
||
|
|
"learning_rate": 0.00015603626678740263,
|
||
|
|
"loss": 0.1364,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.166407947842285,
|
||
|
|
"grad_norm": 0.14464133977890015,
|
||
|
|
"learning_rate": 0.000153621681637029,
|
||
|
|
"loss": 0.1354,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1912449549829245,
|
||
|
|
"grad_norm": 0.11566495150327682,
|
||
|
|
"learning_rate": 0.00015116239188705556,
|
||
|
|
"loss": 0.1337,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1912449549829245,
|
||
|
|
"eval_loss": 0.1254083812236786,
|
||
|
|
"eval_runtime": 39.142,
|
||
|
|
"eval_samples_per_second": 4.343,
|
||
|
|
"eval_steps_per_second": 4.343,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2160819621235641,
|
||
|
|
"grad_norm": 0.14898011088371277,
|
||
|
|
"learning_rate": 0.00014866044785668563,
|
||
|
|
"loss": 0.1383,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2409189692642038,
|
||
|
|
"grad_norm": 0.09731869399547577,
|
||
|
|
"learning_rate": 0.00014611793542615803,
|
||
|
|
"loss": 0.1353,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2657559764048432,
|
||
|
|
"grad_norm": 0.088087297976017,
|
||
|
|
"learning_rate": 0.00014353697429774084,
|
||
|
|
"loss": 0.1277,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2657559764048432,
|
||
|
|
"eval_loss": 0.1225721463561058,
|
||
|
|
"eval_runtime": 38.987,
|
||
|
|
"eval_samples_per_second": 4.36,
|
||
|
|
"eval_steps_per_second": 4.36,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2905929835454828,
|
||
|
|
"grad_norm": 0.0956592932343483,
|
||
|
|
"learning_rate": 0.0001409197162285275,
|
||
|
|
"loss": 0.1334,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3154299906861224,
|
||
|
|
"grad_norm": 0.09264901280403137,
|
||
|
|
"learning_rate": 0.000138268343236509,
|
||
|
|
"loss": 0.1355,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3402669978267618,
|
||
|
|
"grad_norm": 0.0848374217748642,
|
||
|
|
"learning_rate": 0.00013558506578141682,
|
||
|
|
"loss": 0.1331,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3402669978267618,
|
||
|
|
"eval_loss": 0.12285340577363968,
|
||
|
|
"eval_runtime": 39.0259,
|
||
|
|
"eval_samples_per_second": 4.356,
|
||
|
|
"eval_steps_per_second": 4.356,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3651040049674015,
|
||
|
|
"grad_norm": 0.13393981754779816,
|
||
|
|
"learning_rate": 0.00013287212092185464,
|
||
|
|
"loss": 0.1293,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.389941012108041,
|
||
|
|
"grad_norm": 0.1015724316239357,
|
||
|
|
"learning_rate": 0.00013013177045025374,
|
||
|
|
"loss": 0.1336,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4147780192486805,
|
||
|
|
"grad_norm": 0.09620890021324158,
|
||
|
|
"learning_rate": 0.0001273662990072083,
|
||
|
|
"loss": 0.1327,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4147780192486805,
|
||
|
|
"eval_loss": 0.12293345481157303,
|
||
|
|
"eval_runtime": 39.0058,
|
||
|
|
"eval_samples_per_second": 4.358,
|
||
|
|
"eval_steps_per_second": 4.358,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4396150263893202,
|
||
|
|
"grad_norm": 0.10039085894823074,
|
||
|
|
"learning_rate": 0.00012457801217676182,
|
||
|
|
"loss": 0.1358,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4644520335299596,
|
||
|
|
"grad_norm": 0.0853387713432312,
|
||
|
|
"learning_rate": 0.00012176923456423284,
|
||
|
|
"loss": 0.1294,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4892890406705992,
|
||
|
|
"grad_norm": 0.10647860914468765,
|
||
|
|
"learning_rate": 0.00011894230785818284,
|
||
|
|
"loss": 0.1344,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4892890406705992,
|
||
|
|
"eval_loss": 0.12142268568277359,
|
||
|
|
"eval_runtime": 38.984,
|
||
|
|
"eval_samples_per_second": 4.361,
|
||
|
|
"eval_steps_per_second": 4.361,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5141260478112386,
|
||
|
|
"grad_norm": 0.10750167816877365,
|
||
|
|
"learning_rate": 0.00011609958887814129,
|
||
|
|
"loss": 0.1328,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5389630549518785,
|
||
|
|
"grad_norm": 0.10130150616168976,
|
||
|
|
"learning_rate": 0.00011324344760971671,
|
||
|
|
"loss": 0.1305,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5638000620925179,
|
||
|
|
"grad_norm": 0.08659154921770096,
|
||
|
|
"learning_rate": 0.00011037626522873019,
|
||
|
|
"loss": 0.1329,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5638000620925179,
|
||
|
|
"eval_loss": 0.12055275589227676,
|
||
|
|
"eval_runtime": 38.9483,
|
||
|
|
"eval_samples_per_second": 4.365,
|
||
|
|
"eval_steps_per_second": 4.365,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5886370692331573,
|
||
|
|
"grad_norm": 0.10068133473396301,
|
||
|
|
"learning_rate": 0.00010750043211602045,
|
||
|
|
"loss": 0.1332,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.613474076373797,
|
||
|
|
"grad_norm": 0.10626350343227386,
|
||
|
|
"learning_rate": 0.00010461834586457398,
|
||
|
|
"loss": 0.1265,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6383110835144366,
|
||
|
|
"grad_norm": 0.08024556934833527,
|
||
|
|
"learning_rate": 0.00010173240928064285,
|
||
|
|
"loss": 0.1188,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6383110835144366,
|
||
|
|
"eval_loss": 0.12125765532255173,
|
||
|
|
"eval_runtime": 39.0777,
|
||
|
|
"eval_samples_per_second": 4.35,
|
||
|
|
"eval_steps_per_second": 4.35,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.663148090655076,
|
||
|
|
"grad_norm": 0.08063840121030807,
|
||
|
|
"learning_rate": 9.884502838051595e-05,
|
||
|
|
"loss": 0.1325,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6879850977957156,
|
||
|
|
"grad_norm": 0.0849422737956047,
|
||
|
|
"learning_rate": 9.595861038461398e-05,
|
||
|
|
"loss": 0.1322,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7128221049363552,
|
||
|
|
"grad_norm": 0.09707438945770264,
|
||
|
|
"learning_rate": 9.307556171058085e-05,
|
||
|
|
"loss": 0.1303,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7128221049363552,
|
||
|
|
"eval_loss": 0.119967520236969,
|
||
|
|
"eval_runtime": 38.9279,
|
||
|
|
"eval_samples_per_second": 4.367,
|
||
|
|
"eval_steps_per_second": 4.367,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7376591120769946,
|
||
|
|
"grad_norm": 0.06874745339155197,
|
||
|
|
"learning_rate": 9.019828596704394e-05,
|
||
|
|
"loss": 0.1323,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7624961192176343,
|
||
|
|
"grad_norm": 0.09782899171113968,
|
||
|
|
"learning_rate": 8.732918194971664e-05,
|
||
|
|
"loss": 0.1269,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.787333126358274,
|
||
|
|
"grad_norm": 0.08459767699241638,
|
||
|
|
"learning_rate": 8.447064164151304e-05,
|
||
|
|
"loss": 0.1383,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.787333126358274,
|
||
|
|
"eval_loss": 0.11947113275527954,
|
||
|
|
"eval_runtime": 39.0046,
|
||
|
|
"eval_samples_per_second": 4.358,
|
||
|
|
"eval_steps_per_second": 4.358,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8121701334989133,
|
||
|
|
"grad_norm": 0.10213370621204376,
|
||
|
|
"learning_rate": 8.162504821834295e-05,
|
||
|
|
"loss": 0.1306,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.837007140639553,
|
||
|
|
"grad_norm": 0.08585705608129501,
|
||
|
|
"learning_rate": 7.879477406224894e-05,
|
||
|
|
"loss": 0.1316,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8618441477801926,
|
||
|
|
"grad_norm": 0.08754217624664307,
|
||
|
|
"learning_rate": 7.598217878354237e-05,
|
||
|
|
"loss": 0.1314,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8618441477801926,
|
||
|
|
"eval_loss": 0.11994090676307678,
|
||
|
|
"eval_runtime": 39.0279,
|
||
|
|
"eval_samples_per_second": 4.356,
|
||
|
|
"eval_steps_per_second": 4.356,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.886681154920832,
|
||
|
|
"grad_norm": 0.09176009893417358,
|
||
|
|
"learning_rate": 7.318960725358741e-05,
|
||
|
|
"loss": 0.1272,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9115181620614716,
|
||
|
|
"grad_norm": 0.07954169064760208,
|
||
|
|
"learning_rate": 7.041938764987297e-05,
|
||
|
|
"loss": 0.1254,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9363551692021113,
|
||
|
|
"grad_norm": 0.10337759554386139,
|
||
|
|
"learning_rate": 6.767382951500204e-05,
|
||
|
|
"loss": 0.1324,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9363551692021113,
|
||
|
|
"eval_loss": 0.12009570002555847,
|
||
|
|
"eval_runtime": 39.0135,
|
||
|
|
"eval_samples_per_second": 4.357,
|
||
|
|
"eval_steps_per_second": 4.357,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9611921763427507,
|
||
|
|
"grad_norm": 0.0923035591840744,
|
||
|
|
"learning_rate": 6.495522183121741e-05,
|
||
|
|
"loss": 0.1312,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9860291834833903,
|
||
|
|
"grad_norm": 0.08417502790689468,
|
||
|
|
"learning_rate": 6.226583111206856e-05,
|
||
|
|
"loss": 0.1308,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.009934802856256,
|
||
|
|
"grad_norm": 0.0744447335600853,
|
||
|
|
"learning_rate": 5.960789951281052e-05,
|
||
|
|
"loss": 0.1226,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.009934802856256,
|
||
|
|
"eval_loss": 0.11844488978385925,
|
||
|
|
"eval_runtime": 39.1559,
|
||
|
|
"eval_samples_per_second": 4.342,
|
||
|
|
"eval_steps_per_second": 4.342,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0347718099968954,
|
||
|
|
"grad_norm": 0.07120949774980545,
|
||
|
|
"learning_rate": 5.698364296111056e-05,
|
||
|
|
"loss": 0.1307,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.059608817137535,
|
||
|
|
"grad_norm": 0.10359616577625275,
|
||
|
|
"learning_rate": 5.43952493096211e-05,
|
||
|
|
"loss": 0.1235,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0844458242781743,
|
||
|
|
"grad_norm": 0.07456395030021667,
|
||
|
|
"learning_rate": 5.184487651195825e-05,
|
||
|
|
"loss": 0.1251,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0844458242781743,
|
||
|
|
"eval_loss": 0.11879772692918777,
|
||
|
|
"eval_runtime": 39.037,
|
||
|
|
"eval_samples_per_second": 4.355,
|
||
|
|
"eval_steps_per_second": 4.355,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.109282831418814,
|
||
|
|
"grad_norm": 0.08009591698646545,
|
||
|
|
"learning_rate": 4.933465082360807e-05,
|
||
|
|
"loss": 0.1349,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1341198385594535,
|
||
|
|
"grad_norm": 0.09120004624128342,
|
||
|
|
"learning_rate": 4.686666502925908e-05,
|
||
|
|
"loss": 0.1268,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.158956845700093,
|
||
|
|
"grad_norm": 0.06066734343767166,
|
||
|
|
"learning_rate": 4.444297669803981e-05,
|
||
|
|
"loss": 0.1259,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.158956845700093,
|
||
|
|
"eval_loss": 0.11877793818712234,
|
||
|
|
"eval_runtime": 38.9985,
|
||
|
|
"eval_samples_per_second": 4.359,
|
||
|
|
"eval_steps_per_second": 4.359,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.183793852840733,
|
||
|
|
"grad_norm": 0.09893694519996643,
|
||
|
|
"learning_rate": 4.206560646811545e-05,
|
||
|
|
"loss": 0.1295,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.208630859981372,
|
||
|
|
"grad_norm": 0.0868426039814949,
|
||
|
|
"learning_rate": 3.973653636207437e-05,
|
||
|
|
"loss": 0.1193,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2334678671220116,
|
||
|
|
"grad_norm": 0.0975928083062172,
|
||
|
|
"learning_rate": 3.745770813450824e-05,
|
||
|
|
"loss": 0.1282,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2334678671220116,
|
||
|
|
"eval_loss": 0.1184425801038742,
|
||
|
|
"eval_runtime": 38.8574,
|
||
|
|
"eval_samples_per_second": 4.375,
|
||
|
|
"eval_steps_per_second": 4.375,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2583048742626515,
|
||
|
|
"grad_norm": 0.07092616707086563,
|
||
|
|
"learning_rate": 3.523102165316381e-05,
|
||
|
|
"loss": 0.1212,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.283141881403291,
|
||
|
|
"grad_norm": 0.0809750035405159,
|
||
|
|
"learning_rate": 3.3058333315016065e-05,
|
||
|
|
"loss": 0.1252,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3079788885439303,
|
||
|
|
"grad_norm": 0.10307040810585022,
|
||
|
|
"learning_rate": 3.094145449858285e-05,
|
||
|
|
"loss": 0.1263,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3079788885439303,
|
||
|
|
"eval_loss": 0.11812498420476913,
|
||
|
|
"eval_runtime": 38.9098,
|
||
|
|
"eval_samples_per_second": 4.369,
|
||
|
|
"eval_steps_per_second": 4.369,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.33281589568457,
|
||
|
|
"grad_norm": 0.087245412170887,
|
||
|
|
"learning_rate": 2.8882150053771995e-05,
|
||
|
|
"loss": 0.1261,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3576529028252096,
|
||
|
|
"grad_norm": 0.08954016864299774,
|
||
|
|
"learning_rate": 2.688213683051892e-05,
|
||
|
|
"loss": 0.13,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.382489909965849,
|
||
|
|
"grad_norm": 0.08771616220474243,
|
||
|
|
"learning_rate": 2.4943082247442585e-05,
|
||
|
|
"loss": 0.1232,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.382489909965849,
|
||
|
|
"eval_loss": 0.1186952143907547,
|
||
|
|
"eval_runtime": 38.9954,
|
||
|
|
"eval_samples_per_second": 4.359,
|
||
|
|
"eval_steps_per_second": 4.359,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.407326917106489,
|
||
|
|
"grad_norm": 0.09044978022575378,
|
||
|
|
"learning_rate": 2.3066602901712108e-05,
|
||
|
|
"loss": 0.1269,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4321639242471282,
|
||
|
|
"grad_norm": 0.07978509366512299,
|
||
|
|
"learning_rate": 2.1254263221283654e-05,
|
||
|
|
"loss": 0.1234,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4570009313877677,
|
||
|
|
"grad_norm": 0.09153130650520325,
|
||
|
|
"learning_rate": 1.950757416063077e-05,
|
||
|
|
"loss": 0.1219,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4570009313877677,
|
||
|
|
"eval_loss": 0.11810684204101562,
|
||
|
|
"eval_runtime": 39.0556,
|
||
|
|
"eval_samples_per_second": 4.353,
|
||
|
|
"eval_steps_per_second": 4.353,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4818379385284075,
|
||
|
|
"grad_norm": 0.09865284711122513,
|
||
|
|
"learning_rate": 1.7827991941056177e-05,
|
||
|
|
"loss": 0.1196,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.506674945669047,
|
||
|
|
"grad_norm": 0.08960665762424469,
|
||
|
|
"learning_rate": 1.621691683663418e-05,
|
||
|
|
"loss": 0.1279,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5315119528096863,
|
||
|
|
"grad_norm": 0.09712927043437958,
|
||
|
|
"learning_rate": 1.4675692006797137e-05,
|
||
|
|
"loss": 0.125,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5315119528096863,
|
||
|
|
"eval_loss": 0.11813132464885712,
|
||
|
|
"eval_runtime": 38.9216,
|
||
|
|
"eval_samples_per_second": 4.368,
|
||
|
|
"eval_steps_per_second": 4.368,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.556348959950326,
|
||
|
|
"grad_norm": 0.10061313211917877,
|
||
|
|
"learning_rate": 1.3205602376538163e-05,
|
||
|
|
"loss": 0.128,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5811859670909656,
|
||
|
|
"grad_norm": 0.07715890556573868,
|
||
|
|
"learning_rate": 1.1807873565164506e-05,
|
||
|
|
"loss": 0.1231,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.606022974231605,
|
||
|
|
"grad_norm": 0.09734167903661728,
|
||
|
|
"learning_rate": 1.0483670864493778e-05,
|
||
|
|
"loss": 0.133,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.606022974231605,
|
||
|
|
"eval_loss": 0.11806084215641022,
|
||
|
|
"eval_runtime": 39.1009,
|
||
|
|
"eval_samples_per_second": 4.348,
|
||
|
|
"eval_steps_per_second": 4.348,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.630859981372245,
|
||
|
|
"grad_norm": 0.07243157923221588,
|
||
|
|
"learning_rate": 9.234098267345958e-06,
|
||
|
|
"loss": 0.1278,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6556969885128843,
|
||
|
|
"grad_norm": 0.12202966213226318,
|
||
|
|
"learning_rate": 8.060197547140347e-06,
|
||
|
|
"loss": 0.1228,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6805339956535237,
|
||
|
|
"grad_norm": 0.07589470595121384,
|
||
|
|
"learning_rate": 6.962947389365071e-06,
|
||
|
|
"loss": 0.1225,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6805339956535237,
|
||
|
|
"eval_loss": 0.11801353096961975,
|
||
|
|
"eval_runtime": 39.1137,
|
||
|
|
"eval_samples_per_second": 4.346,
|
||
|
|
"eval_steps_per_second": 4.346,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7053710027941635,
|
||
|
|
"grad_norm": 0.11445856839418411,
|
||
|
|
"learning_rate": 5.943262575643238e-06,
|
||
|
|
"loss": 0.135,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.730208009934803,
|
||
|
|
"grad_norm": 0.09918007999658585,
|
||
|
|
"learning_rate": 5.001993221076162e-06,
|
||
|
|
"loss": 0.1312,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7550450170754424,
|
||
|
|
"grad_norm": 0.08956551551818848,
|
||
|
|
"learning_rate": 4.139924065499035e-06,
|
||
|
|
"loss": 0.1188,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7550450170754424,
|
||
|
|
"eval_loss": 0.11794496327638626,
|
||
|
|
"eval_runtime": 39.0409,
|
||
|
|
"eval_samples_per_second": 4.354,
|
||
|
|
"eval_steps_per_second": 4.354,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.779882024216082,
|
||
|
|
"grad_norm": 0.09554298222064972,
|
||
|
|
"learning_rate": 3.3577738192404395e-06,
|
||
|
|
"loss": 0.124,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8047190313567216,
|
||
|
|
"grad_norm": 0.09960418939590454,
|
||
|
|
"learning_rate": 2.656194563930714e-06,
|
||
|
|
"loss": 0.1295,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.829556038497361,
|
||
|
|
"grad_norm": 0.11168912798166275,
|
||
|
|
"learning_rate": 2.035771208859194e-06,
|
||
|
|
"loss": 0.1243,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.829556038497361,
|
||
|
|
"eval_loss": 0.11804591119289398,
|
||
|
|
"eval_runtime": 39.2098,
|
||
|
|
"eval_samples_per_second": 4.336,
|
||
|
|
"eval_steps_per_second": 4.336,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.854393045638001,
|
||
|
|
"grad_norm": 0.08320163935422897,
|
||
|
|
"learning_rate": 1.49702100333291e-06,
|
||
|
|
"loss": 0.1238,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8792300527786403,
|
||
|
|
"grad_norm": 0.11576654016971588,
|
||
|
|
"learning_rate": 1.0403931054440374e-06,
|
||
|
|
"loss": 0.1206,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9040670599192797,
|
||
|
|
"grad_norm": 0.09662230312824249,
|
||
|
|
"learning_rate": 6.662682076050031e-07,
|
||
|
|
"loss": 0.1205,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9040670599192797,
|
||
|
|
"eval_loss": 0.11792467534542084,
|
||
|
|
"eval_runtime": 39.0045,
|
||
|
|
"eval_samples_per_second": 4.358,
|
||
|
|
"eval_steps_per_second": 4.358,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.928904067059919,
|
||
|
|
"grad_norm": 0.09648039191961288,
|
||
|
|
"learning_rate": 3.7495821916382344e-07,
|
||
|
|
"loss": 0.129,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.953741074200559,
|
||
|
|
"grad_norm": 0.08923009783029556,
|
||
|
|
"learning_rate": 1.6670600636403687e-07,
|
||
|
|
"loss": 0.1237,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9785780813411984,
|
||
|
|
"grad_norm": 0.08443022519350052,
|
||
|
|
"learning_rate": 4.168518986628067e-08,
|
||
|
|
"loss": 0.1196,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9785780813411984,
|
||
|
|
"eval_loss": 0.11797202378511429,
|
||
|
|
"eval_runtime": 39.051,
|
||
|
|
"eval_samples_per_second": 4.353,
|
||
|
|
"eval_steps_per_second": 4.353,
|
||
|
|
"step": 1200
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 10,
|
||
|
|
"max_steps": 1209,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 3,
|
||
|
|
"save_steps": 100,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 1.8653749005815194e+17,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|