{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 100, "global_step": 2669, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00937207122774133, "grad_norm": 32.909061431884766, "learning_rate": 5.970149253731343e-07, "loss": 1.2647, "step": 5 }, { "epoch": 0.01874414245548266, "grad_norm": 25.104095458984375, "learning_rate": 1.3432835820895524e-06, "loss": 1.1231, "step": 10 }, { "epoch": 0.028116213683223992, "grad_norm": 9.685723304748535, "learning_rate": 2.08955223880597e-06, "loss": 0.7966, "step": 15 }, { "epoch": 0.03748828491096532, "grad_norm": 3.455462694168091, "learning_rate": 2.835820895522388e-06, "loss": 0.59, "step": 20 }, { "epoch": 0.046860356138706656, "grad_norm": 3.022719621658325, "learning_rate": 3.582089552238806e-06, "loss": 0.5539, "step": 25 }, { "epoch": 0.056232427366447985, "grad_norm": 2.2470591068267822, "learning_rate": 4.3283582089552236e-06, "loss": 0.4943, "step": 30 }, { "epoch": 0.06560449859418932, "grad_norm": 2.7969272136688232, "learning_rate": 5.074626865671642e-06, "loss": 0.5235, "step": 35 }, { "epoch": 0.07497656982193064, "grad_norm": 4.2170538902282715, "learning_rate": 5.820895522388061e-06, "loss": 0.5126, "step": 40 }, { "epoch": 0.08434864104967198, "grad_norm": 34.95182418823242, "learning_rate": 6.567164179104478e-06, "loss": 0.4857, "step": 45 }, { "epoch": 0.09372071227741331, "grad_norm": 2.0386860370635986, "learning_rate": 7.313432835820896e-06, "loss": 0.4885, "step": 50 }, { "epoch": 0.10309278350515463, "grad_norm": 2.5301098823547363, "learning_rate": 8.059701492537314e-06, "loss": 0.48, "step": 55 }, { "epoch": 0.11246485473289597, "grad_norm": 2.5187911987304688, "learning_rate": 8.805970149253732e-06, "loss": 0.4729, "step": 60 }, { "epoch": 0.1218369259606373, "grad_norm": 2.380256175994873, "learning_rate": 9.552238805970149e-06, "loss": 0.5112, "step": 65 }, { "epoch": 0.13120899718837864, "grad_norm": 2.1329147815704346, "learning_rate": 1.029850746268657e-05, "loss": 0.4578, "step": 70 }, { "epoch": 0.14058106841611998, "grad_norm": 2.7913565635681152, "learning_rate": 1.1044776119402986e-05, "loss": 0.4598, "step": 75 }, { "epoch": 0.14995313964386128, "grad_norm": 2.056675910949707, "learning_rate": 1.1791044776119405e-05, "loss": 0.4831, "step": 80 }, { "epoch": 0.15932521087160262, "grad_norm": 2.386592388153076, "learning_rate": 1.2537313432835823e-05, "loss": 0.473, "step": 85 }, { "epoch": 0.16869728209934395, "grad_norm": 2.63767409324646, "learning_rate": 1.328358208955224e-05, "loss": 0.4841, "step": 90 }, { "epoch": 0.1780693533270853, "grad_norm": 2.0835254192352295, "learning_rate": 1.4029850746268658e-05, "loss": 0.4657, "step": 95 }, { "epoch": 0.18744142455482662, "grad_norm": 2.168680429458618, "learning_rate": 1.4776119402985077e-05, "loss": 0.4937, "step": 100 }, { "epoch": 0.18744142455482662, "eval_loss": 0.6319828033447266, "eval_runtime": 111.5664, "eval_samples_per_second": 4.482, "eval_steps_per_second": 2.241, "step": 100 }, { "epoch": 0.19681349578256796, "grad_norm": 1.953171730041504, "learning_rate": 1.5522388059701494e-05, "loss": 0.4405, "step": 105 }, { "epoch": 0.20618556701030927, "grad_norm": 1.805274248123169, "learning_rate": 1.626865671641791e-05, "loss": 0.4922, "step": 110 }, { "epoch": 0.2155576382380506, "grad_norm": 2.0582938194274902, "learning_rate": 1.701492537313433e-05, "loss": 0.4722, "step": 115 }, { "epoch": 0.22492970946579194, "grad_norm": 2.067007064819336, "learning_rate": 1.7761194029850748e-05, "loss": 0.4876, "step": 120 }, { "epoch": 0.23430178069353327, "grad_norm": 2.0720055103302, "learning_rate": 1.8507462686567165e-05, "loss": 0.479, "step": 125 }, { "epoch": 0.2436738519212746, "grad_norm": 2.026207208633423, "learning_rate": 1.9253731343283585e-05, "loss": 0.4642, "step": 130 }, { "epoch": 0.2530459231490159, "grad_norm": 1.9669533967971802, "learning_rate": 2e-05, "loss": 0.479, "step": 135 }, { "epoch": 0.2624179943767573, "grad_norm": 1.7911089658737183, "learning_rate": 1.9999808172939662e-05, "loss": 0.484, "step": 140 }, { "epoch": 0.2717900656044986, "grad_norm": 1.6933950185775757, "learning_rate": 1.9999232699118173e-05, "loss": 0.4945, "step": 145 }, { "epoch": 0.28116213683223995, "grad_norm": 1.988083839416504, "learning_rate": 1.9998273600613825e-05, "loss": 0.5123, "step": 150 }, { "epoch": 0.29053420805998126, "grad_norm": 2.103421688079834, "learning_rate": 1.999693091422282e-05, "loss": 0.4682, "step": 155 }, { "epoch": 0.29990627928772257, "grad_norm": 2.0768039226531982, "learning_rate": 1.9995204691457883e-05, "loss": 0.4885, "step": 160 }, { "epoch": 0.30927835051546393, "grad_norm": 1.8248564004898071, "learning_rate": 1.9993094998546257e-05, "loss": 0.4735, "step": 165 }, { "epoch": 0.31865042174320524, "grad_norm": 1.6503818035125732, "learning_rate": 1.9990601916427183e-05, "loss": 0.4733, "step": 170 }, { "epoch": 0.3280224929709466, "grad_norm": 1.6220550537109375, "learning_rate": 1.998772554074878e-05, "loss": 0.4898, "step": 175 }, { "epoch": 0.3373945641986879, "grad_norm": 1.5691450834274292, "learning_rate": 1.9984465981864393e-05, "loss": 0.4697, "step": 180 }, { "epoch": 0.3467666354264292, "grad_norm": 1.856009602546692, "learning_rate": 1.998082336482833e-05, "loss": 0.46, "step": 185 }, { "epoch": 0.3561387066541706, "grad_norm": 1.7977851629257202, "learning_rate": 1.9976797829391104e-05, "loss": 0.5193, "step": 190 }, { "epoch": 0.3655107778819119, "grad_norm": 1.5994986295700073, "learning_rate": 1.9972389529994043e-05, "loss": 0.4666, "step": 195 }, { "epoch": 0.37488284910965325, "grad_norm": 1.8488245010375977, "learning_rate": 1.996759863576336e-05, "loss": 0.511, "step": 200 }, { "epoch": 0.37488284910965325, "eval_loss": 0.6320933699607849, "eval_runtime": 111.4483, "eval_samples_per_second": 4.486, "eval_steps_per_second": 2.243, "step": 200 }, { "epoch": 0.38425492033739456, "grad_norm": 2.441446542739868, "learning_rate": 1.9962425330503693e-05, "loss": 0.4696, "step": 205 }, { "epoch": 0.3936269915651359, "grad_norm": 1.8430964946746826, "learning_rate": 1.995686981269103e-05, "loss": 0.4649, "step": 210 }, { "epoch": 0.4029990627928772, "grad_norm": 1.7581799030303955, "learning_rate": 1.9950932295465102e-05, "loss": 0.4885, "step": 215 }, { "epoch": 0.41237113402061853, "grad_norm": 1.6407780647277832, "learning_rate": 1.9944613006621197e-05, "loss": 0.4754, "step": 220 }, { "epoch": 0.4217432052483599, "grad_norm": 1.6698272228240967, "learning_rate": 1.9937912188601444e-05, "loss": 0.4823, "step": 225 }, { "epoch": 0.4311152764761012, "grad_norm": 1.5131304264068604, "learning_rate": 1.9930830098485484e-05, "loss": 0.4692, "step": 230 }, { "epoch": 0.44048734770384257, "grad_norm": 1.6762291193008423, "learning_rate": 1.992336700798062e-05, "loss": 0.4901, "step": 235 }, { "epoch": 0.4498594189315839, "grad_norm": 1.7265088558197021, "learning_rate": 1.9915523203411397e-05, "loss": 0.4627, "step": 240 }, { "epoch": 0.4592314901593252, "grad_norm": 1.5664938688278198, "learning_rate": 1.990729898570861e-05, "loss": 0.4715, "step": 245 }, { "epoch": 0.46860356138706655, "grad_norm": 1.6908628940582275, "learning_rate": 1.989869467039776e-05, "loss": 0.4984, "step": 250 }, { "epoch": 0.47797563261480785, "grad_norm": 1.3762125968933105, "learning_rate": 1.9889710587586953e-05, "loss": 0.4663, "step": 255 }, { "epoch": 0.4873477038425492, "grad_norm": 1.605967402458191, "learning_rate": 1.9880347081954217e-05, "loss": 0.4711, "step": 260 }, { "epoch": 0.4967197750702905, "grad_norm": 1.5448018312454224, "learning_rate": 1.987060451273432e-05, "loss": 0.4637, "step": 265 }, { "epoch": 0.5060918462980318, "grad_norm": 1.4862360954284668, "learning_rate": 1.986048325370493e-05, "loss": 0.4614, "step": 270 }, { "epoch": 0.5154639175257731, "grad_norm": 1.587640643119812, "learning_rate": 1.9849983693172324e-05, "loss": 0.4819, "step": 275 }, { "epoch": 0.5248359887535146, "grad_norm": 1.5963493585586548, "learning_rate": 1.9839106233956474e-05, "loss": 0.4912, "step": 280 }, { "epoch": 0.5342080599812559, "grad_norm": 1.4431391954421997, "learning_rate": 1.982785129337558e-05, "loss": 0.4727, "step": 285 }, { "epoch": 0.5435801312089972, "grad_norm": 1.4035024642944336, "learning_rate": 1.9816219303230077e-05, "loss": 0.4642, "step": 290 }, { "epoch": 0.5529522024367385, "grad_norm": 1.5492186546325684, "learning_rate": 1.980421070978606e-05, "loss": 0.4881, "step": 295 }, { "epoch": 0.5623242736644799, "grad_norm": 1.4136260747909546, "learning_rate": 1.9791825973758167e-05, "loss": 0.4657, "step": 300 }, { "epoch": 0.5623242736644799, "eval_loss": 0.6458946466445923, "eval_runtime": 111.4442, "eval_samples_per_second": 4.487, "eval_steps_per_second": 2.243, "step": 300 }, { "epoch": 0.5716963448922212, "grad_norm": 1.6971220970153809, "learning_rate": 1.9779065570291894e-05, "loss": 0.4685, "step": 305 }, { "epoch": 0.5810684161199625, "grad_norm": 1.4709703922271729, "learning_rate": 1.9765929988945382e-05, "loss": 0.4948, "step": 310 }, { "epoch": 0.5904404873477038, "grad_norm": 1.7003625631332397, "learning_rate": 1.975241973367062e-05, "loss": 0.4963, "step": 315 }, { "epoch": 0.5998125585754451, "grad_norm": 1.4582606554031372, "learning_rate": 1.9738535322794122e-05, "loss": 0.4827, "step": 320 }, { "epoch": 0.6091846298031866, "grad_norm": 1.4838789701461792, "learning_rate": 1.972427728899703e-05, "loss": 0.4545, "step": 325 }, { "epoch": 0.6185567010309279, "grad_norm": 1.3947144746780396, "learning_rate": 1.9709646179294687e-05, "loss": 0.4712, "step": 330 }, { "epoch": 0.6279287722586692, "grad_norm": 1.554185390472412, "learning_rate": 1.9694642555015643e-05, "loss": 0.4702, "step": 335 }, { "epoch": 0.6373008434864105, "grad_norm": 1.6660090684890747, "learning_rate": 1.9679266991780128e-05, "loss": 0.5128, "step": 340 }, { "epoch": 0.6466729147141518, "grad_norm": 1.5578211545944214, "learning_rate": 1.966352007947796e-05, "loss": 0.4844, "step": 345 }, { "epoch": 0.6560449859418932, "grad_norm": 1.4711651802062988, "learning_rate": 1.964740242224592e-05, "loss": 0.4798, "step": 350 }, { "epoch": 0.6654170571696345, "grad_norm": 1.7752223014831543, "learning_rate": 1.9630914638444572e-05, "loss": 0.4922, "step": 355 }, { "epoch": 0.6747891283973758, "grad_norm": 1.8880064487457275, "learning_rate": 1.961405736063453e-05, "loss": 0.4928, "step": 360 }, { "epoch": 0.6841611996251171, "grad_norm": 1.5307488441467285, "learning_rate": 1.9596831235552205e-05, "loss": 0.4492, "step": 365 }, { "epoch": 0.6935332708528584, "grad_norm": 1.4398698806762695, "learning_rate": 1.957923692408499e-05, "loss": 0.45, "step": 370 }, { "epoch": 0.7029053420805998, "grad_norm": 1.5632487535476685, "learning_rate": 1.9561275101245886e-05, "loss": 0.4878, "step": 375 }, { "epoch": 0.7122774133083412, "grad_norm": 1.4883025884628296, "learning_rate": 1.954294645614763e-05, "loss": 0.4799, "step": 380 }, { "epoch": 0.7216494845360825, "grad_norm": 1.4776026010513306, "learning_rate": 1.9524251691976243e-05, "loss": 0.5043, "step": 385 }, { "epoch": 0.7310215557638238, "grad_norm": 1.5219204425811768, "learning_rate": 1.950519152596406e-05, "loss": 0.4737, "step": 390 }, { "epoch": 0.7403936269915652, "grad_norm": 1.6262823343276978, "learning_rate": 1.9485766689362205e-05, "loss": 0.4575, "step": 395 }, { "epoch": 0.7497656982193065, "grad_norm": 1.5888830423355103, "learning_rate": 1.9465977927412535e-05, "loss": 0.4577, "step": 400 }, { "epoch": 0.7497656982193065, "eval_loss": 0.6419793963432312, "eval_runtime": 111.6005, "eval_samples_per_second": 4.48, "eval_steps_per_second": 2.24, "step": 400 }, { "epoch": 0.7591377694470478, "grad_norm": 1.4423269033432007, "learning_rate": 1.9445825999319057e-05, "loss": 0.4451, "step": 405 }, { "epoch": 0.7685098406747891, "grad_norm": 1.5620957612991333, "learning_rate": 1.94253116782188e-05, "loss": 0.4578, "step": 410 }, { "epoch": 0.7778819119025304, "grad_norm": 1.326604962348938, "learning_rate": 1.9404435751152134e-05, "loss": 0.4772, "step": 415 }, { "epoch": 0.7872539831302718, "grad_norm": 1.3322705030441284, "learning_rate": 1.938319901903262e-05, "loss": 0.4829, "step": 420 }, { "epoch": 0.7966260543580131, "grad_norm": 1.4845640659332275, "learning_rate": 1.9361602296616223e-05, "loss": 0.4598, "step": 425 }, { "epoch": 0.8059981255857545, "grad_norm": 1.4712871313095093, "learning_rate": 1.9339646412470106e-05, "loss": 0.4695, "step": 430 }, { "epoch": 0.8153701968134958, "grad_norm": 1.3889317512512207, "learning_rate": 1.931733220894081e-05, "loss": 0.447, "step": 435 }, { "epoch": 0.8247422680412371, "grad_norm": 1.4443881511688232, "learning_rate": 1.9294660542121944e-05, "loss": 0.4662, "step": 440 }, { "epoch": 0.8341143392689785, "grad_norm": 1.5155857801437378, "learning_rate": 1.9271632281821354e-05, "loss": 0.4873, "step": 445 }, { "epoch": 0.8434864104967198, "grad_norm": 1.5591299533843994, "learning_rate": 1.9248248311527735e-05, "loss": 0.4942, "step": 450 }, { "epoch": 0.8528584817244611, "grad_norm": 1.5808844566345215, "learning_rate": 1.9224509528376737e-05, "loss": 0.472, "step": 455 }, { "epoch": 0.8622305529522024, "grad_norm": 1.8616470098495483, "learning_rate": 1.9200416843116562e-05, "loss": 0.4577, "step": 460 }, { "epoch": 0.8716026241799437, "grad_norm": 1.918115496635437, "learning_rate": 1.9175971180073012e-05, "loss": 0.4774, "step": 465 }, { "epoch": 0.8809746954076851, "grad_norm": 1.411353349685669, "learning_rate": 1.9151173477114015e-05, "loss": 0.4682, "step": 470 }, { "epoch": 0.8903467666354264, "grad_norm": 1.625918984413147, "learning_rate": 1.9126024685613664e-05, "loss": 0.4923, "step": 475 }, { "epoch": 0.8997188378631678, "grad_norm": 1.388818621635437, "learning_rate": 1.9100525770415713e-05, "loss": 0.4766, "step": 480 }, { "epoch": 0.9090909090909091, "grad_norm": 1.4252229928970337, "learning_rate": 1.907467770979655e-05, "loss": 0.4622, "step": 485 }, { "epoch": 0.9184629803186504, "grad_norm": 1.6216133832931519, "learning_rate": 1.9048481495427667e-05, "loss": 0.4824, "step": 490 }, { "epoch": 0.9278350515463918, "grad_norm": 1.6171802282333374, "learning_rate": 1.9021938132337628e-05, "loss": 0.4979, "step": 495 }, { "epoch": 0.9372071227741331, "grad_norm": 1.5567573308944702, "learning_rate": 1.8995048638873494e-05, "loss": 0.4634, "step": 500 }, { "epoch": 0.9372071227741331, "eval_loss": 0.6470092535018921, "eval_runtime": 111.4606, "eval_samples_per_second": 4.486, "eval_steps_per_second": 2.243, "step": 500 }, { "epoch": 0.9465791940018744, "grad_norm": 1.3252328634262085, "learning_rate": 1.896781404666176e-05, "loss": 0.4682, "step": 505 }, { "epoch": 0.9559512652296157, "grad_norm": 1.6408551931381226, "learning_rate": 1.8940235400568784e-05, "loss": 0.4762, "step": 510 }, { "epoch": 0.9653233364573571, "grad_norm": 1.6290283203125, "learning_rate": 1.891231375866068e-05, "loss": 0.4661, "step": 515 }, { "epoch": 0.9746954076850984, "grad_norm": 1.402817964553833, "learning_rate": 1.888405019216275e-05, "loss": 0.5037, "step": 520 }, { "epoch": 0.9840674789128397, "grad_norm": 1.366844892501831, "learning_rate": 1.885544578541837e-05, "loss": 0.4596, "step": 525 }, { "epoch": 0.993439550140581, "grad_norm": 1.4287879467010498, "learning_rate": 1.8826501635847392e-05, "loss": 0.4652, "step": 530 }, { "epoch": 1.0037488284910965, "grad_norm": 1.2563650608062744, "learning_rate": 1.8797218853904037e-05, "loss": 0.4833, "step": 535 }, { "epoch": 1.013120899718838, "grad_norm": 1.2508701086044312, "learning_rate": 1.8767598563034304e-05, "loss": 0.287, "step": 540 }, { "epoch": 1.022492970946579, "grad_norm": 1.3916987180709839, "learning_rate": 1.8737641899632857e-05, "loss": 0.2859, "step": 545 }, { "epoch": 1.0318650421743205, "grad_norm": 1.4374034404754639, "learning_rate": 1.870735001299943e-05, "loss": 0.2746, "step": 550 }, { "epoch": 1.041237113402062, "grad_norm": 1.3931026458740234, "learning_rate": 1.8676724065294744e-05, "loss": 0.255, "step": 555 }, { "epoch": 1.0506091846298031, "grad_norm": 1.4676737785339355, "learning_rate": 1.864576523149589e-05, "loss": 0.2609, "step": 560 }, { "epoch": 1.0599812558575445, "grad_norm": 1.3945457935333252, "learning_rate": 1.8614474699351294e-05, "loss": 0.2595, "step": 565 }, { "epoch": 1.069353327085286, "grad_norm": 1.413190245628357, "learning_rate": 1.8582853669335107e-05, "loss": 0.2704, "step": 570 }, { "epoch": 1.0787253983130272, "grad_norm": 1.2427492141723633, "learning_rate": 1.8550903354601182e-05, "loss": 0.2444, "step": 575 }, { "epoch": 1.0880974695407686, "grad_norm": 1.3134554624557495, "learning_rate": 1.851862498093651e-05, "loss": 0.2606, "step": 580 }, { "epoch": 1.0974695407685098, "grad_norm": 1.3855392932891846, "learning_rate": 1.8486019786714194e-05, "loss": 0.263, "step": 585 }, { "epoch": 1.1068416119962512, "grad_norm": 1.4354616403579712, "learning_rate": 1.8453089022845943e-05, "loss": 0.2488, "step": 590 }, { "epoch": 1.1162136832239926, "grad_norm": 1.1863958835601807, "learning_rate": 1.8419833952734094e-05, "loss": 0.2506, "step": 595 }, { "epoch": 1.1255857544517338, "grad_norm": 1.5044498443603516, "learning_rate": 1.83862558522231e-05, "loss": 0.2661, "step": 600 }, { "epoch": 1.1255857544517338, "eval_loss": 0.6920709013938904, "eval_runtime": 111.4907, "eval_samples_per_second": 4.485, "eval_steps_per_second": 2.242, "step": 600 }, { "epoch": 1.1349578256794752, "grad_norm": 1.4557913541793823, "learning_rate": 1.835235600955064e-05, "loss": 0.265, "step": 605 }, { "epoch": 1.1443298969072164, "grad_norm": 1.3041000366210938, "learning_rate": 1.8318135725298133e-05, "loss": 0.261, "step": 610 }, { "epoch": 1.1537019681349578, "grad_norm": 1.3250569105148315, "learning_rate": 1.8283596312340893e-05, "loss": 0.2638, "step": 615 }, { "epoch": 1.1630740393626993, "grad_norm": 1.3970952033996582, "learning_rate": 1.8248739095797726e-05, "loss": 0.2642, "step": 620 }, { "epoch": 1.1724461105904405, "grad_norm": 1.4045438766479492, "learning_rate": 1.8213565412980114e-05, "loss": 0.2909, "step": 625 }, { "epoch": 1.1818181818181819, "grad_norm": 1.3580117225646973, "learning_rate": 1.8178076613340886e-05, "loss": 0.2541, "step": 630 }, { "epoch": 1.191190253045923, "grad_norm": 1.3984880447387695, "learning_rate": 1.8142274058422467e-05, "loss": 0.253, "step": 635 }, { "epoch": 1.2005623242736645, "grad_norm": 1.275099754333496, "learning_rate": 1.8106159121804633e-05, "loss": 0.2679, "step": 640 }, { "epoch": 1.209934395501406, "grad_norm": 1.4693080186843872, "learning_rate": 1.8069733189051802e-05, "loss": 0.2586, "step": 645 }, { "epoch": 1.219306466729147, "grad_norm": 1.3677211999893188, "learning_rate": 1.80329976576599e-05, "loss": 0.2877, "step": 650 }, { "epoch": 1.2286785379568885, "grad_norm": 1.376230001449585, "learning_rate": 1.7995953937002723e-05, "loss": 0.2499, "step": 655 }, { "epoch": 1.2380506091846297, "grad_norm": 1.380204677581787, "learning_rate": 1.7958603448277882e-05, "loss": 0.2426, "step": 660 }, { "epoch": 1.2474226804123711, "grad_norm": 1.4259058237075806, "learning_rate": 1.7920947624452264e-05, "loss": 0.2806, "step": 665 }, { "epoch": 1.2567947516401126, "grad_norm": 1.4305455684661865, "learning_rate": 1.7882987910207066e-05, "loss": 0.2657, "step": 670 }, { "epoch": 1.2661668228678538, "grad_norm": 1.4844595193862915, "learning_rate": 1.784472576188237e-05, "loss": 0.2704, "step": 675 }, { "epoch": 1.2755388940955952, "grad_norm": 1.28706693649292, "learning_rate": 1.780616264742126e-05, "loss": 0.2534, "step": 680 }, { "epoch": 1.2849109653233364, "grad_norm": 1.3618587255477905, "learning_rate": 1.776730004631352e-05, "loss": 0.2715, "step": 685 }, { "epoch": 1.2942830365510778, "grad_norm": 1.399498701095581, "learning_rate": 1.7728139449538848e-05, "loss": 0.2748, "step": 690 }, { "epoch": 1.3036551077788192, "grad_norm": 1.3688334226608276, "learning_rate": 1.768868235950968e-05, "loss": 0.2625, "step": 695 }, { "epoch": 1.3130271790065604, "grad_norm": 1.327973484992981, "learning_rate": 1.7648930290013532e-05, "loss": 0.2427, "step": 700 }, { "epoch": 1.3130271790065604, "eval_loss": 0.6904003620147705, "eval_runtime": 111.5048, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 700 }, { "epoch": 1.3223992502343018, "grad_norm": 1.5537965297698975, "learning_rate": 1.760888476615493e-05, "loss": 0.2487, "step": 705 }, { "epoch": 1.331771321462043, "grad_norm": 1.382699728012085, "learning_rate": 1.75685473242969e-05, "loss": 0.2417, "step": 710 }, { "epoch": 1.3411433926897844, "grad_norm": 1.4410724639892578, "learning_rate": 1.7527919512002025e-05, "loss": 0.2467, "step": 715 }, { "epoch": 1.3505154639175259, "grad_norm": 1.448276400566101, "learning_rate": 1.7487002887973057e-05, "loss": 0.2525, "step": 720 }, { "epoch": 1.359887535145267, "grad_norm": 1.4892441034317017, "learning_rate": 1.7445799021993138e-05, "loss": 0.2336, "step": 725 }, { "epoch": 1.3692596063730085, "grad_norm": 1.2686562538146973, "learning_rate": 1.7404309494865572e-05, "loss": 0.2624, "step": 730 }, { "epoch": 1.3786316776007497, "grad_norm": 1.36681067943573, "learning_rate": 1.736253589835316e-05, "loss": 0.279, "step": 735 }, { "epoch": 1.388003748828491, "grad_norm": 1.4178364276885986, "learning_rate": 1.7320479835117142e-05, "loss": 0.2634, "step": 740 }, { "epoch": 1.3973758200562325, "grad_norm": 1.7909929752349854, "learning_rate": 1.7278142918655717e-05, "loss": 0.2568, "step": 745 }, { "epoch": 1.4067478912839737, "grad_norm": 1.4352169036865234, "learning_rate": 1.7235526773242136e-05, "loss": 0.2487, "step": 750 }, { "epoch": 1.4161199625117151, "grad_norm": 1.3589709997177124, "learning_rate": 1.719263303386237e-05, "loss": 0.2612, "step": 755 }, { "epoch": 1.4254920337394563, "grad_norm": 1.3523000478744507, "learning_rate": 1.7149463346152412e-05, "loss": 0.2644, "step": 760 }, { "epoch": 1.4348641049671977, "grad_norm": 1.396602988243103, "learning_rate": 1.7106019366335113e-05, "loss": 0.2704, "step": 765 }, { "epoch": 1.4442361761949392, "grad_norm": 1.379135012626648, "learning_rate": 1.7062302761156667e-05, "loss": 0.2593, "step": 770 }, { "epoch": 1.4536082474226804, "grad_norm": 1.301147699356079, "learning_rate": 1.701831520782264e-05, "loss": 0.2592, "step": 775 }, { "epoch": 1.4629803186504218, "grad_norm": 1.4539448022842407, "learning_rate": 1.6974058393933647e-05, "loss": 0.2909, "step": 780 }, { "epoch": 1.472352389878163, "grad_norm": 1.5490386486053467, "learning_rate": 1.692953401742059e-05, "loss": 0.2771, "step": 785 }, { "epoch": 1.4817244611059044, "grad_norm": 1.4883418083190918, "learning_rate": 1.6884743786479513e-05, "loss": 0.2529, "step": 790 }, { "epoch": 1.4910965323336458, "grad_norm": 1.5105490684509277, "learning_rate": 1.6839689419506092e-05, "loss": 0.265, "step": 795 }, { "epoch": 1.5004686035613872, "grad_norm": 1.461634635925293, "learning_rate": 1.6794372645029674e-05, "loss": 0.2608, "step": 800 }, { "epoch": 1.5004686035613872, "eval_loss": 0.6895884871482849, "eval_runtime": 111.5059, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 800 }, { "epoch": 1.5098406747891284, "grad_norm": 1.523145079612732, "learning_rate": 1.6748795201646992e-05, "loss": 0.2762, "step": 805 }, { "epoch": 1.5192127460168696, "grad_norm": 1.366004228591919, "learning_rate": 1.670295883795544e-05, "loss": 0.28, "step": 810 }, { "epoch": 1.528584817244611, "grad_norm": 1.6428511142730713, "learning_rate": 1.6656865312485996e-05, "loss": 0.2489, "step": 815 }, { "epoch": 1.5379568884723525, "grad_norm": 1.31986665725708, "learning_rate": 1.6610516393635757e-05, "loss": 0.2498, "step": 820 }, { "epoch": 1.5473289597000939, "grad_norm": 1.5260220766067505, "learning_rate": 1.6563913859600102e-05, "loss": 0.338, "step": 825 }, { "epoch": 1.556701030927835, "grad_norm": 1.3370164632797241, "learning_rate": 1.6517059498304444e-05, "loss": 0.2468, "step": 830 }, { "epoch": 1.5660731021555763, "grad_norm": 1.4251459836959839, "learning_rate": 1.6469955107335666e-05, "loss": 0.2764, "step": 835 }, { "epoch": 1.5754451733833177, "grad_norm": 1.2612155675888062, "learning_rate": 1.6422602493873137e-05, "loss": 0.2613, "step": 840 }, { "epoch": 1.584817244611059, "grad_norm": 1.3020036220550537, "learning_rate": 1.637500347461938e-05, "loss": 0.2618, "step": 845 }, { "epoch": 1.5941893158388005, "grad_norm": 1.3664627075195312, "learning_rate": 1.6327159875730393e-05, "loss": 0.2476, "step": 850 }, { "epoch": 1.6035613870665417, "grad_norm": 1.4827312231063843, "learning_rate": 1.627907353274555e-05, "loss": 0.2674, "step": 855 }, { "epoch": 1.612933458294283, "grad_norm": 1.2991149425506592, "learning_rate": 1.6230746290517227e-05, "loss": 0.2716, "step": 860 }, { "epoch": 1.6223055295220243, "grad_norm": 1.5782040357589722, "learning_rate": 1.618218000313998e-05, "loss": 0.2875, "step": 865 }, { "epoch": 1.6316776007497658, "grad_norm": 1.4465105533599854, "learning_rate": 1.613337653387943e-05, "loss": 0.2723, "step": 870 }, { "epoch": 1.6410496719775072, "grad_norm": 1.3791197538375854, "learning_rate": 1.6084337755100795e-05, "loss": 0.2572, "step": 875 }, { "epoch": 1.6504217432052484, "grad_norm": 1.3755207061767578, "learning_rate": 1.603506554819703e-05, "loss": 0.2562, "step": 880 }, { "epoch": 1.6597938144329896, "grad_norm": 1.4186309576034546, "learning_rate": 1.598556180351665e-05, "loss": 0.2679, "step": 885 }, { "epoch": 1.669165885660731, "grad_norm": 1.3663445711135864, "learning_rate": 1.5935828420291227e-05, "loss": 0.2505, "step": 890 }, { "epoch": 1.6785379568884724, "grad_norm": 1.4272841215133667, "learning_rate": 1.588586730656249e-05, "loss": 0.2861, "step": 895 }, { "epoch": 1.6879100281162138, "grad_norm": 1.3556526899337769, "learning_rate": 1.5835680379109166e-05, "loss": 0.2811, "step": 900 }, { "epoch": 1.6879100281162138, "eval_loss": 0.6763415336608887, "eval_runtime": 111.4991, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 900 }, { "epoch": 1.697282099343955, "grad_norm": 1.527638554573059, "learning_rate": 1.5785269563373402e-05, "loss": 0.2655, "step": 905 }, { "epoch": 1.7066541705716962, "grad_norm": 1.347113847732544, "learning_rate": 1.573463679338692e-05, "loss": 0.2783, "step": 910 }, { "epoch": 1.7160262417994376, "grad_norm": 1.346537470817566, "learning_rate": 1.56837840116968e-05, "loss": 0.2712, "step": 915 }, { "epoch": 1.725398313027179, "grad_norm": 1.3698228597640991, "learning_rate": 1.5632713169290962e-05, "loss": 0.2582, "step": 920 }, { "epoch": 1.7347703842549205, "grad_norm": 1.4085627794265747, "learning_rate": 1.5581426225523333e-05, "loss": 0.262, "step": 925 }, { "epoch": 1.7441424554826617, "grad_norm": 1.4400358200073242, "learning_rate": 1.5529925148038635e-05, "loss": 0.2636, "step": 930 }, { "epoch": 1.7535145267104029, "grad_norm": 1.2298705577850342, "learning_rate": 1.547821191269693e-05, "loss": 0.2542, "step": 935 }, { "epoch": 1.7628865979381443, "grad_norm": 1.4320347309112549, "learning_rate": 1.5426288503497802e-05, "loss": 0.2607, "step": 940 }, { "epoch": 1.7722586691658857, "grad_norm": 1.4086341857910156, "learning_rate": 1.5374156912504236e-05, "loss": 0.2464, "step": 945 }, { "epoch": 1.7816307403936271, "grad_norm": 1.3747973442077637, "learning_rate": 1.532181913976621e-05, "loss": 0.2781, "step": 950 }, { "epoch": 1.7910028116213683, "grad_norm": 1.4264485836029053, "learning_rate": 1.5269277193243936e-05, "loss": 0.2872, "step": 955 }, { "epoch": 1.8003748828491095, "grad_norm": 1.3113363981246948, "learning_rate": 1.5216533088730844e-05, "loss": 0.2693, "step": 960 }, { "epoch": 1.809746954076851, "grad_norm": 1.3197410106658936, "learning_rate": 1.516358884977624e-05, "loss": 0.2495, "step": 965 }, { "epoch": 1.8191190253045924, "grad_norm": 1.4005447626113892, "learning_rate": 1.5110446507607666e-05, "loss": 0.2792, "step": 970 }, { "epoch": 1.8284910965323338, "grad_norm": 1.3619177341461182, "learning_rate": 1.5057108101052978e-05, "loss": 0.2496, "step": 975 }, { "epoch": 1.837863167760075, "grad_norm": 1.3972722291946411, "learning_rate": 1.5003575676462126e-05, "loss": 0.2586, "step": 980 }, { "epoch": 1.8472352389878162, "grad_norm": 1.3040308952331543, "learning_rate": 1.4949851287628631e-05, "loss": 0.2593, "step": 985 }, { "epoch": 1.8566073102155576, "grad_norm": 1.4333730936050415, "learning_rate": 1.4895936995710815e-05, "loss": 0.2643, "step": 990 }, { "epoch": 1.865979381443299, "grad_norm": 1.304624319076538, "learning_rate": 1.4841834869152703e-05, "loss": 0.2478, "step": 995 }, { "epoch": 1.8753514526710404, "grad_norm": 1.3824489116668701, "learning_rate": 1.478754698360467e-05, "loss": 0.2506, "step": 1000 }, { "epoch": 1.8753514526710404, "eval_loss": 0.6781994104385376, "eval_runtime": 111.5183, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 1000 }, { "epoch": 1.8847235238987816, "grad_norm": 1.5689202547073364, "learning_rate": 1.473307542184382e-05, "loss": 0.2811, "step": 1005 }, { "epoch": 1.8940955951265228, "grad_norm": 1.357867956161499, "learning_rate": 1.4678422273694062e-05, "loss": 0.2637, "step": 1010 }, { "epoch": 1.9034676663542642, "grad_norm": 1.241373896598816, "learning_rate": 1.462358963594595e-05, "loss": 0.2636, "step": 1015 }, { "epoch": 1.9128397375820057, "grad_norm": 1.3964288234710693, "learning_rate": 1.4568579612276222e-05, "loss": 0.2741, "step": 1020 }, { "epoch": 1.922211808809747, "grad_norm": 1.3163318634033203, "learning_rate": 1.4513394313167104e-05, "loss": 0.2621, "step": 1025 }, { "epoch": 1.9315838800374883, "grad_norm": 1.3993713855743408, "learning_rate": 1.4458035855825341e-05, "loss": 0.2657, "step": 1030 }, { "epoch": 1.9409559512652295, "grad_norm": 1.3384408950805664, "learning_rate": 1.4402506364100957e-05, "loss": 0.2598, "step": 1035 }, { "epoch": 1.9503280224929709, "grad_norm": 1.4588673114776611, "learning_rate": 1.4346807968405783e-05, "loss": 0.2536, "step": 1040 }, { "epoch": 1.9597000937207123, "grad_norm": 1.326058268547058, "learning_rate": 1.4290942805631722e-05, "loss": 0.2563, "step": 1045 }, { "epoch": 1.9690721649484537, "grad_norm": 1.353257179260254, "learning_rate": 1.4234913019068769e-05, "loss": 0.2564, "step": 1050 }, { "epoch": 1.978444236176195, "grad_norm": 1.4586265087127686, "learning_rate": 1.4178720758322761e-05, "loss": 0.2769, "step": 1055 }, { "epoch": 1.9878163074039361, "grad_norm": 1.2936612367630005, "learning_rate": 1.412236817923295e-05, "loss": 0.2737, "step": 1060 }, { "epoch": 1.9971883786316775, "grad_norm": 1.4073734283447266, "learning_rate": 1.4065857443789246e-05, "loss": 0.2717, "step": 1065 }, { "epoch": 2.005623242736645, "grad_norm": 1.2421205043792725, "learning_rate": 1.4009190720049309e-05, "loss": 0.1902, "step": 1070 }, { "epoch": 2.014995313964386, "grad_norm": 1.3869972229003906, "learning_rate": 1.3952370182055332e-05, "loss": 0.1134, "step": 1075 }, { "epoch": 2.0243673851921273, "grad_norm": 1.3595290184020996, "learning_rate": 1.389539800975068e-05, "loss": 0.097, "step": 1080 }, { "epoch": 2.0337394564198688, "grad_norm": 1.2397971153259277, "learning_rate": 1.3838276388896216e-05, "loss": 0.1022, "step": 1085 }, { "epoch": 2.04311152764761, "grad_norm": 1.1282893419265747, "learning_rate": 1.3781007510986464e-05, "loss": 0.1003, "step": 1090 }, { "epoch": 2.0524835988753516, "grad_norm": 1.2011518478393555, "learning_rate": 1.3723593573165523e-05, "loss": 0.0993, "step": 1095 }, { "epoch": 2.0618556701030926, "grad_norm": 1.1846802234649658, "learning_rate": 1.3666036778142773e-05, "loss": 0.1031, "step": 1100 }, { "epoch": 2.0618556701030926, "eval_loss": 0.7819597125053406, "eval_runtime": 111.4871, "eval_samples_per_second": 4.485, "eval_steps_per_second": 2.242, "step": 1100 }, { "epoch": 2.071227741330834, "grad_norm": 1.1528737545013428, "learning_rate": 1.3608339334108378e-05, "loss": 0.0938, "step": 1105 }, { "epoch": 2.0805998125585754, "grad_norm": 1.2607845067977905, "learning_rate": 1.355050345464855e-05, "loss": 0.1048, "step": 1110 }, { "epoch": 2.089971883786317, "grad_norm": 1.0643517971038818, "learning_rate": 1.3492531358660634e-05, "loss": 0.1056, "step": 1115 }, { "epoch": 2.0993439550140582, "grad_norm": 1.2049908638000488, "learning_rate": 1.3434425270267983e-05, "loss": 0.1078, "step": 1120 }, { "epoch": 2.108716026241799, "grad_norm": 1.1504206657409668, "learning_rate": 1.3376187418734626e-05, "loss": 0.0987, "step": 1125 }, { "epoch": 2.1180880974695406, "grad_norm": 1.103416085243225, "learning_rate": 1.3317820038379731e-05, "loss": 0.1011, "step": 1130 }, { "epoch": 2.127460168697282, "grad_norm": 1.2639893293380737, "learning_rate": 1.3259325368491897e-05, "loss": 0.1065, "step": 1135 }, { "epoch": 2.1368322399250235, "grad_norm": 1.2981096506118774, "learning_rate": 1.320070565324324e-05, "loss": 0.1089, "step": 1140 }, { "epoch": 2.146204311152765, "grad_norm": 1.3471019268035889, "learning_rate": 1.314196314160329e-05, "loss": 0.1034, "step": 1145 }, { "epoch": 2.155576382380506, "grad_norm": 1.2037670612335205, "learning_rate": 1.308310008725271e-05, "loss": 0.0954, "step": 1150 }, { "epoch": 2.1649484536082473, "grad_norm": 1.124943733215332, "learning_rate": 1.3024118748496834e-05, "loss": 0.1086, "step": 1155 }, { "epoch": 2.1743205248359887, "grad_norm": 1.2061023712158203, "learning_rate": 1.2965021388179036e-05, "loss": 0.1032, "step": 1160 }, { "epoch": 2.18369259606373, "grad_norm": 1.2710933685302734, "learning_rate": 1.2905810273593887e-05, "loss": 0.1024, "step": 1165 }, { "epoch": 2.1930646672914715, "grad_norm": 1.1786785125732422, "learning_rate": 1.28464876764002e-05, "loss": 0.103, "step": 1170 }, { "epoch": 2.2024367385192125, "grad_norm": 1.5116946697235107, "learning_rate": 1.2787055872533867e-05, "loss": 0.1107, "step": 1175 }, { "epoch": 2.211808809746954, "grad_norm": 1.2890318632125854, "learning_rate": 1.2727517142120527e-05, "loss": 0.1019, "step": 1180 }, { "epoch": 2.2211808809746953, "grad_norm": 1.184844970703125, "learning_rate": 1.266787376938811e-05, "loss": 0.1067, "step": 1185 }, { "epoch": 2.2305529522024368, "grad_norm": 1.3428583145141602, "learning_rate": 1.2608128042579185e-05, "loss": 0.1066, "step": 1190 }, { "epoch": 2.239925023430178, "grad_norm": 1.2953709363937378, "learning_rate": 1.2548282253863181e-05, "loss": 0.1138, "step": 1195 }, { "epoch": 2.2492970946579196, "grad_norm": 1.1381481885910034, "learning_rate": 1.2488338699248443e-05, "loss": 0.1053, "step": 1200 }, { "epoch": 2.2492970946579196, "eval_loss": 0.7939261198043823, "eval_runtime": 111.5111, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 1200 }, { "epoch": 2.2586691658856606, "grad_norm": 1.5689799785614014, "learning_rate": 1.2428299678494146e-05, "loss": 0.098, "step": 1205 }, { "epoch": 2.268041237113402, "grad_norm": 1.3094913959503174, "learning_rate": 1.236816749502206e-05, "loss": 0.1111, "step": 1210 }, { "epoch": 2.2774133083411434, "grad_norm": 1.2114543914794922, "learning_rate": 1.2307944455828178e-05, "loss": 0.1051, "step": 1215 }, { "epoch": 2.286785379568885, "grad_norm": 1.1505310535430908, "learning_rate": 1.2247632871394223e-05, "loss": 0.0927, "step": 1220 }, { "epoch": 2.296157450796626, "grad_norm": 1.2007763385772705, "learning_rate": 1.218723505559898e-05, "loss": 0.1081, "step": 1225 }, { "epoch": 2.3055295220243672, "grad_norm": 1.1881816387176514, "learning_rate": 1.2126753325629543e-05, "loss": 0.0984, "step": 1230 }, { "epoch": 2.3149015932521086, "grad_norm": 1.2576075792312622, "learning_rate": 1.2066190001892398e-05, "loss": 0.112, "step": 1235 }, { "epoch": 2.32427366447985, "grad_norm": 1.2001255750656128, "learning_rate": 1.200554740792442e-05, "loss": 0.107, "step": 1240 }, { "epoch": 2.3336457357075915, "grad_norm": 1.2408965826034546, "learning_rate": 1.1944827870303719e-05, "loss": 0.1166, "step": 1245 }, { "epoch": 2.3430178069353325, "grad_norm": 1.1618740558624268, "learning_rate": 1.1884033718560372e-05, "loss": 0.0978, "step": 1250 }, { "epoch": 2.352389878163074, "grad_norm": 1.177768349647522, "learning_rate": 1.1823167285087064e-05, "loss": 0.1027, "step": 1255 }, { "epoch": 2.3617619493908153, "grad_norm": 1.1294364929199219, "learning_rate": 1.1762230905049593e-05, "loss": 0.1087, "step": 1260 }, { "epoch": 2.3711340206185567, "grad_norm": 1.4736202955245972, "learning_rate": 1.1701226916297295e-05, "loss": 0.1142, "step": 1265 }, { "epoch": 2.380506091846298, "grad_norm": 1.2007415294647217, "learning_rate": 1.164015765927333e-05, "loss": 0.1076, "step": 1270 }, { "epoch": 2.3898781630740396, "grad_norm": 1.274434208869934, "learning_rate": 1.1579025476924912e-05, "loss": 0.1116, "step": 1275 }, { "epoch": 2.3992502343017805, "grad_norm": 1.3655272722244263, "learning_rate": 1.1517832714613406e-05, "loss": 0.1079, "step": 1280 }, { "epoch": 2.408622305529522, "grad_norm": 1.2331844568252563, "learning_rate": 1.1456581720024356e-05, "loss": 0.1056, "step": 1285 }, { "epoch": 2.4179943767572634, "grad_norm": 1.1586816310882568, "learning_rate": 1.1395274843077405e-05, "loss": 0.1067, "step": 1290 }, { "epoch": 2.427366447985005, "grad_norm": 1.271945834159851, "learning_rate": 1.1333914435836153e-05, "loss": 0.1051, "step": 1295 }, { "epoch": 2.436738519212746, "grad_norm": 1.1621251106262207, "learning_rate": 1.1272502852417908e-05, "loss": 0.1009, "step": 1300 }, { "epoch": 2.436738519212746, "eval_loss": 0.777266263961792, "eval_runtime": 111.4978, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 1300 }, { "epoch": 2.446110590440487, "grad_norm": 1.1645290851593018, "learning_rate": 1.1211042448903374e-05, "loss": 0.1169, "step": 1305 }, { "epoch": 2.4554826616682286, "grad_norm": 1.163246989250183, "learning_rate": 1.1149535583246253e-05, "loss": 0.0952, "step": 1310 }, { "epoch": 2.46485473289597, "grad_norm": 1.3993792533874512, "learning_rate": 1.1087984615182797e-05, "loss": 0.1178, "step": 1315 }, { "epoch": 2.4742268041237114, "grad_norm": 1.1687663793563843, "learning_rate": 1.1026391906141255e-05, "loss": 0.0978, "step": 1320 }, { "epoch": 2.483598875351453, "grad_norm": 1.1476637125015259, "learning_rate": 1.0964759819151289e-05, "loss": 0.0946, "step": 1325 }, { "epoch": 2.492970946579194, "grad_norm": 1.0236659049987793, "learning_rate": 1.0903090718753317e-05, "loss": 0.1057, "step": 1330 }, { "epoch": 2.5023430178069352, "grad_norm": 1.4007511138916016, "learning_rate": 1.0841386970907786e-05, "loss": 0.1124, "step": 1335 }, { "epoch": 2.5117150890346767, "grad_norm": 1.2030051946640015, "learning_rate": 1.077965094290441e-05, "loss": 0.102, "step": 1340 }, { "epoch": 2.521087160262418, "grad_norm": 1.0863361358642578, "learning_rate": 1.0717885003271338e-05, "loss": 0.1501, "step": 1345 }, { "epoch": 2.530459231490159, "grad_norm": 1.441186547279358, "learning_rate": 1.0656091521684297e-05, "loss": 0.1111, "step": 1350 }, { "epoch": 2.539831302717901, "grad_norm": 1.1081117391586304, "learning_rate": 1.0594272868875677e-05, "loss": 0.0995, "step": 1355 }, { "epoch": 2.549203373945642, "grad_norm": 1.3063805103302002, "learning_rate": 1.0532431416543559e-05, "loss": 0.1026, "step": 1360 }, { "epoch": 2.5585754451733833, "grad_norm": 1.265457034111023, "learning_rate": 1.0470569537260746e-05, "loss": 0.1137, "step": 1365 }, { "epoch": 2.5679475164011247, "grad_norm": 1.1931920051574707, "learning_rate": 1.040868960438373e-05, "loss": 0.1056, "step": 1370 }, { "epoch": 2.5773195876288657, "grad_norm": 1.2705389261245728, "learning_rate": 1.0346793991961636e-05, "loss": 0.0992, "step": 1375 }, { "epoch": 2.5866916588566076, "grad_norm": 1.2234851121902466, "learning_rate": 1.0284885074645139e-05, "loss": 0.1067, "step": 1380 }, { "epoch": 2.5960637300843485, "grad_norm": 1.30626380443573, "learning_rate": 1.022296522759536e-05, "loss": 0.1071, "step": 1385 }, { "epoch": 2.60543580131209, "grad_norm": 1.1325551271438599, "learning_rate": 1.016103682639275e-05, "loss": 0.0946, "step": 1390 }, { "epoch": 2.6148078725398314, "grad_norm": 1.2140247821807861, "learning_rate": 1.009910224694593e-05, "loss": 0.1012, "step": 1395 }, { "epoch": 2.624179943767573, "grad_norm": 1.2330358028411865, "learning_rate": 1.0037163865400577e-05, "loss": 0.1022, "step": 1400 }, { "epoch": 2.624179943767573, "eval_loss": 0.7983193397521973, "eval_runtime": 111.5048, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 1400 }, { "epoch": 2.633552014995314, "grad_norm": 1.2977453470230103, "learning_rate": 9.97522405804821e-06, "loss": 0.1086, "step": 1405 }, { "epoch": 2.642924086223055, "grad_norm": 1.2647531032562256, "learning_rate": 9.913285201235065e-06, "loss": 0.1051, "step": 1410 }, { "epoch": 2.6522961574507966, "grad_norm": 1.3180173635482788, "learning_rate": 9.85134967127091e-06, "loss": 0.1142, "step": 1415 }, { "epoch": 2.661668228678538, "grad_norm": 1.2392545938491821, "learning_rate": 9.789419844337868e-06, "loss": 0.1047, "step": 1420 }, { "epoch": 2.6710402999062794, "grad_norm": 1.1911959648132324, "learning_rate": 9.727498096399272e-06, "loss": 0.0908, "step": 1425 }, { "epoch": 2.680412371134021, "grad_norm": 1.3625760078430176, "learning_rate": 9.665586803108495e-06, "loss": 0.0967, "step": 1430 }, { "epoch": 2.689784442361762, "grad_norm": 1.077038288116455, "learning_rate": 9.603688339717818e-06, "loss": 0.1055, "step": 1435 }, { "epoch": 2.6991565135895033, "grad_norm": 1.2724173069000244, "learning_rate": 9.541805080987298e-06, "loss": 0.1024, "step": 1440 }, { "epoch": 2.7085285848172447, "grad_norm": 1.246999979019165, "learning_rate": 9.47993940109365e-06, "loss": 0.1096, "step": 1445 }, { "epoch": 2.717900656044986, "grad_norm": 1.1447161436080933, "learning_rate": 9.418093673539181e-06, "loss": 0.0964, "step": 1450 }, { "epoch": 2.7272727272727275, "grad_norm": 1.3298566341400146, "learning_rate": 9.356270271060711e-06, "loss": 0.1036, "step": 1455 }, { "epoch": 2.7366447985004685, "grad_norm": 1.3487498760223389, "learning_rate": 9.294471565538552e-06, "loss": 0.1054, "step": 1460 }, { "epoch": 2.74601686972821, "grad_norm": 1.2166017293930054, "learning_rate": 9.232699927905508e-06, "loss": 0.1031, "step": 1465 }, { "epoch": 2.7553889409559513, "grad_norm": 1.1950914859771729, "learning_rate": 9.170957728055907e-06, "loss": 0.0988, "step": 1470 }, { "epoch": 2.7647610121836927, "grad_norm": 1.0390832424163818, "learning_rate": 9.10924733475469e-06, "loss": 0.1038, "step": 1475 }, { "epoch": 2.774133083411434, "grad_norm": 1.190873146057129, "learning_rate": 9.047571115546526e-06, "loss": 0.1036, "step": 1480 }, { "epoch": 2.783505154639175, "grad_norm": 1.1870976686477661, "learning_rate": 8.985931436664981e-06, "loss": 0.1032, "step": 1485 }, { "epoch": 2.7928772258669166, "grad_norm": 1.2104380130767822, "learning_rate": 8.924330662941731e-06, "loss": 0.1006, "step": 1490 }, { "epoch": 2.802249297094658, "grad_norm": 1.1908341646194458, "learning_rate": 8.862771157715847e-06, "loss": 0.0984, "step": 1495 }, { "epoch": 2.8116213683223994, "grad_norm": 1.3652592897415161, "learning_rate": 8.801255282743113e-06, "loss": 0.1087, "step": 1500 }, { "epoch": 2.8116213683223994, "eval_loss": 0.8067182898521423, "eval_runtime": 111.5238, "eval_samples_per_second": 4.483, "eval_steps_per_second": 2.242, "step": 1500 }, { "epoch": 2.820993439550141, "grad_norm": 1.3108559846878052, "learning_rate": 8.739785398105419e-06, "loss": 0.1096, "step": 1505 }, { "epoch": 2.830365510777882, "grad_norm": 1.1820882558822632, "learning_rate": 8.678363862120224e-06, "loss": 0.0961, "step": 1510 }, { "epoch": 2.839737582005623, "grad_norm": 1.0882302522659302, "learning_rate": 8.616993031250059e-06, "loss": 0.097, "step": 1515 }, { "epoch": 2.8491096532333646, "grad_norm": 1.3416924476623535, "learning_rate": 8.555675260012137e-06, "loss": 0.1011, "step": 1520 }, { "epoch": 2.858481724461106, "grad_norm": 1.3005818128585815, "learning_rate": 8.49441290088803e-06, "loss": 0.1064, "step": 1525 }, { "epoch": 2.8678537956888475, "grad_norm": 1.203696846961975, "learning_rate": 8.433208304233383e-06, "loss": 0.0907, "step": 1530 }, { "epoch": 2.8772258669165884, "grad_norm": 1.1533688306808472, "learning_rate": 8.372063818187768e-06, "loss": 0.0951, "step": 1535 }, { "epoch": 2.88659793814433, "grad_norm": 1.21674382686615, "learning_rate": 8.31098178858459e-06, "loss": 0.0924, "step": 1540 }, { "epoch": 2.8959700093720713, "grad_norm": 1.3103758096694946, "learning_rate": 8.249964558861084e-06, "loss": 0.1038, "step": 1545 }, { "epoch": 2.9053420805998127, "grad_norm": 1.1318589448928833, "learning_rate": 8.189014469968407e-06, "loss": 0.0991, "step": 1550 }, { "epoch": 2.914714151827554, "grad_norm": 1.3271617889404297, "learning_rate": 8.128133860281838e-06, "loss": 0.1061, "step": 1555 }, { "epoch": 2.924086223055295, "grad_norm": 1.2122989892959595, "learning_rate": 8.067325065511056e-06, "loss": 0.0995, "step": 1560 }, { "epoch": 2.9334582942830365, "grad_norm": 1.286104440689087, "learning_rate": 8.006590418610523e-06, "loss": 0.1069, "step": 1565 }, { "epoch": 2.942830365510778, "grad_norm": 1.3062405586242676, "learning_rate": 7.945932249690002e-06, "loss": 0.1025, "step": 1570 }, { "epoch": 2.9522024367385193, "grad_norm": 1.2752856016159058, "learning_rate": 7.885352885925139e-06, "loss": 0.1097, "step": 1575 }, { "epoch": 2.9615745079662608, "grad_norm": 1.1971313953399658, "learning_rate": 7.824854651468187e-06, "loss": 0.1002, "step": 1580 }, { "epoch": 2.9709465791940017, "grad_norm": 1.3056398630142212, "learning_rate": 7.764439867358836e-06, "loss": 0.1088, "step": 1585 }, { "epoch": 2.980318650421743, "grad_norm": 1.2253344058990479, "learning_rate": 7.704110851435174e-06, "loss": 0.1047, "step": 1590 }, { "epoch": 2.9896907216494846, "grad_norm": 1.1375926733016968, "learning_rate": 7.643869918244759e-06, "loss": 0.0937, "step": 1595 }, { "epoch": 2.999062792877226, "grad_norm": 1.2414946556091309, "learning_rate": 7.583719378955816e-06, "loss": 0.1046, "step": 1600 }, { "epoch": 2.999062792877226, "eval_loss": 0.8037455081939697, "eval_runtime": 111.5354, "eval_samples_per_second": 4.483, "eval_steps_per_second": 2.241, "step": 1600 }, { "epoch": 3.007497656982193, "grad_norm": 0.8191234469413757, "learning_rate": 7.523661541268571e-06, "loss": 0.054, "step": 1605 }, { "epoch": 3.0168697282099344, "grad_norm": 0.6123488545417786, "learning_rate": 7.463698709326708e-06, "loss": 0.0328, "step": 1610 }, { "epoch": 3.026241799437676, "grad_norm": 1.0028489828109741, "learning_rate": 7.403833183628995e-06, "loss": 0.0345, "step": 1615 }, { "epoch": 3.035613870665417, "grad_norm": 1.0307646989822388, "learning_rate": 7.344067260940989e-06, "loss": 0.0323, "step": 1620 }, { "epoch": 3.044985941893158, "grad_norm": 0.9559040069580078, "learning_rate": 7.284403234206939e-06, "loss": 0.035, "step": 1625 }, { "epoch": 3.0543580131208996, "grad_norm": 0.9424014687538147, "learning_rate": 7.224843392461818e-06, "loss": 0.033, "step": 1630 }, { "epoch": 3.063730084348641, "grad_norm": 0.845702588558197, "learning_rate": 7.165390020743498e-06, "loss": 0.0324, "step": 1635 }, { "epoch": 3.0731021555763824, "grad_norm": 0.8844259977340698, "learning_rate": 7.106045400005083e-06, "loss": 0.0284, "step": 1640 }, { "epoch": 3.082474226804124, "grad_norm": 0.7264754772186279, "learning_rate": 7.046811807027401e-06, "loss": 0.0344, "step": 1645 }, { "epoch": 3.091846298031865, "grad_norm": 0.8641548156738281, "learning_rate": 6.987691514331656e-06, "loss": 0.0366, "step": 1650 }, { "epoch": 3.1012183692596063, "grad_norm": 0.8383805155754089, "learning_rate": 6.928686790092235e-06, "loss": 0.0323, "step": 1655 }, { "epoch": 3.1105904404873477, "grad_norm": 1.0214649438858032, "learning_rate": 6.869799898049704e-06, "loss": 0.0333, "step": 1660 }, { "epoch": 3.119962511715089, "grad_norm": 1.09578537940979, "learning_rate": 6.811033097423938e-06, "loss": 0.0357, "step": 1665 }, { "epoch": 3.1293345829428305, "grad_norm": 0.9607039093971252, "learning_rate": 6.752388642827459e-06, "loss": 0.0356, "step": 1670 }, { "epoch": 3.138706654170572, "grad_norm": 0.9811620712280273, "learning_rate": 6.693868784178934e-06, "loss": 0.0325, "step": 1675 }, { "epoch": 3.148078725398313, "grad_norm": 1.125432014465332, "learning_rate": 6.635475766616852e-06, "loss": 0.0341, "step": 1680 }, { "epoch": 3.1574507966260543, "grad_norm": 0.8190117478370667, "learning_rate": 6.577211830413397e-06, "loss": 0.0318, "step": 1685 }, { "epoch": 3.1668228678537957, "grad_norm": 0.8427776098251343, "learning_rate": 6.519079210888486e-06, "loss": 0.0326, "step": 1690 }, { "epoch": 3.176194939081537, "grad_norm": 0.9349907636642456, "learning_rate": 6.461080138324025e-06, "loss": 0.0303, "step": 1695 }, { "epoch": 3.1855670103092786, "grad_norm": 0.7530879378318787, "learning_rate": 6.40321683787833e-06, "loss": 0.0311, "step": 1700 }, { "epoch": 3.1855670103092786, "eval_loss": 0.9447797536849976, "eval_runtime": 111.5066, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 1700 }, { "epoch": 3.1949390815370196, "grad_norm": 1.094067096710205, "learning_rate": 6.345491529500769e-06, "loss": 0.0362, "step": 1705 }, { "epoch": 3.204311152764761, "grad_norm": 0.9973980784416199, "learning_rate": 6.287906427846583e-06, "loss": 0.0311, "step": 1710 }, { "epoch": 3.2136832239925024, "grad_norm": 0.954328179359436, "learning_rate": 6.230463742191926e-06, "loss": 0.0316, "step": 1715 }, { "epoch": 3.223055295220244, "grad_norm": 0.8958219289779663, "learning_rate": 6.173165676349103e-06, "loss": 0.0319, "step": 1720 }, { "epoch": 3.2324273664479852, "grad_norm": 0.8772101402282715, "learning_rate": 6.116014428582022e-06, "loss": 0.033, "step": 1725 }, { "epoch": 3.241799437675726, "grad_norm": 0.8836532235145569, "learning_rate": 6.059012191521853e-06, "loss": 0.0345, "step": 1730 }, { "epoch": 3.2511715089034676, "grad_norm": 1.0338672399520874, "learning_rate": 6.002161152082909e-06, "loss": 0.0322, "step": 1735 }, { "epoch": 3.260543580131209, "grad_norm": 0.7626182436943054, "learning_rate": 5.945463491378746e-06, "loss": 0.034, "step": 1740 }, { "epoch": 3.2699156513589505, "grad_norm": 1.0167630910873413, "learning_rate": 5.888921384638477e-06, "loss": 0.0323, "step": 1745 }, { "epoch": 3.279287722586692, "grad_norm": 0.8768958449363708, "learning_rate": 5.832537001123328e-06, "loss": 0.0335, "step": 1750 }, { "epoch": 3.288659793814433, "grad_norm": 0.8373109698295593, "learning_rate": 5.7763125040434084e-06, "loss": 0.0306, "step": 1755 }, { "epoch": 3.2980318650421743, "grad_norm": 0.7997825741767883, "learning_rate": 5.720250050474723e-06, "loss": 0.0314, "step": 1760 }, { "epoch": 3.3074039362699157, "grad_norm": 0.9116000533103943, "learning_rate": 5.66435179127639e-06, "loss": 0.0342, "step": 1765 }, { "epoch": 3.316776007497657, "grad_norm": 0.7944602370262146, "learning_rate": 5.608619871008166e-06, "loss": 0.0314, "step": 1770 }, { "epoch": 3.3261480787253985, "grad_norm": 0.9112783074378967, "learning_rate": 5.553056427848136e-06, "loss": 0.0305, "step": 1775 }, { "epoch": 3.3355201499531395, "grad_norm": 0.9411343336105347, "learning_rate": 5.497663593510693e-06, "loss": 0.0362, "step": 1780 }, { "epoch": 3.344892221180881, "grad_norm": 0.9458235502243042, "learning_rate": 5.442443493164753e-06, "loss": 0.0311, "step": 1785 }, { "epoch": 3.3542642924086223, "grad_norm": 0.9986944794654846, "learning_rate": 5.387398245352213e-06, "loss": 0.0346, "step": 1790 }, { "epoch": 3.3636363636363638, "grad_norm": 0.8632819652557373, "learning_rate": 5.332529961906699e-06, "loss": 0.0322, "step": 1795 }, { "epoch": 3.373008434864105, "grad_norm": 0.8336763978004456, "learning_rate": 5.277840747872509e-06, "loss": 0.0343, "step": 1800 }, { "epoch": 3.373008434864105, "eval_loss": 0.9443374872207642, "eval_runtime": 111.5074, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 1800 }, { "epoch": 3.382380506091846, "grad_norm": 0.7421078085899353, "learning_rate": 5.223332701423875e-06, "loss": 0.0299, "step": 1805 }, { "epoch": 3.3917525773195876, "grad_norm": 0.7075040340423584, "learning_rate": 5.169007913784462e-06, "loss": 0.0333, "step": 1810 }, { "epoch": 3.401124648547329, "grad_norm": 0.8889288306236267, "learning_rate": 5.11486846914713e-06, "loss": 0.033, "step": 1815 }, { "epoch": 3.4104967197750704, "grad_norm": 1.1044409275054932, "learning_rate": 5.060916444593985e-06, "loss": 0.0353, "step": 1820 }, { "epoch": 3.419868791002812, "grad_norm": 0.9357883334159851, "learning_rate": 5.00715391001668e-06, "loss": 0.0304, "step": 1825 }, { "epoch": 3.429240862230553, "grad_norm": 0.9663400650024414, "learning_rate": 4.953582928037005e-06, "loss": 0.0332, "step": 1830 }, { "epoch": 3.438612933458294, "grad_norm": 1.0516884326934814, "learning_rate": 4.900205553927761e-06, "loss": 0.035, "step": 1835 }, { "epoch": 3.4479850046860356, "grad_norm": 1.041757345199585, "learning_rate": 4.847023835533903e-06, "loss": 0.0315, "step": 1840 }, { "epoch": 3.457357075913777, "grad_norm": 0.8891613483428955, "learning_rate": 4.794039813193967e-06, "loss": 0.0326, "step": 1845 }, { "epoch": 3.4667291471415185, "grad_norm": 0.9261044859886169, "learning_rate": 4.741255519661806e-06, "loss": 0.0304, "step": 1850 }, { "epoch": 3.4761012183692594, "grad_norm": 1.3144643306732178, "learning_rate": 4.68867298002859e-06, "loss": 0.0354, "step": 1855 }, { "epoch": 3.485473289597001, "grad_norm": 0.8868503570556641, "learning_rate": 4.6362942116451226e-06, "loss": 0.0304, "step": 1860 }, { "epoch": 3.4948453608247423, "grad_norm": 0.9837562441825867, "learning_rate": 4.5841212240444334e-06, "loss": 0.032, "step": 1865 }, { "epoch": 3.5042174320524837, "grad_norm": 0.8227118253707886, "learning_rate": 4.532156018864692e-06, "loss": 0.0307, "step": 1870 }, { "epoch": 3.513589503280225, "grad_norm": 0.7651123404502869, "learning_rate": 4.480400589772409e-06, "loss": 0.0264, "step": 1875 }, { "epoch": 3.522961574507966, "grad_norm": 0.9286547899246216, "learning_rate": 4.428856922385942e-06, "loss": 0.0285, "step": 1880 }, { "epoch": 3.5323336457357075, "grad_norm": 0.9905438423156738, "learning_rate": 4.37752699419934e-06, "loss": 0.0337, "step": 1885 }, { "epoch": 3.541705716963449, "grad_norm": 0.914618194103241, "learning_rate": 4.326412774506444e-06, "loss": 0.0287, "step": 1890 }, { "epoch": 3.5510777881911904, "grad_norm": 0.8570281863212585, "learning_rate": 4.275516224325356e-06, "loss": 0.0319, "step": 1895 }, { "epoch": 3.5604498594189318, "grad_norm": 0.8986263871192932, "learning_rate": 4.224839296323196e-06, "loss": 0.0322, "step": 1900 }, { "epoch": 3.5604498594189318, "eval_loss": 0.9526164531707764, "eval_runtime": 111.507, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 1900 }, { "epoch": 3.5698219306466727, "grad_norm": 1.0641896724700928, "learning_rate": 4.1743839347411875e-06, "loss": 0.0317, "step": 1905 }, { "epoch": 3.579194001874414, "grad_norm": 1.0256502628326416, "learning_rate": 4.124152075320071e-06, "loss": 0.0346, "step": 1910 }, { "epoch": 3.5885660731021556, "grad_norm": 0.8067216277122498, "learning_rate": 4.074145645225831e-06, "loss": 0.0302, "step": 1915 }, { "epoch": 3.597938144329897, "grad_norm": 0.9786953926086426, "learning_rate": 4.0243665629757654e-06, "loss": 0.0362, "step": 1920 }, { "epoch": 3.6073102155576384, "grad_norm": 0.8346753716468811, "learning_rate": 3.974816738364875e-06, "loss": 0.0309, "step": 1925 }, { "epoch": 3.6166822867853794, "grad_norm": 0.7229898571968079, "learning_rate": 3.9254980723926e-06, "loss": 0.03, "step": 1930 }, { "epoch": 3.626054358013121, "grad_norm": 0.9483016729354858, "learning_rate": 3.876412457189883e-06, "loss": 0.032, "step": 1935 }, { "epoch": 3.6354264292408622, "grad_norm": 0.9327901601791382, "learning_rate": 3.8275617759465775e-06, "loss": 0.0323, "step": 1940 }, { "epoch": 3.6447985004686037, "grad_norm": 0.8537086844444275, "learning_rate": 3.7789479028392007e-06, "loss": 0.029, "step": 1945 }, { "epoch": 3.654170571696345, "grad_norm": 0.891110360622406, "learning_rate": 3.7305727029590245e-06, "loss": 0.0342, "step": 1950 }, { "epoch": 3.663542642924086, "grad_norm": 0.8868283629417419, "learning_rate": 3.6824380322405273e-06, "loss": 0.0315, "step": 1955 }, { "epoch": 3.6729147141518275, "grad_norm": 0.9474219679832458, "learning_rate": 3.6345457373901848e-06, "loss": 0.0302, "step": 1960 }, { "epoch": 3.682286785379569, "grad_norm": 0.9067096710205078, "learning_rate": 3.5868976558156254e-06, "loss": 0.0291, "step": 1965 }, { "epoch": 3.6916588566073103, "grad_norm": 0.8193556070327759, "learning_rate": 3.5394956155551285e-06, "loss": 0.0309, "step": 1970 }, { "epoch": 3.7010309278350517, "grad_norm": 0.8624306321144104, "learning_rate": 3.492341435207509e-06, "loss": 0.0312, "step": 1975 }, { "epoch": 3.7104029990627927, "grad_norm": 0.7553118467330933, "learning_rate": 3.445436923862322e-06, "loss": 0.0298, "step": 1980 }, { "epoch": 3.719775070290534, "grad_norm": 0.8075463175773621, "learning_rate": 3.3987838810304752e-06, "loss": 0.0297, "step": 1985 }, { "epoch": 3.7291471415182755, "grad_norm": 1.0225906372070312, "learning_rate": 3.3523840965751788e-06, "loss": 0.032, "step": 1990 }, { "epoch": 3.738519212746017, "grad_norm": 0.8977119326591492, "learning_rate": 3.3062393506432843e-06, "loss": 0.0705, "step": 1995 }, { "epoch": 3.7478912839737584, "grad_norm": 0.8516520857810974, "learning_rate": 3.2603514135969837e-06, "loss": 0.0299, "step": 2000 }, { "epoch": 3.7478912839737584, "eval_loss": 0.967979371547699, "eval_runtime": 111.5017, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 2000 }, { "epoch": 3.7572633552014993, "grad_norm": 0.9143263697624207, "learning_rate": 3.214722045945895e-06, "loss": 0.0295, "step": 2005 }, { "epoch": 3.7666354264292408, "grad_norm": 0.8708062767982483, "learning_rate": 3.1693529982795036e-06, "loss": 0.0281, "step": 2010 }, { "epoch": 3.776007497656982, "grad_norm": 0.9132674932479858, "learning_rate": 3.124246011200018e-06, "loss": 0.0301, "step": 2015 }, { "epoch": 3.7853795688847236, "grad_norm": 0.9853923916816711, "learning_rate": 3.079402815255591e-06, "loss": 0.0313, "step": 2020 }, { "epoch": 3.794751640112465, "grad_norm": 1.0308923721313477, "learning_rate": 3.0348251308739106e-06, "loss": 0.032, "step": 2025 }, { "epoch": 3.804123711340206, "grad_norm": 0.7933114767074585, "learning_rate": 2.9905146682962073e-06, "loss": 0.0311, "step": 2030 }, { "epoch": 3.8134957825679474, "grad_norm": 0.8838526606559753, "learning_rate": 2.9464731275116355e-06, "loss": 0.0325, "step": 2035 }, { "epoch": 3.822867853795689, "grad_norm": 0.8747525811195374, "learning_rate": 2.9027021981920566e-06, "loss": 0.0314, "step": 2040 }, { "epoch": 3.8322399250234302, "grad_norm": 0.7285995483398438, "learning_rate": 2.8592035596272118e-06, "loss": 0.0294, "step": 2045 }, { "epoch": 3.8416119962511717, "grad_norm": 0.8272311091423035, "learning_rate": 2.8159788806602904e-06, "loss": 0.0318, "step": 2050 }, { "epoch": 3.8509840674789126, "grad_norm": 0.7552247047424316, "learning_rate": 2.773029819623916e-06, "loss": 0.03, "step": 2055 }, { "epoch": 3.860356138706654, "grad_norm": 0.9183073043823242, "learning_rate": 2.730358024276509e-06, "loss": 0.0314, "step": 2060 }, { "epoch": 3.8697282099343955, "grad_norm": 0.8467240333557129, "learning_rate": 2.6879651317390864e-06, "loss": 0.0256, "step": 2065 }, { "epoch": 3.879100281162137, "grad_norm": 0.850248396396637, "learning_rate": 2.6458527684324376e-06, "loss": 0.0299, "step": 2070 }, { "epoch": 3.8884723523898783, "grad_norm": 0.7223458290100098, "learning_rate": 2.6040225500147365e-06, "loss": 0.0305, "step": 2075 }, { "epoch": 3.8978444236176193, "grad_norm": 0.8155651092529297, "learning_rate": 2.5624760813195436e-06, "loss": 0.0298, "step": 2080 }, { "epoch": 3.9072164948453607, "grad_norm": 0.7251290082931519, "learning_rate": 2.5212149562942535e-06, "loss": 0.0276, "step": 2085 }, { "epoch": 3.916588566073102, "grad_norm": 1.1165629625320435, "learning_rate": 2.48024075793893e-06, "loss": 0.0309, "step": 2090 }, { "epoch": 3.9259606373008435, "grad_norm": 1.0103236436843872, "learning_rate": 2.4395550582455774e-06, "loss": 0.0277, "step": 2095 }, { "epoch": 3.935332708528585, "grad_norm": 0.912944495677948, "learning_rate": 2.3991594181378286e-06, "loss": 0.0335, "step": 2100 }, { "epoch": 3.935332708528585, "eval_loss": 0.9605706930160522, "eval_runtime": 111.5358, "eval_samples_per_second": 4.483, "eval_steps_per_second": 2.241, "step": 2100 }, { "epoch": 3.944704779756326, "grad_norm": 0.925261378288269, "learning_rate": 2.359055387411061e-06, "loss": 0.0311, "step": 2105 }, { "epoch": 3.9540768509840674, "grad_norm": 0.9867929220199585, "learning_rate": 2.319244504672943e-06, "loss": 0.0306, "step": 2110 }, { "epoch": 3.963448922211809, "grad_norm": 0.9533296227455139, "learning_rate": 2.279728297284394e-06, "loss": 0.0309, "step": 2115 }, { "epoch": 3.97282099343955, "grad_norm": 0.8042296171188354, "learning_rate": 2.2405082813009926e-06, "loss": 0.0257, "step": 2120 }, { "epoch": 3.9821930646672916, "grad_norm": 0.8513698577880859, "learning_rate": 2.201585961414815e-06, "loss": 0.0277, "step": 2125 }, { "epoch": 3.9915651358950326, "grad_norm": 0.8996440768241882, "learning_rate": 2.1629628308967e-06, "loss": 0.0309, "step": 2130 }, { "epoch": 4.0, "grad_norm": 1.3045603036880493, "learning_rate": 2.1246403715389675e-06, "loss": 0.0307, "step": 2135 }, { "epoch": 4.009372071227741, "grad_norm": 0.5760667324066162, "learning_rate": 2.0866200535985616e-06, "loss": 0.0104, "step": 2140 }, { "epoch": 4.018744142455483, "grad_norm": 0.32251426577568054, "learning_rate": 2.0489033357406464e-06, "loss": 0.0091, "step": 2145 }, { "epoch": 4.028116213683224, "grad_norm": 0.3890618681907654, "learning_rate": 2.011491664982644e-06, "loss": 0.0093, "step": 2150 }, { "epoch": 4.037488284910966, "grad_norm": 0.4246854782104492, "learning_rate": 1.9743864766387198e-06, "loss": 0.0094, "step": 2155 }, { "epoch": 4.046860356138707, "grad_norm": 0.37308433651924133, "learning_rate": 1.937589194264715e-06, "loss": 0.0083, "step": 2160 }, { "epoch": 4.056232427366448, "grad_norm": 0.29468032717704773, "learning_rate": 1.9011012296035303e-06, "loss": 0.0072, "step": 2165 }, { "epoch": 4.0656044985941895, "grad_norm": 0.49253249168395996, "learning_rate": 1.864923982530965e-06, "loss": 0.0078, "step": 2170 }, { "epoch": 4.0749765698219305, "grad_norm": 0.5254181623458862, "learning_rate": 1.8290588410020116e-06, "loss": 0.0078, "step": 2175 }, { "epoch": 4.084348641049672, "grad_norm": 0.3478500247001648, "learning_rate": 1.7935071809976035e-06, "loss": 0.0075, "step": 2180 }, { "epoch": 4.093720712277413, "grad_norm": 0.3770616352558136, "learning_rate": 1.7582703664718247e-06, "loss": 0.0082, "step": 2185 }, { "epoch": 4.103092783505154, "grad_norm": 0.349509596824646, "learning_rate": 1.7233497492995865e-06, "loss": 0.0069, "step": 2190 }, { "epoch": 4.112464854732896, "grad_norm": 0.43029990792274475, "learning_rate": 1.6887466692247556e-06, "loss": 0.0077, "step": 2195 }, { "epoch": 4.121836925960637, "grad_norm": 0.6748161911964417, "learning_rate": 1.654462453808755e-06, "loss": 0.0073, "step": 2200 }, { "epoch": 4.121836925960637, "eval_loss": 1.0975761413574219, "eval_runtime": 111.5036, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 2200 }, { "epoch": 4.131208997188379, "grad_norm": 0.6008536219596863, "learning_rate": 1.6204984183796425e-06, "loss": 0.0079, "step": 2205 }, { "epoch": 4.14058106841612, "grad_norm": 0.4357309937477112, "learning_rate": 1.5868558659816302e-06, "loss": 0.0082, "step": 2210 }, { "epoch": 4.149953139643861, "grad_norm": 0.4295269250869751, "learning_rate": 1.5535360873251026e-06, "loss": 0.008, "step": 2215 }, { "epoch": 4.159325210871603, "grad_norm": 0.3729182183742523, "learning_rate": 1.5205403607370984e-06, "loss": 0.0071, "step": 2220 }, { "epoch": 4.168697282099344, "grad_norm": 0.5101849436759949, "learning_rate": 1.4878699521122654e-06, "loss": 0.0081, "step": 2225 }, { "epoch": 4.178069353327086, "grad_norm": 0.5576186776161194, "learning_rate": 1.4555261148642929e-06, "loss": 0.0088, "step": 2230 }, { "epoch": 4.187441424554827, "grad_norm": 0.39585602283477783, "learning_rate": 1.423510089877823e-06, "loss": 0.0078, "step": 2235 }, { "epoch": 4.196813495782568, "grad_norm": 0.45328739285469055, "learning_rate": 1.3918231054608499e-06, "loss": 0.0077, "step": 2240 }, { "epoch": 4.206185567010309, "grad_norm": 0.45810526609420776, "learning_rate": 1.3604663772975856e-06, "loss": 0.0093, "step": 2245 }, { "epoch": 4.21555763823805, "grad_norm": 0.4543026089668274, "learning_rate": 1.3294411084018277e-06, "loss": 0.007, "step": 2250 }, { "epoch": 4.224929709465792, "grad_norm": 1.054495930671692, "learning_rate": 1.2987484890708024e-06, "loss": 0.0087, "step": 2255 }, { "epoch": 4.234301780693533, "grad_norm": 0.5703629851341248, "learning_rate": 1.268389696839497e-06, "loss": 0.008, "step": 2260 }, { "epoch": 4.243673851921274, "grad_norm": 0.41296708583831787, "learning_rate": 1.2383658964354861e-06, "loss": 0.006, "step": 2265 }, { "epoch": 4.253045923149016, "grad_norm": 0.6897146701812744, "learning_rate": 1.2086782397342445e-06, "loss": 0.0076, "step": 2270 }, { "epoch": 4.262417994376757, "grad_norm": 0.39745044708251953, "learning_rate": 1.1793278657149532e-06, "loss": 0.0084, "step": 2275 }, { "epoch": 4.271790065604499, "grad_norm": 0.6803708672523499, "learning_rate": 1.1503159004168074e-06, "loss": 0.0063, "step": 2280 }, { "epoch": 4.28116213683224, "grad_norm": 0.49779579043388367, "learning_rate": 1.12164345689581e-06, "loss": 0.0077, "step": 2285 }, { "epoch": 4.290534208059981, "grad_norm": 0.42171531915664673, "learning_rate": 1.0933116351820695e-06, "loss": 0.0074, "step": 2290 }, { "epoch": 4.299906279287723, "grad_norm": 0.41160067915916443, "learning_rate": 1.0653215222376045e-06, "loss": 0.0068, "step": 2295 }, { "epoch": 4.309278350515464, "grad_norm": 0.4333638548851013, "learning_rate": 1.0376741919146305e-06, "loss": 0.0069, "step": 2300 }, { "epoch": 4.309278350515464, "eval_loss": 1.1144713163375854, "eval_runtime": 111.5268, "eval_samples_per_second": 4.483, "eval_steps_per_second": 2.242, "step": 2300 }, { "epoch": 4.318650421743206, "grad_norm": 0.621540367603302, "learning_rate": 1.0103707049143673e-06, "loss": 0.008, "step": 2305 }, { "epoch": 4.3280224929709465, "grad_norm": 0.3928787112236023, "learning_rate": 9.834121087463445e-07, "loss": 0.0068, "step": 2310 }, { "epoch": 4.3373945641986875, "grad_norm": 0.4444401264190674, "learning_rate": 9.56799437688214e-07, "loss": 0.0076, "step": 2315 }, { "epoch": 4.346766635426429, "grad_norm": 0.4709712266921997, "learning_rate": 9.305337127460678e-07, "loss": 0.0064, "step": 2320 }, { "epoch": 4.35613870665417, "grad_norm": 0.6003327369689941, "learning_rate": 9.046159416152633e-07, "loss": 0.007, "step": 2325 }, { "epoch": 4.365510777881912, "grad_norm": 0.3838503360748291, "learning_rate": 8.790471186417715e-07, "loss": 0.0076, "step": 2330 }, { "epoch": 4.374882849109653, "grad_norm": 0.5418089032173157, "learning_rate": 8.538282247840201e-07, "loss": 0.0072, "step": 2335 }, { "epoch": 4.384254920337394, "grad_norm": 0.7511455416679382, "learning_rate": 8.289602275752673e-07, "loss": 0.009, "step": 2340 }, { "epoch": 4.393626991565136, "grad_norm": 0.5192817449569702, "learning_rate": 8.044440810864718e-07, "loss": 0.0081, "step": 2345 }, { "epoch": 4.402999062792877, "grad_norm": 0.6360767483711243, "learning_rate": 7.80280725889696e-07, "loss": 0.0079, "step": 2350 }, { "epoch": 4.412371134020619, "grad_norm": 0.5308467149734497, "learning_rate": 7.564710890220183e-07, "loss": 0.0083, "step": 2355 }, { "epoch": 4.42174320524836, "grad_norm": 0.4319888949394226, "learning_rate": 7.3301608394997e-07, "loss": 0.0079, "step": 2360 }, { "epoch": 4.431115276476101, "grad_norm": 0.46917620301246643, "learning_rate": 7.099166105344835e-07, "loss": 0.0064, "step": 2365 }, { "epoch": 4.440487347703843, "grad_norm": 0.455216646194458, "learning_rate": 6.871735549963765e-07, "loss": 0.007, "step": 2370 }, { "epoch": 4.449859418931584, "grad_norm": 0.40280669927597046, "learning_rate": 6.647877898823463e-07, "loss": 0.0068, "step": 2375 }, { "epoch": 4.4592314901593255, "grad_norm": 0.32350170612335205, "learning_rate": 6.427601740314926e-07, "loss": 0.0077, "step": 2380 }, { "epoch": 4.4686035613870665, "grad_norm": 0.30398938059806824, "learning_rate": 6.2109155254238e-07, "loss": 0.0068, "step": 2385 }, { "epoch": 4.4779756326148075, "grad_norm": 0.5104652047157288, "learning_rate": 5.997827567405978e-07, "loss": 0.0069, "step": 2390 }, { "epoch": 4.487347703842549, "grad_norm": 0.4495840072631836, "learning_rate": 5.788346041468796e-07, "loss": 0.0065, "step": 2395 }, { "epoch": 4.49671977507029, "grad_norm": 0.3475983440876007, "learning_rate": 5.582478984457284e-07, "loss": 0.0064, "step": 2400 }, { "epoch": 4.49671977507029, "eval_loss": 1.1217763423919678, "eval_runtime": 111.5486, "eval_samples_per_second": 4.482, "eval_steps_per_second": 2.241, "step": 2400 }, { "epoch": 4.506091846298032, "grad_norm": 0.46389687061309814, "learning_rate": 5.380234294545938e-07, "loss": 0.0071, "step": 2405 }, { "epoch": 4.515463917525773, "grad_norm": 0.3474023640155792, "learning_rate": 5.181619730935617e-07, "loss": 0.0067, "step": 2410 }, { "epoch": 4.524835988753514, "grad_norm": 0.3991861045360565, "learning_rate": 4.986642913555895e-07, "loss": 0.0068, "step": 2415 }, { "epoch": 4.534208059981256, "grad_norm": 0.4194345772266388, "learning_rate": 4.795311322772722e-07, "loss": 0.0077, "step": 2420 }, { "epoch": 4.543580131208997, "grad_norm": 0.34731411933898926, "learning_rate": 4.6076322991013946e-07, "loss": 0.0063, "step": 2425 }, { "epoch": 4.552952202436739, "grad_norm": 0.7513842582702637, "learning_rate": 4.4236130429250347e-07, "loss": 0.007, "step": 2430 }, { "epoch": 4.56232427366448, "grad_norm": 0.35471469163894653, "learning_rate": 4.2432606142182145e-07, "loss": 0.0071, "step": 2435 }, { "epoch": 4.571696344892221, "grad_norm": 0.3158963918685913, "learning_rate": 4.06658193227617e-07, "loss": 0.008, "step": 2440 }, { "epoch": 4.581068416119963, "grad_norm": 0.510502815246582, "learning_rate": 3.8935837754493497e-07, "loss": 0.0083, "step": 2445 }, { "epoch": 4.590440487347704, "grad_norm": 0.5745358467102051, "learning_rate": 3.72427278088332e-07, "loss": 0.0075, "step": 2450 }, { "epoch": 4.5998125585754455, "grad_norm": 0.48121458292007446, "learning_rate": 3.5586554442641587e-07, "loss": 0.0081, "step": 2455 }, { "epoch": 4.609184629803186, "grad_norm": 0.4651750922203064, "learning_rate": 3.3967381195692317e-07, "loss": 0.0069, "step": 2460 }, { "epoch": 4.618556701030927, "grad_norm": 0.4792514443397522, "learning_rate": 3.238527018823423e-07, "loss": 0.0081, "step": 2465 }, { "epoch": 4.627928772258669, "grad_norm": 0.4478175640106201, "learning_rate": 3.08402821186079e-07, "loss": 0.0063, "step": 2470 }, { "epoch": 4.63730084348641, "grad_norm": 0.3196679949760437, "learning_rate": 2.933247626091751e-07, "loss": 0.0068, "step": 2475 }, { "epoch": 4.646672914714152, "grad_norm": 0.5067555904388428, "learning_rate": 2.786191046275588e-07, "loss": 0.0076, "step": 2480 }, { "epoch": 4.656044985941893, "grad_norm": 0.5797865986824036, "learning_rate": 2.6428641142986043e-07, "loss": 0.009, "step": 2485 }, { "epoch": 4.665417057169634, "grad_norm": 0.5033183693885803, "learning_rate": 2.503272328957584e-07, "loss": 0.0078, "step": 2490 }, { "epoch": 4.674789128397376, "grad_norm": 0.30220600962638855, "learning_rate": 2.367421045748908e-07, "loss": 0.007, "step": 2495 }, { "epoch": 4.684161199625117, "grad_norm": 0.5532141923904419, "learning_rate": 2.2353154766630358e-07, "loss": 0.0086, "step": 2500 }, { "epoch": 4.684161199625117, "eval_loss": 1.1228344440460205, "eval_runtime": 111.5053, "eval_samples_per_second": 4.484, "eval_steps_per_second": 2.242, "step": 2500 }, { "epoch": 4.693533270852859, "grad_norm": 0.4479539692401886, "learning_rate": 2.1069606899845497e-07, "loss": 0.0077, "step": 2505 }, { "epoch": 4.7029053420806, "grad_norm": 0.4743359386920929, "learning_rate": 1.9823616100977495e-07, "loss": 0.0081, "step": 2510 }, { "epoch": 4.712277413308341, "grad_norm": 0.38026347756385803, "learning_rate": 1.8615230172976507e-07, "loss": 0.0065, "step": 2515 }, { "epoch": 4.721649484536083, "grad_norm": 0.5804769396781921, "learning_rate": 1.744449547606697e-07, "loss": 0.0092, "step": 2520 }, { "epoch": 4.7310215557638235, "grad_norm": 0.5354004502296448, "learning_rate": 1.6311456925967583e-07, "loss": 0.0074, "step": 2525 }, { "epoch": 4.740393626991565, "grad_norm": 0.6035090088844299, "learning_rate": 1.5216157992169577e-07, "loss": 0.0067, "step": 2530 }, { "epoch": 4.749765698219306, "grad_norm": 0.5137022733688354, "learning_rate": 1.41586406962676e-07, "loss": 0.0075, "step": 2535 }, { "epoch": 4.759137769447047, "grad_norm": 0.2721659541130066, "learning_rate": 1.3138945610348564e-07, "loss": 0.0072, "step": 2540 }, { "epoch": 4.768509840674789, "grad_norm": 0.4901478886604309, "learning_rate": 1.2157111855434667e-07, "loss": 0.0065, "step": 2545 }, { "epoch": 4.77788191190253, "grad_norm": 0.2981049716472626, "learning_rate": 1.1213177099982376e-07, "loss": 0.0069, "step": 2550 }, { "epoch": 4.787253983130272, "grad_norm": 0.49158474802970886, "learning_rate": 1.0307177558437686e-07, "loss": 0.0082, "step": 2555 }, { "epoch": 4.796626054358013, "grad_norm": 0.6860193610191345, "learning_rate": 9.439147989846354e-08, "loss": 0.0081, "step": 2560 }, { "epoch": 4.805998125585754, "grad_norm": 0.7087129354476929, "learning_rate": 8.609121696520283e-08, "loss": 0.0084, "step": 2565 }, { "epoch": 4.815370196813496, "grad_norm": 0.727730929851532, "learning_rate": 7.817130522760452e-08, "loss": 0.0334, "step": 2570 }, { "epoch": 4.824742268041237, "grad_norm": 0.4352070391178131, "learning_rate": 7.063204853634543e-08, "loss": 0.0076, "step": 2575 }, { "epoch": 4.834114339268979, "grad_norm": 0.3776610791683197, "learning_rate": 6.347373613811325e-08, "loss": 0.0059, "step": 2580 }, { "epoch": 4.84348641049672, "grad_norm": 0.5180082321166992, "learning_rate": 5.6696642664515465e-08, "loss": 0.0081, "step": 2585 }, { "epoch": 4.852858481724461, "grad_norm": 0.49723920226097107, "learning_rate": 5.030102812153548e-08, "loss": 0.0081, "step": 2590 }, { "epoch": 4.8622305529522025, "grad_norm": 0.2777559161186218, "learning_rate": 4.428713787955841e-08, "loss": 0.007, "step": 2595 }, { "epoch": 4.8716026241799435, "grad_norm": 0.44526979327201843, "learning_rate": 3.865520266396416e-08, "loss": 0.0072, "step": 2600 }, { "epoch": 4.8716026241799435, "eval_loss": 1.1233325004577637, "eval_runtime": 111.5373, "eval_samples_per_second": 4.483, "eval_steps_per_second": 2.241, "step": 2600 }, { "epoch": 4.880974695407685, "grad_norm": 0.49195200204849243, "learning_rate": 3.340543854626566e-08, "loss": 0.0081, "step": 2605 }, { "epoch": 4.890346766635426, "grad_norm": 0.33215323090553284, "learning_rate": 2.8538046935828733e-08, "loss": 0.0069, "step": 2610 }, { "epoch": 4.899718837863167, "grad_norm": 0.43431356549263, "learning_rate": 2.4053214572137274e-08, "loss": 0.0066, "step": 2615 }, { "epoch": 4.909090909090909, "grad_norm": 0.49866101145744324, "learning_rate": 1.9951113517633346e-08, "loss": 0.007, "step": 2620 }, { "epoch": 4.91846298031865, "grad_norm": 0.5170955657958984, "learning_rate": 1.6231901151113617e-08, "loss": 0.0083, "step": 2625 }, { "epoch": 4.927835051546392, "grad_norm": 0.3568389117717743, "learning_rate": 1.2895720161693048e-08, "loss": 0.0073, "step": 2630 }, { "epoch": 4.937207122774133, "grad_norm": 0.4371294379234314, "learning_rate": 9.942698543330409e-09, "loss": 0.008, "step": 2635 }, { "epoch": 4.946579194001874, "grad_norm": 0.6057606935501099, "learning_rate": 7.372949589916633e-09, "loss": 0.0078, "step": 2640 }, { "epoch": 4.955951265229616, "grad_norm": 0.44616734981536865, "learning_rate": 5.186571890929415e-09, "loss": 0.0079, "step": 2645 }, { "epoch": 4.965323336457357, "grad_norm": 0.5187500715255737, "learning_rate": 3.383649327650673e-09, "loss": 0.0082, "step": 2650 }, { "epoch": 4.974695407685099, "grad_norm": 0.3702596426010132, "learning_rate": 1.9642510699469096e-09, "loss": 0.0074, "step": 2655 }, { "epoch": 4.98406747891284, "grad_norm": 0.38890424370765686, "learning_rate": 9.284315736168837e-10, "loss": 0.0068, "step": 2660 }, { "epoch": 4.993439550140581, "grad_norm": 0.40768128633499146, "learning_rate": 2.762305783021724e-10, "loss": 0.0079, "step": 2665 }, { "epoch": 5.0, "step": 2669, "total_flos": 3.0080813400754176e+18, "train_loss": 0.12334943315905438, "train_runtime": 40705.595, "train_samples_per_second": 2.097, "train_steps_per_second": 0.066 } ], "logging_steps": 5, "max_steps": 2670, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.0080813400754176e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }