{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0125, "grad_norm": 2.691509962081909, "learning_rate": 0.00015998457923856519, "loss": 1.4456, "step": 1 }, { "epoch": 0.025, "grad_norm": 24.26286506652832, "learning_rate": 0.00015993832289925785, "loss": 4.3349, "step": 2 }, { "epoch": 0.0375, "grad_norm": 13.163036346435547, "learning_rate": 0.0001598612488147773, "loss": 2.6695, "step": 3 }, { "epoch": 0.05, "grad_norm": 9.818785667419434, "learning_rate": 0.00015975338669865026, "loss": 2.3799, "step": 4 }, { "epoch": 0.0625, "grad_norm": 6.200242519378662, "learning_rate": 0.00015961477813377576, "loss": 2.0935, "step": 5 }, { "epoch": 0.075, "grad_norm": 2.0556814670562744, "learning_rate": 0.00015944547655639412, "loss": 1.8465, "step": 6 }, { "epoch": 0.0875, "grad_norm": 2.5195746421813965, "learning_rate": 0.00015924554723548617, "loss": 1.7321, "step": 7 }, { "epoch": 0.1, "grad_norm": 4.300451278686523, "learning_rate": 0.00015901506724761103, "loss": 1.7284, "step": 8 }, { "epoch": 0.1125, "grad_norm": 1.5021892786026, "learning_rate": 0.00015875412544719134, "loss": 1.5971, "step": 9 }, { "epoch": 0.125, "grad_norm": 1.5246820449829102, "learning_rate": 0.00015846282243225845, "loss": 1.562, "step": 10 }, { "epoch": 0.1375, "grad_norm": 2.0095787048339844, "learning_rate": 0.0001581412705056698, "loss": 1.578, "step": 11 }, { "epoch": 0.15, "grad_norm": 0.9773982763290405, "learning_rate": 0.00015778959363181415, "loss": 1.4977, "step": 12 }, { "epoch": 0.1625, "grad_norm": 1.1493251323699951, "learning_rate": 0.0001574079273888208, "loss": 1.5075, "step": 13 }, { "epoch": 0.175, "grad_norm": 0.8909309506416321, "learning_rate": 0.00015699641891629178, "loss": 1.4158, "step": 14 }, { "epoch": 0.1875, "grad_norm": 0.9415439963340759, "learning_rate": 0.00015655522685857672, "loss": 1.4219, "step": 15 }, { "epoch": 0.2, "grad_norm": 1.1703603267669678, "learning_rate": 0.0001560845213036123, "loss": 1.4006, "step": 16 }, { "epoch": 0.2125, "grad_norm": 0.7575011849403381, "learning_rate": 0.00015558448371735025, "loss": 1.3675, "step": 17 }, { "epoch": 0.225, "grad_norm": 0.6772542595863342, "learning_rate": 0.00015505530687379875, "loss": 1.3369, "step": 18 }, { "epoch": 0.2375, "grad_norm": 0.5587411522865295, "learning_rate": 0.00015449719478070428, "loss": 1.3632, "step": 19 }, { "epoch": 0.25, "grad_norm": 0.5920618772506714, "learning_rate": 0.00015391036260090294, "loss": 1.3511, "step": 20 }, { "epoch": 0.2625, "grad_norm": 0.4218953847885132, "learning_rate": 0.0001532950365693709, "loss": 1.3641, "step": 21 }, { "epoch": 0.275, "grad_norm": 0.4676741361618042, "learning_rate": 0.00015265145390600652, "loss": 1.3441, "step": 22 }, { "epoch": 0.2875, "grad_norm": 0.38095250725746155, "learning_rate": 0.00015197986272417774, "loss": 1.3418, "step": 23 }, { "epoch": 0.3, "grad_norm": 0.42308753728866577, "learning_rate": 0.00015128052193506944, "loss": 1.3646, "step": 24 }, { "epoch": 0.3125, "grad_norm": 0.4307089149951935, "learning_rate": 0.0001505537011478684, "loss": 1.2992, "step": 25 }, { "epoch": 0.325, "grad_norm": 0.33103814721107483, "learning_rate": 0.0001497996805658238, "loss": 1.3435, "step": 26 }, { "epoch": 0.3375, "grad_norm": 0.3511773645877838, "learning_rate": 0.00014901875087822337, "loss": 1.3, "step": 27 }, { "epoch": 0.35, "grad_norm": 0.2914850115776062, "learning_rate": 0.0001482112131483274, "loss": 1.3103, "step": 28 }, { "epoch": 0.3625, "grad_norm": 0.37050625681877136, "learning_rate": 0.00014737737869730292, "loss": 1.2731, "step": 29 }, { "epoch": 0.375, "grad_norm": 0.3476356565952301, "learning_rate": 0.00014651756898420365, "loss": 1.3211, "step": 30 }, { "epoch": 0.3875, "grad_norm": 0.27799472212791443, "learning_rate": 0.0001456321154820411, "loss": 1.2657, "step": 31 }, { "epoch": 0.4, "grad_norm": 0.318327397108078, "learning_rate": 0.00014472135954999581, "loss": 1.3068, "step": 32 }, { "epoch": 0.4125, "grad_norm": 0.30465707182884216, "learning_rate": 0.00014378565230181657, "loss": 1.2839, "step": 33 }, { "epoch": 0.425, "grad_norm": 0.2618834376335144, "learning_rate": 0.0001428253544704596, "loss": 1.2868, "step": 34 }, { "epoch": 0.4375, "grad_norm": 0.2864656150341034, "learning_rate": 0.00014184083626901897, "loss": 1.2815, "step": 35 }, { "epoch": 0.45, "grad_norm": 0.2776831388473511, "learning_rate": 0.0001408324772480025, "loss": 1.2895, "step": 36 }, { "epoch": 0.4625, "grad_norm": 0.31238630414009094, "learning_rate": 0.00013980066614900776, "loss": 1.2718, "step": 37 }, { "epoch": 0.475, "grad_norm": 0.23365426063537598, "learning_rate": 0.00013874580075485485, "loss": 1.2596, "step": 38 }, { "epoch": 0.4875, "grad_norm": 0.23924365639686584, "learning_rate": 0.00013766828773623352, "loss": 1.2809, "step": 39 }, { "epoch": 0.5, "grad_norm": 0.24298632144927979, "learning_rate": 0.00013656854249492382, "loss": 1.2248, "step": 40 }, { "epoch": 0.5125, "grad_norm": 0.25117772817611694, "learning_rate": 0.0001354469890036509, "loss": 1.2653, "step": 41 }, { "epoch": 0.525, "grad_norm": 0.25377020239830017, "learning_rate": 0.00013430405964263536, "loss": 1.2687, "step": 42 }, { "epoch": 0.5375, "grad_norm": 0.24669994413852692, "learning_rate": 0.00013314019503290255, "loss": 1.269, "step": 43 }, { "epoch": 0.55, "grad_norm": 0.22006134688854218, "learning_rate": 0.00013195584386641469, "loss": 1.2559, "step": 44 }, { "epoch": 0.5625, "grad_norm": 0.2517986595630646, "learning_rate": 0.00013075146273309164, "loss": 1.2477, "step": 45 }, { "epoch": 0.575, "grad_norm": 0.21466796100139618, "learning_rate": 0.00012952751594478675, "loss": 1.2358, "step": 46 }, { "epoch": 0.5875, "grad_norm": 0.2188994437456131, "learning_rate": 0.0001282844753562857, "loss": 1.2444, "step": 47 }, { "epoch": 0.6, "grad_norm": 2.4198501110076904, "learning_rate": 0.00012702282018339786, "loss": 1.2535, "step": 48 }, { "epoch": 0.6125, "grad_norm": 0.3393913209438324, "learning_rate": 0.00012574303681820898, "loss": 1.2361, "step": 49 }, { "epoch": 0.625, "grad_norm": 0.32384437322616577, "learning_rate": 0.0001244456186415682, "loss": 1.2283, "step": 50 }, { "epoch": 0.6375, "grad_norm": 0.3469082713127136, "learning_rate": 0.00012313106583288004, "loss": 1.2401, "step": 51 }, { "epoch": 0.65, "grad_norm": 0.42606261372566223, "learning_rate": 0.00012179988517727591, "loss": 1.2399, "step": 52 }, { "epoch": 0.6625, "grad_norm": 0.4077642261981964, "learning_rate": 0.00012045258987023879, "loss": 1.2441, "step": 53 }, { "epoch": 0.675, "grad_norm": 0.3077225089073181, "learning_rate": 0.00011908969931975641, "loss": 1.253, "step": 54 }, { "epoch": 0.6875, "grad_norm": 0.9925752878189087, "learning_rate": 0.00011771173894607985, "loss": 1.2586, "step": 55 }, { "epoch": 0.7, "grad_norm": 1.9072725772857666, "learning_rate": 0.00011631923997916375, "loss": 1.2643, "step": 56 }, { "epoch": 0.7125, "grad_norm": 0.5788567662239075, "learning_rate": 0.00011491273925386736, "loss": 1.2657, "step": 57 }, { "epoch": 0.725, "grad_norm": 0.9417564868927002, "learning_rate": 0.00011349277900299426, "loss": 1.2526, "step": 58 }, { "epoch": 0.7375, "grad_norm": 0.9247767329216003, "learning_rate": 0.00011205990664825127, "loss": 1.2402, "step": 59 }, { "epoch": 0.75, "grad_norm": 0.5092797875404358, "learning_rate": 0.00011061467458920719, "loss": 1.2264, "step": 60 }, { "epoch": 0.7625, "grad_norm": 0.8128093481063843, "learning_rate": 0.00010915763999033201, "loss": 1.22, "step": 61 }, { "epoch": 0.775, "grad_norm": 0.4954143166542053, "learning_rate": 0.00010768936456619945, "loss": 1.203, "step": 62 }, { "epoch": 0.7875, "grad_norm": 0.7117099761962891, "learning_rate": 0.0001062104143649355, "loss": 1.2295, "step": 63 }, { "epoch": 0.8, "grad_norm": 0.5060359835624695, "learning_rate": 0.0001047213595499958, "loss": 1.1936, "step": 64 }, { "epoch": 0.8125, "grad_norm": 0.5212268829345703, "learning_rate": 0.000103222774180357, "loss": 1.1927, "step": 65 }, { "epoch": 0.825, "grad_norm": 0.47975900769233704, "learning_rate": 0.00010171523598920594, "loss": 1.2116, "step": 66 }, { "epoch": 0.8375, "grad_norm": 0.3655720055103302, "learning_rate": 0.00010019932616121264, "loss": 1.2002, "step": 67 }, { "epoch": 0.85, "grad_norm": 0.38993576169013977, "learning_rate": 9.867562910847246e-05, "loss": 1.2225, "step": 68 }, { "epoch": 0.8625, "grad_norm": 0.33190780878067017, "learning_rate": 9.714473224520406e-05, "loss": 1.1982, "step": 69 }, { "epoch": 0.875, "grad_norm": 0.3178853988647461, "learning_rate": 9.560722576129029e-05, "loss": 1.2015, "step": 70 }, { "epoch": 0.8875, "grad_norm": 0.28483396768569946, "learning_rate": 9.406370239474839e-05, "loss": 1.2013, "step": 71 }, { "epoch": 0.9, "grad_norm": 0.26456528902053833, "learning_rate": 9.251475720321848e-05, "loss": 1.2101, "step": 72 }, { "epoch": 0.9125, "grad_norm": 0.24198457598686218, "learning_rate": 9.096098733455746e-05, "loss": 1.1889, "step": 73 }, { "epoch": 0.925, "grad_norm": 0.2521977424621582, "learning_rate": 8.940299179662703e-05, "loss": 1.1915, "step": 74 }, { "epoch": 0.9375, "grad_norm": 0.22842273116111755, "learning_rate": 8.784137122636488e-05, "loss": 1.2018, "step": 75 }, { "epoch": 0.95, "grad_norm": 0.21817852556705475, "learning_rate": 8.627672765822762e-05, "loss": 1.188, "step": 76 }, { "epoch": 0.9625, "grad_norm": 0.1990320086479187, "learning_rate": 8.470966429209512e-05, "loss": 1.1821, "step": 77 }, { "epoch": 0.975, "grad_norm": 0.20685255527496338, "learning_rate": 8.31407852607255e-05, "loss": 1.1687, "step": 78 }, { "epoch": 0.9875, "grad_norm": 0.20527754724025726, "learning_rate": 8.157069539685026e-05, "loss": 1.2024, "step": 79 }, { "epoch": 1.0, "grad_norm": 0.20712077617645264, "learning_rate": 8e-05, "loss": 1.1721, "step": 80 }, { "epoch": 1.0125, "grad_norm": 0.2558927536010742, "learning_rate": 7.842930460314975e-05, "loss": 1.2239, "step": 81 }, { "epoch": 1.025, "grad_norm": 0.25813376903533936, "learning_rate": 7.685921473927454e-05, "loss": 1.2615, "step": 82 }, { "epoch": 1.0375, "grad_norm": 0.22628559172153473, "learning_rate": 7.529033570790488e-05, "loss": 1.2229, "step": 83 }, { "epoch": 1.05, "grad_norm": 0.19325922429561615, "learning_rate": 7.372327234177242e-05, "loss": 1.2115, "step": 84 }, { "epoch": 1.0625, "grad_norm": 0.21385957300662994, "learning_rate": 7.215862877363515e-05, "loss": 1.2484, "step": 85 }, { "epoch": 1.075, "grad_norm": 0.1882706731557846, "learning_rate": 7.0597008203373e-05, "loss": 1.2583, "step": 86 }, { "epoch": 1.0875, "grad_norm": 0.1963510811328888, "learning_rate": 6.903901266544258e-05, "loss": 1.2448, "step": 87 }, { "epoch": 1.1, "grad_norm": 0.18707427382469177, "learning_rate": 6.748524279678152e-05, "loss": 1.2694, "step": 88 }, { "epoch": 1.1125, "grad_norm": 0.17115652561187744, "learning_rate": 6.593629760525164e-05, "loss": 1.2213, "step": 89 }, { "epoch": 1.125, "grad_norm": 0.17526257038116455, "learning_rate": 6.439277423870975e-05, "loss": 1.2448, "step": 90 }, { "epoch": 1.1375, "grad_norm": 0.17824630439281464, "learning_rate": 6.285526775479596e-05, "loss": 1.2967, "step": 91 }, { "epoch": 1.15, "grad_norm": 0.17490257322788239, "learning_rate": 6.13243708915276e-05, "loss": 1.2426, "step": 92 }, { "epoch": 1.1625, "grad_norm": 0.17836087942123413, "learning_rate": 5.9800673838787364e-05, "loss": 1.2834, "step": 93 }, { "epoch": 1.175, "grad_norm": 0.165805846452713, "learning_rate": 5.828476401079407e-05, "loss": 1.2123, "step": 94 }, { "epoch": 1.1875, "grad_norm": 0.16540847718715668, "learning_rate": 5.677722581964303e-05, "loss": 1.2358, "step": 95 }, { "epoch": 1.2, "grad_norm": 0.15229956805706024, "learning_rate": 5.5278640450004216e-05, "loss": 1.2278, "step": 96 }, { "epoch": 1.2125, "grad_norm": 0.1695357859134674, "learning_rate": 5.3789585635064534e-05, "loss": 1.2044, "step": 97 }, { "epoch": 1.225, "grad_norm": 0.16060565412044525, "learning_rate": 5.231063543380055e-05, "loss": 1.1828, "step": 98 }, { "epoch": 1.2375, "grad_norm": 0.15779414772987366, "learning_rate": 5.084236000966803e-05, "loss": 1.2169, "step": 99 }, { "epoch": 1.25, "grad_norm": 0.14691977202892303, "learning_rate": 4.9385325410792824e-05, "loss": 1.2132, "step": 100 }, { "epoch": 1.2625, "grad_norm": 0.15118764340877533, "learning_rate": 4.794009335174874e-05, "loss": 1.2336, "step": 101 }, { "epoch": 1.275, "grad_norm": 0.13673175871372223, "learning_rate": 4.650722099700578e-05, "loss": 1.2196, "step": 102 }, { "epoch": 1.2875, "grad_norm": 0.14001807570457458, "learning_rate": 4.508726074613262e-05, "loss": 1.2204, "step": 103 }, { "epoch": 1.3, "grad_norm": 0.14354203641414642, "learning_rate": 4.3680760020836266e-05, "loss": 1.2468, "step": 104 }, { "epoch": 1.3125, "grad_norm": 0.1509067714214325, "learning_rate": 4.2288261053920186e-05, "loss": 1.1899, "step": 105 }, { "epoch": 1.325, "grad_norm": 0.14659079909324646, "learning_rate": 4.0910300680243636e-05, "loss": 1.2373, "step": 106 }, { "epoch": 1.3375, "grad_norm": 0.15252433717250824, "learning_rate": 3.954741012976125e-05, "loss": 1.1971, "step": 107 }, { "epoch": 1.35, "grad_norm": 0.1317344456911087, "learning_rate": 3.8200114822724096e-05, "loss": 1.2109, "step": 108 }, { "epoch": 1.3625, "grad_norm": 0.1382139027118683, "learning_rate": 3.686893416711998e-05, "loss": 1.1777, "step": 109 }, { "epoch": 1.375, "grad_norm": 0.13280591368675232, "learning_rate": 3.5554381358431845e-05, "loss": 1.2271, "step": 110 }, { "epoch": 1.3875, "grad_norm": 0.13518379628658295, "learning_rate": 3.425696318179103e-05, "loss": 1.1753, "step": 111 }, { "epoch": 1.4, "grad_norm": 0.11688841879367828, "learning_rate": 3.297717981660216e-05, "loss": 1.216, "step": 112 }, { "epoch": 1.4125, "grad_norm": 0.12823420763015747, "learning_rate": 3.1715524643714286e-05, "loss": 1.1956, "step": 113 }, { "epoch": 1.425, "grad_norm": 0.13634033501148224, "learning_rate": 3.0472484055213276e-05, "loss": 1.2017, "step": 114 }, { "epoch": 1.4375, "grad_norm": 0.11677446216344833, "learning_rate": 2.9248537266908373e-05, "loss": 1.1977, "step": 115 }, { "epoch": 1.45, "grad_norm": 0.12495961785316467, "learning_rate": 2.804415613358532e-05, "loss": 1.2068, "step": 116 }, { "epoch": 1.4625, "grad_norm": 0.11066638678312302, "learning_rate": 2.685980496709749e-05, "loss": 1.1919, "step": 117 }, { "epoch": 1.475, "grad_norm": 0.1103682890534401, "learning_rate": 2.569594035736466e-05, "loss": 1.1824, "step": 118 }, { "epoch": 1.4875, "grad_norm": 0.11392944306135178, "learning_rate": 2.4553010996349143e-05, "loss": 1.2038, "step": 119 }, { "epoch": 1.5, "grad_norm": 0.10885628312826157, "learning_rate": 2.3431457505076205e-05, "loss": 1.1529, "step": 120 }, { "epoch": 1.5125, "grad_norm": 0.11054757237434387, "learning_rate": 2.2331712263766495e-05, "loss": 1.192, "step": 121 }, { "epoch": 1.525, "grad_norm": 0.10627539455890656, "learning_rate": 2.1254199245145177e-05, "loss": 1.1969, "step": 122 }, { "epoch": 1.5375, "grad_norm": 0.10586383193731308, "learning_rate": 2.0199333850992245e-05, "loss": 1.1987, "step": 123 }, { "epoch": 1.55, "grad_norm": 0.10642778873443604, "learning_rate": 1.9167522751997527e-05, "loss": 1.1871, "step": 124 }, { "epoch": 1.5625, "grad_norm": 0.1041000634431839, "learning_rate": 1.815916373098104e-05, "loss": 1.1808, "step": 125 }, { "epoch": 1.575, "grad_norm": 0.09965813905000687, "learning_rate": 1.7174645529540424e-05, "loss": 1.1695, "step": 126 }, { "epoch": 1.5875, "grad_norm": 0.10275875777006149, "learning_rate": 1.621434769818344e-05, "loss": 1.1789, "step": 127 }, { "epoch": 1.6, "grad_norm": 0.12429191172122955, "learning_rate": 1.5278640450004213e-05, "loss": 1.2016, "step": 128 }, { "epoch": 1.6125, "grad_norm": 0.10674679279327393, "learning_rate": 1.4367884517958914e-05, "loss": 1.1712, "step": 129 }, { "epoch": 1.625, "grad_norm": 0.10529352724552155, "learning_rate": 1.3482431015796373e-05, "loss": 1.1647, "step": 130 }, { "epoch": 1.6375, "grad_norm": 0.09806757420301437, "learning_rate": 1.2622621302697087e-05, "loss": 1.1781, "step": 131 }, { "epoch": 1.65, "grad_norm": 0.10181386023759842, "learning_rate": 1.1788786851672628e-05, "loss": 1.1777, "step": 132 }, { "epoch": 1.6625, "grad_norm": 0.11010607331991196, "learning_rate": 1.0981249121776654e-05, "loss": 1.1838, "step": 133 }, { "epoch": 1.675, "grad_norm": 0.09728992730379105, "learning_rate": 1.0200319434176227e-05, "loss": 1.1926, "step": 134 }, { "epoch": 1.6875, "grad_norm": 0.1308615505695343, "learning_rate": 9.446298852131605e-06, "loss": 1.185, "step": 135 }, { "epoch": 1.7, "grad_norm": 0.13742145895957947, "learning_rate": 8.719478064930578e-06, "loss": 1.1891, "step": 136 }, { "epoch": 1.7125, "grad_norm": 0.11961862444877625, "learning_rate": 8.020137275822297e-06, "loss": 1.1973, "step": 137 }, { "epoch": 1.725, "grad_norm": 0.1296798288822174, "learning_rate": 7.348546093993492e-06, "loss": 1.1824, "step": 138 }, { "epoch": 1.7375, "grad_norm": 0.10642002522945404, "learning_rate": 6.704963430629132e-06, "loss": 1.1724, "step": 139 }, { "epoch": 1.75, "grad_norm": 0.10178523510694504, "learning_rate": 6.0896373990970614e-06, "loss": 1.1647, "step": 140 }, { "epoch": 1.7625, "grad_norm": 0.11064235866069794, "learning_rate": 5.502805219295715e-06, "loss": 1.1553, "step": 141 }, { "epoch": 1.775, "grad_norm": 0.1006138026714325, "learning_rate": 4.944693126201276e-06, "loss": 1.1432, "step": 142 }, { "epoch": 1.7875, "grad_norm": 0.09788831323385239, "learning_rate": 4.415516282649756e-06, "loss": 1.1704, "step": 143 }, { "epoch": 1.8, "grad_norm": 0.09438074380159378, "learning_rate": 3.915478696387718e-06, "loss": 1.1378, "step": 144 }, { "epoch": 1.8125, "grad_norm": 0.08973080664873123, "learning_rate": 3.4447731414232945e-06, "loss": 1.1372, "step": 145 }, { "epoch": 1.825, "grad_norm": 0.09268128126859665, "learning_rate": 3.0035810837082267e-06, "loss": 1.1577, "step": 146 }, { "epoch": 1.8375, "grad_norm": 0.08991479873657227, "learning_rate": 2.5920726111792195e-06, "loss": 1.1482, "step": 147 }, { "epoch": 1.85, "grad_norm": 0.09131817519664764, "learning_rate": 2.2104063681858757e-06, "loss": 1.1699, "step": 148 }, { "epoch": 1.8625, "grad_norm": 0.08887693285942078, "learning_rate": 1.8587294943302092e-06, "loss": 1.1475, "step": 149 }, { "epoch": 1.875, "grad_norm": 0.08875437080860138, "learning_rate": 1.5371775677415656e-06, "loss": 1.1529, "step": 150 }, { "epoch": 1.8875, "grad_norm": 0.09550578147172928, "learning_rate": 1.2458745528086723e-06, "loss": 1.1544, "step": 151 }, { "epoch": 1.9, "grad_norm": 0.08528783172369003, "learning_rate": 9.849327523889873e-07, "loss": 1.1637, "step": 152 }, { "epoch": 1.9125, "grad_norm": 0.09047097712755203, "learning_rate": 7.5445276451382e-07, "loss": 1.1429, "step": 153 }, { "epoch": 1.925, "grad_norm": 0.08597776293754578, "learning_rate": 5.545234436058966e-07, "loss": 1.1471, "step": 154 }, { "epoch": 1.9375, "grad_norm": 0.08300716429948807, "learning_rate": 3.852218662242546e-07, "loss": 1.1584, "step": 155 }, { "epoch": 1.95, "grad_norm": 0.08327652513980865, "learning_rate": 2.466133013497629e-07, "loss": 1.1464, "step": 156 }, { "epoch": 1.9625, "grad_norm": 0.08351754397153854, "learning_rate": 1.3875118522273412e-07, "loss": 1.1413, "step": 157 }, { "epoch": 1.975, "grad_norm": 0.08628886193037033, "learning_rate": 6.167710074216792e-08, "loss": 1.1291, "step": 158 }, { "epoch": 1.9875, "grad_norm": 0.08761341124773026, "learning_rate": 1.5420761434814523e-08, "loss": 1.1628, "step": 159 }, { "epoch": 2.0, "grad_norm": 0.083193838596344, "learning_rate": 0.0, "loss": 1.1336, "step": 160 }, { "epoch": 2.0, "step": 160, "total_flos": 5.276128179356959e+18, "train_loss": 1.2880202122032642, "train_runtime": 15850.0535, "train_samples_per_second": 2.584, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.276128179356959e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }