{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 249, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012048192771084338, "grad_norm": 40.75222396850586, "learning_rate": 9.99960204377842e-06, "loss": 0.8283, "step": 1 }, { "epoch": 0.024096385542168676, "grad_norm": 7.6217546463012695, "learning_rate": 9.99840823846134e-06, "loss": 0.7463, "step": 2 }, { "epoch": 0.03614457831325301, "grad_norm": 4.7737321853637695, "learning_rate": 9.996418774081658e-06, "loss": 0.6773, "step": 3 }, { "epoch": 0.04819277108433735, "grad_norm": 7.380457401275635, "learning_rate": 9.99363396732727e-06, "loss": 0.7069, "step": 4 }, { "epoch": 0.060240963855421686, "grad_norm": 6.07143497467041, "learning_rate": 9.990054261490643e-06, "loss": 0.8157, "step": 5 }, { "epoch": 0.07228915662650602, "grad_norm": 32.628204345703125, "learning_rate": 9.985680226398261e-06, "loss": 0.7604, "step": 6 }, { "epoch": 0.08433734939759036, "grad_norm": 7.0896759033203125, "learning_rate": 9.980512558319915e-06, "loss": 0.6947, "step": 7 }, { "epoch": 0.0963855421686747, "grad_norm": 4.166346549987793, "learning_rate": 9.974552079857873e-06, "loss": 0.5901, "step": 8 }, { "epoch": 0.10843373493975904, "grad_norm": 5.307025909423828, "learning_rate": 9.967799739815925e-06, "loss": 0.6768, "step": 9 }, { "epoch": 0.12048192771084337, "grad_norm": 5.205200672149658, "learning_rate": 9.960256613048367e-06, "loss": 0.7401, "step": 10 }, { "epoch": 0.13253012048192772, "grad_norm": 4.746809482574463, "learning_rate": 9.951923900288888e-06, "loss": 0.603, "step": 11 }, { "epoch": 0.14457831325301204, "grad_norm": 4.554243087768555, "learning_rate": 9.942802927959444e-06, "loss": 0.5951, "step": 12 }, { "epoch": 0.1566265060240964, "grad_norm": 4.13496732711792, "learning_rate": 9.932895147959106e-06, "loss": 0.6246, "step": 13 }, { "epoch": 0.1686746987951807, "grad_norm": 5.653600692749023, "learning_rate": 9.922202137432954e-06, "loss": 0.8116, "step": 14 }, { "epoch": 0.18072289156626506, "grad_norm": 4.084902286529541, "learning_rate": 9.910725598521014e-06, "loss": 0.5243, "step": 15 }, { "epoch": 0.1927710843373494, "grad_norm": 4.84393835067749, "learning_rate": 9.89846735808731e-06, "loss": 0.5911, "step": 16 }, { "epoch": 0.20481927710843373, "grad_norm": 3.5985801219940186, "learning_rate": 9.885429367429062e-06, "loss": 0.5873, "step": 17 }, { "epoch": 0.21686746987951808, "grad_norm": 4.133760452270508, "learning_rate": 9.871613701966067e-06, "loss": 0.58, "step": 18 }, { "epoch": 0.2289156626506024, "grad_norm": 5.736385345458984, "learning_rate": 9.857022560910338e-06, "loss": 0.6884, "step": 19 }, { "epoch": 0.24096385542168675, "grad_norm": 5.400482177734375, "learning_rate": 9.84165826691602e-06, "loss": 0.7507, "step": 20 }, { "epoch": 0.25301204819277107, "grad_norm": 3.2082321643829346, "learning_rate": 9.825523265709667e-06, "loss": 0.4539, "step": 21 }, { "epoch": 0.26506024096385544, "grad_norm": 3.9605965614318848, "learning_rate": 9.808620125700925e-06, "loss": 0.5744, "step": 22 }, { "epoch": 0.27710843373493976, "grad_norm": 3.652902603149414, "learning_rate": 9.790951537573686e-06, "loss": 0.4361, "step": 23 }, { "epoch": 0.2891566265060241, "grad_norm": 3.5659713745117188, "learning_rate": 9.772520313857777e-06, "loss": 0.4565, "step": 24 }, { "epoch": 0.30120481927710846, "grad_norm": 5.866443157196045, "learning_rate": 9.753329388481261e-06, "loss": 0.6564, "step": 25 }, { "epoch": 0.3132530120481928, "grad_norm": 5.043295383453369, "learning_rate": 9.733381816303395e-06, "loss": 0.5905, "step": 26 }, { "epoch": 0.3253012048192771, "grad_norm": 4.576389789581299, "learning_rate": 9.712680772628365e-06, "loss": 0.5458, "step": 27 }, { "epoch": 0.3373493975903614, "grad_norm": 2.964594602584839, "learning_rate": 9.691229552699817e-06, "loss": 0.4196, "step": 28 }, { "epoch": 0.3493975903614458, "grad_norm": 3.668825387954712, "learning_rate": 9.669031571176322e-06, "loss": 0.5939, "step": 29 }, { "epoch": 0.3614457831325301, "grad_norm": 3.4135804176330566, "learning_rate": 9.646090361587828e-06, "loss": 0.4942, "step": 30 }, { "epoch": 0.37349397590361444, "grad_norm": 3.3271186351776123, "learning_rate": 9.622409575773162e-06, "loss": 0.4447, "step": 31 }, { "epoch": 0.3855421686746988, "grad_norm": 3.8561484813690186, "learning_rate": 9.597992983298748e-06, "loss": 0.5443, "step": 32 }, { "epoch": 0.39759036144578314, "grad_norm": 3.4959912300109863, "learning_rate": 9.572844470858537e-06, "loss": 0.5228, "step": 33 }, { "epoch": 0.40963855421686746, "grad_norm": 4.416797637939453, "learning_rate": 9.546968041655326e-06, "loss": 0.5745, "step": 34 }, { "epoch": 0.42168674698795183, "grad_norm": 3.1685950756073, "learning_rate": 9.520367814763514e-06, "loss": 0.5249, "step": 35 }, { "epoch": 0.43373493975903615, "grad_norm": 3.5792479515075684, "learning_rate": 9.493048024473413e-06, "loss": 0.5533, "step": 36 }, { "epoch": 0.4457831325301205, "grad_norm": 3.136587619781494, "learning_rate": 9.46501301961723e-06, "loss": 0.5707, "step": 37 }, { "epoch": 0.4578313253012048, "grad_norm": 6.66333532333374, "learning_rate": 9.436267262876808e-06, "loss": 0.5537, "step": 38 }, { "epoch": 0.46987951807228917, "grad_norm": 3.710054397583008, "learning_rate": 9.406815330073244e-06, "loss": 0.5072, "step": 39 }, { "epoch": 0.4819277108433735, "grad_norm": 2.439741611480713, "learning_rate": 9.376661909438496e-06, "loss": 0.4088, "step": 40 }, { "epoch": 0.4939759036144578, "grad_norm": 2.6984646320343018, "learning_rate": 9.3458118008691e-06, "loss": 0.548, "step": 41 }, { "epoch": 0.5060240963855421, "grad_norm": 2.626049757003784, "learning_rate": 9.314269915162115e-06, "loss": 0.541, "step": 42 }, { "epoch": 0.5180722891566265, "grad_norm": 31.189899444580078, "learning_rate": 9.282041273233402e-06, "loss": 0.5461, "step": 43 }, { "epoch": 0.5301204819277109, "grad_norm": 4.356227397918701, "learning_rate": 9.249131005318388e-06, "loss": 0.6082, "step": 44 }, { "epoch": 0.5421686746987951, "grad_norm": 10.281394958496094, "learning_rate": 9.215544350155423e-06, "loss": 0.6193, "step": 45 }, { "epoch": 0.5542168674698795, "grad_norm": 81.10453796386719, "learning_rate": 9.18128665415186e-06, "loss": 0.4795, "step": 46 }, { "epoch": 0.5662650602409639, "grad_norm": 136.2274932861328, "learning_rate": 9.146363370533004e-06, "loss": 0.5669, "step": 47 }, { "epoch": 0.5783132530120482, "grad_norm": 24.73008155822754, "learning_rate": 9.110780058474052e-06, "loss": 0.4712, "step": 48 }, { "epoch": 0.5903614457831325, "grad_norm": 3.0569868087768555, "learning_rate": 9.07454238221517e-06, "loss": 0.4934, "step": 49 }, { "epoch": 0.6024096385542169, "grad_norm": 3.192237615585327, "learning_rate": 9.03765611015985e-06, "loss": 0.5427, "step": 50 }, { "epoch": 0.6144578313253012, "grad_norm": 1.920320749282837, "learning_rate": 9.000127113956673e-06, "loss": 0.4182, "step": 51 }, { "epoch": 0.6265060240963856, "grad_norm": 3.1197104454040527, "learning_rate": 8.961961367564652e-06, "loss": 0.5577, "step": 52 }, { "epoch": 0.6385542168674698, "grad_norm": 2.1309397220611572, "learning_rate": 8.923164946302274e-06, "loss": 0.5111, "step": 53 }, { "epoch": 0.6506024096385542, "grad_norm": 2.3042995929718018, "learning_rate": 8.883744025880429e-06, "loss": 0.5015, "step": 54 }, { "epoch": 0.6626506024096386, "grad_norm": 2.4492433071136475, "learning_rate": 8.843704881419333e-06, "loss": 0.3826, "step": 55 }, { "epoch": 0.6746987951807228, "grad_norm": 2.3031723499298096, "learning_rate": 8.803053886449644e-06, "loss": 0.4694, "step": 56 }, { "epoch": 0.6867469879518072, "grad_norm": 3.1464896202087402, "learning_rate": 8.761797511897907e-06, "loss": 0.5708, "step": 57 }, { "epoch": 0.6987951807228916, "grad_norm": 2.5254249572753906, "learning_rate": 8.719942325056496e-06, "loss": 0.5605, "step": 58 }, { "epoch": 0.7108433734939759, "grad_norm": 2.614318370819092, "learning_rate": 8.67749498853821e-06, "loss": 0.5702, "step": 59 }, { "epoch": 0.7228915662650602, "grad_norm": 2.1782386302948, "learning_rate": 8.634462259215719e-06, "loss": 0.5409, "step": 60 }, { "epoch": 0.7349397590361446, "grad_norm": 2.084237813949585, "learning_rate": 8.590850987145964e-06, "loss": 0.4923, "step": 61 }, { "epoch": 0.7469879518072289, "grad_norm": 2.4142396450042725, "learning_rate": 8.546668114479769e-06, "loss": 0.6142, "step": 62 }, { "epoch": 0.7590361445783133, "grad_norm": 1.6900039911270142, "learning_rate": 8.501920674356755e-06, "loss": 0.4445, "step": 63 }, { "epoch": 0.7710843373493976, "grad_norm": 1.9757111072540283, "learning_rate": 8.456615789785804e-06, "loss": 0.491, "step": 64 }, { "epoch": 0.7831325301204819, "grad_norm": 2.328930139541626, "learning_rate": 8.410760672511188e-06, "loss": 0.5563, "step": 65 }, { "epoch": 0.7951807228915663, "grad_norm": 2.8067822456359863, "learning_rate": 8.364362621864595e-06, "loss": 0.6574, "step": 66 }, { "epoch": 0.8072289156626506, "grad_norm": 2.0766549110412598, "learning_rate": 8.31742902360319e-06, "loss": 0.5063, "step": 67 }, { "epoch": 0.8192771084337349, "grad_norm": 2.085911989212036, "learning_rate": 8.269967348733947e-06, "loss": 0.5504, "step": 68 }, { "epoch": 0.8313253012048193, "grad_norm": 1.8254350423812866, "learning_rate": 8.221985152324385e-06, "loss": 0.4678, "step": 69 }, { "epoch": 0.8433734939759037, "grad_norm": 2.208496332168579, "learning_rate": 8.17349007229994e-06, "loss": 0.5589, "step": 70 }, { "epoch": 0.8554216867469879, "grad_norm": 2.833843469619751, "learning_rate": 8.124489828228136e-06, "loss": 0.6464, "step": 71 }, { "epoch": 0.8674698795180723, "grad_norm": 2.181140661239624, "learning_rate": 8.07499222008977e-06, "loss": 0.6037, "step": 72 }, { "epoch": 0.8795180722891566, "grad_norm": 1.5879639387130737, "learning_rate": 8.025005127037282e-06, "loss": 0.4077, "step": 73 }, { "epoch": 0.891566265060241, "grad_norm": 1.94895601272583, "learning_rate": 7.974536506140546e-06, "loss": 0.4523, "step": 74 }, { "epoch": 0.9036144578313253, "grad_norm": 2.282900810241699, "learning_rate": 7.923594391120237e-06, "loss": 0.4889, "step": 75 }, { "epoch": 0.9156626506024096, "grad_norm": 1.8225998878479004, "learning_rate": 7.872186891068997e-06, "loss": 0.4483, "step": 76 }, { "epoch": 0.927710843373494, "grad_norm": 2.1921205520629883, "learning_rate": 7.820322189160618e-06, "loss": 0.4848, "step": 77 }, { "epoch": 0.9397590361445783, "grad_norm": 1.9695558547973633, "learning_rate": 7.768008541347423e-06, "loss": 0.4577, "step": 78 }, { "epoch": 0.9518072289156626, "grad_norm": 2.367926836013794, "learning_rate": 7.715254275046062e-06, "loss": 0.6004, "step": 79 }, { "epoch": 0.963855421686747, "grad_norm": 1.95900297164917, "learning_rate": 7.66206778781193e-06, "loss": 0.5161, "step": 80 }, { "epoch": 0.9759036144578314, "grad_norm": 4.2675557136535645, "learning_rate": 7.608457546002423e-06, "loss": 0.4645, "step": 81 }, { "epoch": 0.9879518072289156, "grad_norm": 2.129870891571045, "learning_rate": 7.554432083429253e-06, "loss": 0.5267, "step": 82 }, { "epoch": 1.0, "grad_norm": 1.7695404291152954, "learning_rate": 7.500000000000001e-06, "loss": 0.3909, "step": 83 }, { "epoch": 1.0120481927710843, "grad_norm": 2.0876364707946777, "learning_rate": 7.445169960349167e-06, "loss": 0.3333, "step": 84 }, { "epoch": 1.0240963855421688, "grad_norm": 1.5992554426193237, "learning_rate": 7.389950692458916e-06, "loss": 0.3103, "step": 85 }, { "epoch": 1.036144578313253, "grad_norm": 2.081721544265747, "learning_rate": 7.3343509862697295e-06, "loss": 0.286, "step": 86 }, { "epoch": 1.0481927710843373, "grad_norm": 1.5453327894210815, "learning_rate": 7.278379692281209e-06, "loss": 0.2851, "step": 87 }, { "epoch": 1.0602409638554218, "grad_norm": 1.6960233449935913, "learning_rate": 7.22204572014322e-06, "loss": 0.3118, "step": 88 }, { "epoch": 1.072289156626506, "grad_norm": 1.6961935758590698, "learning_rate": 7.165358037237644e-06, "loss": 0.3024, "step": 89 }, { "epoch": 1.0843373493975903, "grad_norm": 1.9473631381988525, "learning_rate": 7.10832566725092e-06, "loss": 0.3262, "step": 90 }, { "epoch": 1.0963855421686748, "grad_norm": 1.5019605159759521, "learning_rate": 7.0509576887376375e-06, "loss": 0.23, "step": 91 }, { "epoch": 1.108433734939759, "grad_norm": 1.7088998556137085, "learning_rate": 6.99326323367538e-06, "loss": 0.2511, "step": 92 }, { "epoch": 1.1204819277108433, "grad_norm": 2.8957417011260986, "learning_rate": 6.9352514860110876e-06, "loss": 0.3191, "step": 93 }, { "epoch": 1.1325301204819278, "grad_norm": 1.71742844581604, "learning_rate": 6.876931680199121e-06, "loss": 0.2792, "step": 94 }, { "epoch": 1.144578313253012, "grad_norm": 1.615378975868225, "learning_rate": 6.818313099731308e-06, "loss": 0.2653, "step": 95 }, { "epoch": 1.1566265060240963, "grad_norm": 1.4427539110183716, "learning_rate": 6.759405075659165e-06, "loss": 0.2909, "step": 96 }, { "epoch": 1.1686746987951806, "grad_norm": 1.1839165687561035, "learning_rate": 6.700216985108568e-06, "loss": 0.1959, "step": 97 }, { "epoch": 1.180722891566265, "grad_norm": 1.7143460512161255, "learning_rate": 6.640758249787067e-06, "loss": 0.2841, "step": 98 }, { "epoch": 1.1927710843373494, "grad_norm": 1.3873624801635742, "learning_rate": 6.58103833448412e-06, "loss": 0.2838, "step": 99 }, { "epoch": 1.2048192771084336, "grad_norm": 1.8592312335968018, "learning_rate": 6.521066745564467e-06, "loss": 0.2963, "step": 100 }, { "epoch": 1.216867469879518, "grad_norm": 1.608494758605957, "learning_rate": 6.460853029454879e-06, "loss": 0.2877, "step": 101 }, { "epoch": 1.2289156626506024, "grad_norm": 1.8831335306167603, "learning_rate": 6.4004067711245366e-06, "loss": 0.3066, "step": 102 }, { "epoch": 1.2409638554216866, "grad_norm": 1.743905782699585, "learning_rate": 6.3397375925592675e-06, "loss": 0.3099, "step": 103 }, { "epoch": 1.2530120481927711, "grad_norm": 1.8759677410125732, "learning_rate": 6.2788551512299014e-06, "loss": 0.2914, "step": 104 }, { "epoch": 1.2650602409638554, "grad_norm": 1.7082366943359375, "learning_rate": 6.2177691385549595e-06, "loss": 0.2931, "step": 105 }, { "epoch": 1.2771084337349397, "grad_norm": 1.519975185394287, "learning_rate": 6.156489278357967e-06, "loss": 0.2499, "step": 106 }, { "epoch": 1.2891566265060241, "grad_norm": 1.8293309211730957, "learning_rate": 6.0950253253195656e-06, "loss": 0.3611, "step": 107 }, { "epoch": 1.3012048192771084, "grad_norm": 1.728571891784668, "learning_rate": 6.033387063424765e-06, "loss": 0.3017, "step": 108 }, { "epoch": 1.3132530120481927, "grad_norm": 1.6766902208328247, "learning_rate": 5.971584304405489e-06, "loss": 0.2823, "step": 109 }, { "epoch": 1.3253012048192772, "grad_norm": 1.7143419981002808, "learning_rate": 5.909626886178721e-06, "loss": 0.2307, "step": 110 }, { "epoch": 1.3373493975903614, "grad_norm": 1.5373152494430542, "learning_rate": 5.8475246712804845e-06, "loss": 0.2963, "step": 111 }, { "epoch": 1.3493975903614457, "grad_norm": 1.8781455755233765, "learning_rate": 5.785287545295895e-06, "loss": 0.2874, "step": 112 }, { "epoch": 1.3614457831325302, "grad_norm": 1.824504017829895, "learning_rate": 5.722925415285555e-06, "loss": 0.2454, "step": 113 }, { "epoch": 1.3734939759036144, "grad_norm": 1.7806376218795776, "learning_rate": 5.660448208208513e-06, "loss": 0.3654, "step": 114 }, { "epoch": 1.3855421686746987, "grad_norm": 1.5633933544158936, "learning_rate": 5.597865869342075e-06, "loss": 0.2931, "step": 115 }, { "epoch": 1.3975903614457832, "grad_norm": 1.8875840902328491, "learning_rate": 5.535188360698687e-06, "loss": 0.331, "step": 116 }, { "epoch": 1.4096385542168675, "grad_norm": 1.404435634613037, "learning_rate": 5.472425659440157e-06, "loss": 0.246, "step": 117 }, { "epoch": 1.4216867469879517, "grad_norm": 1.4050829410552979, "learning_rate": 5.409587756289462e-06, "loss": 0.2689, "step": 118 }, { "epoch": 1.4337349397590362, "grad_norm": 1.5876859426498413, "learning_rate": 5.346684653940408e-06, "loss": 0.2645, "step": 119 }, { "epoch": 1.4457831325301205, "grad_norm": 1.6692218780517578, "learning_rate": 5.2837263654653715e-06, "loss": 0.3155, "step": 120 }, { "epoch": 1.4578313253012047, "grad_norm": 1.2533305883407593, "learning_rate": 5.2207229127213866e-06, "loss": 0.2112, "step": 121 }, { "epoch": 1.4698795180722892, "grad_norm": 1.5980626344680786, "learning_rate": 5.157684324754858e-06, "loss": 0.2441, "step": 122 }, { "epoch": 1.4819277108433735, "grad_norm": 1.6085745096206665, "learning_rate": 5.094620636205096e-06, "loss": 0.3087, "step": 123 }, { "epoch": 1.4939759036144578, "grad_norm": 1.7097792625427246, "learning_rate": 5.031541885706987e-06, "loss": 0.2499, "step": 124 }, { "epoch": 1.5060240963855422, "grad_norm": 1.4703900814056396, "learning_rate": 4.9684581142930135e-06, "loss": 0.2413, "step": 125 }, { "epoch": 1.5180722891566265, "grad_norm": 2.3154144287109375, "learning_rate": 4.905379363794907e-06, "loss": 0.3701, "step": 126 }, { "epoch": 1.5301204819277108, "grad_norm": 1.665852427482605, "learning_rate": 4.842315675245144e-06, "loss": 0.2791, "step": 127 }, { "epoch": 1.5421686746987953, "grad_norm": 1.7872849702835083, "learning_rate": 4.779277087278615e-06, "loss": 0.3303, "step": 128 }, { "epoch": 1.5542168674698795, "grad_norm": 1.4255069494247437, "learning_rate": 4.71627363453463e-06, "loss": 0.2462, "step": 129 }, { "epoch": 1.5662650602409638, "grad_norm": 1.8723397254943848, "learning_rate": 4.653315346059592e-06, "loss": 0.3083, "step": 130 }, { "epoch": 1.5783132530120483, "grad_norm": 1.6238393783569336, "learning_rate": 4.5904122437105384e-06, "loss": 0.2947, "step": 131 }, { "epoch": 1.5903614457831325, "grad_norm": 1.5982369184494019, "learning_rate": 4.527574340559844e-06, "loss": 0.3114, "step": 132 }, { "epoch": 1.6024096385542168, "grad_norm": 1.7584006786346436, "learning_rate": 4.464811639301314e-06, "loss": 0.3335, "step": 133 }, { "epoch": 1.6144578313253013, "grad_norm": 1.7169082164764404, "learning_rate": 4.402134130657925e-06, "loss": 0.2783, "step": 134 }, { "epoch": 1.6265060240963856, "grad_norm": 1.6119632720947266, "learning_rate": 4.33955179179149e-06, "loss": 0.252, "step": 135 }, { "epoch": 1.6385542168674698, "grad_norm": 1.5756961107254028, "learning_rate": 4.277074584714447e-06, "loss": 0.2825, "step": 136 }, { "epoch": 1.6506024096385543, "grad_norm": 1.511651873588562, "learning_rate": 4.214712454704107e-06, "loss": 0.2479, "step": 137 }, { "epoch": 1.6626506024096386, "grad_norm": 1.354615330696106, "learning_rate": 4.152475328719517e-06, "loss": 0.2192, "step": 138 }, { "epoch": 1.6746987951807228, "grad_norm": 1.821956753730774, "learning_rate": 4.090373113821281e-06, "loss": 0.2735, "step": 139 }, { "epoch": 1.6867469879518073, "grad_norm": 1.4524273872375488, "learning_rate": 4.028415695594512e-06, "loss": 0.2222, "step": 140 }, { "epoch": 1.6987951807228916, "grad_norm": 1.6997952461242676, "learning_rate": 3.966612936575235e-06, "loss": 0.2841, "step": 141 }, { "epoch": 1.7108433734939759, "grad_norm": 1.5502634048461914, "learning_rate": 3.904974674680436e-06, "loss": 0.281, "step": 142 }, { "epoch": 1.7228915662650603, "grad_norm": 1.6944836378097534, "learning_rate": 3.843510721642036e-06, "loss": 0.19, "step": 143 }, { "epoch": 1.7349397590361446, "grad_norm": 1.958292007446289, "learning_rate": 3.782230861445041e-06, "loss": 0.3143, "step": 144 }, { "epoch": 1.7469879518072289, "grad_norm": 1.9379884004592896, "learning_rate": 3.7211448487701002e-06, "loss": 0.2964, "step": 145 }, { "epoch": 1.7590361445783134, "grad_norm": 1.6362128257751465, "learning_rate": 3.6602624074407354e-06, "loss": 0.2749, "step": 146 }, { "epoch": 1.7710843373493976, "grad_norm": 1.740090250968933, "learning_rate": 3.5995932288754655e-06, "loss": 0.2572, "step": 147 }, { "epoch": 1.783132530120482, "grad_norm": 1.3941646814346313, "learning_rate": 3.539146970545124e-06, "loss": 0.2476, "step": 148 }, { "epoch": 1.7951807228915664, "grad_norm": 1.6419267654418945, "learning_rate": 3.478933254435534e-06, "loss": 0.2902, "step": 149 }, { "epoch": 1.8072289156626506, "grad_norm": 1.825861930847168, "learning_rate": 3.4189616655158803e-06, "loss": 0.3345, "step": 150 }, { "epoch": 1.819277108433735, "grad_norm": 1.749080777168274, "learning_rate": 3.359241750212934e-06, "loss": 0.314, "step": 151 }, { "epoch": 1.8313253012048194, "grad_norm": 1.2390449047088623, "learning_rate": 3.2997830148914316e-06, "loss": 0.214, "step": 152 }, { "epoch": 1.8433734939759037, "grad_norm": 1.6753946542739868, "learning_rate": 3.240594924340835e-06, "loss": 0.2988, "step": 153 }, { "epoch": 1.855421686746988, "grad_norm": 1.6091383695602417, "learning_rate": 3.181686900268694e-06, "loss": 0.2481, "step": 154 }, { "epoch": 1.8674698795180724, "grad_norm": 1.438892126083374, "learning_rate": 3.1230683198008817e-06, "loss": 0.2702, "step": 155 }, { "epoch": 1.8795180722891565, "grad_norm": 1.931443691253662, "learning_rate": 3.0647485139889145e-06, "loss": 0.2957, "step": 156 }, { "epoch": 1.891566265060241, "grad_norm": 1.5713204145431519, "learning_rate": 3.006736766324623e-06, "loss": 0.2815, "step": 157 }, { "epoch": 1.9036144578313254, "grad_norm": 1.5962169170379639, "learning_rate": 2.9490423112623646e-06, "loss": 0.2791, "step": 158 }, { "epoch": 1.9156626506024095, "grad_norm": 2.0020360946655273, "learning_rate": 2.89167433274908e-06, "loss": 0.3897, "step": 159 }, { "epoch": 1.927710843373494, "grad_norm": 1.6599327325820923, "learning_rate": 2.834641962762358e-06, "loss": 0.2742, "step": 160 }, { "epoch": 1.9397590361445785, "grad_norm": 1.6006088256835938, "learning_rate": 2.7779542798567804e-06, "loss": 0.2678, "step": 161 }, { "epoch": 1.9518072289156625, "grad_norm": 1.5215158462524414, "learning_rate": 2.721620307718793e-06, "loss": 0.3035, "step": 162 }, { "epoch": 1.963855421686747, "grad_norm": 1.8756093978881836, "learning_rate": 2.66564901373027e-06, "loss": 0.3407, "step": 163 }, { "epoch": 1.9759036144578315, "grad_norm": 1.5014938116073608, "learning_rate": 2.610049307541085e-06, "loss": 0.2533, "step": 164 }, { "epoch": 1.9879518072289155, "grad_norm": 1.6140003204345703, "learning_rate": 2.554830039650834e-06, "loss": 0.2369, "step": 165 }, { "epoch": 2.0, "grad_norm": 1.5059895515441895, "learning_rate": 2.5000000000000015e-06, "loss": 0.1612, "step": 166 }, { "epoch": 2.0120481927710845, "grad_norm": 1.2760642766952515, "learning_rate": 2.4455679165707473e-06, "loss": 0.1247, "step": 167 }, { "epoch": 2.0240963855421685, "grad_norm": 1.3720568418502808, "learning_rate": 2.391542453997578e-06, "loss": 0.1618, "step": 168 }, { "epoch": 2.036144578313253, "grad_norm": 1.4044466018676758, "learning_rate": 2.337932212188073e-06, "loss": 0.1427, "step": 169 }, { "epoch": 2.0481927710843375, "grad_norm": 1.2212741374969482, "learning_rate": 2.284745724953939e-06, "loss": 0.1587, "step": 170 }, { "epoch": 2.0602409638554215, "grad_norm": 1.1166741847991943, "learning_rate": 2.2319914586525776e-06, "loss": 0.1169, "step": 171 }, { "epoch": 2.072289156626506, "grad_norm": 1.2007352113723755, "learning_rate": 2.1796778108393824e-06, "loss": 0.1232, "step": 172 }, { "epoch": 2.0843373493975905, "grad_norm": 1.4228880405426025, "learning_rate": 2.127813108931007e-06, "loss": 0.1646, "step": 173 }, { "epoch": 2.0963855421686746, "grad_norm": 1.2214866876602173, "learning_rate": 2.0764056088797646e-06, "loss": 0.1058, "step": 174 }, { "epoch": 2.108433734939759, "grad_norm": 1.8072195053100586, "learning_rate": 2.0254634938594555e-06, "loss": 0.1579, "step": 175 }, { "epoch": 2.1204819277108435, "grad_norm": 1.872309684753418, "learning_rate": 1.9749948729627188e-06, "loss": 0.138, "step": 176 }, { "epoch": 2.1325301204819276, "grad_norm": 1.8318668603897095, "learning_rate": 1.9250077799102323e-06, "loss": 0.1331, "step": 177 }, { "epoch": 2.144578313253012, "grad_norm": 2.1385316848754883, "learning_rate": 1.875510171771865e-06, "loss": 0.1635, "step": 178 }, { "epoch": 2.1566265060240966, "grad_norm": 1.719831943511963, "learning_rate": 1.8265099277000614e-06, "loss": 0.1561, "step": 179 }, { "epoch": 2.1686746987951806, "grad_norm": 1.7940328121185303, "learning_rate": 1.7780148476756148e-06, "loss": 0.14, "step": 180 }, { "epoch": 2.180722891566265, "grad_norm": 1.3721911907196045, "learning_rate": 1.7300326512660542e-06, "loss": 0.1233, "step": 181 }, { "epoch": 2.1927710843373496, "grad_norm": 1.2797173261642456, "learning_rate": 1.6825709763968112e-06, "loss": 0.0936, "step": 182 }, { "epoch": 2.2048192771084336, "grad_norm": 1.5839323997497559, "learning_rate": 1.6356373781354058e-06, "loss": 0.1648, "step": 183 }, { "epoch": 2.216867469879518, "grad_norm": 1.3700120449066162, "learning_rate": 1.589239327488812e-06, "loss": 0.126, "step": 184 }, { "epoch": 2.2289156626506026, "grad_norm": 1.5171151161193848, "learning_rate": 1.543384210214196e-06, "loss": 0.1212, "step": 185 }, { "epoch": 2.2409638554216866, "grad_norm": 1.6373289823532104, "learning_rate": 1.4980793256432474e-06, "loss": 0.1509, "step": 186 }, { "epoch": 2.253012048192771, "grad_norm": 1.30360746383667, "learning_rate": 1.453331885520234e-06, "loss": 0.12, "step": 187 }, { "epoch": 2.2650602409638556, "grad_norm": 1.395431399345398, "learning_rate": 1.4091490128540374e-06, "loss": 0.1406, "step": 188 }, { "epoch": 2.2771084337349397, "grad_norm": 1.3656375408172607, "learning_rate": 1.3655377407842813e-06, "loss": 0.1706, "step": 189 }, { "epoch": 2.289156626506024, "grad_norm": 1.189477562904358, "learning_rate": 1.32250501146179e-06, "loss": 0.1243, "step": 190 }, { "epoch": 2.3012048192771086, "grad_norm": 1.273803949356079, "learning_rate": 1.2800576749435068e-06, "loss": 0.1132, "step": 191 }, { "epoch": 2.3132530120481927, "grad_norm": 1.249987244606018, "learning_rate": 1.2382024881020937e-06, "loss": 0.133, "step": 192 }, { "epoch": 2.325301204819277, "grad_norm": 1.2117363214492798, "learning_rate": 1.1969461135503573e-06, "loss": 0.1153, "step": 193 }, { "epoch": 2.337349397590361, "grad_norm": 1.115524172782898, "learning_rate": 1.1562951185806675e-06, "loss": 0.1068, "step": 194 }, { "epoch": 2.3493975903614457, "grad_norm": 1.2410939931869507, "learning_rate": 1.1162559741195733e-06, "loss": 0.0926, "step": 195 }, { "epoch": 2.36144578313253, "grad_norm": 1.0989357233047485, "learning_rate": 1.076835053697728e-06, "loss": 0.1147, "step": 196 }, { "epoch": 2.3734939759036147, "grad_norm": 1.2773900032043457, "learning_rate": 1.0380386324353508e-06, "loss": 0.131, "step": 197 }, { "epoch": 2.3855421686746987, "grad_norm": 1.2643158435821533, "learning_rate": 9.998728860433277e-07, "loss": 0.1377, "step": 198 }, { "epoch": 2.397590361445783, "grad_norm": 1.3157423734664917, "learning_rate": 9.62343889840151e-07, "loss": 0.127, "step": 199 }, { "epoch": 2.4096385542168672, "grad_norm": 1.1823986768722534, "learning_rate": 9.254576177848313e-07, "loss": 0.1039, "step": 200 }, { "epoch": 2.4216867469879517, "grad_norm": 1.2062366008758545, "learning_rate": 8.892199415259501e-07, "loss": 0.1137, "step": 201 }, { "epoch": 2.433734939759036, "grad_norm": 1.358426570892334, "learning_rate": 8.536366294669979e-07, "loss": 0.1188, "step": 202 }, { "epoch": 2.4457831325301207, "grad_norm": 1.4414290189743042, "learning_rate": 8.187133458481416e-07, "loss": 0.1393, "step": 203 }, { "epoch": 2.4578313253012047, "grad_norm": 1.2111995220184326, "learning_rate": 7.844556498445788e-07, "loss": 0.1088, "step": 204 }, { "epoch": 2.4698795180722892, "grad_norm": 1.0797122716903687, "learning_rate": 7.508689946816128e-07, "loss": 0.1012, "step": 205 }, { "epoch": 2.4819277108433733, "grad_norm": 1.2910206317901611, "learning_rate": 7.179587267665999e-07, "loss": 0.1283, "step": 206 }, { "epoch": 2.4939759036144578, "grad_norm": 1.76227867603302, "learning_rate": 6.857300848378857e-07, "loss": 0.1773, "step": 207 }, { "epoch": 2.5060240963855422, "grad_norm": 1.2892178297042847, "learning_rate": 6.541881991309013e-07, "loss": 0.1003, "step": 208 }, { "epoch": 2.5180722891566267, "grad_norm": 1.2142372131347656, "learning_rate": 6.233380905615049e-07, "loss": 0.1059, "step": 209 }, { "epoch": 2.5301204819277108, "grad_norm": 1.3028932809829712, "learning_rate": 5.931846699267558e-07, "loss": 0.0997, "step": 210 }, { "epoch": 2.5421686746987953, "grad_norm": 1.2703701257705688, "learning_rate": 5.637327371231921e-07, "loss": 0.1074, "step": 211 }, { "epoch": 2.5542168674698793, "grad_norm": 1.6055101156234741, "learning_rate": 5.349869803827717e-07, "loss": 0.1635, "step": 212 }, { "epoch": 2.566265060240964, "grad_norm": 1.2764710187911987, "learning_rate": 5.0695197552659e-07, "loss": 0.1394, "step": 213 }, { "epoch": 2.5783132530120483, "grad_norm": 1.3518632650375366, "learning_rate": 4.796321852364877e-07, "loss": 0.1363, "step": 214 }, { "epoch": 2.5903614457831328, "grad_norm": 1.3571412563323975, "learning_rate": 4.5303195834467463e-07, "loss": 0.1326, "step": 215 }, { "epoch": 2.602409638554217, "grad_norm": 1.4019449949264526, "learning_rate": 4.271555291414636e-07, "loss": 0.1222, "step": 216 }, { "epoch": 2.6144578313253013, "grad_norm": 1.184061050415039, "learning_rate": 4.020070167012541e-07, "loss": 0.0845, "step": 217 }, { "epoch": 2.6265060240963853, "grad_norm": 1.5907222032546997, "learning_rate": 3.775904242268391e-07, "loss": 0.1353, "step": 218 }, { "epoch": 2.63855421686747, "grad_norm": 1.3479151725769043, "learning_rate": 3.539096384121743e-07, "loss": 0.1445, "step": 219 }, { "epoch": 2.6506024096385543, "grad_norm": 1.36601722240448, "learning_rate": 3.309684288236775e-07, "loss": 0.1386, "step": 220 }, { "epoch": 2.662650602409639, "grad_norm": 1.6582006216049194, "learning_rate": 3.0877044730018515e-07, "loss": 0.1237, "step": 221 }, { "epoch": 2.674698795180723, "grad_norm": 1.505927324295044, "learning_rate": 2.873192273716369e-07, "loss": 0.153, "step": 222 }, { "epoch": 2.6867469879518073, "grad_norm": 1.2739795446395874, "learning_rate": 2.666181836966053e-07, "loss": 0.1038, "step": 223 }, { "epoch": 2.6987951807228914, "grad_norm": 1.3373569250106812, "learning_rate": 2.466706115187406e-07, "loss": 0.1208, "step": 224 }, { "epoch": 2.710843373493976, "grad_norm": 1.3513188362121582, "learning_rate": 2.274796861422246e-07, "loss": 0.1209, "step": 225 }, { "epoch": 2.7228915662650603, "grad_norm": 1.4020378589630127, "learning_rate": 2.090484624263167e-07, "loss": 0.1323, "step": 226 }, { "epoch": 2.734939759036145, "grad_norm": 1.4146372079849243, "learning_rate": 1.9137987429907635e-07, "loss": 0.1304, "step": 227 }, { "epoch": 2.746987951807229, "grad_norm": 1.3225347995758057, "learning_rate": 1.7447673429033361e-07, "loss": 0.1149, "step": 228 }, { "epoch": 2.7590361445783134, "grad_norm": 1.3890403509140015, "learning_rate": 1.583417330839798e-07, "loss": 0.1557, "step": 229 }, { "epoch": 2.7710843373493974, "grad_norm": 1.466339349746704, "learning_rate": 1.4297743908966212e-07, "loss": 0.1489, "step": 230 }, { "epoch": 2.783132530120482, "grad_norm": 1.2367849349975586, "learning_rate": 1.2838629803393343e-07, "loss": 0.0997, "step": 231 }, { "epoch": 2.7951807228915664, "grad_norm": 1.390717625617981, "learning_rate": 1.1457063257093892e-07, "loss": 0.1218, "step": 232 }, { "epoch": 2.807228915662651, "grad_norm": 1.2239187955856323, "learning_rate": 1.0153264191269052e-07, "loss": 0.1135, "step": 233 }, { "epoch": 2.819277108433735, "grad_norm": 1.0472311973571777, "learning_rate": 8.927440147898703e-08, "loss": 0.1065, "step": 234 }, { "epoch": 2.8313253012048194, "grad_norm": 1.2322300672531128, "learning_rate": 7.779786256704669e-08, "loss": 0.1016, "step": 235 }, { "epoch": 2.8433734939759034, "grad_norm": 1.3203635215759277, "learning_rate": 6.710485204089456e-08, "loss": 0.1239, "step": 236 }, { "epoch": 2.855421686746988, "grad_norm": 1.2621276378631592, "learning_rate": 5.7197072040557356e-08, "loss": 0.1358, "step": 237 }, { "epoch": 2.8674698795180724, "grad_norm": 1.3743759393692017, "learning_rate": 4.807609971111238e-08, "loss": 0.1337, "step": 238 }, { "epoch": 2.8795180722891565, "grad_norm": 1.0450865030288696, "learning_rate": 3.974338695163393e-08, "loss": 0.0945, "step": 239 }, { "epoch": 2.891566265060241, "grad_norm": 1.5270490646362305, "learning_rate": 3.220026018407541e-08, "loss": 0.0994, "step": 240 }, { "epoch": 2.9036144578313254, "grad_norm": 1.694990873336792, "learning_rate": 2.5447920142128712e-08, "loss": 0.1689, "step": 241 }, { "epoch": 2.9156626506024095, "grad_norm": 1.4968199729919434, "learning_rate": 1.9487441680084983e-08, "loss": 0.1219, "step": 242 }, { "epoch": 2.927710843373494, "grad_norm": 1.332356572151184, "learning_rate": 1.431977360173975e-08, "loss": 0.1137, "step": 243 }, { "epoch": 2.9397590361445785, "grad_norm": 1.4295134544372559, "learning_rate": 9.945738509358205e-09, "loss": 0.1498, "step": 244 }, { "epoch": 2.9518072289156625, "grad_norm": 1.1830252408981323, "learning_rate": 6.366032672731059e-09, "loss": 0.1002, "step": 245 }, { "epoch": 2.963855421686747, "grad_norm": 1.3982295989990234, "learning_rate": 3.5812259183426457e-09, "loss": 0.1247, "step": 246 }, { "epoch": 2.9759036144578315, "grad_norm": 1.467788577079773, "learning_rate": 1.591761538662362e-09, "loss": 0.1098, "step": 247 }, { "epoch": 2.9879518072289155, "grad_norm": 1.2710858583450317, "learning_rate": 3.9795622158111945e-10, "loss": 0.1335, "step": 248 }, { "epoch": 3.0, "grad_norm": 0.9258020520210266, "learning_rate": 0.0, "loss": 0.0612, "step": 249 }, { "epoch": 3.0, "step": 249, "total_flos": 3.4614492313052774e+17, "train_loss": 0.32225324711706266, "train_runtime": 433.6826, "train_samples_per_second": 4.545, "train_steps_per_second": 0.574 } ], "logging_steps": 1, "max_steps": 249, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.4614492313052774e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }