{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 50, "global_step": 633, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004739336492890996, "grad_norm": 33.21675090695969, "learning_rate": 0.0, "loss": 1.6607, "step": 1 }, { "epoch": 0.009478672985781991, "grad_norm": 145.79448143612095, "learning_rate": 4.739336492890996e-08, "loss": 6.9341, "step": 2 }, { "epoch": 0.014218009478672985, "grad_norm": 76.12710929983453, "learning_rate": 9.478672985781992e-08, "loss": 4.0894, "step": 3 }, { "epoch": 0.018957345971563982, "grad_norm": 120.34557987524876, "learning_rate": 1.4218009478672986e-07, "loss": 4.8714, "step": 4 }, { "epoch": 0.023696682464454975, "grad_norm": 45.93915597012246, "learning_rate": 1.8957345971563984e-07, "loss": 3.0785, "step": 5 }, { "epoch": 0.02843601895734597, "grad_norm": 7.588158469972687, "learning_rate": 2.3696682464454978e-07, "loss": 1.1787, "step": 6 }, { "epoch": 0.03317535545023697, "grad_norm": 123.2126299473189, "learning_rate": 2.843601895734597e-07, "loss": 4.9773, "step": 7 }, { "epoch": 0.037914691943127965, "grad_norm": 34.85405544555481, "learning_rate": 3.317535545023697e-07, "loss": 1.9134, "step": 8 }, { "epoch": 0.04265402843601896, "grad_norm": 6.387158096213077, "learning_rate": 3.791469194312797e-07, "loss": 1.078, "step": 9 }, { "epoch": 0.04739336492890995, "grad_norm": 90.5056475587616, "learning_rate": 4.265402843601896e-07, "loss": 3.3833, "step": 10 }, { "epoch": 0.052132701421800945, "grad_norm": 19.847643429642027, "learning_rate": 4.7393364928909956e-07, "loss": 1.6906, "step": 11 }, { "epoch": 0.05687203791469194, "grad_norm": 69.85258553912271, "learning_rate": 5.213270142180095e-07, "loss": 3.443, "step": 12 }, { "epoch": 0.061611374407582936, "grad_norm": 66.07803912705128, "learning_rate": 5.687203791469194e-07, "loss": 2.998, "step": 13 }, { "epoch": 0.06635071090047394, "grad_norm": 60.825322694407426, "learning_rate": 6.161137440758294e-07, "loss": 2.9808, "step": 14 }, { "epoch": 0.07109004739336493, "grad_norm": 12.100377024176268, "learning_rate": 6.635071090047394e-07, "loss": 1.1544, "step": 15 }, { "epoch": 0.07582938388625593, "grad_norm": 67.6920658849621, "learning_rate": 7.109004739336493e-07, "loss": 2.9571, "step": 16 }, { "epoch": 0.08056872037914692, "grad_norm": 18.392508410355724, "learning_rate": 7.582938388625594e-07, "loss": 1.3217, "step": 17 }, { "epoch": 0.08530805687203792, "grad_norm": 70.96881552186444, "learning_rate": 8.056872037914692e-07, "loss": 3.4988, "step": 18 }, { "epoch": 0.09004739336492891, "grad_norm": 11.624035689102646, "learning_rate": 8.530805687203792e-07, "loss": 1.3954, "step": 19 }, { "epoch": 0.0947867298578199, "grad_norm": 63.05933859985993, "learning_rate": 9.004739336492892e-07, "loss": 2.6776, "step": 20 }, { "epoch": 0.0995260663507109, "grad_norm": 8.32831094452385, "learning_rate": 9.478672985781991e-07, "loss": 1.0306, "step": 21 }, { "epoch": 0.10426540284360189, "grad_norm": 45.10580181897829, "learning_rate": 9.95260663507109e-07, "loss": 2.3985, "step": 22 }, { "epoch": 0.10900473933649289, "grad_norm": 15.893169103733232, "learning_rate": 1.042654028436019e-06, "loss": 1.2971, "step": 23 }, { "epoch": 0.11374407582938388, "grad_norm": 29.8234296502016, "learning_rate": 1.090047393364929e-06, "loss": 1.919, "step": 24 }, { "epoch": 0.11848341232227488, "grad_norm": 82.09962631491388, "learning_rate": 1.1374407582938388e-06, "loss": 3.6626, "step": 25 }, { "epoch": 0.12322274881516587, "grad_norm": 40.04271756686343, "learning_rate": 1.184834123222749e-06, "loss": 2.308, "step": 26 }, { "epoch": 0.12796208530805686, "grad_norm": 27.449517033776978, "learning_rate": 1.2322274881516587e-06, "loss": 1.8155, "step": 27 }, { "epoch": 0.13270142180094788, "grad_norm": 10.639809456341752, "learning_rate": 1.2796208530805687e-06, "loss": 1.0393, "step": 28 }, { "epoch": 0.13744075829383887, "grad_norm": 54.85981944259095, "learning_rate": 1.3270142180094788e-06, "loss": 2.7517, "step": 29 }, { "epoch": 0.14218009478672985, "grad_norm": 10.614676945303964, "learning_rate": 1.3744075829383887e-06, "loss": 1.1101, "step": 30 }, { "epoch": 0.14691943127962084, "grad_norm": 24.53181043088714, "learning_rate": 1.4218009478672987e-06, "loss": 1.7597, "step": 31 }, { "epoch": 0.15165876777251186, "grad_norm": 21.276538744853784, "learning_rate": 1.4691943127962086e-06, "loss": 1.8453, "step": 32 }, { "epoch": 0.15639810426540285, "grad_norm": 37.367412651323995, "learning_rate": 1.5165876777251187e-06, "loss": 1.9205, "step": 33 }, { "epoch": 0.16113744075829384, "grad_norm": 4.908149316461357, "learning_rate": 1.5639810426540287e-06, "loss": 1.0521, "step": 34 }, { "epoch": 0.16587677725118483, "grad_norm": 18.58884607239678, "learning_rate": 1.6113744075829384e-06, "loss": 1.2307, "step": 35 }, { "epoch": 0.17061611374407584, "grad_norm": 4.500814297819521, "learning_rate": 1.6587677725118483e-06, "loss": 0.9168, "step": 36 }, { "epoch": 0.17535545023696683, "grad_norm": 21.66083636003178, "learning_rate": 1.7061611374407585e-06, "loss": 1.5528, "step": 37 }, { "epoch": 0.18009478672985782, "grad_norm": 18.173576160275513, "learning_rate": 1.7535545023696684e-06, "loss": 1.4227, "step": 38 }, { "epoch": 0.1848341232227488, "grad_norm": 3.6059374841290643, "learning_rate": 1.8009478672985784e-06, "loss": 0.9369, "step": 39 }, { "epoch": 0.1895734597156398, "grad_norm": 13.61011186508229, "learning_rate": 1.8483412322274883e-06, "loss": 1.3158, "step": 40 }, { "epoch": 0.1943127962085308, "grad_norm": 13.70693894260049, "learning_rate": 1.8957345971563982e-06, "loss": 1.3908, "step": 41 }, { "epoch": 0.1990521327014218, "grad_norm": 6.763803508410375, "learning_rate": 1.943127962085308e-06, "loss": 1.1349, "step": 42 }, { "epoch": 0.2037914691943128, "grad_norm": 5.206847724219033, "learning_rate": 1.990521327014218e-06, "loss": 0.8841, "step": 43 }, { "epoch": 0.20853080568720378, "grad_norm": 4.113335406268759, "learning_rate": 2.037914691943128e-06, "loss": 0.8342, "step": 44 }, { "epoch": 0.2132701421800948, "grad_norm": 13.973887785697313, "learning_rate": 2.085308056872038e-06, "loss": 1.5232, "step": 45 }, { "epoch": 0.21800947867298578, "grad_norm": 24.178882765868533, "learning_rate": 2.1327014218009483e-06, "loss": 1.7558, "step": 46 }, { "epoch": 0.22274881516587677, "grad_norm": 15.258943952112597, "learning_rate": 2.180094786729858e-06, "loss": 1.4047, "step": 47 }, { "epoch": 0.22748815165876776, "grad_norm": 6.028846809467111, "learning_rate": 2.2274881516587678e-06, "loss": 0.9313, "step": 48 }, { "epoch": 0.23222748815165878, "grad_norm": 13.30832695052435, "learning_rate": 2.2748815165876777e-06, "loss": 1.3048, "step": 49 }, { "epoch": 0.23696682464454977, "grad_norm": 32.03283861833234, "learning_rate": 2.322274881516588e-06, "loss": 1.9909, "step": 50 }, { "epoch": 0.23696682464454977, "eval_loss": 1.5357390642166138, "eval_runtime": 7.7918, "eval_samples_per_second": 24.128, "eval_steps_per_second": 6.032, "step": 50 }, { "epoch": 0.24170616113744076, "grad_norm": 3.288468627282892, "learning_rate": 2.369668246445498e-06, "loss": 0.7391, "step": 51 }, { "epoch": 0.24644549763033174, "grad_norm": 43.315163503233876, "learning_rate": 2.417061611374408e-06, "loss": 2.074, "step": 52 }, { "epoch": 0.25118483412322273, "grad_norm": 18.160736628539848, "learning_rate": 2.4644549763033174e-06, "loss": 1.2847, "step": 53 }, { "epoch": 0.2559241706161137, "grad_norm": 12.634993975676384, "learning_rate": 2.5118483412322274e-06, "loss": 1.18, "step": 54 }, { "epoch": 0.26066350710900477, "grad_norm": 15.065309167670087, "learning_rate": 2.5592417061611373e-06, "loss": 1.3315, "step": 55 }, { "epoch": 0.26540284360189575, "grad_norm": 3.1570174369908455, "learning_rate": 2.606635071090048e-06, "loss": 0.7762, "step": 56 }, { "epoch": 0.27014218009478674, "grad_norm": 29.459791307849596, "learning_rate": 2.6540284360189576e-06, "loss": 1.6226, "step": 57 }, { "epoch": 0.27488151658767773, "grad_norm": 21.723079688552776, "learning_rate": 2.7014218009478675e-06, "loss": 1.3176, "step": 58 }, { "epoch": 0.2796208530805687, "grad_norm": 5.836364892449694, "learning_rate": 2.7488151658767775e-06, "loss": 0.7024, "step": 59 }, { "epoch": 0.2843601895734597, "grad_norm": 18.693320830865055, "learning_rate": 2.7962085308056874e-06, "loss": 1.2722, "step": 60 }, { "epoch": 0.2890995260663507, "grad_norm": 15.436571327706798, "learning_rate": 2.8436018957345973e-06, "loss": 1.3138, "step": 61 }, { "epoch": 0.2938388625592417, "grad_norm": 5.930903722307386, "learning_rate": 2.8909952606635073e-06, "loss": 0.8624, "step": 62 }, { "epoch": 0.2985781990521327, "grad_norm": 28.660964672923605, "learning_rate": 2.938388625592417e-06, "loss": 1.4304, "step": 63 }, { "epoch": 0.3033175355450237, "grad_norm": 7.170512177516791, "learning_rate": 2.985781990521327e-06, "loss": 0.7581, "step": 64 }, { "epoch": 0.3080568720379147, "grad_norm": 24.92537688193459, "learning_rate": 3.0331753554502375e-06, "loss": 1.5542, "step": 65 }, { "epoch": 0.3127962085308057, "grad_norm": 6.670020974801537, "learning_rate": 3.0805687203791474e-06, "loss": 0.7192, "step": 66 }, { "epoch": 0.3175355450236967, "grad_norm": 18.058205723760572, "learning_rate": 3.1279620853080574e-06, "loss": 1.2775, "step": 67 }, { "epoch": 0.3222748815165877, "grad_norm": 2.7688353603669373, "learning_rate": 3.1753554502369673e-06, "loss": 0.7382, "step": 68 }, { "epoch": 0.32701421800947866, "grad_norm": 25.79053830589144, "learning_rate": 3.222748815165877e-06, "loss": 1.4285, "step": 69 }, { "epoch": 0.33175355450236965, "grad_norm": 27.17162166449944, "learning_rate": 3.2701421800947867e-06, "loss": 1.3835, "step": 70 }, { "epoch": 0.33649289099526064, "grad_norm": 2.7709621071894515, "learning_rate": 3.3175355450236967e-06, "loss": 0.8456, "step": 71 }, { "epoch": 0.3412322274881517, "grad_norm": 16.104863041856177, "learning_rate": 3.3649289099526066e-06, "loss": 1.0264, "step": 72 }, { "epoch": 0.3459715639810427, "grad_norm": 7.87118824845877, "learning_rate": 3.412322274881517e-06, "loss": 0.9177, "step": 73 }, { "epoch": 0.35071090047393366, "grad_norm": 14.424951696237573, "learning_rate": 3.459715639810427e-06, "loss": 1.3698, "step": 74 }, { "epoch": 0.35545023696682465, "grad_norm": 26.356301557715284, "learning_rate": 3.507109004739337e-06, "loss": 1.4675, "step": 75 }, { "epoch": 0.36018957345971564, "grad_norm": 30.488132509935415, "learning_rate": 3.5545023696682468e-06, "loss": 1.4732, "step": 76 }, { "epoch": 0.36492890995260663, "grad_norm": 7.144103257314455, "learning_rate": 3.6018957345971567e-06, "loss": 0.9201, "step": 77 }, { "epoch": 0.3696682464454976, "grad_norm": 12.335182641269517, "learning_rate": 3.6492890995260666e-06, "loss": 1.0759, "step": 78 }, { "epoch": 0.3744075829383886, "grad_norm": 9.761423382810872, "learning_rate": 3.6966824644549766e-06, "loss": 1.1193, "step": 79 }, { "epoch": 0.3791469194312796, "grad_norm": 6.228851618657622, "learning_rate": 3.7440758293838865e-06, "loss": 0.8188, "step": 80 }, { "epoch": 0.38388625592417064, "grad_norm": 7.476211349226989, "learning_rate": 3.7914691943127964e-06, "loss": 1.0185, "step": 81 }, { "epoch": 0.3886255924170616, "grad_norm": 6.008754086378737, "learning_rate": 3.838862559241707e-06, "loss": 0.9504, "step": 82 }, { "epoch": 0.3933649289099526, "grad_norm": 39.784238511336135, "learning_rate": 3.886255924170616e-06, "loss": 1.508, "step": 83 }, { "epoch": 0.3981042654028436, "grad_norm": 16.093115365998983, "learning_rate": 3.933649289099527e-06, "loss": 0.9674, "step": 84 }, { "epoch": 0.4028436018957346, "grad_norm": 2.831228098225237, "learning_rate": 3.981042654028436e-06, "loss": 0.8576, "step": 85 }, { "epoch": 0.4075829383886256, "grad_norm": 2.6654563752530755, "learning_rate": 4.0284360189573465e-06, "loss": 0.881, "step": 86 }, { "epoch": 0.41232227488151657, "grad_norm": 24.963850396340955, "learning_rate": 4.075829383886256e-06, "loss": 1.381, "step": 87 }, { "epoch": 0.41706161137440756, "grad_norm": 18.32888630488476, "learning_rate": 4.123222748815166e-06, "loss": 0.8679, "step": 88 }, { "epoch": 0.4218009478672986, "grad_norm": 28.159604020225608, "learning_rate": 4.170616113744076e-06, "loss": 1.4164, "step": 89 }, { "epoch": 0.4265402843601896, "grad_norm": 18.38643187927111, "learning_rate": 4.218009478672986e-06, "loss": 0.8316, "step": 90 }, { "epoch": 0.4312796208530806, "grad_norm": 18.0776549889313, "learning_rate": 4.265402843601897e-06, "loss": 1.1651, "step": 91 }, { "epoch": 0.43601895734597157, "grad_norm": 8.31511353691506, "learning_rate": 4.312796208530806e-06, "loss": 1.027, "step": 92 }, { "epoch": 0.44075829383886256, "grad_norm": 2.984359322587433, "learning_rate": 4.360189573459716e-06, "loss": 0.7204, "step": 93 }, { "epoch": 0.44549763033175355, "grad_norm": 6.1438951745676516, "learning_rate": 4.407582938388626e-06, "loss": 0.6249, "step": 94 }, { "epoch": 0.45023696682464454, "grad_norm": 6.253599995679127, "learning_rate": 4.4549763033175355e-06, "loss": 0.8433, "step": 95 }, { "epoch": 0.4549763033175355, "grad_norm": 16.121167837366702, "learning_rate": 4.502369668246446e-06, "loss": 1.2633, "step": 96 }, { "epoch": 0.4597156398104265, "grad_norm": 23.92401887282444, "learning_rate": 4.549763033175355e-06, "loss": 1.1481, "step": 97 }, { "epoch": 0.46445497630331756, "grad_norm": 7.54919968485265, "learning_rate": 4.597156398104266e-06, "loss": 0.8537, "step": 98 }, { "epoch": 0.46919431279620855, "grad_norm": 16.4663797881457, "learning_rate": 4.644549763033176e-06, "loss": 1.0375, "step": 99 }, { "epoch": 0.47393364928909953, "grad_norm": 3.2371645854636832, "learning_rate": 4.691943127962086e-06, "loss": 0.6856, "step": 100 }, { "epoch": 0.47393364928909953, "eval_loss": 1.0985443592071533, "eval_runtime": 7.9946, "eval_samples_per_second": 23.516, "eval_steps_per_second": 5.879, "step": 100 }, { "epoch": 0.4786729857819905, "grad_norm": 3.8478671547588474, "learning_rate": 4.739336492890996e-06, "loss": 0.7252, "step": 101 }, { "epoch": 0.4834123222748815, "grad_norm": 17.790211396697263, "learning_rate": 4.7867298578199055e-06, "loss": 1.2033, "step": 102 }, { "epoch": 0.4881516587677725, "grad_norm": 6.598774872646845, "learning_rate": 4.834123222748816e-06, "loss": 0.7996, "step": 103 }, { "epoch": 0.4928909952606635, "grad_norm": 17.90748957259168, "learning_rate": 4.881516587677725e-06, "loss": 0.6942, "step": 104 }, { "epoch": 0.4976303317535545, "grad_norm": 6.276702585855472, "learning_rate": 4.928909952606635e-06, "loss": 0.881, "step": 105 }, { "epoch": 0.5023696682464455, "grad_norm": 2.641182574106109, "learning_rate": 4.976303317535545e-06, "loss": 0.8275, "step": 106 }, { "epoch": 0.5071090047393365, "grad_norm": 3.1740261509578676, "learning_rate": 5.023696682464455e-06, "loss": 0.64, "step": 107 }, { "epoch": 0.5118483412322274, "grad_norm": 15.982354340368344, "learning_rate": 5.071090047393366e-06, "loss": 0.8089, "step": 108 }, { "epoch": 0.5165876777251185, "grad_norm": 13.62346496564989, "learning_rate": 5.118483412322275e-06, "loss": 1.0223, "step": 109 }, { "epoch": 0.5213270142180095, "grad_norm": 18.85115778380913, "learning_rate": 5.165876777251185e-06, "loss": 0.9443, "step": 110 }, { "epoch": 0.5260663507109005, "grad_norm": 17.357644955205703, "learning_rate": 5.213270142180096e-06, "loss": 0.7135, "step": 111 }, { "epoch": 0.5308056872037915, "grad_norm": 18.435671376617172, "learning_rate": 5.260663507109005e-06, "loss": 0.9026, "step": 112 }, { "epoch": 0.5355450236966824, "grad_norm": 16.296720003324083, "learning_rate": 5.308056872037915e-06, "loss": 0.813, "step": 113 }, { "epoch": 0.5402843601895735, "grad_norm": 17.387322006549645, "learning_rate": 5.355450236966825e-06, "loss": 0.8703, "step": 114 }, { "epoch": 0.5450236966824644, "grad_norm": 17.84888433467405, "learning_rate": 5.402843601895735e-06, "loss": 0.9083, "step": 115 }, { "epoch": 0.5497630331753555, "grad_norm": 15.567515178037798, "learning_rate": 5.4502369668246446e-06, "loss": 0.8269, "step": 116 }, { "epoch": 0.5545023696682464, "grad_norm": 32.4120434440015, "learning_rate": 5.497630331753555e-06, "loss": 1.0408, "step": 117 }, { "epoch": 0.5592417061611374, "grad_norm": 34.75683609822539, "learning_rate": 5.5450236966824644e-06, "loss": 1.169, "step": 118 }, { "epoch": 0.5639810426540285, "grad_norm": 16.267510034467378, "learning_rate": 5.592417061611375e-06, "loss": 0.5434, "step": 119 }, { "epoch": 0.5687203791469194, "grad_norm": 10.050816033748056, "learning_rate": 5.639810426540285e-06, "loss": 0.9619, "step": 120 }, { "epoch": 0.5734597156398105, "grad_norm": 18.015087265001927, "learning_rate": 5.687203791469195e-06, "loss": 0.9081, "step": 121 }, { "epoch": 0.5781990521327014, "grad_norm": 3.0788046399571103, "learning_rate": 5.734597156398105e-06, "loss": 0.8138, "step": 122 }, { "epoch": 0.5829383886255924, "grad_norm": 40.81995319269455, "learning_rate": 5.7819905213270145e-06, "loss": 1.0204, "step": 123 }, { "epoch": 0.5876777251184834, "grad_norm": 3.382154801748216, "learning_rate": 5.829383886255925e-06, "loss": 0.6919, "step": 124 }, { "epoch": 0.5924170616113744, "grad_norm": 3.7193644674392594, "learning_rate": 5.876777251184834e-06, "loss": 0.7168, "step": 125 }, { "epoch": 0.5971563981042654, "grad_norm": 2.952313965201417, "learning_rate": 5.924170616113745e-06, "loss": 0.771, "step": 126 }, { "epoch": 0.6018957345971564, "grad_norm": 13.003492397640734, "learning_rate": 5.971563981042654e-06, "loss": 0.8174, "step": 127 }, { "epoch": 0.6066350710900474, "grad_norm": 14.589344292080268, "learning_rate": 6.018957345971565e-06, "loss": 0.7877, "step": 128 }, { "epoch": 0.6113744075829384, "grad_norm": 11.638671492972737, "learning_rate": 6.066350710900475e-06, "loss": 0.7807, "step": 129 }, { "epoch": 0.6161137440758294, "grad_norm": 5.48286947202299, "learning_rate": 6.1137440758293845e-06, "loss": 0.8134, "step": 130 }, { "epoch": 0.6208530805687204, "grad_norm": 16.323797258539084, "learning_rate": 6.161137440758295e-06, "loss": 0.5628, "step": 131 }, { "epoch": 0.6255924170616114, "grad_norm": 14.23866523935885, "learning_rate": 6.208530805687204e-06, "loss": 1.0402, "step": 132 }, { "epoch": 0.6303317535545023, "grad_norm": 17.516776965037845, "learning_rate": 6.255924170616115e-06, "loss": 0.5668, "step": 133 }, { "epoch": 0.6350710900473934, "grad_norm": 3.203974060674095, "learning_rate": 6.303317535545023e-06, "loss": 0.7752, "step": 134 }, { "epoch": 0.6398104265402843, "grad_norm": 15.925776501328883, "learning_rate": 6.350710900473935e-06, "loss": 0.4179, "step": 135 }, { "epoch": 0.6445497630331753, "grad_norm": 13.118953573289614, "learning_rate": 6.398104265402843e-06, "loss": 0.818, "step": 136 }, { "epoch": 0.6492890995260664, "grad_norm": 17.67906655380422, "learning_rate": 6.445497630331754e-06, "loss": 1.0207, "step": 137 }, { "epoch": 0.6540284360189573, "grad_norm": 4.771268317889247, "learning_rate": 6.492890995260665e-06, "loss": 0.7833, "step": 138 }, { "epoch": 0.6587677725118484, "grad_norm": 13.46170978768313, "learning_rate": 6.5402843601895735e-06, "loss": 0.8421, "step": 139 }, { "epoch": 0.6635071090047393, "grad_norm": 13.374555494546588, "learning_rate": 6.587677725118484e-06, "loss": 0.7757, "step": 140 }, { "epoch": 0.6682464454976303, "grad_norm": 5.6967734677721635, "learning_rate": 6.635071090047393e-06, "loss": 0.7167, "step": 141 }, { "epoch": 0.6729857819905213, "grad_norm": 19.051040456172952, "learning_rate": 6.682464454976304e-06, "loss": 0.6418, "step": 142 }, { "epoch": 0.6777251184834123, "grad_norm": 4.437733313814945, "learning_rate": 6.729857819905213e-06, "loss": 0.8421, "step": 143 }, { "epoch": 0.6824644549763034, "grad_norm": 3.3824942666879303, "learning_rate": 6.777251184834124e-06, "loss": 0.6347, "step": 144 }, { "epoch": 0.6872037914691943, "grad_norm": 16.134029693349987, "learning_rate": 6.824644549763034e-06, "loss": 0.7729, "step": 145 }, { "epoch": 0.6919431279620853, "grad_norm": 9.522896679230966, "learning_rate": 6.8720379146919435e-06, "loss": 0.5691, "step": 146 }, { "epoch": 0.6966824644549763, "grad_norm": 11.26439032421294, "learning_rate": 6.919431279620854e-06, "loss": 0.6833, "step": 147 }, { "epoch": 0.7014218009478673, "grad_norm": 2.8946533391937144, "learning_rate": 6.966824644549763e-06, "loss": 0.8339, "step": 148 }, { "epoch": 0.7061611374407583, "grad_norm": 12.567294143726862, "learning_rate": 7.014218009478674e-06, "loss": 0.7937, "step": 149 }, { "epoch": 0.7109004739336493, "grad_norm": 4.118378272456221, "learning_rate": 7.061611374407583e-06, "loss": 0.9209, "step": 150 }, { "epoch": 0.7109004739336493, "eval_loss": 0.5114782452583313, "eval_runtime": 7.6835, "eval_samples_per_second": 24.468, "eval_steps_per_second": 6.117, "step": 150 }, { "epoch": 0.7156398104265402, "grad_norm": 13.081643447337786, "learning_rate": 7.1090047393364935e-06, "loss": 0.5085, "step": 151 }, { "epoch": 0.7203791469194313, "grad_norm": 7.131592313344539, "learning_rate": 7.156398104265403e-06, "loss": 0.5927, "step": 152 }, { "epoch": 0.7251184834123223, "grad_norm": 11.868469385411386, "learning_rate": 7.203791469194313e-06, "loss": 0.4432, "step": 153 }, { "epoch": 0.7298578199052133, "grad_norm": 13.89380031996673, "learning_rate": 7.251184834123224e-06, "loss": 0.526, "step": 154 }, { "epoch": 0.7345971563981043, "grad_norm": 3.0245649047418084, "learning_rate": 7.298578199052133e-06, "loss": 0.6342, "step": 155 }, { "epoch": 0.7393364928909952, "grad_norm": 8.99445909152358, "learning_rate": 7.345971563981044e-06, "loss": 0.48, "step": 156 }, { "epoch": 0.7440758293838863, "grad_norm": 4.835328254993896, "learning_rate": 7.393364928909953e-06, "loss": 0.7006, "step": 157 }, { "epoch": 0.7488151658767772, "grad_norm": 2.56401277703409, "learning_rate": 7.4407582938388635e-06, "loss": 0.565, "step": 158 }, { "epoch": 0.7535545023696683, "grad_norm": 8.408290523647263, "learning_rate": 7.488151658767773e-06, "loss": 0.5788, "step": 159 }, { "epoch": 0.7582938388625592, "grad_norm": 2.448536630140397, "learning_rate": 7.535545023696683e-06, "loss": 0.7817, "step": 160 }, { "epoch": 0.7630331753554502, "grad_norm": 3.2418181238906127, "learning_rate": 7.582938388625593e-06, "loss": 0.2056, "step": 161 }, { "epoch": 0.7677725118483413, "grad_norm": 2.216383090131846, "learning_rate": 7.630331753554503e-06, "loss": 0.43, "step": 162 }, { "epoch": 0.7725118483412322, "grad_norm": 2.7269338690903986, "learning_rate": 7.677725118483414e-06, "loss": 0.5986, "step": 163 }, { "epoch": 0.7772511848341233, "grad_norm": 3.703015103620324, "learning_rate": 7.725118483412322e-06, "loss": 0.2139, "step": 164 }, { "epoch": 0.7819905213270142, "grad_norm": 11.509890344708763, "learning_rate": 7.772511848341233e-06, "loss": 0.3832, "step": 165 }, { "epoch": 0.7867298578199052, "grad_norm": 13.221389377721215, "learning_rate": 7.819905213270143e-06, "loss": 0.6961, "step": 166 }, { "epoch": 0.7914691943127962, "grad_norm": 8.601955890360907, "learning_rate": 7.867298578199053e-06, "loss": 0.7069, "step": 167 }, { "epoch": 0.7962085308056872, "grad_norm": 6.464688976078771, "learning_rate": 7.914691943127962e-06, "loss": 0.2874, "step": 168 }, { "epoch": 0.8009478672985783, "grad_norm": 4.818205368328611, "learning_rate": 7.962085308056872e-06, "loss": 0.2536, "step": 169 }, { "epoch": 0.8056872037914692, "grad_norm": 6.818061320004181, "learning_rate": 8.009478672985783e-06, "loss": 0.8167, "step": 170 }, { "epoch": 0.8104265402843602, "grad_norm": 6.814053715737355, "learning_rate": 8.056872037914693e-06, "loss": 0.4523, "step": 171 }, { "epoch": 0.8151658767772512, "grad_norm": 3.7622792940282554, "learning_rate": 8.104265402843603e-06, "loss": 0.437, "step": 172 }, { "epoch": 0.8199052132701422, "grad_norm": 17.887658231522614, "learning_rate": 8.151658767772512e-06, "loss": 0.5533, "step": 173 }, { "epoch": 0.8246445497630331, "grad_norm": 4.154002223531854, "learning_rate": 8.199052132701422e-06, "loss": 0.767, "step": 174 }, { "epoch": 0.8293838862559242, "grad_norm": 2.6690075960806445, "learning_rate": 8.246445497630333e-06, "loss": 0.4886, "step": 175 }, { "epoch": 0.8341232227488151, "grad_norm": 11.187655316079512, "learning_rate": 8.293838862559243e-06, "loss": 0.5197, "step": 176 }, { "epoch": 0.8388625592417062, "grad_norm": 2.3362587429831687, "learning_rate": 8.341232227488152e-06, "loss": 0.8314, "step": 177 }, { "epoch": 0.8436018957345972, "grad_norm": 2.8252680205720817, "learning_rate": 8.388625592417062e-06, "loss": 0.4624, "step": 178 }, { "epoch": 0.8483412322274881, "grad_norm": 4.528305551354439, "learning_rate": 8.436018957345973e-06, "loss": 0.2562, "step": 179 }, { "epoch": 0.8530805687203792, "grad_norm": 4.546641068403436, "learning_rate": 8.483412322274883e-06, "loss": 0.2464, "step": 180 }, { "epoch": 0.8578199052132701, "grad_norm": 3.6038044992663334, "learning_rate": 8.530805687203793e-06, "loss": 0.4069, "step": 181 }, { "epoch": 0.8625592417061612, "grad_norm": 1.9811719048702106, "learning_rate": 8.578199052132702e-06, "loss": 0.5778, "step": 182 }, { "epoch": 0.8672985781990521, "grad_norm": 4.384545934120455, "learning_rate": 8.625592417061612e-06, "loss": 0.7424, "step": 183 }, { "epoch": 0.8720379146919431, "grad_norm": 2.224370506740259, "learning_rate": 8.672985781990521e-06, "loss": 0.5563, "step": 184 }, { "epoch": 0.8767772511848341, "grad_norm": 2.2438244553088804, "learning_rate": 8.720379146919431e-06, "loss": 0.5606, "step": 185 }, { "epoch": 0.8815165876777251, "grad_norm": 2.4153286166112657, "learning_rate": 8.767772511848342e-06, "loss": 0.7797, "step": 186 }, { "epoch": 0.8862559241706162, "grad_norm": 4.313982187372051, "learning_rate": 8.815165876777252e-06, "loss": 0.83, "step": 187 }, { "epoch": 0.8909952606635071, "grad_norm": 5.2744040798784635, "learning_rate": 8.862559241706162e-06, "loss": 0.5133, "step": 188 }, { "epoch": 0.8957345971563981, "grad_norm": 4.35905212043424, "learning_rate": 8.909952606635071e-06, "loss": 0.4265, "step": 189 }, { "epoch": 0.9004739336492891, "grad_norm": 6.916403849349734, "learning_rate": 8.957345971563981e-06, "loss": 0.5007, "step": 190 }, { "epoch": 0.9052132701421801, "grad_norm": 4.0093681115073325, "learning_rate": 9.004739336492892e-06, "loss": 0.9012, "step": 191 }, { "epoch": 0.909952606635071, "grad_norm": 3.3859156359807496, "learning_rate": 9.052132701421802e-06, "loss": 0.4262, "step": 192 }, { "epoch": 0.9146919431279621, "grad_norm": 4.488130094949602, "learning_rate": 9.09952606635071e-06, "loss": 0.5656, "step": 193 }, { "epoch": 0.919431279620853, "grad_norm": 7.17629211137066, "learning_rate": 9.146919431279621e-06, "loss": 0.4695, "step": 194 }, { "epoch": 0.9241706161137441, "grad_norm": 2.7309294256882928, "learning_rate": 9.194312796208532e-06, "loss": 0.8346, "step": 195 }, { "epoch": 0.9289099526066351, "grad_norm": 5.085916739731668, "learning_rate": 9.241706161137442e-06, "loss": 0.5857, "step": 196 }, { "epoch": 0.933649289099526, "grad_norm": 23.811646540122965, "learning_rate": 9.289099526066352e-06, "loss": 0.7435, "step": 197 }, { "epoch": 0.9383886255924171, "grad_norm": 2.4009497142615572, "learning_rate": 9.336492890995261e-06, "loss": 0.5708, "step": 198 }, { "epoch": 0.943127962085308, "grad_norm": 6.581250829580497, "learning_rate": 9.383886255924171e-06, "loss": 0.4402, "step": 199 }, { "epoch": 0.9478672985781991, "grad_norm": 2.4308077426776142, "learning_rate": 9.431279620853082e-06, "loss": 0.4197, "step": 200 }, { "epoch": 0.9478672985781991, "eval_loss": 0.4012674391269684, "eval_runtime": 7.7289, "eval_samples_per_second": 24.324, "eval_steps_per_second": 6.081, "step": 200 }, { "epoch": 0.95260663507109, "grad_norm": 3.5198078059324027, "learning_rate": 9.478672985781992e-06, "loss": 0.5321, "step": 201 }, { "epoch": 0.957345971563981, "grad_norm": 6.593784858432653, "learning_rate": 9.5260663507109e-06, "loss": 0.5953, "step": 202 }, { "epoch": 0.9620853080568721, "grad_norm": 4.212951711248403, "learning_rate": 9.573459715639811e-06, "loss": 0.6857, "step": 203 }, { "epoch": 0.966824644549763, "grad_norm": 4.910386484070207, "learning_rate": 9.620853080568721e-06, "loss": 0.6271, "step": 204 }, { "epoch": 0.9715639810426541, "grad_norm": 6.155738225633911, "learning_rate": 9.668246445497632e-06, "loss": 0.601, "step": 205 }, { "epoch": 0.976303317535545, "grad_norm": 3.268371754216749, "learning_rate": 9.715639810426542e-06, "loss": 0.5909, "step": 206 }, { "epoch": 0.981042654028436, "grad_norm": 2.2280729563438784, "learning_rate": 9.76303317535545e-06, "loss": 0.4416, "step": 207 }, { "epoch": 0.985781990521327, "grad_norm": 10.634221783829398, "learning_rate": 9.810426540284361e-06, "loss": 0.446, "step": 208 }, { "epoch": 0.990521327014218, "grad_norm": 1.9752733492895744, "learning_rate": 9.85781990521327e-06, "loss": 0.5698, "step": 209 }, { "epoch": 0.995260663507109, "grad_norm": 4.058938063363919, "learning_rate": 9.905213270142182e-06, "loss": 0.8303, "step": 210 }, { "epoch": 1.0, "grad_norm": 2.3966848282479023, "learning_rate": 9.95260663507109e-06, "loss": 0.582, "step": 211 }, { "epoch": 1.004739336492891, "grad_norm": 3.218631141884875, "learning_rate": 1e-05, "loss": 0.2923, "step": 212 }, { "epoch": 1.009478672985782, "grad_norm": 2.5824515879316516, "learning_rate": 9.999993157895144e-06, "loss": 0.4631, "step": 213 }, { "epoch": 1.014218009478673, "grad_norm": 3.6831706895158742, "learning_rate": 9.9999726315993e-06, "loss": 0.1689, "step": 214 }, { "epoch": 1.018957345971564, "grad_norm": 3.3891043448121563, "learning_rate": 9.999938421168647e-06, "loss": 0.5278, "step": 215 }, { "epoch": 1.0236966824644549, "grad_norm": 2.356450269389873, "learning_rate": 9.999890526696813e-06, "loss": 0.4907, "step": 216 }, { "epoch": 1.028436018957346, "grad_norm": 2.693113370180676, "learning_rate": 9.999828948314876e-06, "loss": 0.6182, "step": 217 }, { "epoch": 1.033175355450237, "grad_norm": 10.485075200420136, "learning_rate": 9.999753686191369e-06, "loss": 0.1554, "step": 218 }, { "epoch": 1.037914691943128, "grad_norm": 13.319934547783834, "learning_rate": 9.99966474053227e-06, "loss": 0.4698, "step": 219 }, { "epoch": 1.042654028436019, "grad_norm": 2.8867025126626813, "learning_rate": 9.999562111581011e-06, "loss": 0.3821, "step": 220 }, { "epoch": 1.04739336492891, "grad_norm": 10.927946668852911, "learning_rate": 9.99944579961847e-06, "loss": 0.7159, "step": 221 }, { "epoch": 1.052132701421801, "grad_norm": 4.262645462720551, "learning_rate": 9.999315804962974e-06, "loss": 0.7124, "step": 222 }, { "epoch": 1.0568720379146919, "grad_norm": 26.424366500171537, "learning_rate": 9.999172127970301e-06, "loss": 0.5217, "step": 223 }, { "epoch": 1.061611374407583, "grad_norm": 3.1119784365221377, "learning_rate": 9.99901476903367e-06, "loss": 0.5714, "step": 224 }, { "epoch": 1.066350710900474, "grad_norm": 5.335575516099637, "learning_rate": 9.998843728583747e-06, "loss": 0.6435, "step": 225 }, { "epoch": 1.0710900473933649, "grad_norm": 2.6631526273812023, "learning_rate": 9.998659007088642e-06, "loss": 0.7229, "step": 226 }, { "epoch": 1.0758293838862558, "grad_norm": 3.82136179301926, "learning_rate": 9.998460605053911e-06, "loss": 0.7317, "step": 227 }, { "epoch": 1.080568720379147, "grad_norm": 4.481488279741978, "learning_rate": 9.998248523022548e-06, "loss": 0.7125, "step": 228 }, { "epoch": 1.085308056872038, "grad_norm": 38.77713323075712, "learning_rate": 9.998022761574989e-06, "loss": 0.4422, "step": 229 }, { "epoch": 1.0900473933649288, "grad_norm": 14.05824520952655, "learning_rate": 9.997783321329104e-06, "loss": 0.3723, "step": 230 }, { "epoch": 1.09478672985782, "grad_norm": 5.808006574074876, "learning_rate": 9.997530202940206e-06, "loss": 0.503, "step": 231 }, { "epoch": 1.099526066350711, "grad_norm": 3.7946635632876635, "learning_rate": 9.997263407101038e-06, "loss": 0.4076, "step": 232 }, { "epoch": 1.1042654028436019, "grad_norm": 3.804597829921628, "learning_rate": 9.996982934541781e-06, "loss": 0.6137, "step": 233 }, { "epoch": 1.1090047393364928, "grad_norm": 3.086388488116426, "learning_rate": 9.996688786030042e-06, "loss": 0.523, "step": 234 }, { "epoch": 1.113744075829384, "grad_norm": 2.593227456525801, "learning_rate": 9.996380962370859e-06, "loss": 0.7126, "step": 235 }, { "epoch": 1.1184834123222749, "grad_norm": 3.0824248661799794, "learning_rate": 9.9960594644067e-06, "loss": 0.5933, "step": 236 }, { "epoch": 1.1232227488151658, "grad_norm": 2.401542304407477, "learning_rate": 9.995724293017449e-06, "loss": 0.5244, "step": 237 }, { "epoch": 1.1279620853080567, "grad_norm": 17.032209335154683, "learning_rate": 9.995375449120419e-06, "loss": 0.3041, "step": 238 }, { "epoch": 1.132701421800948, "grad_norm": 14.316641226011127, "learning_rate": 9.995012933670341e-06, "loss": 0.3489, "step": 239 }, { "epoch": 1.1374407582938388, "grad_norm": 2.4244669733868593, "learning_rate": 9.994636747659363e-06, "loss": 0.5447, "step": 240 }, { "epoch": 1.1421800947867298, "grad_norm": 2.8284944731557715, "learning_rate": 9.994246892117046e-06, "loss": 0.359, "step": 241 }, { "epoch": 1.146919431279621, "grad_norm": 4.092835540014769, "learning_rate": 9.993843368110363e-06, "loss": 0.5189, "step": 242 }, { "epoch": 1.1516587677725119, "grad_norm": 7.344430749456483, "learning_rate": 9.993426176743695e-06, "loss": 0.6276, "step": 243 }, { "epoch": 1.1563981042654028, "grad_norm": 2.4913875975185813, "learning_rate": 9.992995319158832e-06, "loss": 0.4981, "step": 244 }, { "epoch": 1.161137440758294, "grad_norm": 3.8542842696245603, "learning_rate": 9.992550796534957e-06, "loss": 0.5826, "step": 245 }, { "epoch": 1.1658767772511849, "grad_norm": 1.5510273419539844, "learning_rate": 9.992092610088664e-06, "loss": 0.1907, "step": 246 }, { "epoch": 1.1706161137440758, "grad_norm": 3.266454326874095, "learning_rate": 9.991620761073932e-06, "loss": 0.5856, "step": 247 }, { "epoch": 1.1753554502369667, "grad_norm": 2.634621688542773, "learning_rate": 9.991135250782143e-06, "loss": 0.6905, "step": 248 }, { "epoch": 1.180094786729858, "grad_norm": 2.598153013578805, "learning_rate": 9.990636080542056e-06, "loss": 0.619, "step": 249 }, { "epoch": 1.1848341232227488, "grad_norm": 2.1773544624317007, "learning_rate": 9.990123251719826e-06, "loss": 0.3558, "step": 250 }, { "epoch": 1.1848341232227488, "eval_loss": 0.33629781007766724, "eval_runtime": 7.6017, "eval_samples_per_second": 24.731, "eval_steps_per_second": 6.183, "step": 250 }, { "epoch": 1.1895734597156398, "grad_norm": 3.988497582009886, "learning_rate": 9.989596765718981e-06, "loss": 0.4084, "step": 251 }, { "epoch": 1.1943127962085307, "grad_norm": 2.4657243130513744, "learning_rate": 9.989056623980431e-06, "loss": 0.5131, "step": 252 }, { "epoch": 1.1990521327014219, "grad_norm": 3.535641712422993, "learning_rate": 9.988502827982458e-06, "loss": 0.4954, "step": 253 }, { "epoch": 1.2037914691943128, "grad_norm": 1.85755118873854, "learning_rate": 9.987935379240715e-06, "loss": 0.2961, "step": 254 }, { "epoch": 1.2085308056872037, "grad_norm": 2.5139993461113845, "learning_rate": 9.98735427930822e-06, "loss": 0.4925, "step": 255 }, { "epoch": 1.2132701421800949, "grad_norm": 1.924588366089442, "learning_rate": 9.98675952977535e-06, "loss": 0.5136, "step": 256 }, { "epoch": 1.2180094786729858, "grad_norm": 1.7640940730996135, "learning_rate": 9.986151132269843e-06, "loss": 0.3154, "step": 257 }, { "epoch": 1.2227488151658767, "grad_norm": 3.2916885300512413, "learning_rate": 9.985529088456783e-06, "loss": 0.5185, "step": 258 }, { "epoch": 1.2274881516587677, "grad_norm": 2.9386914924460767, "learning_rate": 9.984893400038608e-06, "loss": 0.6502, "step": 259 }, { "epoch": 1.2322274881516588, "grad_norm": 1.8843342232826004, "learning_rate": 9.9842440687551e-06, "loss": 0.4734, "step": 260 }, { "epoch": 1.2369668246445498, "grad_norm": 2.7122087340956003, "learning_rate": 9.98358109638337e-06, "loss": 0.6829, "step": 261 }, { "epoch": 1.2417061611374407, "grad_norm": 2.2550679323996023, "learning_rate": 9.98290448473787e-06, "loss": 0.4525, "step": 262 }, { "epoch": 1.2464454976303316, "grad_norm": 1.9647842800496953, "learning_rate": 9.982214235670383e-06, "loss": 0.2775, "step": 263 }, { "epoch": 1.2511848341232228, "grad_norm": 2.9680154031551345, "learning_rate": 9.981510351070008e-06, "loss": 0.3757, "step": 264 }, { "epoch": 1.2559241706161137, "grad_norm": 2.957357955761695, "learning_rate": 9.980792832863166e-06, "loss": 0.4655, "step": 265 }, { "epoch": 1.2606635071090047, "grad_norm": 2.6409737245596316, "learning_rate": 9.980061683013594e-06, "loss": 0.7199, "step": 266 }, { "epoch": 1.2654028436018958, "grad_norm": 2.872009184076046, "learning_rate": 9.979316903522328e-06, "loss": 0.3885, "step": 267 }, { "epoch": 1.2701421800947867, "grad_norm": 12.283797018397255, "learning_rate": 9.978558496427718e-06, "loss": 0.448, "step": 268 }, { "epoch": 1.2748815165876777, "grad_norm": 5.246080493210068, "learning_rate": 9.977786463805399e-06, "loss": 0.6272, "step": 269 }, { "epoch": 1.2796208530805688, "grad_norm": 6.854106097690291, "learning_rate": 9.977000807768306e-06, "loss": 0.2441, "step": 270 }, { "epoch": 1.2843601895734598, "grad_norm": 2.4002121073215914, "learning_rate": 9.976201530466656e-06, "loss": 0.5504, "step": 271 }, { "epoch": 1.2890995260663507, "grad_norm": 2.250753376506315, "learning_rate": 9.97538863408794e-06, "loss": 0.5053, "step": 272 }, { "epoch": 1.2938388625592416, "grad_norm": 2.4676051862347754, "learning_rate": 9.97456212085693e-06, "loss": 0.6538, "step": 273 }, { "epoch": 1.2985781990521326, "grad_norm": 1.7931457643079958, "learning_rate": 9.973721993035664e-06, "loss": 0.3308, "step": 274 }, { "epoch": 1.3033175355450237, "grad_norm": 8.793350718644787, "learning_rate": 9.972868252923433e-06, "loss": 0.4205, "step": 275 }, { "epoch": 1.3080568720379147, "grad_norm": 2.3111808159253497, "learning_rate": 9.972000902856795e-06, "loss": 0.35, "step": 276 }, { "epoch": 1.3127962085308056, "grad_norm": 1.9770114541725161, "learning_rate": 9.971119945209548e-06, "loss": 0.181, "step": 277 }, { "epoch": 1.3175355450236967, "grad_norm": 3.676033223865303, "learning_rate": 9.970225382392733e-06, "loss": 0.4626, "step": 278 }, { "epoch": 1.3222748815165877, "grad_norm": 3.0913472681704914, "learning_rate": 9.969317216854627e-06, "loss": 0.5468, "step": 279 }, { "epoch": 1.3270142180094786, "grad_norm": 1.6758883133404263, "learning_rate": 9.968395451080736e-06, "loss": 0.3027, "step": 280 }, { "epoch": 1.3317535545023698, "grad_norm": 1.6906328567970788, "learning_rate": 9.967460087593786e-06, "loss": 0.2599, "step": 281 }, { "epoch": 1.3364928909952607, "grad_norm": 2.519747919443685, "learning_rate": 9.966511128953723e-06, "loss": 0.3654, "step": 282 }, { "epoch": 1.3412322274881516, "grad_norm": 3.0994348743265645, "learning_rate": 9.965548577757691e-06, "loss": 0.5109, "step": 283 }, { "epoch": 1.3459715639810428, "grad_norm": 5.19769356707834, "learning_rate": 9.964572436640046e-06, "loss": 0.6598, "step": 284 }, { "epoch": 1.3507109004739337, "grad_norm": 2.2151807054914103, "learning_rate": 9.963582708272328e-06, "loss": 0.3225, "step": 285 }, { "epoch": 1.3554502369668247, "grad_norm": 4.321591266645481, "learning_rate": 9.96257939536327e-06, "loss": 0.2298, "step": 286 }, { "epoch": 1.3601895734597156, "grad_norm": 2.5426417173798423, "learning_rate": 9.961562500658779e-06, "loss": 0.209, "step": 287 }, { "epoch": 1.3649289099526065, "grad_norm": 2.641414755534017, "learning_rate": 9.960532026941934e-06, "loss": 0.6695, "step": 288 }, { "epoch": 1.3696682464454977, "grad_norm": 2.350576800978289, "learning_rate": 9.959487977032982e-06, "loss": 0.1766, "step": 289 }, { "epoch": 1.3744075829383886, "grad_norm": 1.5070760436712913, "learning_rate": 9.958430353789321e-06, "loss": 0.2852, "step": 290 }, { "epoch": 1.3791469194312795, "grad_norm": 2.693828234820816, "learning_rate": 9.957359160105497e-06, "loss": 0.7203, "step": 291 }, { "epoch": 1.3838862559241707, "grad_norm": 6.638947415556458, "learning_rate": 9.956274398913201e-06, "loss": 0.5427, "step": 292 }, { "epoch": 1.3886255924170616, "grad_norm": 2.234860448425406, "learning_rate": 9.95517607318125e-06, "loss": 0.6673, "step": 293 }, { "epoch": 1.3933649289099526, "grad_norm": 2.127742473833712, "learning_rate": 9.954064185915589e-06, "loss": 0.3178, "step": 294 }, { "epoch": 1.3981042654028437, "grad_norm": 2.2598415378345327, "learning_rate": 9.952938740159278e-06, "loss": 0.6143, "step": 295 }, { "epoch": 1.4028436018957346, "grad_norm": 3.5494883982443195, "learning_rate": 9.951799738992484e-06, "loss": 0.6594, "step": 296 }, { "epoch": 1.4075829383886256, "grad_norm": 3.9241929941872313, "learning_rate": 9.950647185532473e-06, "loss": 0.5619, "step": 297 }, { "epoch": 1.4123222748815165, "grad_norm": 8.034389257470865, "learning_rate": 9.949481082933602e-06, "loss": 0.4057, "step": 298 }, { "epoch": 1.4170616113744074, "grad_norm": 5.55376279395039, "learning_rate": 9.948301434387308e-06, "loss": 0.6668, "step": 299 }, { "epoch": 1.4218009478672986, "grad_norm": 8.382791791140692, "learning_rate": 9.947108243122107e-06, "loss": 0.6512, "step": 300 }, { "epoch": 1.4218009478672986, "eval_loss": 0.3448152244091034, "eval_runtime": 7.5747, "eval_samples_per_second": 24.819, "eval_steps_per_second": 6.205, "step": 300 }, { "epoch": 1.4265402843601895, "grad_norm": 2.3895201954800327, "learning_rate": 9.94590151240357e-06, "loss": 0.3551, "step": 301 }, { "epoch": 1.4312796208530805, "grad_norm": 21.81484589554373, "learning_rate": 9.944681245534329e-06, "loss": 0.2085, "step": 302 }, { "epoch": 1.4360189573459716, "grad_norm": 2.4289020232068603, "learning_rate": 9.943447445854065e-06, "loss": 0.6601, "step": 303 }, { "epoch": 1.4407582938388626, "grad_norm": 6.537329090959136, "learning_rate": 9.942200116739488e-06, "loss": 0.4185, "step": 304 }, { "epoch": 1.4454976303317535, "grad_norm": 1.9272545240932075, "learning_rate": 9.940939261604344e-06, "loss": 0.3802, "step": 305 }, { "epoch": 1.4502369668246446, "grad_norm": 2.0804031032401604, "learning_rate": 9.939664883899394e-06, "loss": 0.479, "step": 306 }, { "epoch": 1.4549763033175356, "grad_norm": 1.5623893795033323, "learning_rate": 9.938376987112406e-06, "loss": 0.3465, "step": 307 }, { "epoch": 1.4597156398104265, "grad_norm": 2.4971685926961196, "learning_rate": 9.937075574768152e-06, "loss": 0.5371, "step": 308 }, { "epoch": 1.4644549763033177, "grad_norm": 1.9465256216683717, "learning_rate": 9.93576065042839e-06, "loss": 0.492, "step": 309 }, { "epoch": 1.4691943127962086, "grad_norm": 2.05113471327495, "learning_rate": 9.934432217691862e-06, "loss": 0.5045, "step": 310 }, { "epoch": 1.4739336492890995, "grad_norm": 1.9196555117253529, "learning_rate": 9.93309028019428e-06, "loss": 0.3704, "step": 311 }, { "epoch": 1.4786729857819905, "grad_norm": 3.682379874191357, "learning_rate": 9.931734841608311e-06, "loss": 0.4535, "step": 312 }, { "epoch": 1.4834123222748814, "grad_norm": 3.3519197093172863, "learning_rate": 9.930365905643578e-06, "loss": 0.5185, "step": 313 }, { "epoch": 1.4881516587677726, "grad_norm": 2.881253847151378, "learning_rate": 9.928983476046643e-06, "loss": 0.5396, "step": 314 }, { "epoch": 1.4928909952606635, "grad_norm": 1.8237235275534953, "learning_rate": 9.927587556600997e-06, "loss": 0.1842, "step": 315 }, { "epoch": 1.4976303317535544, "grad_norm": 4.184421241513608, "learning_rate": 9.926178151127049e-06, "loss": 0.5325, "step": 316 }, { "epoch": 1.5023696682464456, "grad_norm": 3.9338291238727985, "learning_rate": 9.924755263482121e-06, "loss": 0.2008, "step": 317 }, { "epoch": 1.5071090047393365, "grad_norm": 5.51176109511911, "learning_rate": 9.92331889756043e-06, "loss": 0.4532, "step": 318 }, { "epoch": 1.5118483412322274, "grad_norm": 2.748622530097229, "learning_rate": 9.921869057293086e-06, "loss": 0.6899, "step": 319 }, { "epoch": 1.5165876777251186, "grad_norm": 2.336940895127292, "learning_rate": 9.920405746648067e-06, "loss": 0.4099, "step": 320 }, { "epoch": 1.5213270142180095, "grad_norm": 4.063807492448013, "learning_rate": 9.918928969630228e-06, "loss": 0.4569, "step": 321 }, { "epoch": 1.5260663507109005, "grad_norm": 2.438274381368647, "learning_rate": 9.917438730281273e-06, "loss": 0.1749, "step": 322 }, { "epoch": 1.5308056872037916, "grad_norm": 2.5255975967536015, "learning_rate": 9.91593503267975e-06, "loss": 0.6794, "step": 323 }, { "epoch": 1.5355450236966823, "grad_norm": 2.435653867656926, "learning_rate": 9.914417880941043e-06, "loss": 0.6476, "step": 324 }, { "epoch": 1.5402843601895735, "grad_norm": 2.3714317752012914, "learning_rate": 9.912887279217356e-06, "loss": 0.4351, "step": 325 }, { "epoch": 1.5450236966824644, "grad_norm": 2.3675026644803006, "learning_rate": 9.911343231697703e-06, "loss": 0.2025, "step": 326 }, { "epoch": 1.5497630331753554, "grad_norm": 2.6934469311195643, "learning_rate": 9.9097857426079e-06, "loss": 0.4875, "step": 327 }, { "epoch": 1.5545023696682465, "grad_norm": 2.382299005054798, "learning_rate": 9.908214816210548e-06, "loss": 0.6983, "step": 328 }, { "epoch": 1.5592417061611374, "grad_norm": 2.5391906448996613, "learning_rate": 9.906630456805024e-06, "loss": 0.4902, "step": 329 }, { "epoch": 1.5639810426540284, "grad_norm": 5.074284156439468, "learning_rate": 9.905032668727467e-06, "loss": 0.3692, "step": 330 }, { "epoch": 1.5687203791469195, "grad_norm": 2.2664917617556255, "learning_rate": 9.903421456350776e-06, "loss": 0.5135, "step": 331 }, { "epoch": 1.5734597156398105, "grad_norm": 2.19259108575062, "learning_rate": 9.90179682408458e-06, "loss": 0.4938, "step": 332 }, { "epoch": 1.5781990521327014, "grad_norm": 2.5361885097234933, "learning_rate": 9.90015877637524e-06, "loss": 0.5438, "step": 333 }, { "epoch": 1.5829383886255926, "grad_norm": 2.178138486962567, "learning_rate": 9.898507317705837e-06, "loss": 0.6188, "step": 334 }, { "epoch": 1.5876777251184833, "grad_norm": 1.812913729132945, "learning_rate": 9.896842452596151e-06, "loss": 0.3508, "step": 335 }, { "epoch": 1.5924170616113744, "grad_norm": 1.7087086271485783, "learning_rate": 9.895164185602655e-06, "loss": 0.3084, "step": 336 }, { "epoch": 1.5971563981042654, "grad_norm": 2.7388427108508053, "learning_rate": 9.893472521318499e-06, "loss": 0.1332, "step": 337 }, { "epoch": 1.6018957345971563, "grad_norm": 1.8791204965113866, "learning_rate": 9.891767464373503e-06, "loss": 0.4661, "step": 338 }, { "epoch": 1.6066350710900474, "grad_norm": 2.068312034507751, "learning_rate": 9.890049019434135e-06, "loss": 0.5085, "step": 339 }, { "epoch": 1.6113744075829384, "grad_norm": 1.799668228890366, "learning_rate": 9.888317191203513e-06, "loss": 0.3711, "step": 340 }, { "epoch": 1.6161137440758293, "grad_norm": 1.9724625694968003, "learning_rate": 9.886571984421371e-06, "loss": 0.3308, "step": 341 }, { "epoch": 1.6208530805687205, "grad_norm": 0.5548688306542162, "learning_rate": 9.884813403864067e-06, "loss": 0.0027, "step": 342 }, { "epoch": 1.6255924170616114, "grad_norm": 1.1667921040825704, "learning_rate": 9.883041454344558e-06, "loss": 0.1846, "step": 343 }, { "epoch": 1.6303317535545023, "grad_norm": 2.504222470295781, "learning_rate": 9.881256140712389e-06, "loss": 0.4055, "step": 344 }, { "epoch": 1.6350710900473935, "grad_norm": 3.2053743663342202, "learning_rate": 9.879457467853683e-06, "loss": 0.3911, "step": 345 }, { "epoch": 1.6398104265402842, "grad_norm": 4.070213617774694, "learning_rate": 9.877645440691122e-06, "loss": 0.3496, "step": 346 }, { "epoch": 1.6445497630331753, "grad_norm": 4.238271512703958, "learning_rate": 9.875820064183936e-06, "loss": 0.7347, "step": 347 }, { "epoch": 1.6492890995260665, "grad_norm": 5.243679722478133, "learning_rate": 9.873981343327895e-06, "loss": 0.4416, "step": 348 }, { "epoch": 1.6540284360189572, "grad_norm": 2.057271614968161, "learning_rate": 9.872129283155287e-06, "loss": 0.4228, "step": 349 }, { "epoch": 1.6587677725118484, "grad_norm": 1.9694558162236104, "learning_rate": 9.870263888734905e-06, "loss": 0.4931, "step": 350 }, { "epoch": 1.6587677725118484, "eval_loss": 0.2978443503379822, "eval_runtime": 7.6174, "eval_samples_per_second": 24.68, "eval_steps_per_second": 6.17, "step": 350 }, { "epoch": 1.6635071090047393, "grad_norm": 3.1554779761803013, "learning_rate": 9.868385165172042e-06, "loss": 0.6458, "step": 351 }, { "epoch": 1.6682464454976302, "grad_norm": 6.676783870329319, "learning_rate": 9.866493117608468e-06, "loss": 0.1516, "step": 352 }, { "epoch": 1.6729857819905214, "grad_norm": 2.7462701878820575, "learning_rate": 9.864587751222416e-06, "loss": 0.2574, "step": 353 }, { "epoch": 1.6777251184834123, "grad_norm": 3.8157454645498454, "learning_rate": 9.862669071228572e-06, "loss": 0.4856, "step": 354 }, { "epoch": 1.6824644549763033, "grad_norm": 1.3090823665483136, "learning_rate": 9.860737082878062e-06, "loss": 0.323, "step": 355 }, { "epoch": 1.6872037914691944, "grad_norm": 10.494439088857515, "learning_rate": 9.858791791458431e-06, "loss": 0.1988, "step": 356 }, { "epoch": 1.6919431279620853, "grad_norm": 2.0191995262815783, "learning_rate": 9.856833202293637e-06, "loss": 0.3119, "step": 357 }, { "epoch": 1.6966824644549763, "grad_norm": 3.5323445127912927, "learning_rate": 9.854861320744024e-06, "loss": 0.1471, "step": 358 }, { "epoch": 1.7014218009478674, "grad_norm": 4.056087735458044, "learning_rate": 9.852876152206325e-06, "loss": 0.4143, "step": 359 }, { "epoch": 1.7061611374407581, "grad_norm": 1.1699046556227357, "learning_rate": 9.85087770211363e-06, "loss": 0.1744, "step": 360 }, { "epoch": 1.7109004739336493, "grad_norm": 7.948034395540595, "learning_rate": 9.84886597593538e-06, "loss": 0.583, "step": 361 }, { "epoch": 1.7156398104265402, "grad_norm": 1.8387572219605024, "learning_rate": 9.846840979177354e-06, "loss": 0.3403, "step": 362 }, { "epoch": 1.7203791469194312, "grad_norm": 2.0300481281041978, "learning_rate": 9.844802717381649e-06, "loss": 0.5911, "step": 363 }, { "epoch": 1.7251184834123223, "grad_norm": 2.442593688401152, "learning_rate": 9.842751196126663e-06, "loss": 0.3407, "step": 364 }, { "epoch": 1.7298578199052133, "grad_norm": 2.357867457832997, "learning_rate": 9.840686421027085e-06, "loss": 0.6408, "step": 365 }, { "epoch": 1.7345971563981042, "grad_norm": 1.4156381529281636, "learning_rate": 9.83860839773388e-06, "loss": 0.3078, "step": 366 }, { "epoch": 1.7393364928909953, "grad_norm": 1.4761961207478902, "learning_rate": 9.836517131934267e-06, "loss": 0.3368, "step": 367 }, { "epoch": 1.7440758293838863, "grad_norm": 2.233809665949993, "learning_rate": 9.834412629351712e-06, "loss": 0.59, "step": 368 }, { "epoch": 1.7488151658767772, "grad_norm": 4.153971174320609, "learning_rate": 9.832294895745906e-06, "loss": 0.6378, "step": 369 }, { "epoch": 1.7535545023696684, "grad_norm": 1.8346871985935636, "learning_rate": 9.830163936912752e-06, "loss": 0.2077, "step": 370 }, { "epoch": 1.758293838862559, "grad_norm": 2.185943824706028, "learning_rate": 9.828019758684343e-06, "loss": 0.632, "step": 371 }, { "epoch": 1.7630331753554502, "grad_norm": 3.585282208139966, "learning_rate": 9.82586236692896e-06, "loss": 0.414, "step": 372 }, { "epoch": 1.7677725118483414, "grad_norm": 2.9683580597033417, "learning_rate": 9.823691767551042e-06, "loss": 0.5511, "step": 373 }, { "epoch": 1.772511848341232, "grad_norm": 1.8760918872517949, "learning_rate": 9.821507966491178e-06, "loss": 0.3139, "step": 374 }, { "epoch": 1.7772511848341233, "grad_norm": 2.521809140652447, "learning_rate": 9.819310969726083e-06, "loss": 0.3167, "step": 375 }, { "epoch": 1.7819905213270142, "grad_norm": 1.6335080451938315, "learning_rate": 9.817100783268591e-06, "loss": 0.3058, "step": 376 }, { "epoch": 1.7867298578199051, "grad_norm": 1.8046828342953851, "learning_rate": 9.814877413167635e-06, "loss": 0.3768, "step": 377 }, { "epoch": 1.7914691943127963, "grad_norm": 1.8830999035554479, "learning_rate": 9.812640865508228e-06, "loss": 0.4693, "step": 378 }, { "epoch": 1.7962085308056872, "grad_norm": 2.320257885040441, "learning_rate": 9.810391146411445e-06, "loss": 0.6267, "step": 379 }, { "epoch": 1.8009478672985781, "grad_norm": 10.80073855446241, "learning_rate": 9.808128262034411e-06, "loss": 0.1824, "step": 380 }, { "epoch": 1.8056872037914693, "grad_norm": 8.414676884797586, "learning_rate": 9.805852218570285e-06, "loss": 0.31, "step": 381 }, { "epoch": 1.8104265402843602, "grad_norm": 4.325341013078249, "learning_rate": 9.803563022248238e-06, "loss": 0.5953, "step": 382 }, { "epoch": 1.8151658767772512, "grad_norm": 2.0110115883060855, "learning_rate": 9.801260679333435e-06, "loss": 0.5959, "step": 383 }, { "epoch": 1.8199052132701423, "grad_norm": 6.122877463440686, "learning_rate": 9.79894519612703e-06, "loss": 0.3405, "step": 384 }, { "epoch": 1.824644549763033, "grad_norm": 3.650442506848263, "learning_rate": 9.796616578966133e-06, "loss": 0.3861, "step": 385 }, { "epoch": 1.8293838862559242, "grad_norm": 1.5157626892732818, "learning_rate": 9.794274834223797e-06, "loss": 0.2846, "step": 386 }, { "epoch": 1.8341232227488151, "grad_norm": 3.1507961365544745, "learning_rate": 9.791919968309014e-06, "loss": 0.3294, "step": 387 }, { "epoch": 1.838862559241706, "grad_norm": 1.7306556573405594, "learning_rate": 9.789551987666676e-06, "loss": 0.3562, "step": 388 }, { "epoch": 1.8436018957345972, "grad_norm": 5.941824507077249, "learning_rate": 9.787170898777571e-06, "loss": 0.3274, "step": 389 }, { "epoch": 1.8483412322274881, "grad_norm": 3.030732093438433, "learning_rate": 9.784776708158363e-06, "loss": 0.4133, "step": 390 }, { "epoch": 1.853080568720379, "grad_norm": 1.3937150172782848, "learning_rate": 9.782369422361576e-06, "loss": 0.1756, "step": 391 }, { "epoch": 1.8578199052132702, "grad_norm": 12.927627074325668, "learning_rate": 9.779949047975568e-06, "loss": 0.361, "step": 392 }, { "epoch": 1.8625592417061612, "grad_norm": 4.493223615263428, "learning_rate": 9.777515591624523e-06, "loss": 0.4096, "step": 393 }, { "epoch": 1.867298578199052, "grad_norm": 3.148554545451083, "learning_rate": 9.775069059968426e-06, "loss": 0.3309, "step": 394 }, { "epoch": 1.8720379146919433, "grad_norm": 2.3641866669229046, "learning_rate": 9.772609459703046e-06, "loss": 0.3819, "step": 395 }, { "epoch": 1.876777251184834, "grad_norm": 2.2830547145033, "learning_rate": 9.770136797559921e-06, "loss": 0.4664, "step": 396 }, { "epoch": 1.8815165876777251, "grad_norm": 1.1790434142717119, "learning_rate": 9.767651080306337e-06, "loss": 0.1274, "step": 397 }, { "epoch": 1.8862559241706163, "grad_norm": 3.845230310093734, "learning_rate": 9.76515231474531e-06, "loss": 0.1594, "step": 398 }, { "epoch": 1.890995260663507, "grad_norm": 1.8509410525795489, "learning_rate": 9.762640507715563e-06, "loss": 0.4798, "step": 399 }, { "epoch": 1.8957345971563981, "grad_norm": 2.781434794248597, "learning_rate": 9.760115666091518e-06, "loss": 0.1246, "step": 400 }, { "epoch": 1.8957345971563981, "eval_loss": 0.2955733835697174, "eval_runtime": 7.4936, "eval_samples_per_second": 25.088, "eval_steps_per_second": 6.272, "step": 400 }, { "epoch": 1.900473933649289, "grad_norm": 2.0822001446365808, "learning_rate": 9.757577796783268e-06, "loss": 0.1796, "step": 401 }, { "epoch": 1.90521327014218, "grad_norm": 1.7683889575687497, "learning_rate": 9.755026906736558e-06, "loss": 0.3831, "step": 402 }, { "epoch": 1.9099526066350712, "grad_norm": 2.0251576776654665, "learning_rate": 9.752463002932771e-06, "loss": 0.3582, "step": 403 }, { "epoch": 1.914691943127962, "grad_norm": 1.550976752082876, "learning_rate": 9.749886092388907e-06, "loss": 0.2987, "step": 404 }, { "epoch": 1.919431279620853, "grad_norm": 2.2015043868744852, "learning_rate": 9.747296182157562e-06, "loss": 0.3003, "step": 405 }, { "epoch": 1.9241706161137442, "grad_norm": 1.3089765916841185, "learning_rate": 9.744693279326915e-06, "loss": 0.2871, "step": 406 }, { "epoch": 1.9289099526066351, "grad_norm": 3.3397852609601415, "learning_rate": 9.742077391020695e-06, "loss": 0.5265, "step": 407 }, { "epoch": 1.933649289099526, "grad_norm": 1.946719052936926, "learning_rate": 9.739448524398176e-06, "loss": 0.5086, "step": 408 }, { "epoch": 1.9383886255924172, "grad_norm": 1.5684813908714574, "learning_rate": 9.73680668665415e-06, "loss": 0.2019, "step": 409 }, { "epoch": 1.943127962085308, "grad_norm": 1.7688479660064684, "learning_rate": 9.73415188501891e-06, "loss": 0.4819, "step": 410 }, { "epoch": 1.947867298578199, "grad_norm": 1.4921551637020574, "learning_rate": 9.731484126758231e-06, "loss": 0.3432, "step": 411 }, { "epoch": 1.95260663507109, "grad_norm": 1.6904871673634219, "learning_rate": 9.72880341917334e-06, "loss": 0.3338, "step": 412 }, { "epoch": 1.957345971563981, "grad_norm": 2.231311267323003, "learning_rate": 9.726109769600915e-06, "loss": 0.3727, "step": 413 }, { "epoch": 1.962085308056872, "grad_norm": 2.064721764431227, "learning_rate": 9.72340318541305e-06, "loss": 0.5217, "step": 414 }, { "epoch": 1.966824644549763, "grad_norm": 1.8824517839226762, "learning_rate": 9.720683674017232e-06, "loss": 0.5255, "step": 415 }, { "epoch": 1.971563981042654, "grad_norm": 2.741168518277324, "learning_rate": 9.717951242856338e-06, "loss": 0.574, "step": 416 }, { "epoch": 1.9763033175355451, "grad_norm": 0.36867385458926955, "learning_rate": 9.7152058994086e-06, "loss": 0.0081, "step": 417 }, { "epoch": 1.981042654028436, "grad_norm": 3.7413445968502455, "learning_rate": 9.712447651187589e-06, "loss": 0.6644, "step": 418 }, { "epoch": 1.985781990521327, "grad_norm": 2.036352574421493, "learning_rate": 9.709676505742194e-06, "loss": 0.508, "step": 419 }, { "epoch": 1.9905213270142181, "grad_norm": 2.332710921849896, "learning_rate": 9.706892470656601e-06, "loss": 0.672, "step": 420 }, { "epoch": 1.9952606635071088, "grad_norm": 1.1977387854956147, "learning_rate": 9.704095553550277e-06, "loss": 0.3395, "step": 421 }, { "epoch": 2.0, "grad_norm": 1.306644738004841, "learning_rate": 9.701285762077938e-06, "loss": 0.2778, "step": 422 }, { "epoch": 2.004739336492891, "grad_norm": 2.6627265449160573, "learning_rate": 9.698463103929542e-06, "loss": 0.4528, "step": 423 }, { "epoch": 2.009478672985782, "grad_norm": 2.099358840952704, "learning_rate": 9.695627586830258e-06, "loss": 0.3879, "step": 424 }, { "epoch": 2.014218009478673, "grad_norm": 5.2879020211730055, "learning_rate": 9.692779218540449e-06, "loss": 0.14, "step": 425 }, { "epoch": 2.018957345971564, "grad_norm": 1.5018740536470672, "learning_rate": 9.689918006855645e-06, "loss": 0.1269, "step": 426 }, { "epoch": 2.023696682464455, "grad_norm": 1.4094211658721043, "learning_rate": 9.687043959606535e-06, "loss": 0.2423, "step": 427 }, { "epoch": 2.028436018957346, "grad_norm": 2.159506476596885, "learning_rate": 9.684157084658929e-06, "loss": 0.2815, "step": 428 }, { "epoch": 2.0331753554502368, "grad_norm": 0.8885269405665877, "learning_rate": 9.681257389913747e-06, "loss": 0.1421, "step": 429 }, { "epoch": 2.037914691943128, "grad_norm": 2.0385950670605197, "learning_rate": 9.678344883306997e-06, "loss": 0.3958, "step": 430 }, { "epoch": 2.042654028436019, "grad_norm": 6.205987948434508, "learning_rate": 9.675419572809748e-06, "loss": 0.271, "step": 431 }, { "epoch": 2.0473933649289098, "grad_norm": 2.7807782405533352, "learning_rate": 9.672481466428114e-06, "loss": 0.4354, "step": 432 }, { "epoch": 2.052132701421801, "grad_norm": 1.5112315738532895, "learning_rate": 9.669530572203228e-06, "loss": 0.2338, "step": 433 }, { "epoch": 2.056872037914692, "grad_norm": 2.403007718370798, "learning_rate": 9.666566898211219e-06, "loss": 0.1161, "step": 434 }, { "epoch": 2.061611374407583, "grad_norm": 1.342375032653582, "learning_rate": 9.663590452563193e-06, "loss": 0.1454, "step": 435 }, { "epoch": 2.066350710900474, "grad_norm": 3.4913666949654543, "learning_rate": 9.660601243405214e-06, "loss": 0.2455, "step": 436 }, { "epoch": 2.071090047393365, "grad_norm": 2.7479972548361586, "learning_rate": 9.657599278918278e-06, "loss": 0.2492, "step": 437 }, { "epoch": 2.075829383886256, "grad_norm": 1.3732106527900387, "learning_rate": 9.654584567318279e-06, "loss": 0.1943, "step": 438 }, { "epoch": 2.080568720379147, "grad_norm": 1.814725144260375, "learning_rate": 9.651557116856015e-06, "loss": 0.256, "step": 439 }, { "epoch": 2.085308056872038, "grad_norm": 2.2069597546240107, "learning_rate": 9.648516935817133e-06, "loss": 0.3961, "step": 440 }, { "epoch": 2.090047393364929, "grad_norm": 1.5420571195505541, "learning_rate": 9.64546403252213e-06, "loss": 0.251, "step": 441 }, { "epoch": 2.09478672985782, "grad_norm": 1.5476167518379802, "learning_rate": 9.642398415326321e-06, "loss": 0.2603, "step": 442 }, { "epoch": 2.0995260663507107, "grad_norm": 2.034199340028726, "learning_rate": 9.639320092619814e-06, "loss": 0.1695, "step": 443 }, { "epoch": 2.104265402843602, "grad_norm": 3.7349542663033612, "learning_rate": 9.636229072827495e-06, "loss": 0.128, "step": 444 }, { "epoch": 2.109004739336493, "grad_norm": 2.821904163178345, "learning_rate": 9.633125364408993e-06, "loss": 0.2114, "step": 445 }, { "epoch": 2.1137440758293837, "grad_norm": 2.7924773405486736, "learning_rate": 9.630008975858667e-06, "loss": 0.4738, "step": 446 }, { "epoch": 2.118483412322275, "grad_norm": 3.9989024730276475, "learning_rate": 9.626879915705583e-06, "loss": 0.1967, "step": 447 }, { "epoch": 2.123222748815166, "grad_norm": 2.8673915004345267, "learning_rate": 9.62373819251348e-06, "loss": 0.4892, "step": 448 }, { "epoch": 2.1279620853080567, "grad_norm": 3.946682223252352, "learning_rate": 9.620583814880763e-06, "loss": 0.4804, "step": 449 }, { "epoch": 2.132701421800948, "grad_norm": 1.5786772938702343, "learning_rate": 9.617416791440461e-06, "loss": 0.1804, "step": 450 }, { "epoch": 2.132701421800948, "eval_loss": 0.2992797791957855, "eval_runtime": 7.7669, "eval_samples_per_second": 24.205, "eval_steps_per_second": 6.051, "step": 450 }, { "epoch": 2.137440758293839, "grad_norm": 2.165145174031428, "learning_rate": 9.61423713086022e-06, "loss": 0.4228, "step": 451 }, { "epoch": 2.1421800947867298, "grad_norm": 1.866067205365788, "learning_rate": 9.611044841842264e-06, "loss": 0.1926, "step": 452 }, { "epoch": 2.146919431279621, "grad_norm": 2.346074490279633, "learning_rate": 9.607839933123387e-06, "loss": 0.4009, "step": 453 }, { "epoch": 2.1516587677725116, "grad_norm": 4.0592215780138945, "learning_rate": 9.604622413474916e-06, "loss": 0.3062, "step": 454 }, { "epoch": 2.156398104265403, "grad_norm": 1.5094788846683966, "learning_rate": 9.601392291702693e-06, "loss": 0.253, "step": 455 }, { "epoch": 2.161137440758294, "grad_norm": 2.214809533583907, "learning_rate": 9.598149576647053e-06, "loss": 0.3733, "step": 456 }, { "epoch": 2.1658767772511847, "grad_norm": 1.8852412240790106, "learning_rate": 9.594894277182793e-06, "loss": 0.2866, "step": 457 }, { "epoch": 2.170616113744076, "grad_norm": 1.7526684425452752, "learning_rate": 9.591626402219154e-06, "loss": 0.3004, "step": 458 }, { "epoch": 2.175355450236967, "grad_norm": 2.9894508978330783, "learning_rate": 9.588345960699792e-06, "loss": 0.2337, "step": 459 }, { "epoch": 2.1800947867298577, "grad_norm": 1.9990589843914348, "learning_rate": 9.585052961602759e-06, "loss": 0.3459, "step": 460 }, { "epoch": 2.184834123222749, "grad_norm": 2.35306488074333, "learning_rate": 9.581747413940472e-06, "loss": 0.1304, "step": 461 }, { "epoch": 2.18957345971564, "grad_norm": 1.777896718380068, "learning_rate": 9.57842932675969e-06, "loss": 0.3115, "step": 462 }, { "epoch": 2.1943127962085307, "grad_norm": 1.7487858526836815, "learning_rate": 9.575098709141496e-06, "loss": 0.2286, "step": 463 }, { "epoch": 2.199052132701422, "grad_norm": 2.3258340527853703, "learning_rate": 9.571755570201266e-06, "loss": 0.3659, "step": 464 }, { "epoch": 2.2037914691943126, "grad_norm": 1.465155899360022, "learning_rate": 9.56839991908864e-06, "loss": 0.1962, "step": 465 }, { "epoch": 2.2085308056872037, "grad_norm": 1.4928089369177426, "learning_rate": 9.565031764987502e-06, "loss": 0.1699, "step": 466 }, { "epoch": 2.213270142180095, "grad_norm": 1.785576743274789, "learning_rate": 9.561651117115962e-06, "loss": 0.2761, "step": 467 }, { "epoch": 2.2180094786729856, "grad_norm": 2.2531836622374337, "learning_rate": 9.558257984726319e-06, "loss": 0.3987, "step": 468 }, { "epoch": 2.2227488151658767, "grad_norm": 2.315364186256386, "learning_rate": 9.554852377105036e-06, "loss": 0.3683, "step": 469 }, { "epoch": 2.227488151658768, "grad_norm": 1.6764194598679394, "learning_rate": 9.551434303572725e-06, "loss": 0.1712, "step": 470 }, { "epoch": 2.2322274881516586, "grad_norm": 1.7843409934441365, "learning_rate": 9.548003773484115e-06, "loss": 0.3081, "step": 471 }, { "epoch": 2.2369668246445498, "grad_norm": 2.3510258117985505, "learning_rate": 9.544560796228022e-06, "loss": 0.175, "step": 472 }, { "epoch": 2.241706161137441, "grad_norm": 2.014710977925563, "learning_rate": 9.54110538122733e-06, "loss": 0.332, "step": 473 }, { "epoch": 2.2464454976303316, "grad_norm": 1.500227249636288, "learning_rate": 9.537637537938966e-06, "loss": 0.1706, "step": 474 }, { "epoch": 2.251184834123223, "grad_norm": 5.326923875102078, "learning_rate": 9.534157275853869e-06, "loss": 0.3117, "step": 475 }, { "epoch": 2.2559241706161135, "grad_norm": 2.395469195860342, "learning_rate": 9.530664604496964e-06, "loss": 0.4078, "step": 476 }, { "epoch": 2.2606635071090047, "grad_norm": 1.783834474018257, "learning_rate": 9.527159533427142e-06, "loss": 0.3021, "step": 477 }, { "epoch": 2.265402843601896, "grad_norm": 2.424152235039787, "learning_rate": 9.52364207223723e-06, "loss": 0.2725, "step": 478 }, { "epoch": 2.270142180094787, "grad_norm": 1.8235299600725425, "learning_rate": 9.520112230553959e-06, "loss": 0.2643, "step": 479 }, { "epoch": 2.2748815165876777, "grad_norm": 2.051524555671329, "learning_rate": 9.51657001803795e-06, "loss": 0.0735, "step": 480 }, { "epoch": 2.279620853080569, "grad_norm": 2.3488000246022755, "learning_rate": 9.513015444383682e-06, "loss": 0.4467, "step": 481 }, { "epoch": 2.2843601895734595, "grad_norm": 2.1217146626056556, "learning_rate": 9.509448519319455e-06, "loss": 0.364, "step": 482 }, { "epoch": 2.2890995260663507, "grad_norm": 1.6891403485666334, "learning_rate": 9.505869252607385e-06, "loss": 0.2422, "step": 483 }, { "epoch": 2.293838862559242, "grad_norm": 1.8691421689797754, "learning_rate": 9.502277654043355e-06, "loss": 0.1678, "step": 484 }, { "epoch": 2.2985781990521326, "grad_norm": 2.844637407365249, "learning_rate": 9.498673733457007e-06, "loss": 0.2028, "step": 485 }, { "epoch": 2.3033175355450237, "grad_norm": 4.691622791467041, "learning_rate": 9.495057500711698e-06, "loss": 0.3995, "step": 486 }, { "epoch": 2.308056872037915, "grad_norm": 4.110449766395664, "learning_rate": 9.491428965704486e-06, "loss": 0.2655, "step": 487 }, { "epoch": 2.3127962085308056, "grad_norm": 3.181980318070533, "learning_rate": 9.487788138366098e-06, "loss": 0.2057, "step": 488 }, { "epoch": 2.3175355450236967, "grad_norm": 1.8751228151087935, "learning_rate": 9.484135028660905e-06, "loss": 0.3028, "step": 489 }, { "epoch": 2.322274881516588, "grad_norm": 1.5661514970100012, "learning_rate": 9.480469646586888e-06, "loss": 0.1934, "step": 490 }, { "epoch": 2.3270142180094786, "grad_norm": 2.1943914786986927, "learning_rate": 9.476792002175621e-06, "loss": 0.2269, "step": 491 }, { "epoch": 2.3317535545023698, "grad_norm": 2.264533107109475, "learning_rate": 9.473102105492234e-06, "loss": 0.4183, "step": 492 }, { "epoch": 2.3364928909952605, "grad_norm": 2.7234498956935007, "learning_rate": 9.469399966635392e-06, "loss": 0.2831, "step": 493 }, { "epoch": 2.3412322274881516, "grad_norm": 1.8580160075814238, "learning_rate": 9.465685595737263e-06, "loss": 0.2907, "step": 494 }, { "epoch": 2.345971563981043, "grad_norm": 1.7716495700178916, "learning_rate": 9.461959002963492e-06, "loss": 0.3222, "step": 495 }, { "epoch": 2.3507109004739335, "grad_norm": 2.1649991541967655, "learning_rate": 9.458220198513178e-06, "loss": 0.3767, "step": 496 }, { "epoch": 2.3554502369668247, "grad_norm": 2.2141707421354204, "learning_rate": 9.454469192618834e-06, "loss": 0.4129, "step": 497 }, { "epoch": 2.360189573459716, "grad_norm": 1.1394313965651783, "learning_rate": 9.45070599554637e-06, "loss": 0.0941, "step": 498 }, { "epoch": 2.3649289099526065, "grad_norm": 1.5645808841620317, "learning_rate": 9.446930617595066e-06, "loss": 0.2175, "step": 499 }, { "epoch": 2.3696682464454977, "grad_norm": 0.9848357485987885, "learning_rate": 9.443143069097531e-06, "loss": 0.0178, "step": 500 }, { "epoch": 2.3696682464454977, "eval_loss": 0.27932682633399963, "eval_runtime": 7.8489, "eval_samples_per_second": 23.952, "eval_steps_per_second": 5.988, "step": 500 }, { "epoch": 2.374407582938389, "grad_norm": 1.463106965895241, "learning_rate": 9.439343360419689e-06, "loss": 0.1835, "step": 501 }, { "epoch": 2.3791469194312795, "grad_norm": 0.9453677984452025, "learning_rate": 9.43553150196074e-06, "loss": 0.0948, "step": 502 }, { "epoch": 2.3838862559241707, "grad_norm": 2.137434931225527, "learning_rate": 9.431707504153138e-06, "loss": 0.3164, "step": 503 }, { "epoch": 2.3886255924170614, "grad_norm": 2.641718579279774, "learning_rate": 9.427871377462561e-06, "loss": 0.3065, "step": 504 }, { "epoch": 2.3933649289099526, "grad_norm": 3.1991815401861565, "learning_rate": 9.424023132387883e-06, "loss": 0.2931, "step": 505 }, { "epoch": 2.3981042654028437, "grad_norm": 2.0386672451223515, "learning_rate": 9.420162779461142e-06, "loss": 0.2671, "step": 506 }, { "epoch": 2.4028436018957344, "grad_norm": 2.5967954379583396, "learning_rate": 9.416290329247513e-06, "loss": 0.0696, "step": 507 }, { "epoch": 2.4075829383886256, "grad_norm": 1.6357351836128409, "learning_rate": 9.412405792345278e-06, "loss": 0.2182, "step": 508 }, { "epoch": 2.4123222748815167, "grad_norm": 1.0931361872228085, "learning_rate": 9.408509179385806e-06, "loss": 0.0861, "step": 509 }, { "epoch": 2.4170616113744074, "grad_norm": 1.8664597671910188, "learning_rate": 9.404600501033505e-06, "loss": 0.1282, "step": 510 }, { "epoch": 2.4218009478672986, "grad_norm": 1.4441234779633076, "learning_rate": 9.400679767985814e-06, "loss": 0.2087, "step": 511 }, { "epoch": 2.4265402843601898, "grad_norm": 1.435779267833689, "learning_rate": 9.39674699097316e-06, "loss": 0.1959, "step": 512 }, { "epoch": 2.4312796208530805, "grad_norm": 1.843464890515756, "learning_rate": 9.392802180758926e-06, "loss": 0.2495, "step": 513 }, { "epoch": 2.4360189573459716, "grad_norm": 1.5674475560557937, "learning_rate": 9.38884534813944e-06, "loss": 0.1966, "step": 514 }, { "epoch": 2.4407582938388623, "grad_norm": 1.787304736128972, "learning_rate": 9.384876503943929e-06, "loss": 0.2913, "step": 515 }, { "epoch": 2.4454976303317535, "grad_norm": 2.297410213230877, "learning_rate": 9.380895659034486e-06, "loss": 0.2654, "step": 516 }, { "epoch": 2.4502369668246446, "grad_norm": 2.282985726906157, "learning_rate": 9.376902824306058e-06, "loss": 0.3368, "step": 517 }, { "epoch": 2.4549763033175354, "grad_norm": 1.8687545300392534, "learning_rate": 9.3728980106864e-06, "loss": 0.244, "step": 518 }, { "epoch": 2.4597156398104265, "grad_norm": 1.3880182815684456, "learning_rate": 9.368881229136057e-06, "loss": 0.1899, "step": 519 }, { "epoch": 2.4644549763033177, "grad_norm": 1.696370706635656, "learning_rate": 9.364852490648327e-06, "loss": 0.2128, "step": 520 }, { "epoch": 2.4691943127962084, "grad_norm": 2.292613729117954, "learning_rate": 9.360811806249224e-06, "loss": 0.1888, "step": 521 }, { "epoch": 2.4739336492890995, "grad_norm": 9.78708000296614, "learning_rate": 9.356759186997466e-06, "loss": 0.3178, "step": 522 }, { "epoch": 2.4786729857819907, "grad_norm": 2.0045275397991293, "learning_rate": 9.352694643984433e-06, "loss": 0.3639, "step": 523 }, { "epoch": 2.4834123222748814, "grad_norm": 3.742887030355059, "learning_rate": 9.348618188334135e-06, "loss": 0.2554, "step": 524 }, { "epoch": 2.4881516587677726, "grad_norm": 1.94272441768136, "learning_rate": 9.344529831203187e-06, "loss": 0.3038, "step": 525 }, { "epoch": 2.4928909952606633, "grad_norm": 5.088624688688455, "learning_rate": 9.340429583780774e-06, "loss": 0.2156, "step": 526 }, { "epoch": 2.4976303317535544, "grad_norm": 2.6529254956977706, "learning_rate": 9.33631745728863e-06, "loss": 0.2925, "step": 527 }, { "epoch": 2.5023696682464456, "grad_norm": 0.8557408245301991, "learning_rate": 9.33219346298099e-06, "loss": 0.1333, "step": 528 }, { "epoch": 2.5071090047393367, "grad_norm": 2.068141298388673, "learning_rate": 9.32805761214458e-06, "loss": 0.4115, "step": 529 }, { "epoch": 2.5118483412322274, "grad_norm": 2.144402530171497, "learning_rate": 9.323909916098566e-06, "loss": 0.3903, "step": 530 }, { "epoch": 2.5165876777251186, "grad_norm": 2.0004594631335935, "learning_rate": 9.319750386194537e-06, "loss": 0.1363, "step": 531 }, { "epoch": 2.5213270142180093, "grad_norm": 4.035239471522499, "learning_rate": 9.315579033816471e-06, "loss": 0.2661, "step": 532 }, { "epoch": 2.5260663507109005, "grad_norm": 1.534201372487289, "learning_rate": 9.311395870380699e-06, "loss": 0.254, "step": 533 }, { "epoch": 2.5308056872037916, "grad_norm": 2.080407165105763, "learning_rate": 9.307200907335875e-06, "loss": 0.3705, "step": 534 }, { "epoch": 2.5355450236966823, "grad_norm": 1.502966463119228, "learning_rate": 9.302994156162957e-06, "loss": 0.1576, "step": 535 }, { "epoch": 2.5402843601895735, "grad_norm": 2.210241514895105, "learning_rate": 9.29877562837515e-06, "loss": 0.2451, "step": 536 }, { "epoch": 2.545023696682464, "grad_norm": 8.781880069783684, "learning_rate": 9.294545335517904e-06, "loss": 0.2259, "step": 537 }, { "epoch": 2.5497630331753554, "grad_norm": 1.2217030042276933, "learning_rate": 9.290303289168859e-06, "loss": 0.1483, "step": 538 }, { "epoch": 2.5545023696682465, "grad_norm": 2.3742534672201043, "learning_rate": 9.286049500937826e-06, "loss": 0.3232, "step": 539 }, { "epoch": 2.5592417061611377, "grad_norm": 2.4723750071994988, "learning_rate": 9.28178398246675e-06, "loss": 0.2007, "step": 540 }, { "epoch": 2.5639810426540284, "grad_norm": 2.0869954207453425, "learning_rate": 9.277506745429684e-06, "loss": 0.3059, "step": 541 }, { "epoch": 2.5687203791469195, "grad_norm": 0.9781642431980271, "learning_rate": 9.273217801532744e-06, "loss": 0.0759, "step": 542 }, { "epoch": 2.5734597156398102, "grad_norm": 1.7701299344624615, "learning_rate": 9.268917162514098e-06, "loss": 0.1652, "step": 543 }, { "epoch": 2.5781990521327014, "grad_norm": 1.3356330447131743, "learning_rate": 9.26460484014391e-06, "loss": 0.1213, "step": 544 }, { "epoch": 2.5829383886255926, "grad_norm": 2.4644062333165313, "learning_rate": 9.260280846224328e-06, "loss": 0.4874, "step": 545 }, { "epoch": 2.5876777251184833, "grad_norm": 2.062997126984779, "learning_rate": 9.25594519258944e-06, "loss": 0.3558, "step": 546 }, { "epoch": 2.5924170616113744, "grad_norm": 1.5375755186462028, "learning_rate": 9.251597891105242e-06, "loss": 0.2059, "step": 547 }, { "epoch": 2.597156398104265, "grad_norm": 1.1624701008662408, "learning_rate": 9.247238953669612e-06, "loss": 0.0804, "step": 548 }, { "epoch": 2.6018957345971563, "grad_norm": 0.09001542056443658, "learning_rate": 9.242868392212277e-06, "loss": 0.0009, "step": 549 }, { "epoch": 2.6066350710900474, "grad_norm": 1.1436516887675665, "learning_rate": 9.238486218694767e-06, "loss": 0.1028, "step": 550 }, { "epoch": 2.6066350710900474, "eval_loss": 0.26912641525268555, "eval_runtime": 7.6787, "eval_samples_per_second": 24.483, "eval_steps_per_second": 6.121, "step": 550 }, { "epoch": 2.6113744075829386, "grad_norm": 2.133961184910742, "learning_rate": 9.234092445110401e-06, "loss": 0.3103, "step": 551 }, { "epoch": 2.6161137440758293, "grad_norm": 2.072404781682465, "learning_rate": 9.229687083484242e-06, "loss": 0.4379, "step": 552 }, { "epoch": 2.6208530805687205, "grad_norm": 2.2020800312518833, "learning_rate": 9.225270145873069e-06, "loss": 0.5363, "step": 553 }, { "epoch": 2.625592417061611, "grad_norm": 2.0457818451797647, "learning_rate": 9.220841644365343e-06, "loss": 0.361, "step": 554 }, { "epoch": 2.6303317535545023, "grad_norm": 11.521273969858171, "learning_rate": 9.216401591081173e-06, "loss": 0.2696, "step": 555 }, { "epoch": 2.6350710900473935, "grad_norm": 1.5840576742667678, "learning_rate": 9.21194999817228e-06, "loss": 0.2055, "step": 556 }, { "epoch": 2.639810426540284, "grad_norm": 2.169655448274581, "learning_rate": 9.207486877821971e-06, "loss": 0.404, "step": 557 }, { "epoch": 2.6445497630331753, "grad_norm": 2.0410487060700704, "learning_rate": 9.203012242245103e-06, "loss": 0.3766, "step": 558 }, { "epoch": 2.6492890995260665, "grad_norm": 2.726533008767557, "learning_rate": 9.198526103688045e-06, "loss": 0.2197, "step": 559 }, { "epoch": 2.654028436018957, "grad_norm": 2.163218749938081, "learning_rate": 9.194028474428651e-06, "loss": 0.3141, "step": 560 }, { "epoch": 2.6587677725118484, "grad_norm": 7.123794048200211, "learning_rate": 9.189519366776218e-06, "loss": 0.2282, "step": 561 }, { "epoch": 2.6635071090047395, "grad_norm": 4.431231299759084, "learning_rate": 9.184998793071465e-06, "loss": 0.2181, "step": 562 }, { "epoch": 2.6682464454976302, "grad_norm": 1.749031335601768, "learning_rate": 9.180466765686485e-06, "loss": 0.328, "step": 563 }, { "epoch": 2.6729857819905214, "grad_norm": 3.7503874156697377, "learning_rate": 9.17592329702472e-06, "loss": 0.201, "step": 564 }, { "epoch": 2.677725118483412, "grad_norm": 2.1106719929114606, "learning_rate": 9.171368399520925e-06, "loss": 0.3745, "step": 565 }, { "epoch": 2.6824644549763033, "grad_norm": 2.2680399854307867, "learning_rate": 9.16680208564114e-06, "loss": 0.4675, "step": 566 }, { "epoch": 2.6872037914691944, "grad_norm": 3.0991891546864716, "learning_rate": 9.162224367882639e-06, "loss": 0.2777, "step": 567 }, { "epoch": 2.6919431279620856, "grad_norm": 3.4744764688143057, "learning_rate": 9.157635258773915e-06, "loss": 0.3598, "step": 568 }, { "epoch": 2.6966824644549763, "grad_norm": 3.322846604693465, "learning_rate": 9.15303477087463e-06, "loss": 0.34, "step": 569 }, { "epoch": 2.7014218009478674, "grad_norm": 1.4990976721598193, "learning_rate": 9.148422916775596e-06, "loss": 0.1349, "step": 570 }, { "epoch": 2.706161137440758, "grad_norm": 1.457005855011899, "learning_rate": 9.143799709098729e-06, "loss": 0.1827, "step": 571 }, { "epoch": 2.7109004739336493, "grad_norm": 3.0061726721844053, "learning_rate": 9.139165160497017e-06, "loss": 0.3803, "step": 572 }, { "epoch": 2.7156398104265405, "grad_norm": 1.2933432816063761, "learning_rate": 9.134519283654484e-06, "loss": 0.163, "step": 573 }, { "epoch": 2.720379146919431, "grad_norm": 2.04160773743751, "learning_rate": 9.129862091286165e-06, "loss": 0.3507, "step": 574 }, { "epoch": 2.7251184834123223, "grad_norm": 1.7075125314751585, "learning_rate": 9.125193596138057e-06, "loss": 0.2775, "step": 575 }, { "epoch": 2.729857819905213, "grad_norm": 2.0702195565525376, "learning_rate": 9.120513810987095e-06, "loss": 0.4498, "step": 576 }, { "epoch": 2.734597156398104, "grad_norm": 2.1490772482386538, "learning_rate": 9.115822748641109e-06, "loss": 0.4318, "step": 577 }, { "epoch": 2.7393364928909953, "grad_norm": 1.6911398901399057, "learning_rate": 9.111120421938796e-06, "loss": 0.2106, "step": 578 }, { "epoch": 2.7440758293838865, "grad_norm": 1.4174417301169415, "learning_rate": 9.106406843749683e-06, "loss": 0.203, "step": 579 }, { "epoch": 2.748815165876777, "grad_norm": 1.6631284624013194, "learning_rate": 9.101682026974086e-06, "loss": 0.2728, "step": 580 }, { "epoch": 2.7535545023696684, "grad_norm": 3.139731671637309, "learning_rate": 9.096945984543082e-06, "loss": 0.3951, "step": 581 }, { "epoch": 2.758293838862559, "grad_norm": 1.8737860217419084, "learning_rate": 9.09219872941847e-06, "loss": 0.2615, "step": 582 }, { "epoch": 2.7630331753554502, "grad_norm": 2.714671958141015, "learning_rate": 9.08744027459274e-06, "loss": 0.2639, "step": 583 }, { "epoch": 2.7677725118483414, "grad_norm": 2.1847837467103957, "learning_rate": 9.082670633089028e-06, "loss": 0.4095, "step": 584 }, { "epoch": 2.772511848341232, "grad_norm": 1.3923844550111264, "learning_rate": 9.077889817961089e-06, "loss": 0.191, "step": 585 }, { "epoch": 2.7772511848341233, "grad_norm": 1.5983034252351458, "learning_rate": 9.07309784229326e-06, "loss": 0.2239, "step": 586 }, { "epoch": 2.781990521327014, "grad_norm": 1.44827964004181, "learning_rate": 9.068294719200422e-06, "loss": 0.1246, "step": 587 }, { "epoch": 2.786729857819905, "grad_norm": 1.6251016973740264, "learning_rate": 9.063480461827958e-06, "loss": 0.3035, "step": 588 }, { "epoch": 2.7914691943127963, "grad_norm": 2.52281647071451, "learning_rate": 9.058655083351736e-06, "loss": 0.2279, "step": 589 }, { "epoch": 2.7962085308056874, "grad_norm": 2.0494578495916147, "learning_rate": 9.053818596978051e-06, "loss": 0.1287, "step": 590 }, { "epoch": 2.800947867298578, "grad_norm": 1.3704845634766936, "learning_rate": 9.0489710159436e-06, "loss": 0.1626, "step": 591 }, { "epoch": 2.8056872037914693, "grad_norm": 2.793908843701991, "learning_rate": 9.044112353515451e-06, "loss": 0.1489, "step": 592 }, { "epoch": 2.81042654028436, "grad_norm": 1.2485798376541588, "learning_rate": 9.039242622990991e-06, "loss": 0.0234, "step": 593 }, { "epoch": 2.815165876777251, "grad_norm": 1.450764852010275, "learning_rate": 9.034361837697905e-06, "loss": 0.1147, "step": 594 }, { "epoch": 2.8199052132701423, "grad_norm": 2.0316327906965688, "learning_rate": 9.029470010994129e-06, "loss": 0.3353, "step": 595 }, { "epoch": 2.824644549763033, "grad_norm": 1.9981459231385812, "learning_rate": 9.02456715626782e-06, "loss": 0.2966, "step": 596 }, { "epoch": 2.829383886255924, "grad_norm": 3.7732602269541897, "learning_rate": 9.01965328693732e-06, "loss": 0.1657, "step": 597 }, { "epoch": 2.834123222748815, "grad_norm": 1.3140375816715553, "learning_rate": 9.014728416451108e-06, "loss": 0.1341, "step": 598 }, { "epoch": 2.838862559241706, "grad_norm": 4.055146109244563, "learning_rate": 9.009792558287777e-06, "loss": 0.0918, "step": 599 }, { "epoch": 2.843601895734597, "grad_norm": 1.5987581638887849, "learning_rate": 9.004845725955993e-06, "loss": 0.1849, "step": 600 }, { "epoch": 2.843601895734597, "eval_loss": 0.2646949291229248, "eval_runtime": 7.7562, "eval_samples_per_second": 24.239, "eval_steps_per_second": 6.06, "step": 600 }, { "epoch": 2.8483412322274884, "grad_norm": 1.4311870136077258, "learning_rate": 8.999887932994451e-06, "loss": 0.2196, "step": 601 }, { "epoch": 2.853080568720379, "grad_norm": 1.5536357323703605, "learning_rate": 8.994919192971849e-06, "loss": 0.1919, "step": 602 }, { "epoch": 2.8578199052132702, "grad_norm": 2.701719571128564, "learning_rate": 8.989939519486843e-06, "loss": 0.26, "step": 603 }, { "epoch": 2.862559241706161, "grad_norm": 1.636624610366843, "learning_rate": 8.984948926168014e-06, "loss": 0.2494, "step": 604 }, { "epoch": 2.867298578199052, "grad_norm": 2.1995481470676457, "learning_rate": 8.97994742667382e-06, "loss": 0.3229, "step": 605 }, { "epoch": 2.8720379146919433, "grad_norm": 3.077386203425302, "learning_rate": 8.974935034692584e-06, "loss": 0.2151, "step": 606 }, { "epoch": 2.876777251184834, "grad_norm": 2.5094488229151795, "learning_rate": 8.969911763942422e-06, "loss": 0.0976, "step": 607 }, { "epoch": 2.881516587677725, "grad_norm": 2.0153082763397583, "learning_rate": 8.96487762817124e-06, "loss": 0.3267, "step": 608 }, { "epoch": 2.8862559241706163, "grad_norm": 2.504820362595467, "learning_rate": 8.959832641156668e-06, "loss": 0.4973, "step": 609 }, { "epoch": 2.890995260663507, "grad_norm": 1.3840947253341358, "learning_rate": 8.954776816706034e-06, "loss": 0.2252, "step": 610 }, { "epoch": 2.895734597156398, "grad_norm": 1.1475318182666634, "learning_rate": 8.949710168656338e-06, "loss": 0.1105, "step": 611 }, { "epoch": 2.9004739336492893, "grad_norm": 1.9729878568274046, "learning_rate": 8.94463271087419e-06, "loss": 0.2582, "step": 612 }, { "epoch": 2.90521327014218, "grad_norm": 1.8025160811381695, "learning_rate": 8.939544457255792e-06, "loss": 0.3355, "step": 613 }, { "epoch": 2.909952606635071, "grad_norm": 1.9099688440590183, "learning_rate": 8.934445421726888e-06, "loss": 0.3046, "step": 614 }, { "epoch": 2.914691943127962, "grad_norm": 1.6522115356989961, "learning_rate": 8.929335618242733e-06, "loss": 0.2582, "step": 615 }, { "epoch": 2.919431279620853, "grad_norm": 2.298386865763964, "learning_rate": 8.924215060788052e-06, "loss": 0.4088, "step": 616 }, { "epoch": 2.924170616113744, "grad_norm": 2.147731530462001, "learning_rate": 8.919083763377001e-06, "loss": 0.2494, "step": 617 }, { "epoch": 2.9289099526066353, "grad_norm": 1.5422635054909362, "learning_rate": 8.91394174005313e-06, "loss": 0.2203, "step": 618 }, { "epoch": 2.933649289099526, "grad_norm": 1.700833802485036, "learning_rate": 8.908789004889344e-06, "loss": 0.3243, "step": 619 }, { "epoch": 2.938388625592417, "grad_norm": 2.2151123515867686, "learning_rate": 8.903625571987863e-06, "loss": 0.2661, "step": 620 }, { "epoch": 2.943127962085308, "grad_norm": 1.595637385058635, "learning_rate": 8.89845145548019e-06, "loss": 0.2488, "step": 621 }, { "epoch": 2.947867298578199, "grad_norm": 1.479935313492178, "learning_rate": 8.893266669527063e-06, "loss": 0.2238, "step": 622 }, { "epoch": 2.9526066350710902, "grad_norm": 2.2751814656976803, "learning_rate": 8.888071228318422e-06, "loss": 0.5095, "step": 623 }, { "epoch": 2.957345971563981, "grad_norm": 2.4969401553056625, "learning_rate": 8.882865146073365e-06, "loss": 0.1634, "step": 624 }, { "epoch": 2.962085308056872, "grad_norm": 1.532477373318212, "learning_rate": 8.877648437040121e-06, "loss": 0.2584, "step": 625 }, { "epoch": 2.966824644549763, "grad_norm": 2.5398952366709056, "learning_rate": 8.872421115495996e-06, "loss": 0.5054, "step": 626 }, { "epoch": 2.971563981042654, "grad_norm": 2.7892159953371167, "learning_rate": 8.867183195747343e-06, "loss": 0.3134, "step": 627 }, { "epoch": 2.976303317535545, "grad_norm": 1.4358314980311842, "learning_rate": 8.861934692129519e-06, "loss": 0.1268, "step": 628 }, { "epoch": 2.9810426540284363, "grad_norm": 3.110877657855391, "learning_rate": 8.85667561900685e-06, "loss": 0.2085, "step": 629 }, { "epoch": 2.985781990521327, "grad_norm": 1.4709345915042766, "learning_rate": 8.851405990772588e-06, "loss": 0.104, "step": 630 }, { "epoch": 2.990521327014218, "grad_norm": 1.608173580874095, "learning_rate": 8.846125821848874e-06, "loss": 0.2228, "step": 631 }, { "epoch": 2.995260663507109, "grad_norm": 1.7819696416704576, "learning_rate": 8.840835126686694e-06, "loss": 0.1161, "step": 632 }, { "epoch": 3.0, "grad_norm": 2.3936724791423845, "learning_rate": 8.835533919765844e-06, "loss": 0.2689, "step": 633 } ], "logging_steps": 1.0, "max_steps": 2110, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 18303531614208.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }