{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017543859649122807, "grad_norm": 6.018052101135254, "learning_rate": 0.0, "loss": 1.789, "mean_token_accuracy": 0.5678549408912659, "num_tokens": 429478.0, "step": 1 }, { "epoch": 0.0035087719298245615, "grad_norm": 6.373941898345947, "learning_rate": 1.7543859649122807e-06, "loss": 1.7812, "mean_token_accuracy": 0.5701305270195007, "num_tokens": 824562.0, "step": 2 }, { "epoch": 0.005263157894736842, "grad_norm": 6.300591468811035, "learning_rate": 3.5087719298245615e-06, "loss": 1.787, "mean_token_accuracy": 0.5685350894927979, "num_tokens": 1228422.0, "step": 3 }, { "epoch": 0.007017543859649123, "grad_norm": 4.870020389556885, "learning_rate": 5.263157894736842e-06, "loss": 1.7577, "mean_token_accuracy": 0.570589542388916, "num_tokens": 1633223.0, "step": 4 }, { "epoch": 0.008771929824561403, "grad_norm": 3.564033269882202, "learning_rate": 7.017543859649123e-06, "loss": 1.6964, "mean_token_accuracy": 0.5784072279930115, "num_tokens": 2060143.0, "step": 5 }, { "epoch": 0.010526315789473684, "grad_norm": 2.9137721061706543, "learning_rate": 8.771929824561403e-06, "loss": 1.646, "mean_token_accuracy": 0.5835782289505005, "num_tokens": 2503690.0, "step": 6 }, { "epoch": 0.012280701754385965, "grad_norm": 2.481250047683716, "learning_rate": 1.0526315789473684e-05, "loss": 1.5991, "mean_token_accuracy": 0.5927026271820068, "num_tokens": 2904426.0, "step": 7 }, { "epoch": 0.014035087719298246, "grad_norm": 3.379573345184326, "learning_rate": 1.2280701754385964e-05, "loss": 1.5736, "mean_token_accuracy": 0.5973439812660217, "num_tokens": 3299360.0, "step": 8 }, { "epoch": 0.015789473684210527, "grad_norm": 2.4704713821411133, "learning_rate": 1.4035087719298246e-05, "loss": 1.5427, "mean_token_accuracy": 0.6033717393875122, "num_tokens": 3709604.0, "step": 9 }, { "epoch": 0.017543859649122806, "grad_norm": 1.8616167306900024, "learning_rate": 1.5789473684210526e-05, "loss": 1.5179, "mean_token_accuracy": 0.606142520904541, "num_tokens": 4122427.0, "step": 10 }, { "epoch": 0.01929824561403509, "grad_norm": 1.90486478805542, "learning_rate": 1.7543859649122806e-05, "loss": 1.5291, "mean_token_accuracy": 0.6004294157028198, "num_tokens": 4549800.0, "step": 11 }, { "epoch": 0.021052631578947368, "grad_norm": 1.6314690113067627, "learning_rate": 1.929824561403509e-05, "loss": 1.5063, "mean_token_accuracy": 0.6060437560081482, "num_tokens": 4972367.0, "step": 12 }, { "epoch": 0.02280701754385965, "grad_norm": 1.4108632802963257, "learning_rate": 2.105263157894737e-05, "loss": 1.4803, "mean_token_accuracy": 0.6093297004699707, "num_tokens": 5381784.0, "step": 13 }, { "epoch": 0.02456140350877193, "grad_norm": 1.3794684410095215, "learning_rate": 2.280701754385965e-05, "loss": 1.4411, "mean_token_accuracy": 0.6182389259338379, "num_tokens": 5783941.0, "step": 14 }, { "epoch": 0.02631578947368421, "grad_norm": 1.2287817001342773, "learning_rate": 2.456140350877193e-05, "loss": 1.423, "mean_token_accuracy": 0.621440052986145, "num_tokens": 6173946.0, "step": 15 }, { "epoch": 0.028070175438596492, "grad_norm": 1.2201406955718994, "learning_rate": 2.6315789473684212e-05, "loss": 1.4455, "mean_token_accuracy": 0.6170344352722168, "num_tokens": 6588292.0, "step": 16 }, { "epoch": 0.02982456140350877, "grad_norm": 1.1274272203445435, "learning_rate": 2.8070175438596492e-05, "loss": 1.4051, "mean_token_accuracy": 0.6251938343048096, "num_tokens": 7005165.0, "step": 17 }, { "epoch": 0.031578947368421054, "grad_norm": 1.040313482284546, "learning_rate": 2.9824561403508772e-05, "loss": 1.4229, "mean_token_accuracy": 0.6200644373893738, "num_tokens": 7422596.0, "step": 18 }, { "epoch": 0.03333333333333333, "grad_norm": 1.0897908210754395, "learning_rate": 3.157894736842105e-05, "loss": 1.3782, "mean_token_accuracy": 0.6302369236946106, "num_tokens": 7812857.0, "step": 19 }, { "epoch": 0.03508771929824561, "grad_norm": 1.08786940574646, "learning_rate": 3.3333333333333335e-05, "loss": 1.4219, "mean_token_accuracy": 0.6207268834114075, "num_tokens": 8244992.0, "step": 20 }, { "epoch": 0.03684210526315789, "grad_norm": 0.991358757019043, "learning_rate": 3.508771929824561e-05, "loss": 1.3912, "mean_token_accuracy": 0.6278927326202393, "num_tokens": 8664134.0, "step": 21 }, { "epoch": 0.03859649122807018, "grad_norm": 1.1284328699111938, "learning_rate": 3.6842105263157895e-05, "loss": 1.4004, "mean_token_accuracy": 0.6226587295532227, "num_tokens": 9082923.0, "step": 22 }, { "epoch": 0.04035087719298246, "grad_norm": 1.1181979179382324, "learning_rate": 3.859649122807018e-05, "loss": 1.3971, "mean_token_accuracy": 0.6253641247749329, "num_tokens": 9497934.0, "step": 23 }, { "epoch": 0.042105263157894736, "grad_norm": 1.2639045715332031, "learning_rate": 4.0350877192982455e-05, "loss": 1.4259, "mean_token_accuracy": 0.6176168322563171, "num_tokens": 9923716.0, "step": 24 }, { "epoch": 0.043859649122807015, "grad_norm": 1.0910519361495972, "learning_rate": 4.210526315789474e-05, "loss": 1.3733, "mean_token_accuracy": 0.6294311285018921, "num_tokens": 10353257.0, "step": 25 }, { "epoch": 0.0456140350877193, "grad_norm": 1.0363545417785645, "learning_rate": 4.3859649122807014e-05, "loss": 1.3809, "mean_token_accuracy": 0.6275283098220825, "num_tokens": 10767965.0, "step": 26 }, { "epoch": 0.04736842105263158, "grad_norm": 1.048935055732727, "learning_rate": 4.56140350877193e-05, "loss": 1.3815, "mean_token_accuracy": 0.6274988651275635, "num_tokens": 11188411.0, "step": 27 }, { "epoch": 0.04912280701754386, "grad_norm": 1.3705120086669922, "learning_rate": 4.736842105263158e-05, "loss": 1.357, "mean_token_accuracy": 0.6328938007354736, "num_tokens": 11597031.0, "step": 28 }, { "epoch": 0.05087719298245614, "grad_norm": 1.2502895593643188, "learning_rate": 4.912280701754386e-05, "loss": 1.3848, "mean_token_accuracy": 0.626395583152771, "num_tokens": 11996650.0, "step": 29 }, { "epoch": 0.05263157894736842, "grad_norm": 0.9786079525947571, "learning_rate": 5.087719298245615e-05, "loss": 1.3669, "mean_token_accuracy": 0.6301195621490479, "num_tokens": 12414351.0, "step": 30 }, { "epoch": 0.054385964912280704, "grad_norm": 1.322411298751831, "learning_rate": 5.2631578947368424e-05, "loss": 1.3703, "mean_token_accuracy": 0.6289023756980896, "num_tokens": 12853897.0, "step": 31 }, { "epoch": 0.056140350877192984, "grad_norm": 1.1092149019241333, "learning_rate": 5.438596491228071e-05, "loss": 1.343, "mean_token_accuracy": 0.6342843770980835, "num_tokens": 13246819.0, "step": 32 }, { "epoch": 0.05789473684210526, "grad_norm": 1.39752995967865, "learning_rate": 5.6140350877192984e-05, "loss": 1.3795, "mean_token_accuracy": 0.6259216666221619, "num_tokens": 13668134.0, "step": 33 }, { "epoch": 0.05964912280701754, "grad_norm": 1.2338861227035522, "learning_rate": 5.789473684210527e-05, "loss": 1.3461, "mean_token_accuracy": 0.6343478560447693, "num_tokens": 14077677.0, "step": 34 }, { "epoch": 0.06140350877192982, "grad_norm": 1.1885336637496948, "learning_rate": 5.9649122807017544e-05, "loss": 1.3637, "mean_token_accuracy": 0.6306077241897583, "num_tokens": 14502863.0, "step": 35 }, { "epoch": 0.06315789473684211, "grad_norm": 1.471887469291687, "learning_rate": 6.140350877192983e-05, "loss": 1.3589, "mean_token_accuracy": 0.6298720836639404, "num_tokens": 14940331.0, "step": 36 }, { "epoch": 0.06491228070175438, "grad_norm": 1.0302767753601074, "learning_rate": 6.31578947368421e-05, "loss": 1.3397, "mean_token_accuracy": 0.6345314979553223, "num_tokens": 15343369.0, "step": 37 }, { "epoch": 0.06666666666666667, "grad_norm": 1.42778480052948, "learning_rate": 6.49122807017544e-05, "loss": 1.3546, "mean_token_accuracy": 0.6308321356773376, "num_tokens": 15752951.0, "step": 38 }, { "epoch": 0.06842105263157895, "grad_norm": 1.52997624874115, "learning_rate": 6.666666666666667e-05, "loss": 1.3207, "mean_token_accuracy": 0.6400465965270996, "num_tokens": 16147599.0, "step": 39 }, { "epoch": 0.07017543859649122, "grad_norm": 1.1962817907333374, "learning_rate": 6.842105263157895e-05, "loss": 1.3581, "mean_token_accuracy": 0.6308744549751282, "num_tokens": 16557268.0, "step": 40 }, { "epoch": 0.07192982456140351, "grad_norm": 1.1255979537963867, "learning_rate": 7.017543859649122e-05, "loss": 1.3227, "mean_token_accuracy": 0.639037013053894, "num_tokens": 16950170.0, "step": 41 }, { "epoch": 0.07368421052631578, "grad_norm": 1.3424605131149292, "learning_rate": 7.192982456140351e-05, "loss": 1.3326, "mean_token_accuracy": 0.6364561319351196, "num_tokens": 17369691.0, "step": 42 }, { "epoch": 0.07543859649122807, "grad_norm": 1.4676284790039062, "learning_rate": 7.368421052631579e-05, "loss": 1.3922, "mean_token_accuracy": 0.6229093074798584, "num_tokens": 17777844.0, "step": 43 }, { "epoch": 0.07719298245614035, "grad_norm": 1.339996099472046, "learning_rate": 7.543859649122808e-05, "loss": 1.3405, "mean_token_accuracy": 0.6354778409004211, "num_tokens": 18191802.0, "step": 44 }, { "epoch": 0.07894736842105263, "grad_norm": 1.7620866298675537, "learning_rate": 7.719298245614036e-05, "loss": 1.3225, "mean_token_accuracy": 0.6384661197662354, "num_tokens": 18608551.0, "step": 45 }, { "epoch": 0.08070175438596491, "grad_norm": 1.4890868663787842, "learning_rate": 7.894736842105263e-05, "loss": 1.3829, "mean_token_accuracy": 0.6231105327606201, "num_tokens": 19020275.0, "step": 46 }, { "epoch": 0.0824561403508772, "grad_norm": 1.3470134735107422, "learning_rate": 8.070175438596491e-05, "loss": 1.3068, "mean_token_accuracy": 0.6428367495536804, "num_tokens": 19416792.0, "step": 47 }, { "epoch": 0.08421052631578947, "grad_norm": 1.2967629432678223, "learning_rate": 8.24561403508772e-05, "loss": 1.3352, "mean_token_accuracy": 0.6349055171012878, "num_tokens": 19842380.0, "step": 48 }, { "epoch": 0.08596491228070176, "grad_norm": 1.5379173755645752, "learning_rate": 8.421052631578948e-05, "loss": 1.3475, "mean_token_accuracy": 0.6317988634109497, "num_tokens": 20257340.0, "step": 49 }, { "epoch": 0.08771929824561403, "grad_norm": 1.2810230255126953, "learning_rate": 8.596491228070177e-05, "loss": 1.3337, "mean_token_accuracy": 0.6345160007476807, "num_tokens": 20693853.0, "step": 50 }, { "epoch": 0.08947368421052632, "grad_norm": 1.5687311887741089, "learning_rate": 8.771929824561403e-05, "loss": 1.3213, "mean_token_accuracy": 0.637017011642456, "num_tokens": 21145069.0, "step": 51 }, { "epoch": 0.0912280701754386, "grad_norm": 1.3021150827407837, "learning_rate": 8.947368421052632e-05, "loss": 1.3366, "mean_token_accuracy": 0.6351794600486755, "num_tokens": 21548907.0, "step": 52 }, { "epoch": 0.09298245614035087, "grad_norm": 1.6907377243041992, "learning_rate": 9.12280701754386e-05, "loss": 1.3326, "mean_token_accuracy": 0.6350468397140503, "num_tokens": 21947011.0, "step": 53 }, { "epoch": 0.09473684210526316, "grad_norm": 1.4103087186813354, "learning_rate": 9.298245614035089e-05, "loss": 1.3593, "mean_token_accuracy": 0.6285296082496643, "num_tokens": 22384995.0, "step": 54 }, { "epoch": 0.09649122807017543, "grad_norm": 1.3662679195404053, "learning_rate": 9.473684210526316e-05, "loss": 1.2921, "mean_token_accuracy": 0.644428014755249, "num_tokens": 22774976.0, "step": 55 }, { "epoch": 0.09824561403508772, "grad_norm": 1.4143177270889282, "learning_rate": 9.649122807017544e-05, "loss": 1.3537, "mean_token_accuracy": 0.6295624375343323, "num_tokens": 23192796.0, "step": 56 }, { "epoch": 0.1, "grad_norm": 1.085375189781189, "learning_rate": 9.824561403508771e-05, "loss": 1.3102, "mean_token_accuracy": 0.6399141550064087, "num_tokens": 23610316.0, "step": 57 }, { "epoch": 0.10175438596491228, "grad_norm": 1.3309866189956665, "learning_rate": 0.0001, "loss": 1.3685, "mean_token_accuracy": 0.6267704367637634, "num_tokens": 24043550.0, "step": 58 }, { "epoch": 0.10350877192982456, "grad_norm": 1.4298138618469238, "learning_rate": 0.0001, "loss": 1.3263, "mean_token_accuracy": 0.6358023285865784, "num_tokens": 24482051.0, "step": 59 }, { "epoch": 0.10526315789473684, "grad_norm": 1.3495875597000122, "learning_rate": 0.0001, "loss": 1.3231, "mean_token_accuracy": 0.6365145444869995, "num_tokens": 24889567.0, "step": 60 }, { "epoch": 0.10701754385964912, "grad_norm": 1.3433363437652588, "learning_rate": 0.0001, "loss": 1.3265, "mean_token_accuracy": 0.6364420652389526, "num_tokens": 25296636.0, "step": 61 }, { "epoch": 0.10877192982456141, "grad_norm": 1.4023200273513794, "learning_rate": 0.0001, "loss": 1.2956, "mean_token_accuracy": 0.6436514854431152, "num_tokens": 25694614.0, "step": 62 }, { "epoch": 0.11052631578947368, "grad_norm": 1.4170814752578735, "learning_rate": 0.0001, "loss": 1.308, "mean_token_accuracy": 0.6407404541969299, "num_tokens": 26107689.0, "step": 63 }, { "epoch": 0.11228070175438597, "grad_norm": 1.198994755744934, "learning_rate": 0.0001, "loss": 1.3057, "mean_token_accuracy": 0.6392735838890076, "num_tokens": 26522427.0, "step": 64 }, { "epoch": 0.11403508771929824, "grad_norm": 1.422518014907837, "learning_rate": 0.0001, "loss": 1.3237, "mean_token_accuracy": 0.6369431018829346, "num_tokens": 26934123.0, "step": 65 }, { "epoch": 0.11578947368421053, "grad_norm": 1.3225864171981812, "learning_rate": 0.0001, "loss": 1.3213, "mean_token_accuracy": 0.637446403503418, "num_tokens": 27358915.0, "step": 66 }, { "epoch": 0.11754385964912281, "grad_norm": 1.1103287935256958, "learning_rate": 0.0001, "loss": 1.3107, "mean_token_accuracy": 0.6421770453453064, "num_tokens": 27775556.0, "step": 67 }, { "epoch": 0.11929824561403508, "grad_norm": 1.1607317924499512, "learning_rate": 0.0001, "loss": 1.3219, "mean_token_accuracy": 0.6369372010231018, "num_tokens": 28174439.0, "step": 68 }, { "epoch": 0.12105263157894737, "grad_norm": 1.121587872505188, "learning_rate": 0.0001, "loss": 1.2874, "mean_token_accuracy": 0.6452154517173767, "num_tokens": 28591718.0, "step": 69 }, { "epoch": 0.12280701754385964, "grad_norm": 1.347907304763794, "learning_rate": 0.0001, "loss": 1.3031, "mean_token_accuracy": 0.6407017707824707, "num_tokens": 29010174.0, "step": 70 }, { "epoch": 0.12456140350877193, "grad_norm": 0.9920047521591187, "learning_rate": 0.0001, "loss": 1.3081, "mean_token_accuracy": 0.6407448053359985, "num_tokens": 29430243.0, "step": 71 }, { "epoch": 0.12631578947368421, "grad_norm": 1.4440033435821533, "learning_rate": 0.0001, "loss": 1.317, "mean_token_accuracy": 0.6371080279350281, "num_tokens": 29852311.0, "step": 72 }, { "epoch": 0.1280701754385965, "grad_norm": 1.172947645187378, "learning_rate": 0.0001, "loss": 1.2955, "mean_token_accuracy": 0.6423896551132202, "num_tokens": 30267287.0, "step": 73 }, { "epoch": 0.12982456140350876, "grad_norm": 1.2112936973571777, "learning_rate": 0.0001, "loss": 1.3155, "mean_token_accuracy": 0.6372196674346924, "num_tokens": 30715583.0, "step": 74 }, { "epoch": 0.13157894736842105, "grad_norm": 1.1959091424942017, "learning_rate": 0.0001, "loss": 1.326, "mean_token_accuracy": 0.6362699866294861, "num_tokens": 31143599.0, "step": 75 }, { "epoch": 0.13333333333333333, "grad_norm": 1.3436111211776733, "learning_rate": 0.0001, "loss": 1.3162, "mean_token_accuracy": 0.6371738910675049, "num_tokens": 31573952.0, "step": 76 }, { "epoch": 0.13508771929824562, "grad_norm": 1.101008653640747, "learning_rate": 0.0001, "loss": 1.334, "mean_token_accuracy": 0.6339811086654663, "num_tokens": 31985309.0, "step": 77 }, { "epoch": 0.1368421052631579, "grad_norm": 1.2296723127365112, "learning_rate": 0.0001, "loss": 1.3346, "mean_token_accuracy": 0.6317209005355835, "num_tokens": 32422743.0, "step": 78 }, { "epoch": 0.13859649122807016, "grad_norm": 1.0157369375228882, "learning_rate": 0.0001, "loss": 1.3197, "mean_token_accuracy": 0.6366320848464966, "num_tokens": 32830001.0, "step": 79 }, { "epoch": 0.14035087719298245, "grad_norm": 1.1848087310791016, "learning_rate": 0.0001, "loss": 1.2867, "mean_token_accuracy": 0.6441957950592041, "num_tokens": 33242795.0, "step": 80 }, { "epoch": 0.14210526315789473, "grad_norm": 1.035370945930481, "learning_rate": 0.0001, "loss": 1.291, "mean_token_accuracy": 0.6427246332168579, "num_tokens": 33631163.0, "step": 81 }, { "epoch": 0.14385964912280702, "grad_norm": 1.2173899412155151, "learning_rate": 0.0001, "loss": 1.331, "mean_token_accuracy": 0.6329866647720337, "num_tokens": 34030802.0, "step": 82 }, { "epoch": 0.1456140350877193, "grad_norm": 1.2178702354431152, "learning_rate": 0.0001, "loss": 1.3159, "mean_token_accuracy": 0.6359639167785645, "num_tokens": 34443782.0, "step": 83 }, { "epoch": 0.14736842105263157, "grad_norm": 1.045278787612915, "learning_rate": 0.0001, "loss": 1.3248, "mean_token_accuracy": 0.6354624629020691, "num_tokens": 34854553.0, "step": 84 }, { "epoch": 0.14912280701754385, "grad_norm": 1.0509958267211914, "learning_rate": 0.0001, "loss": 1.3038, "mean_token_accuracy": 0.6399705410003662, "num_tokens": 35265577.0, "step": 85 }, { "epoch": 0.15087719298245614, "grad_norm": 1.1449450254440308, "learning_rate": 0.0001, "loss": 1.3004, "mean_token_accuracy": 0.6418460011482239, "num_tokens": 35705670.0, "step": 86 }, { "epoch": 0.15263157894736842, "grad_norm": 1.254193902015686, "learning_rate": 0.0001, "loss": 1.3088, "mean_token_accuracy": 0.6380743980407715, "num_tokens": 36117458.0, "step": 87 }, { "epoch": 0.1543859649122807, "grad_norm": 1.107653021812439, "learning_rate": 0.0001, "loss": 1.3009, "mean_token_accuracy": 0.6418921947479248, "num_tokens": 36528322.0, "step": 88 }, { "epoch": 0.156140350877193, "grad_norm": 0.9854401350021362, "learning_rate": 0.0001, "loss": 1.3022, "mean_token_accuracy": 0.6411072015762329, "num_tokens": 36937089.0, "step": 89 }, { "epoch": 0.15789473684210525, "grad_norm": 0.9852709174156189, "learning_rate": 0.0001, "loss": 1.2818, "mean_token_accuracy": 0.6445783972740173, "num_tokens": 37347449.0, "step": 90 }, { "epoch": 0.15964912280701754, "grad_norm": 1.0607930421829224, "learning_rate": 0.0001, "loss": 1.3042, "mean_token_accuracy": 0.6411298513412476, "num_tokens": 37768132.0, "step": 91 }, { "epoch": 0.16140350877192983, "grad_norm": 0.8618792295455933, "learning_rate": 0.0001, "loss": 1.2899, "mean_token_accuracy": 0.6437462568283081, "num_tokens": 38173420.0, "step": 92 }, { "epoch": 0.1631578947368421, "grad_norm": 0.9967447519302368, "learning_rate": 0.0001, "loss": 1.2854, "mean_token_accuracy": 0.6431381106376648, "num_tokens": 38564492.0, "step": 93 }, { "epoch": 0.1649122807017544, "grad_norm": 0.984609842300415, "learning_rate": 0.0001, "loss": 1.3059, "mean_token_accuracy": 0.6391075849533081, "num_tokens": 38973529.0, "step": 94 }, { "epoch": 0.16666666666666666, "grad_norm": 1.2071311473846436, "learning_rate": 0.0001, "loss": 1.3399, "mean_token_accuracy": 0.6329282522201538, "num_tokens": 39394470.0, "step": 95 }, { "epoch": 0.16842105263157894, "grad_norm": 0.976823627948761, "learning_rate": 0.0001, "loss": 1.3189, "mean_token_accuracy": 0.6357463598251343, "num_tokens": 39817247.0, "step": 96 }, { "epoch": 0.17017543859649123, "grad_norm": 1.1396266222000122, "learning_rate": 0.0001, "loss": 1.3212, "mean_token_accuracy": 0.6357501745223999, "num_tokens": 40219578.0, "step": 97 }, { "epoch": 0.17192982456140352, "grad_norm": 1.375174880027771, "learning_rate": 0.0001, "loss": 1.3187, "mean_token_accuracy": 0.6361098289489746, "num_tokens": 40631135.0, "step": 98 }, { "epoch": 0.1736842105263158, "grad_norm": 1.1790404319763184, "learning_rate": 0.0001, "loss": 1.2962, "mean_token_accuracy": 0.6430338621139526, "num_tokens": 41060591.0, "step": 99 }, { "epoch": 0.17543859649122806, "grad_norm": 1.1208826303482056, "learning_rate": 0.0001, "loss": 1.3243, "mean_token_accuracy": 0.6350250244140625, "num_tokens": 41491310.0, "step": 100 }, { "epoch": 0.17719298245614035, "grad_norm": 0.9812876582145691, "learning_rate": 0.0001, "loss": 1.2989, "mean_token_accuracy": 0.6403151154518127, "num_tokens": 41928251.0, "step": 101 }, { "epoch": 0.17894736842105263, "grad_norm": 1.118895173072815, "learning_rate": 0.0001, "loss": 1.2881, "mean_token_accuracy": 0.643653392791748, "num_tokens": 42328746.0, "step": 102 }, { "epoch": 0.18070175438596492, "grad_norm": 1.0872011184692383, "learning_rate": 0.0001, "loss": 1.3338, "mean_token_accuracy": 0.6348128318786621, "num_tokens": 42770651.0, "step": 103 }, { "epoch": 0.1824561403508772, "grad_norm": 1.0117576122283936, "learning_rate": 0.0001, "loss": 1.305, "mean_token_accuracy": 0.6394751071929932, "num_tokens": 43173759.0, "step": 104 }, { "epoch": 0.18421052631578946, "grad_norm": 0.9142250418663025, "learning_rate": 0.0001, "loss": 1.2908, "mean_token_accuracy": 0.642844557762146, "num_tokens": 43604295.0, "step": 105 }, { "epoch": 0.18596491228070175, "grad_norm": 1.1038587093353271, "learning_rate": 0.0001, "loss": 1.2963, "mean_token_accuracy": 0.6419985294342041, "num_tokens": 44008348.0, "step": 106 }, { "epoch": 0.18771929824561404, "grad_norm": 0.928559422492981, "learning_rate": 0.0001, "loss": 1.3107, "mean_token_accuracy": 0.6373360753059387, "num_tokens": 44444613.0, "step": 107 }, { "epoch": 0.18947368421052632, "grad_norm": 1.0053200721740723, "learning_rate": 0.0001, "loss": 1.2887, "mean_token_accuracy": 0.6448897123336792, "num_tokens": 44875312.0, "step": 108 }, { "epoch": 0.1912280701754386, "grad_norm": 0.9399821758270264, "learning_rate": 0.0001, "loss": 1.2996, "mean_token_accuracy": 0.6389566659927368, "num_tokens": 45273670.0, "step": 109 }, { "epoch": 0.19298245614035087, "grad_norm": 1.2514432668685913, "learning_rate": 0.0001, "loss": 1.2788, "mean_token_accuracy": 0.6447431445121765, "num_tokens": 45696861.0, "step": 110 }, { "epoch": 0.19473684210526315, "grad_norm": 0.9928343892097473, "learning_rate": 0.0001, "loss": 1.303, "mean_token_accuracy": 0.639816403388977, "num_tokens": 46115387.0, "step": 111 }, { "epoch": 0.19649122807017544, "grad_norm": 1.0918611288070679, "learning_rate": 0.0001, "loss": 1.2904, "mean_token_accuracy": 0.6424538493156433, "num_tokens": 46521933.0, "step": 112 }, { "epoch": 0.19824561403508772, "grad_norm": 1.1192419528961182, "learning_rate": 0.0001, "loss": 1.3263, "mean_token_accuracy": 0.634386420249939, "num_tokens": 46941357.0, "step": 113 }, { "epoch": 0.2, "grad_norm": 0.9753395318984985, "learning_rate": 0.0001, "loss": 1.2792, "mean_token_accuracy": 0.6461848020553589, "num_tokens": 47343683.0, "step": 114 }, { "epoch": 0.20175438596491227, "grad_norm": 0.8872193694114685, "learning_rate": 0.0001, "loss": 1.2928, "mean_token_accuracy": 0.6433566808700562, "num_tokens": 47787665.0, "step": 115 }, { "epoch": 0.20350877192982456, "grad_norm": 0.9394273161888123, "learning_rate": 0.0001, "loss": 1.2839, "mean_token_accuracy": 0.643855094909668, "num_tokens": 48190313.0, "step": 116 }, { "epoch": 0.20526315789473684, "grad_norm": 1.136915922164917, "learning_rate": 0.0001, "loss": 1.2904, "mean_token_accuracy": 0.6421671509742737, "num_tokens": 48630247.0, "step": 117 }, { "epoch": 0.20701754385964913, "grad_norm": 0.9522098898887634, "learning_rate": 0.0001, "loss": 1.2942, "mean_token_accuracy": 0.6419311761856079, "num_tokens": 49009657.0, "step": 118 }, { "epoch": 0.20877192982456141, "grad_norm": 1.1538357734680176, "learning_rate": 0.0001, "loss": 1.2708, "mean_token_accuracy": 0.6458480358123779, "num_tokens": 49398873.0, "step": 119 }, { "epoch": 0.21052631578947367, "grad_norm": 0.9239334464073181, "learning_rate": 0.0001, "loss": 1.2768, "mean_token_accuracy": 0.6459267139434814, "num_tokens": 49804381.0, "step": 120 }, { "epoch": 0.21228070175438596, "grad_norm": 0.9793084859848022, "learning_rate": 0.0001, "loss": 1.2712, "mean_token_accuracy": 0.6456162929534912, "num_tokens": 50213766.0, "step": 121 }, { "epoch": 0.21403508771929824, "grad_norm": 1.1136904954910278, "learning_rate": 0.0001, "loss": 1.2877, "mean_token_accuracy": 0.6435809135437012, "num_tokens": 50625155.0, "step": 122 }, { "epoch": 0.21578947368421053, "grad_norm": 0.8962170481681824, "learning_rate": 0.0001, "loss": 1.2929, "mean_token_accuracy": 0.6421340703964233, "num_tokens": 51028769.0, "step": 123 }, { "epoch": 0.21754385964912282, "grad_norm": 1.0955440998077393, "learning_rate": 0.0001, "loss": 1.2801, "mean_token_accuracy": 0.6436057090759277, "num_tokens": 51442144.0, "step": 124 }, { "epoch": 0.21929824561403508, "grad_norm": 0.9009307622909546, "learning_rate": 0.0001, "loss": 1.2709, "mean_token_accuracy": 0.6477597951889038, "num_tokens": 51847840.0, "step": 125 }, { "epoch": 0.22105263157894736, "grad_norm": 1.0885659456253052, "learning_rate": 0.0001, "loss": 1.2771, "mean_token_accuracy": 0.6435371041297913, "num_tokens": 52284344.0, "step": 126 }, { "epoch": 0.22280701754385965, "grad_norm": 0.92705899477005, "learning_rate": 0.0001, "loss": 1.3212, "mean_token_accuracy": 0.6340687870979309, "num_tokens": 52685505.0, "step": 127 }, { "epoch": 0.22456140350877193, "grad_norm": 0.9139009118080139, "learning_rate": 0.0001, "loss": 1.2932, "mean_token_accuracy": 0.6407190561294556, "num_tokens": 53110059.0, "step": 128 }, { "epoch": 0.22631578947368422, "grad_norm": 0.8279791474342346, "learning_rate": 0.0001, "loss": 1.2659, "mean_token_accuracy": 0.6474618911743164, "num_tokens": 53502719.0, "step": 129 }, { "epoch": 0.22807017543859648, "grad_norm": 0.9933703541755676, "learning_rate": 0.0001, "loss": 1.2681, "mean_token_accuracy": 0.6483220458030701, "num_tokens": 53917317.0, "step": 130 }, { "epoch": 0.22982456140350876, "grad_norm": 0.887478768825531, "learning_rate": 0.0001, "loss": 1.3002, "mean_token_accuracy": 0.6406154632568359, "num_tokens": 54338305.0, "step": 131 }, { "epoch": 0.23157894736842105, "grad_norm": 0.8612638711929321, "learning_rate": 0.0001, "loss": 1.2774, "mean_token_accuracy": 0.6442515850067139, "num_tokens": 54754376.0, "step": 132 }, { "epoch": 0.23333333333333334, "grad_norm": 0.850595235824585, "learning_rate": 0.0001, "loss": 1.3222, "mean_token_accuracy": 0.6351276636123657, "num_tokens": 55177668.0, "step": 133 }, { "epoch": 0.23508771929824562, "grad_norm": 1.1265441179275513, "learning_rate": 0.0001, "loss": 1.2869, "mean_token_accuracy": 0.6440436840057373, "num_tokens": 55588413.0, "step": 134 }, { "epoch": 0.23684210526315788, "grad_norm": 0.8181601762771606, "learning_rate": 0.0001, "loss": 1.2689, "mean_token_accuracy": 0.6486095190048218, "num_tokens": 56002661.0, "step": 135 }, { "epoch": 0.23859649122807017, "grad_norm": 0.9597206115722656, "learning_rate": 0.0001, "loss": 1.2685, "mean_token_accuracy": 0.6476383209228516, "num_tokens": 56399978.0, "step": 136 }, { "epoch": 0.24035087719298245, "grad_norm": 0.9021192193031311, "learning_rate": 0.0001, "loss": 1.277, "mean_token_accuracy": 0.6456701159477234, "num_tokens": 56805057.0, "step": 137 }, { "epoch": 0.24210526315789474, "grad_norm": 0.9269475936889648, "learning_rate": 0.0001, "loss": 1.2937, "mean_token_accuracy": 0.6424517631530762, "num_tokens": 57221548.0, "step": 138 }, { "epoch": 0.24385964912280703, "grad_norm": 0.9395855069160461, "learning_rate": 0.0001, "loss": 1.2905, "mean_token_accuracy": 0.6421390175819397, "num_tokens": 57619205.0, "step": 139 }, { "epoch": 0.24561403508771928, "grad_norm": 1.0334845781326294, "learning_rate": 0.0001, "loss": 1.2727, "mean_token_accuracy": 0.6469994783401489, "num_tokens": 58029104.0, "step": 140 }, { "epoch": 0.24736842105263157, "grad_norm": 1.080823302268982, "learning_rate": 0.0001, "loss": 1.3086, "mean_token_accuracy": 0.6387298107147217, "num_tokens": 58444372.0, "step": 141 }, { "epoch": 0.24912280701754386, "grad_norm": 0.8953016400337219, "learning_rate": 0.0001, "loss": 1.3045, "mean_token_accuracy": 0.6378800272941589, "num_tokens": 58859539.0, "step": 142 }, { "epoch": 0.25087719298245614, "grad_norm": 0.8567958474159241, "learning_rate": 0.0001, "loss": 1.3121, "mean_token_accuracy": 0.6372794508934021, "num_tokens": 59268101.0, "step": 143 }, { "epoch": 0.25263157894736843, "grad_norm": 1.158692479133606, "learning_rate": 0.0001, "loss": 1.2691, "mean_token_accuracy": 0.6473510265350342, "num_tokens": 59708311.0, "step": 144 }, { "epoch": 0.2543859649122807, "grad_norm": 0.9232509732246399, "learning_rate": 0.0001, "loss": 1.2682, "mean_token_accuracy": 0.648154616355896, "num_tokens": 60135806.0, "step": 145 }, { "epoch": 0.256140350877193, "grad_norm": 0.9411163330078125, "learning_rate": 0.0001, "loss": 1.3155, "mean_token_accuracy": 0.6371098756790161, "num_tokens": 60546162.0, "step": 146 }, { "epoch": 0.2578947368421053, "grad_norm": 1.013136863708496, "learning_rate": 0.0001, "loss": 1.2655, "mean_token_accuracy": 0.646664023399353, "num_tokens": 60977521.0, "step": 147 }, { "epoch": 0.2596491228070175, "grad_norm": 1.1551271677017212, "learning_rate": 0.0001, "loss": 1.2798, "mean_token_accuracy": 0.6451727747917175, "num_tokens": 61372998.0, "step": 148 }, { "epoch": 0.2614035087719298, "grad_norm": 0.8795229196548462, "learning_rate": 0.0001, "loss": 1.2982, "mean_token_accuracy": 0.6401211023330688, "num_tokens": 61781320.0, "step": 149 }, { "epoch": 0.2631578947368421, "grad_norm": 0.965307891368866, "learning_rate": 0.0001, "loss": 1.2788, "mean_token_accuracy": 0.6439850330352783, "num_tokens": 62199535.0, "step": 150 }, { "epoch": 0.2649122807017544, "grad_norm": 0.9804089665412903, "learning_rate": 0.0001, "loss": 1.2765, "mean_token_accuracy": 0.6456645727157593, "num_tokens": 62632976.0, "step": 151 }, { "epoch": 0.26666666666666666, "grad_norm": 0.9098561406135559, "learning_rate": 0.0001, "loss": 1.2564, "mean_token_accuracy": 0.6501985192298889, "num_tokens": 63043041.0, "step": 152 }, { "epoch": 0.26842105263157895, "grad_norm": 0.7934507727622986, "learning_rate": 0.0001, "loss": 1.2638, "mean_token_accuracy": 0.6469926834106445, "num_tokens": 63453330.0, "step": 153 }, { "epoch": 0.27017543859649124, "grad_norm": 1.0823460817337036, "learning_rate": 0.0001, "loss": 1.2983, "mean_token_accuracy": 0.6407555341720581, "num_tokens": 63864814.0, "step": 154 }, { "epoch": 0.2719298245614035, "grad_norm": 0.7126585841178894, "learning_rate": 0.0001, "loss": 1.2528, "mean_token_accuracy": 0.6504640579223633, "num_tokens": 64280965.0, "step": 155 }, { "epoch": 0.2736842105263158, "grad_norm": 0.9585691690444946, "learning_rate": 0.0001, "loss": 1.2686, "mean_token_accuracy": 0.6472254395484924, "num_tokens": 64672526.0, "step": 156 }, { "epoch": 0.2754385964912281, "grad_norm": 0.752656102180481, "learning_rate": 0.0001, "loss": 1.2474, "mean_token_accuracy": 0.6523414850234985, "num_tokens": 65077313.0, "step": 157 }, { "epoch": 0.2771929824561403, "grad_norm": 0.9288751482963562, "learning_rate": 0.0001, "loss": 1.2624, "mean_token_accuracy": 0.646138072013855, "num_tokens": 65487692.0, "step": 158 }, { "epoch": 0.2789473684210526, "grad_norm": 0.8809065222740173, "learning_rate": 0.0001, "loss": 1.2607, "mean_token_accuracy": 0.6489483118057251, "num_tokens": 65905410.0, "step": 159 }, { "epoch": 0.2807017543859649, "grad_norm": 0.9240980744361877, "learning_rate": 0.0001, "loss": 1.2958, "mean_token_accuracy": 0.6388487219810486, "num_tokens": 66310475.0, "step": 160 }, { "epoch": 0.2824561403508772, "grad_norm": 0.8388931751251221, "learning_rate": 0.0001, "loss": 1.2838, "mean_token_accuracy": 0.6431634426116943, "num_tokens": 66727367.0, "step": 161 }, { "epoch": 0.28421052631578947, "grad_norm": 0.8820334076881409, "learning_rate": 0.0001, "loss": 1.303, "mean_token_accuracy": 0.63872230052948, "num_tokens": 67154750.0, "step": 162 }, { "epoch": 0.28596491228070176, "grad_norm": 0.8385342359542847, "learning_rate": 0.0001, "loss": 1.2632, "mean_token_accuracy": 0.6464277505874634, "num_tokens": 67571403.0, "step": 163 }, { "epoch": 0.28771929824561404, "grad_norm": 0.8737322092056274, "learning_rate": 0.0001, "loss": 1.248, "mean_token_accuracy": 0.6486573219299316, "num_tokens": 67947644.0, "step": 164 }, { "epoch": 0.2894736842105263, "grad_norm": 0.8021559119224548, "learning_rate": 0.0001, "loss": 1.2716, "mean_token_accuracy": 0.644127607345581, "num_tokens": 68362596.0, "step": 165 }, { "epoch": 0.2912280701754386, "grad_norm": 0.7599727511405945, "learning_rate": 0.0001, "loss": 1.2511, "mean_token_accuracy": 0.6510897874832153, "num_tokens": 68768455.0, "step": 166 }, { "epoch": 0.2929824561403509, "grad_norm": 0.8421241044998169, "learning_rate": 0.0001, "loss": 1.2839, "mean_token_accuracy": 0.6423199772834778, "num_tokens": 69182971.0, "step": 167 }, { "epoch": 0.29473684210526313, "grad_norm": 0.8853815793991089, "learning_rate": 0.0001, "loss": 1.2698, "mean_token_accuracy": 0.6462802886962891, "num_tokens": 69633484.0, "step": 168 }, { "epoch": 0.2964912280701754, "grad_norm": 1.0347827672958374, "learning_rate": 0.0001, "loss": 1.2935, "mean_token_accuracy": 0.6394780874252319, "num_tokens": 70069352.0, "step": 169 }, { "epoch": 0.2982456140350877, "grad_norm": 0.8993912935256958, "learning_rate": 0.0001, "loss": 1.2689, "mean_token_accuracy": 0.6471203565597534, "num_tokens": 70478616.0, "step": 170 }, { "epoch": 0.3, "grad_norm": 0.8886722922325134, "learning_rate": 0.0001, "loss": 1.2943, "mean_token_accuracy": 0.6398030519485474, "num_tokens": 70905952.0, "step": 171 }, { "epoch": 0.3017543859649123, "grad_norm": 1.1024540662765503, "learning_rate": 0.0001, "loss": 1.2799, "mean_token_accuracy": 0.6444812417030334, "num_tokens": 71317263.0, "step": 172 }, { "epoch": 0.30350877192982456, "grad_norm": 0.8578200340270996, "learning_rate": 0.0001, "loss": 1.2851, "mean_token_accuracy": 0.6433255672454834, "num_tokens": 71764261.0, "step": 173 }, { "epoch": 0.30526315789473685, "grad_norm": 0.9540239572525024, "learning_rate": 0.0001, "loss": 1.2777, "mean_token_accuracy": 0.644009530544281, "num_tokens": 72179518.0, "step": 174 }, { "epoch": 0.30701754385964913, "grad_norm": 0.9197105169296265, "learning_rate": 0.0001, "loss": 1.2664, "mean_token_accuracy": 0.6461474895477295, "num_tokens": 72594932.0, "step": 175 }, { "epoch": 0.3087719298245614, "grad_norm": 0.7414684891700745, "learning_rate": 0.0001, "loss": 1.2688, "mean_token_accuracy": 0.6475616097450256, "num_tokens": 73005473.0, "step": 176 }, { "epoch": 0.3105263157894737, "grad_norm": 0.9558865427970886, "learning_rate": 0.0001, "loss": 1.2673, "mean_token_accuracy": 0.6460444927215576, "num_tokens": 73397415.0, "step": 177 }, { "epoch": 0.312280701754386, "grad_norm": 0.9229446053504944, "learning_rate": 0.0001, "loss": 1.2738, "mean_token_accuracy": 0.6450560092926025, "num_tokens": 73793964.0, "step": 178 }, { "epoch": 0.3140350877192982, "grad_norm": 0.7821291089057922, "learning_rate": 0.0001, "loss": 1.2489, "mean_token_accuracy": 0.6500634551048279, "num_tokens": 74195270.0, "step": 179 }, { "epoch": 0.3157894736842105, "grad_norm": 0.7419561743736267, "learning_rate": 0.0001, "loss": 1.2612, "mean_token_accuracy": 0.6482713222503662, "num_tokens": 74603802.0, "step": 180 }, { "epoch": 0.3175438596491228, "grad_norm": 0.7956511378288269, "learning_rate": 0.0001, "loss": 1.2644, "mean_token_accuracy": 0.6461498141288757, "num_tokens": 75026083.0, "step": 181 }, { "epoch": 0.3192982456140351, "grad_norm": 0.7759901285171509, "learning_rate": 0.0001, "loss": 1.2647, "mean_token_accuracy": 0.6459044218063354, "num_tokens": 75426052.0, "step": 182 }, { "epoch": 0.32105263157894737, "grad_norm": 0.8206394910812378, "learning_rate": 0.0001, "loss": 1.2661, "mean_token_accuracy": 0.646410346031189, "num_tokens": 75830225.0, "step": 183 }, { "epoch": 0.32280701754385965, "grad_norm": 0.9032196402549744, "learning_rate": 0.0001, "loss": 1.2677, "mean_token_accuracy": 0.64692223072052, "num_tokens": 76241940.0, "step": 184 }, { "epoch": 0.32456140350877194, "grad_norm": 0.7018728256225586, "learning_rate": 0.0001, "loss": 1.2505, "mean_token_accuracy": 0.6487954258918762, "num_tokens": 76663142.0, "step": 185 }, { "epoch": 0.3263157894736842, "grad_norm": 0.9026210904121399, "learning_rate": 0.0001, "loss": 1.2483, "mean_token_accuracy": 0.652391254901886, "num_tokens": 77102129.0, "step": 186 }, { "epoch": 0.3280701754385965, "grad_norm": 0.8878228068351746, "learning_rate": 0.0001, "loss": 1.2736, "mean_token_accuracy": 0.6432400941848755, "num_tokens": 77508487.0, "step": 187 }, { "epoch": 0.3298245614035088, "grad_norm": 0.9250103235244751, "learning_rate": 0.0001, "loss": 1.2426, "mean_token_accuracy": 0.651581883430481, "num_tokens": 77908351.0, "step": 188 }, { "epoch": 0.33157894736842103, "grad_norm": 0.6793785095214844, "learning_rate": 0.0001, "loss": 1.2453, "mean_token_accuracy": 0.6514815092086792, "num_tokens": 78320845.0, "step": 189 }, { "epoch": 0.3333333333333333, "grad_norm": 0.7402032017707825, "learning_rate": 0.0001, "loss": 1.2523, "mean_token_accuracy": 0.6495726704597473, "num_tokens": 78732383.0, "step": 190 }, { "epoch": 0.3350877192982456, "grad_norm": 0.9974339604377747, "learning_rate": 0.0001, "loss": 1.2545, "mean_token_accuracy": 0.6485158801078796, "num_tokens": 79123256.0, "step": 191 }, { "epoch": 0.3368421052631579, "grad_norm": 0.9054931998252869, "learning_rate": 0.0001, "loss": 1.2538, "mean_token_accuracy": 0.6507511734962463, "num_tokens": 79525484.0, "step": 192 }, { "epoch": 0.3385964912280702, "grad_norm": 0.7434863448143005, "learning_rate": 0.0001, "loss": 1.2615, "mean_token_accuracy": 0.6481617093086243, "num_tokens": 79955140.0, "step": 193 }, { "epoch": 0.34035087719298246, "grad_norm": 0.7779750823974609, "learning_rate": 0.0001, "loss": 1.2739, "mean_token_accuracy": 0.6448099613189697, "num_tokens": 80387845.0, "step": 194 }, { "epoch": 0.34210526315789475, "grad_norm": 0.8742808103561401, "learning_rate": 0.0001, "loss": 1.2674, "mean_token_accuracy": 0.6466174125671387, "num_tokens": 80810849.0, "step": 195 }, { "epoch": 0.34385964912280703, "grad_norm": 0.810045063495636, "learning_rate": 0.0001, "loss": 1.2722, "mean_token_accuracy": 0.6458349227905273, "num_tokens": 81226626.0, "step": 196 }, { "epoch": 0.3456140350877193, "grad_norm": 0.7127732634544373, "learning_rate": 0.0001, "loss": 1.2912, "mean_token_accuracy": 0.6405543684959412, "num_tokens": 81648045.0, "step": 197 }, { "epoch": 0.3473684210526316, "grad_norm": 0.8309784531593323, "learning_rate": 0.0001, "loss": 1.254, "mean_token_accuracy": 0.6493059396743774, "num_tokens": 82055621.0, "step": 198 }, { "epoch": 0.34912280701754383, "grad_norm": 0.8503166437149048, "learning_rate": 0.0001, "loss": 1.2772, "mean_token_accuracy": 0.6441330909729004, "num_tokens": 82454820.0, "step": 199 }, { "epoch": 0.3508771929824561, "grad_norm": 0.8834285736083984, "learning_rate": 0.0001, "loss": 1.2701, "mean_token_accuracy": 0.6456678509712219, "num_tokens": 82881912.0, "step": 200 }, { "epoch": 0.3526315789473684, "grad_norm": 0.7746639847755432, "learning_rate": 0.0001, "loss": 1.2731, "mean_token_accuracy": 0.6454135775566101, "num_tokens": 83294708.0, "step": 201 }, { "epoch": 0.3543859649122807, "grad_norm": 0.8626236915588379, "learning_rate": 0.0001, "loss": 1.2677, "mean_token_accuracy": 0.6472684144973755, "num_tokens": 83692153.0, "step": 202 }, { "epoch": 0.356140350877193, "grad_norm": 0.8129353523254395, "learning_rate": 0.0001, "loss": 1.2504, "mean_token_accuracy": 0.649857223033905, "num_tokens": 84106215.0, "step": 203 }, { "epoch": 0.35789473684210527, "grad_norm": 0.9501094818115234, "learning_rate": 0.0001, "loss": 1.2788, "mean_token_accuracy": 0.6440906524658203, "num_tokens": 84533326.0, "step": 204 }, { "epoch": 0.35964912280701755, "grad_norm": 0.7424087524414062, "learning_rate": 0.0001, "loss": 1.2663, "mean_token_accuracy": 0.6469358205795288, "num_tokens": 84958198.0, "step": 205 }, { "epoch": 0.36140350877192984, "grad_norm": 0.7956259846687317, "learning_rate": 0.0001, "loss": 1.2691, "mean_token_accuracy": 0.6459769010543823, "num_tokens": 85375870.0, "step": 206 }, { "epoch": 0.3631578947368421, "grad_norm": 0.7288737893104553, "learning_rate": 0.0001, "loss": 1.2707, "mean_token_accuracy": 0.6465530395507812, "num_tokens": 85800348.0, "step": 207 }, { "epoch": 0.3649122807017544, "grad_norm": 0.7138190865516663, "learning_rate": 0.0001, "loss": 1.2415, "mean_token_accuracy": 0.6529696583747864, "num_tokens": 86195245.0, "step": 208 }, { "epoch": 0.36666666666666664, "grad_norm": 0.9041345119476318, "learning_rate": 0.0001, "loss": 1.2673, "mean_token_accuracy": 0.6487336754798889, "num_tokens": 86599515.0, "step": 209 }, { "epoch": 0.3684210526315789, "grad_norm": 0.7553381323814392, "learning_rate": 0.0001, "loss": 1.298, "mean_token_accuracy": 0.6391161680221558, "num_tokens": 87039537.0, "step": 210 }, { "epoch": 0.3701754385964912, "grad_norm": 0.7526540160179138, "learning_rate": 0.0001, "loss": 1.2465, "mean_token_accuracy": 0.6537913084030151, "num_tokens": 87426029.0, "step": 211 }, { "epoch": 0.3719298245614035, "grad_norm": 0.9352124333381653, "learning_rate": 0.0001, "loss": 1.2655, "mean_token_accuracy": 0.6462576389312744, "num_tokens": 87817987.0, "step": 212 }, { "epoch": 0.3736842105263158, "grad_norm": 0.8342838883399963, "learning_rate": 0.0001, "loss": 1.2356, "mean_token_accuracy": 0.653677225112915, "num_tokens": 88218779.0, "step": 213 }, { "epoch": 0.37543859649122807, "grad_norm": 0.7606971263885498, "learning_rate": 0.0001, "loss": 1.2423, "mean_token_accuracy": 0.6509567499160767, "num_tokens": 88610670.0, "step": 214 }, { "epoch": 0.37719298245614036, "grad_norm": 0.9147993326187134, "learning_rate": 0.0001, "loss": 1.2777, "mean_token_accuracy": 0.6447807550430298, "num_tokens": 89031094.0, "step": 215 }, { "epoch": 0.37894736842105264, "grad_norm": 0.8798630833625793, "learning_rate": 0.0001, "loss": 1.282, "mean_token_accuracy": 0.6422438025474548, "num_tokens": 89465235.0, "step": 216 }, { "epoch": 0.38070175438596493, "grad_norm": 0.7571805119514465, "learning_rate": 0.0001, "loss": 1.2503, "mean_token_accuracy": 0.6498503684997559, "num_tokens": 89867145.0, "step": 217 }, { "epoch": 0.3824561403508772, "grad_norm": 0.9793193936347961, "learning_rate": 0.0001, "loss": 1.248, "mean_token_accuracy": 0.6518094539642334, "num_tokens": 90262494.0, "step": 218 }, { "epoch": 0.38421052631578945, "grad_norm": 0.871235728263855, "learning_rate": 0.0001, "loss": 1.2707, "mean_token_accuracy": 0.6453557014465332, "num_tokens": 90671131.0, "step": 219 }, { "epoch": 0.38596491228070173, "grad_norm": 0.7807226181030273, "learning_rate": 0.0001, "loss": 1.2607, "mean_token_accuracy": 0.6465795040130615, "num_tokens": 91092593.0, "step": 220 }, { "epoch": 0.387719298245614, "grad_norm": 0.9600160121917725, "learning_rate": 0.0001, "loss": 1.2791, "mean_token_accuracy": 0.6422662734985352, "num_tokens": 91501248.0, "step": 221 }, { "epoch": 0.3894736842105263, "grad_norm": 0.8549517393112183, "learning_rate": 0.0001, "loss": 1.243, "mean_token_accuracy": 0.6511504650115967, "num_tokens": 91913007.0, "step": 222 }, { "epoch": 0.3912280701754386, "grad_norm": 0.7951960563659668, "learning_rate": 0.0001, "loss": 1.2377, "mean_token_accuracy": 0.6538807153701782, "num_tokens": 92321188.0, "step": 223 }, { "epoch": 0.3929824561403509, "grad_norm": 0.8606045842170715, "learning_rate": 0.0001, "loss": 1.2678, "mean_token_accuracy": 0.645931601524353, "num_tokens": 92726934.0, "step": 224 }, { "epoch": 0.39473684210526316, "grad_norm": 0.7008436322212219, "learning_rate": 0.0001, "loss": 1.2384, "mean_token_accuracy": 0.6525442600250244, "num_tokens": 93128966.0, "step": 225 }, { "epoch": 0.39649122807017545, "grad_norm": 0.7526488304138184, "learning_rate": 0.0001, "loss": 1.2738, "mean_token_accuracy": 0.6442047357559204, "num_tokens": 93567917.0, "step": 226 }, { "epoch": 0.39824561403508774, "grad_norm": 0.8679794669151306, "learning_rate": 0.0001, "loss": 1.2548, "mean_token_accuracy": 0.6482324600219727, "num_tokens": 93979268.0, "step": 227 }, { "epoch": 0.4, "grad_norm": 0.8233749270439148, "learning_rate": 0.0001, "loss": 1.2215, "mean_token_accuracy": 0.6573722958564758, "num_tokens": 94367195.0, "step": 228 }, { "epoch": 0.4017543859649123, "grad_norm": 0.7261408567428589, "learning_rate": 0.0001, "loss": 1.2415, "mean_token_accuracy": 0.6515704989433289, "num_tokens": 94759639.0, "step": 229 }, { "epoch": 0.40350877192982454, "grad_norm": 0.7959755659103394, "learning_rate": 0.0001, "loss": 1.2722, "mean_token_accuracy": 0.6438157558441162, "num_tokens": 95191668.0, "step": 230 }, { "epoch": 0.4052631578947368, "grad_norm": 0.8794543147087097, "learning_rate": 0.0001, "loss": 1.2477, "mean_token_accuracy": 0.6511826515197754, "num_tokens": 95614874.0, "step": 231 }, { "epoch": 0.4070175438596491, "grad_norm": 0.7663288116455078, "learning_rate": 0.0001, "loss": 1.2175, "mean_token_accuracy": 0.656987190246582, "num_tokens": 95990695.0, "step": 232 }, { "epoch": 0.4087719298245614, "grad_norm": 0.7509688138961792, "learning_rate": 0.0001, "loss": 1.2395, "mean_token_accuracy": 0.6518152952194214, "num_tokens": 96377947.0, "step": 233 }, { "epoch": 0.4105263157894737, "grad_norm": 0.9182112812995911, "learning_rate": 0.0001, "loss": 1.2567, "mean_token_accuracy": 0.6489981412887573, "num_tokens": 96787894.0, "step": 234 }, { "epoch": 0.41228070175438597, "grad_norm": 0.8123442530632019, "learning_rate": 0.0001, "loss": 1.2541, "mean_token_accuracy": 0.6488667726516724, "num_tokens": 97203762.0, "step": 235 }, { "epoch": 0.41403508771929826, "grad_norm": 0.8581697344779968, "learning_rate": 0.0001, "loss": 1.2595, "mean_token_accuracy": 0.6481375694274902, "num_tokens": 97598494.0, "step": 236 }, { "epoch": 0.41578947368421054, "grad_norm": 0.8051207065582275, "learning_rate": 0.0001, "loss": 1.2763, "mean_token_accuracy": 0.644673228263855, "num_tokens": 98011465.0, "step": 237 }, { "epoch": 0.41754385964912283, "grad_norm": 0.7852127552032471, "learning_rate": 0.0001, "loss": 1.2765, "mean_token_accuracy": 0.6438398361206055, "num_tokens": 98447067.0, "step": 238 }, { "epoch": 0.4192982456140351, "grad_norm": 0.7962046265602112, "learning_rate": 0.0001, "loss": 1.2429, "mean_token_accuracy": 0.6508150100708008, "num_tokens": 98854782.0, "step": 239 }, { "epoch": 0.42105263157894735, "grad_norm": 0.8521065711975098, "learning_rate": 0.0001, "loss": 1.2499, "mean_token_accuracy": 0.64998859167099, "num_tokens": 99276423.0, "step": 240 }, { "epoch": 0.42280701754385963, "grad_norm": 0.8006791472434998, "learning_rate": 0.0001, "loss": 1.2523, "mean_token_accuracy": 0.65036940574646, "num_tokens": 99705527.0, "step": 241 }, { "epoch": 0.4245614035087719, "grad_norm": 0.6923927664756775, "learning_rate": 0.0001, "loss": 1.2698, "mean_token_accuracy": 0.645989179611206, "num_tokens": 100144851.0, "step": 242 }, { "epoch": 0.4263157894736842, "grad_norm": 0.8310588002204895, "learning_rate": 0.0001, "loss": 1.2131, "mean_token_accuracy": 0.6578903198242188, "num_tokens": 100546756.0, "step": 243 }, { "epoch": 0.4280701754385965, "grad_norm": 0.7767439484596252, "learning_rate": 0.0001, "loss": 1.2374, "mean_token_accuracy": 0.6534087061882019, "num_tokens": 100961732.0, "step": 244 }, { "epoch": 0.4298245614035088, "grad_norm": 0.7211782932281494, "learning_rate": 0.0001, "loss": 1.2627, "mean_token_accuracy": 0.6475375294685364, "num_tokens": 101396866.0, "step": 245 }, { "epoch": 0.43157894736842106, "grad_norm": 0.754098117351532, "learning_rate": 0.0001, "loss": 1.2535, "mean_token_accuracy": 0.6487017869949341, "num_tokens": 101823395.0, "step": 246 }, { "epoch": 0.43333333333333335, "grad_norm": 0.887698233127594, "learning_rate": 0.0001, "loss": 1.2372, "mean_token_accuracy": 0.6514610052108765, "num_tokens": 102218896.0, "step": 247 }, { "epoch": 0.43508771929824563, "grad_norm": 0.6688896417617798, "learning_rate": 0.0001, "loss": 1.2135, "mean_token_accuracy": 0.6570154428482056, "num_tokens": 102635943.0, "step": 248 }, { "epoch": 0.4368421052631579, "grad_norm": 0.6720183491706848, "learning_rate": 0.0001, "loss": 1.2314, "mean_token_accuracy": 0.653835654258728, "num_tokens": 103060426.0, "step": 249 }, { "epoch": 0.43859649122807015, "grad_norm": 0.6985954642295837, "learning_rate": 0.0001, "loss": 1.2302, "mean_token_accuracy": 0.6543055176734924, "num_tokens": 103480891.0, "step": 250 }, { "epoch": 0.44035087719298244, "grad_norm": 0.7861040234565735, "learning_rate": 0.0001, "loss": 1.2259, "mean_token_accuracy": 0.6543919444084167, "num_tokens": 103896368.0, "step": 251 }, { "epoch": 0.4421052631578947, "grad_norm": 0.7467155456542969, "learning_rate": 0.0001, "loss": 1.2425, "mean_token_accuracy": 0.6521680355072021, "num_tokens": 104318424.0, "step": 252 }, { "epoch": 0.443859649122807, "grad_norm": 0.689565896987915, "learning_rate": 0.0001, "loss": 1.2384, "mean_token_accuracy": 0.6529023051261902, "num_tokens": 104743917.0, "step": 253 }, { "epoch": 0.4456140350877193, "grad_norm": 0.8311668634414673, "learning_rate": 0.0001, "loss": 1.2286, "mean_token_accuracy": 0.6543080806732178, "num_tokens": 105146836.0, "step": 254 }, { "epoch": 0.4473684210526316, "grad_norm": 0.8047776818275452, "learning_rate": 0.0001, "loss": 1.2555, "mean_token_accuracy": 0.6480646133422852, "num_tokens": 105552302.0, "step": 255 }, { "epoch": 0.44912280701754387, "grad_norm": 0.6903892159461975, "learning_rate": 0.0001, "loss": 1.2878, "mean_token_accuracy": 0.6401119232177734, "num_tokens": 105963414.0, "step": 256 }, { "epoch": 0.45087719298245615, "grad_norm": 0.7000618577003479, "learning_rate": 0.0001, "loss": 1.2557, "mean_token_accuracy": 0.6474447846412659, "num_tokens": 106368736.0, "step": 257 }, { "epoch": 0.45263157894736844, "grad_norm": 0.7351795434951782, "learning_rate": 0.0001, "loss": 1.2237, "mean_token_accuracy": 0.6556580662727356, "num_tokens": 106769771.0, "step": 258 }, { "epoch": 0.4543859649122807, "grad_norm": 0.7257981300354004, "learning_rate": 0.0001, "loss": 1.2416, "mean_token_accuracy": 0.6521273255348206, "num_tokens": 107170029.0, "step": 259 }, { "epoch": 0.45614035087719296, "grad_norm": 0.8522328734397888, "learning_rate": 0.0001, "loss": 1.2301, "mean_token_accuracy": 0.6529244184494019, "num_tokens": 107576140.0, "step": 260 }, { "epoch": 0.45789473684210524, "grad_norm": 0.6672490835189819, "learning_rate": 0.0001, "loss": 1.2414, "mean_token_accuracy": 0.6505205631256104, "num_tokens": 108009067.0, "step": 261 }, { "epoch": 0.45964912280701753, "grad_norm": 0.8998327255249023, "learning_rate": 0.0001, "loss": 1.2365, "mean_token_accuracy": 0.6523317694664001, "num_tokens": 108434633.0, "step": 262 }, { "epoch": 0.4614035087719298, "grad_norm": 0.7883278727531433, "learning_rate": 0.0001, "loss": 1.2511, "mean_token_accuracy": 0.648801326751709, "num_tokens": 108878862.0, "step": 263 }, { "epoch": 0.4631578947368421, "grad_norm": 0.9719793796539307, "learning_rate": 0.0001, "loss": 1.2326, "mean_token_accuracy": 0.6538045406341553, "num_tokens": 109287222.0, "step": 264 }, { "epoch": 0.4649122807017544, "grad_norm": 0.6874752044677734, "learning_rate": 0.0001, "loss": 1.2319, "mean_token_accuracy": 0.6524173021316528, "num_tokens": 109693157.0, "step": 265 }, { "epoch": 0.4666666666666667, "grad_norm": 0.8174811601638794, "learning_rate": 0.0001, "loss": 1.2235, "mean_token_accuracy": 0.6545587182044983, "num_tokens": 110099662.0, "step": 266 }, { "epoch": 0.46842105263157896, "grad_norm": 0.7676987051963806, "learning_rate": 0.0001, "loss": 1.252, "mean_token_accuracy": 0.6494687795639038, "num_tokens": 110511759.0, "step": 267 }, { "epoch": 0.47017543859649125, "grad_norm": 0.7034929394721985, "learning_rate": 0.0001, "loss": 1.2328, "mean_token_accuracy": 0.653315544128418, "num_tokens": 110927626.0, "step": 268 }, { "epoch": 0.47192982456140353, "grad_norm": 0.6947440505027771, "learning_rate": 0.0001, "loss": 1.2451, "mean_token_accuracy": 0.6503375172615051, "num_tokens": 111371879.0, "step": 269 }, { "epoch": 0.47368421052631576, "grad_norm": 0.7659525871276855, "learning_rate": 0.0001, "loss": 1.251, "mean_token_accuracy": 0.6494304537773132, "num_tokens": 111784069.0, "step": 270 }, { "epoch": 0.47543859649122805, "grad_norm": 0.7740342617034912, "learning_rate": 0.0001, "loss": 1.2253, "mean_token_accuracy": 0.6542062759399414, "num_tokens": 112186870.0, "step": 271 }, { "epoch": 0.47719298245614034, "grad_norm": 0.65045565366745, "learning_rate": 0.0001, "loss": 1.2305, "mean_token_accuracy": 0.6532946228981018, "num_tokens": 112616585.0, "step": 272 }, { "epoch": 0.4789473684210526, "grad_norm": 0.7001651525497437, "learning_rate": 0.0001, "loss": 1.2247, "mean_token_accuracy": 0.654727041721344, "num_tokens": 113009798.0, "step": 273 }, { "epoch": 0.4807017543859649, "grad_norm": 0.6165850162506104, "learning_rate": 0.0001, "loss": 1.2174, "mean_token_accuracy": 0.6552860736846924, "num_tokens": 113410244.0, "step": 274 }, { "epoch": 0.4824561403508772, "grad_norm": 0.7424379587173462, "learning_rate": 0.0001, "loss": 1.2379, "mean_token_accuracy": 0.653006911277771, "num_tokens": 113805215.0, "step": 275 }, { "epoch": 0.4842105263157895, "grad_norm": 0.7236623167991638, "learning_rate": 0.0001, "loss": 1.2748, "mean_token_accuracy": 0.6438848972320557, "num_tokens": 114224914.0, "step": 276 }, { "epoch": 0.48596491228070177, "grad_norm": 0.6665499210357666, "learning_rate": 0.0001, "loss": 1.205, "mean_token_accuracy": 0.6590371131896973, "num_tokens": 114606381.0, "step": 277 }, { "epoch": 0.48771929824561405, "grad_norm": 0.6881427764892578, "learning_rate": 0.0001, "loss": 1.2456, "mean_token_accuracy": 0.6502711772918701, "num_tokens": 115021603.0, "step": 278 }, { "epoch": 0.48947368421052634, "grad_norm": 0.8498430848121643, "learning_rate": 0.0001, "loss": 1.2443, "mean_token_accuracy": 0.6494314670562744, "num_tokens": 115427417.0, "step": 279 }, { "epoch": 0.49122807017543857, "grad_norm": 0.724355936050415, "learning_rate": 0.0001, "loss": 1.2574, "mean_token_accuracy": 0.6479068398475647, "num_tokens": 115868301.0, "step": 280 }, { "epoch": 0.49298245614035086, "grad_norm": 0.6625252366065979, "learning_rate": 0.0001, "loss": 1.2128, "mean_token_accuracy": 0.6565937995910645, "num_tokens": 116278418.0, "step": 281 }, { "epoch": 0.49473684210526314, "grad_norm": 0.8329636454582214, "learning_rate": 0.0001, "loss": 1.2419, "mean_token_accuracy": 0.650545060634613, "num_tokens": 116681770.0, "step": 282 }, { "epoch": 0.4964912280701754, "grad_norm": 0.8298386335372925, "learning_rate": 0.0001, "loss": 1.2356, "mean_token_accuracy": 0.6532111167907715, "num_tokens": 117054940.0, "step": 283 }, { "epoch": 0.4982456140350877, "grad_norm": 0.7011889219284058, "learning_rate": 0.0001, "loss": 1.2418, "mean_token_accuracy": 0.6518392562866211, "num_tokens": 117477299.0, "step": 284 }, { "epoch": 0.5, "grad_norm": 0.710082471370697, "learning_rate": 0.0001, "loss": 1.2479, "mean_token_accuracy": 0.6494768857955933, "num_tokens": 117885331.0, "step": 285 }, { "epoch": 0.5017543859649123, "grad_norm": 0.8371219038963318, "learning_rate": 0.0001, "loss": 1.2577, "mean_token_accuracy": 0.6471771001815796, "num_tokens": 118307149.0, "step": 286 }, { "epoch": 0.5035087719298246, "grad_norm": 0.8411158919334412, "learning_rate": 0.0001, "loss": 1.219, "mean_token_accuracy": 0.6554292440414429, "num_tokens": 118697262.0, "step": 287 }, { "epoch": 0.5052631578947369, "grad_norm": 0.7115722298622131, "learning_rate": 0.0001, "loss": 1.2501, "mean_token_accuracy": 0.6482685804367065, "num_tokens": 119124053.0, "step": 288 }, { "epoch": 0.5070175438596491, "grad_norm": 0.6575236916542053, "learning_rate": 0.0001, "loss": 1.2555, "mean_token_accuracy": 0.6482589840888977, "num_tokens": 119569185.0, "step": 289 }, { "epoch": 0.5087719298245614, "grad_norm": 0.8516756892204285, "learning_rate": 0.0001, "loss": 1.2332, "mean_token_accuracy": 0.6534713506698608, "num_tokens": 119987270.0, "step": 290 }, { "epoch": 0.5105263157894737, "grad_norm": 0.7346055507659912, "learning_rate": 0.0001, "loss": 1.2347, "mean_token_accuracy": 0.6515175700187683, "num_tokens": 120377177.0, "step": 291 }, { "epoch": 0.512280701754386, "grad_norm": 0.6637006402015686, "learning_rate": 0.0001, "loss": 1.2241, "mean_token_accuracy": 0.6554847359657288, "num_tokens": 120794750.0, "step": 292 }, { "epoch": 0.5140350877192983, "grad_norm": 0.8050562143325806, "learning_rate": 0.0001, "loss": 1.2462, "mean_token_accuracy": 0.6506670713424683, "num_tokens": 121223883.0, "step": 293 }, { "epoch": 0.5157894736842106, "grad_norm": 0.7059856057167053, "learning_rate": 0.0001, "loss": 1.2287, "mean_token_accuracy": 0.6533610820770264, "num_tokens": 121644108.0, "step": 294 }, { "epoch": 0.5175438596491229, "grad_norm": 0.6939064860343933, "learning_rate": 0.0001, "loss": 1.2357, "mean_token_accuracy": 0.6511529684066772, "num_tokens": 122051612.0, "step": 295 }, { "epoch": 0.519298245614035, "grad_norm": 0.8220492601394653, "learning_rate": 0.0001, "loss": 1.2257, "mean_token_accuracy": 0.6545118093490601, "num_tokens": 122430847.0, "step": 296 }, { "epoch": 0.5210526315789473, "grad_norm": 0.7044985294342041, "learning_rate": 0.0001, "loss": 1.2309, "mean_token_accuracy": 0.654273271560669, "num_tokens": 122849234.0, "step": 297 }, { "epoch": 0.5228070175438596, "grad_norm": 0.8146756291389465, "learning_rate": 0.0001, "loss": 1.2489, "mean_token_accuracy": 0.6501311659812927, "num_tokens": 123258290.0, "step": 298 }, { "epoch": 0.5245614035087719, "grad_norm": 0.766899824142456, "learning_rate": 0.0001, "loss": 1.2436, "mean_token_accuracy": 0.6506932973861694, "num_tokens": 123663735.0, "step": 299 }, { "epoch": 0.5263157894736842, "grad_norm": 0.7193543910980225, "learning_rate": 0.0001, "loss": 1.2451, "mean_token_accuracy": 0.6491553783416748, "num_tokens": 124058727.0, "step": 300 }, { "epoch": 0.5280701754385965, "grad_norm": 0.6504607200622559, "learning_rate": 0.0001, "loss": 1.2455, "mean_token_accuracy": 0.6509132385253906, "num_tokens": 124484021.0, "step": 301 }, { "epoch": 0.5298245614035088, "grad_norm": 0.7661638259887695, "learning_rate": 0.0001, "loss": 1.2158, "mean_token_accuracy": 0.6565036177635193, "num_tokens": 124902607.0, "step": 302 }, { "epoch": 0.531578947368421, "grad_norm": 0.73735511302948, "learning_rate": 0.0001, "loss": 1.2408, "mean_token_accuracy": 0.6503796577453613, "num_tokens": 125338175.0, "step": 303 }, { "epoch": 0.5333333333333333, "grad_norm": 0.9022007584571838, "learning_rate": 0.0001, "loss": 1.2332, "mean_token_accuracy": 0.6524848937988281, "num_tokens": 125757431.0, "step": 304 }, { "epoch": 0.5350877192982456, "grad_norm": 0.6961904764175415, "learning_rate": 0.0001, "loss": 1.2217, "mean_token_accuracy": 0.6560453176498413, "num_tokens": 126155529.0, "step": 305 }, { "epoch": 0.5368421052631579, "grad_norm": 0.6821785569190979, "learning_rate": 0.0001, "loss": 1.223, "mean_token_accuracy": 0.6551496982574463, "num_tokens": 126572012.0, "step": 306 }, { "epoch": 0.5385964912280702, "grad_norm": 0.8659482002258301, "learning_rate": 0.0001, "loss": 1.2393, "mean_token_accuracy": 0.6508756875991821, "num_tokens": 126990573.0, "step": 307 }, { "epoch": 0.5403508771929825, "grad_norm": 0.6646002531051636, "learning_rate": 0.0001, "loss": 1.2173, "mean_token_accuracy": 0.6561950445175171, "num_tokens": 127418976.0, "step": 308 }, { "epoch": 0.5421052631578948, "grad_norm": 0.6923218369483948, "learning_rate": 0.0001, "loss": 1.1911, "mean_token_accuracy": 0.6624599695205688, "num_tokens": 127827614.0, "step": 309 }, { "epoch": 0.543859649122807, "grad_norm": 0.6864442825317383, "learning_rate": 0.0001, "loss": 1.2267, "mean_token_accuracy": 0.6538540720939636, "num_tokens": 128258313.0, "step": 310 }, { "epoch": 0.5456140350877193, "grad_norm": 0.7230309247970581, "learning_rate": 0.0001, "loss": 1.2495, "mean_token_accuracy": 0.6486865282058716, "num_tokens": 128702682.0, "step": 311 }, { "epoch": 0.5473684210526316, "grad_norm": 0.6914284825325012, "learning_rate": 0.0001, "loss": 1.2211, "mean_token_accuracy": 0.6558956503868103, "num_tokens": 129094449.0, "step": 312 }, { "epoch": 0.5491228070175439, "grad_norm": 0.6948025822639465, "learning_rate": 0.0001, "loss": 1.2232, "mean_token_accuracy": 0.6554611325263977, "num_tokens": 129492795.0, "step": 313 }, { "epoch": 0.5508771929824562, "grad_norm": 0.6883065104484558, "learning_rate": 0.0001, "loss": 1.225, "mean_token_accuracy": 0.6549999713897705, "num_tokens": 129923670.0, "step": 314 }, { "epoch": 0.5526315789473685, "grad_norm": 0.7065843939781189, "learning_rate": 0.0001, "loss": 1.211, "mean_token_accuracy": 0.6596200466156006, "num_tokens": 130333012.0, "step": 315 }, { "epoch": 0.5543859649122806, "grad_norm": 0.8073469996452332, "learning_rate": 0.0001, "loss": 1.2073, "mean_token_accuracy": 0.6588984727859497, "num_tokens": 130749305.0, "step": 316 }, { "epoch": 0.5561403508771929, "grad_norm": 0.8134505748748779, "learning_rate": 0.0001, "loss": 1.2544, "mean_token_accuracy": 0.6476566791534424, "num_tokens": 131169421.0, "step": 317 }, { "epoch": 0.5578947368421052, "grad_norm": 0.6765173077583313, "learning_rate": 0.0001, "loss": 1.2078, "mean_token_accuracy": 0.6600826978683472, "num_tokens": 131574788.0, "step": 318 }, { "epoch": 0.5596491228070175, "grad_norm": 0.7156663537025452, "learning_rate": 0.0001, "loss": 1.2267, "mean_token_accuracy": 0.6545342206954956, "num_tokens": 131984459.0, "step": 319 }, { "epoch": 0.5614035087719298, "grad_norm": 0.8103324174880981, "learning_rate": 0.0001, "loss": 1.2612, "mean_token_accuracy": 0.6481289267539978, "num_tokens": 132415165.0, "step": 320 }, { "epoch": 0.5631578947368421, "grad_norm": 0.742142915725708, "learning_rate": 0.0001, "loss": 1.2039, "mean_token_accuracy": 0.6596871018409729, "num_tokens": 132838824.0, "step": 321 }, { "epoch": 0.5649122807017544, "grad_norm": 0.7613045573234558, "learning_rate": 0.0001, "loss": 1.2095, "mean_token_accuracy": 0.6572511792182922, "num_tokens": 133247120.0, "step": 322 }, { "epoch": 0.5666666666666667, "grad_norm": 0.7817480564117432, "learning_rate": 0.0001, "loss": 1.2232, "mean_token_accuracy": 0.6545703411102295, "num_tokens": 133672706.0, "step": 323 }, { "epoch": 0.5684210526315789, "grad_norm": 0.6124296188354492, "learning_rate": 0.0001, "loss": 1.2342, "mean_token_accuracy": 0.6552791595458984, "num_tokens": 134079103.0, "step": 324 }, { "epoch": 0.5701754385964912, "grad_norm": 0.6886869668960571, "learning_rate": 0.0001, "loss": 1.2231, "mean_token_accuracy": 0.6547764539718628, "num_tokens": 134494826.0, "step": 325 }, { "epoch": 0.5719298245614035, "grad_norm": 0.6630454659461975, "learning_rate": 0.0001, "loss": 1.2345, "mean_token_accuracy": 0.6520872116088867, "num_tokens": 134919504.0, "step": 326 }, { "epoch": 0.5736842105263158, "grad_norm": 0.8173869252204895, "learning_rate": 0.0001, "loss": 1.2276, "mean_token_accuracy": 0.6533281207084656, "num_tokens": 135331962.0, "step": 327 }, { "epoch": 0.5754385964912281, "grad_norm": 0.6743276715278625, "learning_rate": 0.0001, "loss": 1.2281, "mean_token_accuracy": 0.6535520553588867, "num_tokens": 135745710.0, "step": 328 }, { "epoch": 0.5771929824561404, "grad_norm": 0.6731691360473633, "learning_rate": 0.0001, "loss": 1.2346, "mean_token_accuracy": 0.6528003215789795, "num_tokens": 136174418.0, "step": 329 }, { "epoch": 0.5789473684210527, "grad_norm": 0.6211588382720947, "learning_rate": 0.0001, "loss": 1.2329, "mean_token_accuracy": 0.653133749961853, "num_tokens": 136597949.0, "step": 330 }, { "epoch": 0.5807017543859649, "grad_norm": 0.8585658073425293, "learning_rate": 0.0001, "loss": 1.2506, "mean_token_accuracy": 0.648890495300293, "num_tokens": 137047696.0, "step": 331 }, { "epoch": 0.5824561403508772, "grad_norm": 0.8006256222724915, "learning_rate": 0.0001, "loss": 1.2119, "mean_token_accuracy": 0.658997118473053, "num_tokens": 137428701.0, "step": 332 }, { "epoch": 0.5842105263157895, "grad_norm": 0.692973792552948, "learning_rate": 0.0001, "loss": 1.2167, "mean_token_accuracy": 0.6570533514022827, "num_tokens": 137823083.0, "step": 333 }, { "epoch": 0.5859649122807018, "grad_norm": 0.7685320973396301, "learning_rate": 0.0001, "loss": 1.271, "mean_token_accuracy": 0.6423413157463074, "num_tokens": 138250591.0, "step": 334 }, { "epoch": 0.5877192982456141, "grad_norm": 0.7700155377388, "learning_rate": 0.0001, "loss": 1.2252, "mean_token_accuracy": 0.6543079614639282, "num_tokens": 138660562.0, "step": 335 }, { "epoch": 0.5894736842105263, "grad_norm": 0.7410191893577576, "learning_rate": 0.0001, "loss": 1.2156, "mean_token_accuracy": 0.6561688184738159, "num_tokens": 139077597.0, "step": 336 }, { "epoch": 0.5912280701754385, "grad_norm": 0.7632637619972229, "learning_rate": 0.0001, "loss": 1.235, "mean_token_accuracy": 0.6516165733337402, "num_tokens": 139482698.0, "step": 337 }, { "epoch": 0.5929824561403508, "grad_norm": 0.690731942653656, "learning_rate": 0.0001, "loss": 1.2408, "mean_token_accuracy": 0.6502476930618286, "num_tokens": 139906177.0, "step": 338 }, { "epoch": 0.5947368421052631, "grad_norm": 0.6513046026229858, "learning_rate": 0.0001, "loss": 1.2297, "mean_token_accuracy": 0.6529406905174255, "num_tokens": 140319741.0, "step": 339 }, { "epoch": 0.5964912280701754, "grad_norm": 0.6879235506057739, "learning_rate": 0.0001, "loss": 1.2234, "mean_token_accuracy": 0.6540219783782959, "num_tokens": 140731809.0, "step": 340 }, { "epoch": 0.5982456140350877, "grad_norm": 0.7240639925003052, "learning_rate": 0.0001, "loss": 1.2043, "mean_token_accuracy": 0.6592921018600464, "num_tokens": 141126659.0, "step": 341 }, { "epoch": 0.6, "grad_norm": 0.6559076905250549, "learning_rate": 0.0001, "loss": 1.2353, "mean_token_accuracy": 0.6514889001846313, "num_tokens": 141551619.0, "step": 342 }, { "epoch": 0.6017543859649123, "grad_norm": 0.7054679989814758, "learning_rate": 0.0001, "loss": 1.2225, "mean_token_accuracy": 0.6558979749679565, "num_tokens": 141970744.0, "step": 343 }, { "epoch": 0.6035087719298246, "grad_norm": 0.6867666244506836, "learning_rate": 0.0001, "loss": 1.2162, "mean_token_accuracy": 0.6566264629364014, "num_tokens": 142395401.0, "step": 344 }, { "epoch": 0.6052631578947368, "grad_norm": 0.6507348418235779, "learning_rate": 0.0001, "loss": 1.2244, "mean_token_accuracy": 0.6545271277427673, "num_tokens": 142827910.0, "step": 345 }, { "epoch": 0.6070175438596491, "grad_norm": 0.7520820498466492, "learning_rate": 0.0001, "loss": 1.2315, "mean_token_accuracy": 0.6522303819656372, "num_tokens": 143256693.0, "step": 346 }, { "epoch": 0.6087719298245614, "grad_norm": 0.7250421047210693, "learning_rate": 0.0001, "loss": 1.2126, "mean_token_accuracy": 0.6569870710372925, "num_tokens": 143670113.0, "step": 347 }, { "epoch": 0.6105263157894737, "grad_norm": 0.707240104675293, "learning_rate": 0.0001, "loss": 1.2337, "mean_token_accuracy": 0.6521450281143188, "num_tokens": 144085012.0, "step": 348 }, { "epoch": 0.612280701754386, "grad_norm": 0.6530799269676208, "learning_rate": 0.0001, "loss": 1.2366, "mean_token_accuracy": 0.650078296661377, "num_tokens": 144511304.0, "step": 349 }, { "epoch": 0.6140350877192983, "grad_norm": 0.7164869904518127, "learning_rate": 0.0001, "loss": 1.1957, "mean_token_accuracy": 0.6614063382148743, "num_tokens": 144903389.0, "step": 350 }, { "epoch": 0.6157894736842106, "grad_norm": 0.6941936612129211, "learning_rate": 0.0001, "loss": 1.2244, "mean_token_accuracy": 0.6538809537887573, "num_tokens": 145317796.0, "step": 351 }, { "epoch": 0.6175438596491228, "grad_norm": 0.5569853186607361, "learning_rate": 0.0001, "loss": 1.2303, "mean_token_accuracy": 0.6540141701698303, "num_tokens": 145728692.0, "step": 352 }, { "epoch": 0.6192982456140351, "grad_norm": 0.6453179121017456, "learning_rate": 0.0001, "loss": 1.2219, "mean_token_accuracy": 0.6558411121368408, "num_tokens": 146149428.0, "step": 353 }, { "epoch": 0.6210526315789474, "grad_norm": 0.7571195363998413, "learning_rate": 0.0001, "loss": 1.2371, "mean_token_accuracy": 0.6512084007263184, "num_tokens": 146567803.0, "step": 354 }, { "epoch": 0.6228070175438597, "grad_norm": 0.7026142477989197, "learning_rate": 0.0001, "loss": 1.2577, "mean_token_accuracy": 0.6465089917182922, "num_tokens": 146996287.0, "step": 355 }, { "epoch": 0.624561403508772, "grad_norm": 0.7396862506866455, "learning_rate": 0.0001, "loss": 1.207, "mean_token_accuracy": 0.657769501209259, "num_tokens": 147406032.0, "step": 356 }, { "epoch": 0.6263157894736842, "grad_norm": 0.7301826477050781, "learning_rate": 0.0001, "loss": 1.2549, "mean_token_accuracy": 0.648101270198822, "num_tokens": 147818711.0, "step": 357 }, { "epoch": 0.6280701754385964, "grad_norm": 0.6443963646888733, "learning_rate": 0.0001, "loss": 1.2247, "mean_token_accuracy": 0.6540451645851135, "num_tokens": 148224782.0, "step": 358 }, { "epoch": 0.6298245614035087, "grad_norm": 0.5962257981300354, "learning_rate": 0.0001, "loss": 1.2095, "mean_token_accuracy": 0.6578296422958374, "num_tokens": 148638041.0, "step": 359 }, { "epoch": 0.631578947368421, "grad_norm": 0.553277850151062, "learning_rate": 0.0001, "loss": 1.1954, "mean_token_accuracy": 0.6618661880493164, "num_tokens": 149034209.0, "step": 360 }, { "epoch": 0.6333333333333333, "grad_norm": 0.8141903281211853, "learning_rate": 0.0001, "loss": 1.2181, "mean_token_accuracy": 0.6552863121032715, "num_tokens": 149462197.0, "step": 361 }, { "epoch": 0.6350877192982456, "grad_norm": 0.6312337517738342, "learning_rate": 0.0001, "loss": 1.2237, "mean_token_accuracy": 0.6552015542984009, "num_tokens": 149874309.0, "step": 362 }, { "epoch": 0.6368421052631579, "grad_norm": 0.6863110661506653, "learning_rate": 0.0001, "loss": 1.2144, "mean_token_accuracy": 0.6574706435203552, "num_tokens": 150278354.0, "step": 363 }, { "epoch": 0.6385964912280702, "grad_norm": 0.7062144875526428, "learning_rate": 0.0001, "loss": 1.2309, "mean_token_accuracy": 0.6519078016281128, "num_tokens": 150696393.0, "step": 364 }, { "epoch": 0.6403508771929824, "grad_norm": 0.6141137480735779, "learning_rate": 0.0001, "loss": 1.2175, "mean_token_accuracy": 0.6562414169311523, "num_tokens": 151097966.0, "step": 365 }, { "epoch": 0.6421052631578947, "grad_norm": 0.6939074993133545, "learning_rate": 0.0001, "loss": 1.2115, "mean_token_accuracy": 0.6587799787521362, "num_tokens": 151487261.0, "step": 366 }, { "epoch": 0.643859649122807, "grad_norm": 0.6834867596626282, "learning_rate": 0.0001, "loss": 1.2492, "mean_token_accuracy": 0.6492800116539001, "num_tokens": 151915716.0, "step": 367 }, { "epoch": 0.6456140350877193, "grad_norm": 0.6845062971115112, "learning_rate": 0.0001, "loss": 1.2045, "mean_token_accuracy": 0.6578772664070129, "num_tokens": 152333716.0, "step": 368 }, { "epoch": 0.6473684210526316, "grad_norm": 0.6263954639434814, "learning_rate": 0.0001, "loss": 1.2285, "mean_token_accuracy": 0.6524069905281067, "num_tokens": 152758004.0, "step": 369 }, { "epoch": 0.6491228070175439, "grad_norm": 0.7604780793190002, "learning_rate": 0.0001, "loss": 1.2321, "mean_token_accuracy": 0.6509186029434204, "num_tokens": 153175726.0, "step": 370 }, { "epoch": 0.6508771929824562, "grad_norm": 0.6607220768928528, "learning_rate": 0.0001, "loss": 1.193, "mean_token_accuracy": 0.6612952947616577, "num_tokens": 153573526.0, "step": 371 }, { "epoch": 0.6526315789473685, "grad_norm": 0.7317623496055603, "learning_rate": 0.0001, "loss": 1.2233, "mean_token_accuracy": 0.6555420756340027, "num_tokens": 154001303.0, "step": 372 }, { "epoch": 0.6543859649122807, "grad_norm": 0.5643908381462097, "learning_rate": 0.0001, "loss": 1.1888, "mean_token_accuracy": 0.6617960929870605, "num_tokens": 154409874.0, "step": 373 }, { "epoch": 0.656140350877193, "grad_norm": 0.631582498550415, "learning_rate": 0.0001, "loss": 1.2192, "mean_token_accuracy": 0.6558030843734741, "num_tokens": 154826554.0, "step": 374 }, { "epoch": 0.6578947368421053, "grad_norm": 0.745689332485199, "learning_rate": 0.0001, "loss": 1.2146, "mean_token_accuracy": 0.6560918688774109, "num_tokens": 155230585.0, "step": 375 }, { "epoch": 0.6596491228070176, "grad_norm": 0.651474118232727, "learning_rate": 0.0001, "loss": 1.2055, "mean_token_accuracy": 0.6597638726234436, "num_tokens": 155636974.0, "step": 376 }, { "epoch": 0.6614035087719298, "grad_norm": 0.7227398753166199, "learning_rate": 0.0001, "loss": 1.2211, "mean_token_accuracy": 0.6551980972290039, "num_tokens": 156063068.0, "step": 377 }, { "epoch": 0.6631578947368421, "grad_norm": 0.6124153137207031, "learning_rate": 0.0001, "loss": 1.2435, "mean_token_accuracy": 0.6511872410774231, "num_tokens": 156510643.0, "step": 378 }, { "epoch": 0.6649122807017543, "grad_norm": 0.7193928360939026, "learning_rate": 0.0001, "loss": 1.2242, "mean_token_accuracy": 0.6536823511123657, "num_tokens": 156947423.0, "step": 379 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7923741936683655, "learning_rate": 0.0001, "loss": 1.2149, "mean_token_accuracy": 0.6561374664306641, "num_tokens": 157370426.0, "step": 380 }, { "epoch": 0.6684210526315789, "grad_norm": 0.7290387153625488, "learning_rate": 0.0001, "loss": 1.2132, "mean_token_accuracy": 0.6568667888641357, "num_tokens": 157782963.0, "step": 381 }, { "epoch": 0.6701754385964912, "grad_norm": 0.6192464232444763, "learning_rate": 0.0001, "loss": 1.2309, "mean_token_accuracy": 0.6526235342025757, "num_tokens": 158180387.0, "step": 382 }, { "epoch": 0.6719298245614035, "grad_norm": 0.7137374877929688, "learning_rate": 0.0001, "loss": 1.2328, "mean_token_accuracy": 0.6518644094467163, "num_tokens": 158582401.0, "step": 383 }, { "epoch": 0.6736842105263158, "grad_norm": 0.7550848126411438, "learning_rate": 0.0001, "loss": 1.2166, "mean_token_accuracy": 0.6573696136474609, "num_tokens": 158991919.0, "step": 384 }, { "epoch": 0.6754385964912281, "grad_norm": 0.6890254020690918, "learning_rate": 0.0001, "loss": 1.2107, "mean_token_accuracy": 0.6569054126739502, "num_tokens": 159393180.0, "step": 385 }, { "epoch": 0.6771929824561403, "grad_norm": 0.7258317470550537, "learning_rate": 0.0001, "loss": 1.2151, "mean_token_accuracy": 0.6549711227416992, "num_tokens": 159800687.0, "step": 386 }, { "epoch": 0.6789473684210526, "grad_norm": 0.7973881363868713, "learning_rate": 0.0001, "loss": 1.1913, "mean_token_accuracy": 0.6616858839988708, "num_tokens": 160206677.0, "step": 387 }, { "epoch": 0.6807017543859649, "grad_norm": 0.6781461238861084, "learning_rate": 0.0001, "loss": 1.2296, "mean_token_accuracy": 0.6531736850738525, "num_tokens": 160623973.0, "step": 388 }, { "epoch": 0.6824561403508772, "grad_norm": 0.8034713268280029, "learning_rate": 0.0001, "loss": 1.2306, "mean_token_accuracy": 0.6528879404067993, "num_tokens": 161033575.0, "step": 389 }, { "epoch": 0.6842105263157895, "grad_norm": 0.7085846066474915, "learning_rate": 0.0001, "loss": 1.1892, "mean_token_accuracy": 0.6614021062850952, "num_tokens": 161420334.0, "step": 390 }, { "epoch": 0.6859649122807018, "grad_norm": 0.712842583656311, "learning_rate": 0.0001, "loss": 1.2096, "mean_token_accuracy": 0.6570684909820557, "num_tokens": 161823214.0, "step": 391 }, { "epoch": 0.6877192982456141, "grad_norm": 0.6031337380409241, "learning_rate": 0.0001, "loss": 1.2131, "mean_token_accuracy": 0.6579930782318115, "num_tokens": 162253222.0, "step": 392 }, { "epoch": 0.6894736842105263, "grad_norm": 0.6571363806724548, "learning_rate": 0.0001, "loss": 1.2151, "mean_token_accuracy": 0.6550261974334717, "num_tokens": 162673396.0, "step": 393 }, { "epoch": 0.6912280701754386, "grad_norm": 0.590053915977478, "learning_rate": 0.0001, "loss": 1.1913, "mean_token_accuracy": 0.6606940031051636, "num_tokens": 163095701.0, "step": 394 }, { "epoch": 0.6929824561403509, "grad_norm": 0.660569429397583, "learning_rate": 0.0001, "loss": 1.2168, "mean_token_accuracy": 0.6552713513374329, "num_tokens": 163503487.0, "step": 395 }, { "epoch": 0.6947368421052632, "grad_norm": 0.5482744574546814, "learning_rate": 0.0001, "loss": 1.1966, "mean_token_accuracy": 0.6622109413146973, "num_tokens": 163908638.0, "step": 396 }, { "epoch": 0.6964912280701754, "grad_norm": 0.6649277210235596, "learning_rate": 0.0001, "loss": 1.2082, "mean_token_accuracy": 0.6560900211334229, "num_tokens": 164321664.0, "step": 397 }, { "epoch": 0.6982456140350877, "grad_norm": 0.6546705365180969, "learning_rate": 0.0001, "loss": 1.208, "mean_token_accuracy": 0.6577179431915283, "num_tokens": 164739198.0, "step": 398 }, { "epoch": 0.7, "grad_norm": 0.6374883651733398, "learning_rate": 0.0001, "loss": 1.1893, "mean_token_accuracy": 0.660727322101593, "num_tokens": 165130707.0, "step": 399 }, { "epoch": 0.7017543859649122, "grad_norm": 0.6626867055892944, "learning_rate": 0.0001, "loss": 1.2056, "mean_token_accuracy": 0.6570228934288025, "num_tokens": 165544679.0, "step": 400 }, { "epoch": 0.7035087719298245, "grad_norm": 0.648720920085907, "learning_rate": 0.0001, "loss": 1.1889, "mean_token_accuracy": 0.6612677574157715, "num_tokens": 165963969.0, "step": 401 }, { "epoch": 0.7052631578947368, "grad_norm": 0.6660583019256592, "learning_rate": 0.0001, "loss": 1.201, "mean_token_accuracy": 0.6595137119293213, "num_tokens": 166342479.0, "step": 402 }, { "epoch": 0.7070175438596491, "grad_norm": 0.6676925420761108, "learning_rate": 0.0001, "loss": 1.2178, "mean_token_accuracy": 0.655318558216095, "num_tokens": 166746524.0, "step": 403 }, { "epoch": 0.7087719298245614, "grad_norm": 0.6398362517356873, "learning_rate": 0.0001, "loss": 1.2217, "mean_token_accuracy": 0.6538881063461304, "num_tokens": 167166144.0, "step": 404 }, { "epoch": 0.7105263157894737, "grad_norm": 0.6486631035804749, "learning_rate": 0.0001, "loss": 1.2107, "mean_token_accuracy": 0.6568524837493896, "num_tokens": 167576053.0, "step": 405 }, { "epoch": 0.712280701754386, "grad_norm": 0.6971449851989746, "learning_rate": 0.0001, "loss": 1.2072, "mean_token_accuracy": 0.6588019132614136, "num_tokens": 168005716.0, "step": 406 }, { "epoch": 0.7140350877192982, "grad_norm": 0.5594667792320251, "learning_rate": 0.0001, "loss": 1.1815, "mean_token_accuracy": 0.6640152931213379, "num_tokens": 168425061.0, "step": 407 }, { "epoch": 0.7157894736842105, "grad_norm": 0.6978932619094849, "learning_rate": 0.0001, "loss": 1.2123, "mean_token_accuracy": 0.6560491323471069, "num_tokens": 168832540.0, "step": 408 }, { "epoch": 0.7175438596491228, "grad_norm": 0.577872097492218, "learning_rate": 0.0001, "loss": 1.1961, "mean_token_accuracy": 0.6605911254882812, "num_tokens": 169243355.0, "step": 409 }, { "epoch": 0.7192982456140351, "grad_norm": 0.6972746849060059, "learning_rate": 0.0001, "loss": 1.2263, "mean_token_accuracy": 0.6549021005630493, "num_tokens": 169678758.0, "step": 410 }, { "epoch": 0.7210526315789474, "grad_norm": 0.6528338193893433, "learning_rate": 0.0001, "loss": 1.2193, "mean_token_accuracy": 0.6543501615524292, "num_tokens": 170107843.0, "step": 411 }, { "epoch": 0.7228070175438597, "grad_norm": 0.6352643370628357, "learning_rate": 0.0001, "loss": 1.21, "mean_token_accuracy": 0.6568456292152405, "num_tokens": 170512414.0, "step": 412 }, { "epoch": 0.724561403508772, "grad_norm": 0.6633725762367249, "learning_rate": 0.0001, "loss": 1.2273, "mean_token_accuracy": 0.6531171798706055, "num_tokens": 170927891.0, "step": 413 }, { "epoch": 0.7263157894736842, "grad_norm": 0.7003793716430664, "learning_rate": 0.0001, "loss": 1.2471, "mean_token_accuracy": 0.6485984921455383, "num_tokens": 171347602.0, "step": 414 }, { "epoch": 0.7280701754385965, "grad_norm": 0.6166436076164246, "learning_rate": 0.0001, "loss": 1.1822, "mean_token_accuracy": 0.664222240447998, "num_tokens": 171764325.0, "step": 415 }, { "epoch": 0.7298245614035088, "grad_norm": 0.6370410323143005, "learning_rate": 0.0001, "loss": 1.2288, "mean_token_accuracy": 0.6530359387397766, "num_tokens": 172161316.0, "step": 416 }, { "epoch": 0.7315789473684211, "grad_norm": 0.5680028200149536, "learning_rate": 0.0001, "loss": 1.188, "mean_token_accuracy": 0.663171112537384, "num_tokens": 172557979.0, "step": 417 }, { "epoch": 0.7333333333333333, "grad_norm": 0.6317917704582214, "learning_rate": 0.0001, "loss": 1.2088, "mean_token_accuracy": 0.6587448120117188, "num_tokens": 172977293.0, "step": 418 }, { "epoch": 0.7350877192982456, "grad_norm": 0.6629990935325623, "learning_rate": 0.0001, "loss": 1.206, "mean_token_accuracy": 0.657719612121582, "num_tokens": 173386905.0, "step": 419 }, { "epoch": 0.7368421052631579, "grad_norm": 0.7318717241287231, "learning_rate": 0.0001, "loss": 1.1874, "mean_token_accuracy": 0.662236750125885, "num_tokens": 173790039.0, "step": 420 }, { "epoch": 0.7385964912280701, "grad_norm": 0.5909295678138733, "learning_rate": 0.0001, "loss": 1.1857, "mean_token_accuracy": 0.66287761926651, "num_tokens": 174200214.0, "step": 421 }, { "epoch": 0.7403508771929824, "grad_norm": 0.7244629859924316, "learning_rate": 0.0001, "loss": 1.198, "mean_token_accuracy": 0.6586729288101196, "num_tokens": 174594155.0, "step": 422 }, { "epoch": 0.7421052631578947, "grad_norm": 0.7065144777297974, "learning_rate": 0.0001, "loss": 1.189, "mean_token_accuracy": 0.6611475348472595, "num_tokens": 175025672.0, "step": 423 }, { "epoch": 0.743859649122807, "grad_norm": 0.6348630785942078, "learning_rate": 0.0001, "loss": 1.2285, "mean_token_accuracy": 0.6518391370773315, "num_tokens": 175452636.0, "step": 424 }, { "epoch": 0.7456140350877193, "grad_norm": 0.6401616930961609, "learning_rate": 0.0001, "loss": 1.2204, "mean_token_accuracy": 0.655087947845459, "num_tokens": 175879906.0, "step": 425 }, { "epoch": 0.7473684210526316, "grad_norm": 0.6971575617790222, "learning_rate": 0.0001, "loss": 1.2111, "mean_token_accuracy": 0.6562093496322632, "num_tokens": 176292162.0, "step": 426 }, { "epoch": 0.7491228070175439, "grad_norm": 0.6440587043762207, "learning_rate": 0.0001, "loss": 1.2012, "mean_token_accuracy": 0.6593471765518188, "num_tokens": 176720725.0, "step": 427 }, { "epoch": 0.7508771929824561, "grad_norm": 0.597520649433136, "learning_rate": 0.0001, "loss": 1.2284, "mean_token_accuracy": 0.6531672477722168, "num_tokens": 177161243.0, "step": 428 }, { "epoch": 0.7526315789473684, "grad_norm": 0.8046004772186279, "learning_rate": 0.0001, "loss": 1.1928, "mean_token_accuracy": 0.6594202518463135, "num_tokens": 177562007.0, "step": 429 }, { "epoch": 0.7543859649122807, "grad_norm": 0.6298813223838806, "learning_rate": 0.0001, "loss": 1.219, "mean_token_accuracy": 0.6546623706817627, "num_tokens": 177983323.0, "step": 430 }, { "epoch": 0.756140350877193, "grad_norm": 0.5731974840164185, "learning_rate": 0.0001, "loss": 1.2153, "mean_token_accuracy": 0.6574859619140625, "num_tokens": 178409419.0, "step": 431 }, { "epoch": 0.7578947368421053, "grad_norm": 0.7396548390388489, "learning_rate": 0.0001, "loss": 1.2113, "mean_token_accuracy": 0.6574329137802124, "num_tokens": 178832025.0, "step": 432 }, { "epoch": 0.7596491228070176, "grad_norm": 0.6398889422416687, "learning_rate": 0.0001, "loss": 1.2159, "mean_token_accuracy": 0.6554309129714966, "num_tokens": 179246477.0, "step": 433 }, { "epoch": 0.7614035087719299, "grad_norm": 0.6085229516029358, "learning_rate": 0.0001, "loss": 1.2216, "mean_token_accuracy": 0.6549739837646484, "num_tokens": 179666041.0, "step": 434 }, { "epoch": 0.7631578947368421, "grad_norm": 0.7816640734672546, "learning_rate": 0.0001, "loss": 1.2119, "mean_token_accuracy": 0.6565245389938354, "num_tokens": 180092225.0, "step": 435 }, { "epoch": 0.7649122807017544, "grad_norm": 0.8083506226539612, "learning_rate": 0.0001, "loss": 1.1961, "mean_token_accuracy": 0.6606351733207703, "num_tokens": 180498123.0, "step": 436 }, { "epoch": 0.7666666666666667, "grad_norm": 0.6019986271858215, "learning_rate": 0.0001, "loss": 1.1972, "mean_token_accuracy": 0.6611742377281189, "num_tokens": 180904735.0, "step": 437 }, { "epoch": 0.7684210526315789, "grad_norm": 0.6621778011322021, "learning_rate": 0.0001, "loss": 1.1987, "mean_token_accuracy": 0.6592767238616943, "num_tokens": 181297827.0, "step": 438 }, { "epoch": 0.7701754385964912, "grad_norm": 0.5817862749099731, "learning_rate": 0.0001, "loss": 1.2096, "mean_token_accuracy": 0.6584010124206543, "num_tokens": 181722096.0, "step": 439 }, { "epoch": 0.7719298245614035, "grad_norm": 0.6433981657028198, "learning_rate": 0.0001, "loss": 1.1872, "mean_token_accuracy": 0.6626437902450562, "num_tokens": 182124515.0, "step": 440 }, { "epoch": 0.7736842105263158, "grad_norm": 0.6573434472084045, "learning_rate": 0.0001, "loss": 1.2159, "mean_token_accuracy": 0.6551169157028198, "num_tokens": 182542265.0, "step": 441 }, { "epoch": 0.775438596491228, "grad_norm": 0.684744656085968, "learning_rate": 0.0001, "loss": 1.2095, "mean_token_accuracy": 0.6574633121490479, "num_tokens": 182942368.0, "step": 442 }, { "epoch": 0.7771929824561403, "grad_norm": 0.5961515307426453, "learning_rate": 0.0001, "loss": 1.2012, "mean_token_accuracy": 0.6580845713615417, "num_tokens": 183350652.0, "step": 443 }, { "epoch": 0.7789473684210526, "grad_norm": 0.7268422842025757, "learning_rate": 0.0001, "loss": 1.2082, "mean_token_accuracy": 0.657654345035553, "num_tokens": 183786290.0, "step": 444 }, { "epoch": 0.7807017543859649, "grad_norm": 0.7548661231994629, "learning_rate": 0.0001, "loss": 1.203, "mean_token_accuracy": 0.6581261157989502, "num_tokens": 184191985.0, "step": 445 }, { "epoch": 0.7824561403508772, "grad_norm": 0.589838981628418, "learning_rate": 0.0001, "loss": 1.2253, "mean_token_accuracy": 0.652956485748291, "num_tokens": 184617087.0, "step": 446 }, { "epoch": 0.7842105263157895, "grad_norm": 0.7901304960250854, "learning_rate": 0.0001, "loss": 1.2023, "mean_token_accuracy": 0.6594702005386353, "num_tokens": 185046113.0, "step": 447 }, { "epoch": 0.7859649122807018, "grad_norm": 0.681577205657959, "learning_rate": 0.0001, "loss": 1.1765, "mean_token_accuracy": 0.6648210287094116, "num_tokens": 185440210.0, "step": 448 }, { "epoch": 0.787719298245614, "grad_norm": 0.619105339050293, "learning_rate": 0.0001, "loss": 1.2151, "mean_token_accuracy": 0.6544394493103027, "num_tokens": 185866240.0, "step": 449 }, { "epoch": 0.7894736842105263, "grad_norm": 0.6568613648414612, "learning_rate": 0.0001, "loss": 1.1808, "mean_token_accuracy": 0.6645166277885437, "num_tokens": 186262559.0, "step": 450 }, { "epoch": 0.7912280701754386, "grad_norm": 0.6452411413192749, "learning_rate": 0.0001, "loss": 1.2289, "mean_token_accuracy": 0.6531310677528381, "num_tokens": 186677017.0, "step": 451 }, { "epoch": 0.7929824561403509, "grad_norm": 0.6799737215042114, "learning_rate": 0.0001, "loss": 1.2207, "mean_token_accuracy": 0.6556583046913147, "num_tokens": 187108135.0, "step": 452 }, { "epoch": 0.7947368421052632, "grad_norm": 0.5680040717124939, "learning_rate": 0.0001, "loss": 1.1886, "mean_token_accuracy": 0.6613532900810242, "num_tokens": 187533913.0, "step": 453 }, { "epoch": 0.7964912280701755, "grad_norm": 0.6380943655967712, "learning_rate": 0.0001, "loss": 1.2136, "mean_token_accuracy": 0.6577192544937134, "num_tokens": 187943777.0, "step": 454 }, { "epoch": 0.7982456140350878, "grad_norm": 0.5565281510353088, "learning_rate": 0.0001, "loss": 1.1941, "mean_token_accuracy": 0.6604122519493103, "num_tokens": 188365013.0, "step": 455 }, { "epoch": 0.8, "grad_norm": 0.6176914572715759, "learning_rate": 0.0001, "loss": 1.1957, "mean_token_accuracy": 0.6601607799530029, "num_tokens": 188805546.0, "step": 456 }, { "epoch": 0.8017543859649123, "grad_norm": 0.6163376569747925, "learning_rate": 0.0001, "loss": 1.1788, "mean_token_accuracy": 0.6634429097175598, "num_tokens": 189204261.0, "step": 457 }, { "epoch": 0.8035087719298246, "grad_norm": 0.6874009966850281, "learning_rate": 0.0001, "loss": 1.2061, "mean_token_accuracy": 0.6587145924568176, "num_tokens": 189609457.0, "step": 458 }, { "epoch": 0.8052631578947368, "grad_norm": 0.6584733724594116, "learning_rate": 0.0001, "loss": 1.2077, "mean_token_accuracy": 0.6568840742111206, "num_tokens": 190024904.0, "step": 459 }, { "epoch": 0.8070175438596491, "grad_norm": 0.554511547088623, "learning_rate": 0.0001, "loss": 1.1883, "mean_token_accuracy": 0.661857008934021, "num_tokens": 190435843.0, "step": 460 }, { "epoch": 0.8087719298245614, "grad_norm": 0.6625659465789795, "learning_rate": 0.0001, "loss": 1.209, "mean_token_accuracy": 0.6570659875869751, "num_tokens": 190844879.0, "step": 461 }, { "epoch": 0.8105263157894737, "grad_norm": 0.6230789422988892, "learning_rate": 0.0001, "loss": 1.1932, "mean_token_accuracy": 0.6605242490768433, "num_tokens": 191240668.0, "step": 462 }, { "epoch": 0.8122807017543859, "grad_norm": 0.5848865509033203, "learning_rate": 0.0001, "loss": 1.2055, "mean_token_accuracy": 0.6577123999595642, "num_tokens": 191649912.0, "step": 463 }, { "epoch": 0.8140350877192982, "grad_norm": 0.7131868600845337, "learning_rate": 0.0001, "loss": 1.1945, "mean_token_accuracy": 0.6616647839546204, "num_tokens": 192065093.0, "step": 464 }, { "epoch": 0.8157894736842105, "grad_norm": 0.620922863483429, "learning_rate": 0.0001, "loss": 1.2134, "mean_token_accuracy": 0.657670259475708, "num_tokens": 192458897.0, "step": 465 }, { "epoch": 0.8175438596491228, "grad_norm": 0.6825653910636902, "learning_rate": 0.0001, "loss": 1.2083, "mean_token_accuracy": 0.6574592590332031, "num_tokens": 192871837.0, "step": 466 }, { "epoch": 0.8192982456140351, "grad_norm": 0.649117648601532, "learning_rate": 0.0001, "loss": 1.1904, "mean_token_accuracy": 0.661907970905304, "num_tokens": 193297231.0, "step": 467 }, { "epoch": 0.8210526315789474, "grad_norm": 0.5843600630760193, "learning_rate": 0.0001, "loss": 1.1613, "mean_token_accuracy": 0.6669655442237854, "num_tokens": 193684403.0, "step": 468 }, { "epoch": 0.8228070175438597, "grad_norm": 0.6877574324607849, "learning_rate": 0.0001, "loss": 1.2131, "mean_token_accuracy": 0.6550207138061523, "num_tokens": 194113357.0, "step": 469 }, { "epoch": 0.8245614035087719, "grad_norm": 0.6516855955123901, "learning_rate": 0.0001, "loss": 1.1979, "mean_token_accuracy": 0.65838223695755, "num_tokens": 194526469.0, "step": 470 }, { "epoch": 0.8263157894736842, "grad_norm": 0.6000040769577026, "learning_rate": 0.0001, "loss": 1.2303, "mean_token_accuracy": 0.6509213447570801, "num_tokens": 194953909.0, "step": 471 }, { "epoch": 0.8280701754385965, "grad_norm": 0.6414221525192261, "learning_rate": 0.0001, "loss": 1.2276, "mean_token_accuracy": 0.6521142721176147, "num_tokens": 195388769.0, "step": 472 }, { "epoch": 0.8298245614035088, "grad_norm": 0.614547848701477, "learning_rate": 0.0001, "loss": 1.2174, "mean_token_accuracy": 0.6551234126091003, "num_tokens": 195818916.0, "step": 473 }, { "epoch": 0.8315789473684211, "grad_norm": 0.6391692161560059, "learning_rate": 0.0001, "loss": 1.1963, "mean_token_accuracy": 0.659719705581665, "num_tokens": 196233034.0, "step": 474 }, { "epoch": 0.8333333333333334, "grad_norm": 0.6614966988563538, "learning_rate": 0.0001, "loss": 1.2407, "mean_token_accuracy": 0.6485875844955444, "num_tokens": 196660034.0, "step": 475 }, { "epoch": 0.8350877192982457, "grad_norm": 0.5896729826927185, "learning_rate": 0.0001, "loss": 1.2188, "mean_token_accuracy": 0.6551612615585327, "num_tokens": 197080673.0, "step": 476 }, { "epoch": 0.8368421052631579, "grad_norm": 0.6428948044776917, "learning_rate": 0.0001, "loss": 1.1962, "mean_token_accuracy": 0.6604630947113037, "num_tokens": 197493003.0, "step": 477 }, { "epoch": 0.8385964912280702, "grad_norm": 0.6853853464126587, "learning_rate": 0.0001, "loss": 1.2116, "mean_token_accuracy": 0.6553216576576233, "num_tokens": 197923635.0, "step": 478 }, { "epoch": 0.8403508771929824, "grad_norm": 0.6877092719078064, "learning_rate": 0.0001, "loss": 1.2017, "mean_token_accuracy": 0.657435417175293, "num_tokens": 198339854.0, "step": 479 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5886791348457336, "learning_rate": 0.0001, "loss": 1.2414, "mean_token_accuracy": 0.6491390466690063, "num_tokens": 198768483.0, "step": 480 }, { "epoch": 0.843859649122807, "grad_norm": 0.8585889935493469, "learning_rate": 0.0001, "loss": 1.2168, "mean_token_accuracy": 0.6541799902915955, "num_tokens": 199190842.0, "step": 481 }, { "epoch": 0.8456140350877193, "grad_norm": 0.6527767181396484, "learning_rate": 0.0001, "loss": 1.2236, "mean_token_accuracy": 0.6539558172225952, "num_tokens": 199620649.0, "step": 482 }, { "epoch": 0.8473684210526315, "grad_norm": 0.6834746599197388, "learning_rate": 0.0001, "loss": 1.2015, "mean_token_accuracy": 0.6586301326751709, "num_tokens": 200023406.0, "step": 483 }, { "epoch": 0.8491228070175438, "grad_norm": 0.6827247142791748, "learning_rate": 0.0001, "loss": 1.178, "mean_token_accuracy": 0.6644470691680908, "num_tokens": 200430239.0, "step": 484 }, { "epoch": 0.8508771929824561, "grad_norm": 0.6491426229476929, "learning_rate": 0.0001, "loss": 1.1927, "mean_token_accuracy": 0.6621779799461365, "num_tokens": 200833864.0, "step": 485 }, { "epoch": 0.8526315789473684, "grad_norm": 0.6229031682014465, "learning_rate": 0.0001, "loss": 1.1923, "mean_token_accuracy": 0.6596782207489014, "num_tokens": 201241869.0, "step": 486 }, { "epoch": 0.8543859649122807, "grad_norm": 0.5779981017112732, "learning_rate": 0.0001, "loss": 1.1857, "mean_token_accuracy": 0.6614409685134888, "num_tokens": 201658653.0, "step": 487 }, { "epoch": 0.856140350877193, "grad_norm": 0.6096077561378479, "learning_rate": 0.0001, "loss": 1.2096, "mean_token_accuracy": 0.6574957370758057, "num_tokens": 202086503.0, "step": 488 }, { "epoch": 0.8578947368421053, "grad_norm": 0.7495996952056885, "learning_rate": 0.0001, "loss": 1.2005, "mean_token_accuracy": 0.6597346067428589, "num_tokens": 202509306.0, "step": 489 }, { "epoch": 0.8596491228070176, "grad_norm": 0.6209189295768738, "learning_rate": 0.0001, "loss": 1.1753, "mean_token_accuracy": 0.6628221273422241, "num_tokens": 202909087.0, "step": 490 }, { "epoch": 0.8614035087719298, "grad_norm": 0.563208281993866, "learning_rate": 0.0001, "loss": 1.2005, "mean_token_accuracy": 0.6574559211730957, "num_tokens": 203337615.0, "step": 491 }, { "epoch": 0.8631578947368421, "grad_norm": 0.6872074604034424, "learning_rate": 0.0001, "loss": 1.1982, "mean_token_accuracy": 0.6597882509231567, "num_tokens": 203754527.0, "step": 492 }, { "epoch": 0.8649122807017544, "grad_norm": 0.6505935192108154, "learning_rate": 0.0001, "loss": 1.1734, "mean_token_accuracy": 0.666144609451294, "num_tokens": 204166768.0, "step": 493 }, { "epoch": 0.8666666666666667, "grad_norm": 0.7290279269218445, "learning_rate": 0.0001, "loss": 1.1923, "mean_token_accuracy": 0.6601245403289795, "num_tokens": 204554076.0, "step": 494 }, { "epoch": 0.868421052631579, "grad_norm": 0.6451328992843628, "learning_rate": 0.0001, "loss": 1.2272, "mean_token_accuracy": 0.653006911277771, "num_tokens": 204962578.0, "step": 495 }, { "epoch": 0.8701754385964913, "grad_norm": 0.7413347363471985, "learning_rate": 0.0001, "loss": 1.153, "mean_token_accuracy": 0.6694173216819763, "num_tokens": 205358271.0, "step": 496 }, { "epoch": 0.8719298245614036, "grad_norm": 0.6787010431289673, "learning_rate": 0.0001, "loss": 1.2085, "mean_token_accuracy": 0.6561261415481567, "num_tokens": 205786890.0, "step": 497 }, { "epoch": 0.8736842105263158, "grad_norm": 0.6698117256164551, "learning_rate": 0.0001, "loss": 1.2019, "mean_token_accuracy": 0.6592704057693481, "num_tokens": 206193572.0, "step": 498 }, { "epoch": 0.875438596491228, "grad_norm": 0.6170295476913452, "learning_rate": 0.0001, "loss": 1.1723, "mean_token_accuracy": 0.6650887727737427, "num_tokens": 206607450.0, "step": 499 }, { "epoch": 0.8771929824561403, "grad_norm": 0.5921252965927124, "learning_rate": 0.0001, "loss": 1.1823, "mean_token_accuracy": 0.6633787155151367, "num_tokens": 207005126.0, "step": 500 }, { "epoch": 0.8789473684210526, "grad_norm": 0.69658362865448, "learning_rate": 0.0001, "loss": 1.1793, "mean_token_accuracy": 0.6646496057510376, "num_tokens": 207394794.0, "step": 501 }, { "epoch": 0.8807017543859649, "grad_norm": 0.6810624599456787, "learning_rate": 0.0001, "loss": 1.1979, "mean_token_accuracy": 0.6584210395812988, "num_tokens": 207783539.0, "step": 502 }, { "epoch": 0.8824561403508772, "grad_norm": 0.6264888644218445, "learning_rate": 0.0001, "loss": 1.2045, "mean_token_accuracy": 0.6583248376846313, "num_tokens": 208196198.0, "step": 503 }, { "epoch": 0.8842105263157894, "grad_norm": 0.6070482730865479, "learning_rate": 0.0001, "loss": 1.1995, "mean_token_accuracy": 0.6601771712303162, "num_tokens": 208602598.0, "step": 504 }, { "epoch": 0.8859649122807017, "grad_norm": 0.6856517791748047, "learning_rate": 0.0001, "loss": 1.1909, "mean_token_accuracy": 0.6614357233047485, "num_tokens": 209039139.0, "step": 505 }, { "epoch": 0.887719298245614, "grad_norm": 0.5697737336158752, "learning_rate": 0.0001, "loss": 1.1967, "mean_token_accuracy": 0.6589823961257935, "num_tokens": 209437950.0, "step": 506 }, { "epoch": 0.8894736842105263, "grad_norm": 0.7310987114906311, "learning_rate": 0.0001, "loss": 1.205, "mean_token_accuracy": 0.6580671668052673, "num_tokens": 209862559.0, "step": 507 }, { "epoch": 0.8912280701754386, "grad_norm": 0.6229117512702942, "learning_rate": 0.0001, "loss": 1.222, "mean_token_accuracy": 0.6536027789115906, "num_tokens": 210284769.0, "step": 508 }, { "epoch": 0.8929824561403509, "grad_norm": 0.5739285349845886, "learning_rate": 0.0001, "loss": 1.2059, "mean_token_accuracy": 0.6585639119148254, "num_tokens": 210708859.0, "step": 509 }, { "epoch": 0.8947368421052632, "grad_norm": 0.6239802837371826, "learning_rate": 0.0001, "loss": 1.1986, "mean_token_accuracy": 0.6589633822441101, "num_tokens": 211130168.0, "step": 510 }, { "epoch": 0.8964912280701754, "grad_norm": 0.6617391705513, "learning_rate": 0.0001, "loss": 1.2027, "mean_token_accuracy": 0.6577974557876587, "num_tokens": 211529753.0, "step": 511 }, { "epoch": 0.8982456140350877, "grad_norm": 0.638733983039856, "learning_rate": 0.0001, "loss": 1.2142, "mean_token_accuracy": 0.6540495157241821, "num_tokens": 211963430.0, "step": 512 }, { "epoch": 0.9, "grad_norm": 0.6008735299110413, "learning_rate": 0.0001, "loss": 1.2136, "mean_token_accuracy": 0.6559524536132812, "num_tokens": 212387404.0, "step": 513 }, { "epoch": 0.9017543859649123, "grad_norm": 0.6343475580215454, "learning_rate": 0.0001, "loss": 1.1718, "mean_token_accuracy": 0.6651860475540161, "num_tokens": 212802763.0, "step": 514 }, { "epoch": 0.9035087719298246, "grad_norm": 0.637675940990448, "learning_rate": 0.0001, "loss": 1.1694, "mean_token_accuracy": 0.6661834716796875, "num_tokens": 213219406.0, "step": 515 }, { "epoch": 0.9052631578947369, "grad_norm": 0.5518184900283813, "learning_rate": 0.0001, "loss": 1.1954, "mean_token_accuracy": 0.6603313684463501, "num_tokens": 213623710.0, "step": 516 }, { "epoch": 0.9070175438596492, "grad_norm": 0.6756175756454468, "learning_rate": 0.0001, "loss": 1.1701, "mean_token_accuracy": 0.6667043566703796, "num_tokens": 214053806.0, "step": 517 }, { "epoch": 0.9087719298245615, "grad_norm": 0.5964516401290894, "learning_rate": 0.0001, "loss": 1.2007, "mean_token_accuracy": 0.6573567390441895, "num_tokens": 214457394.0, "step": 518 }, { "epoch": 0.9105263157894737, "grad_norm": 0.745707094669342, "learning_rate": 0.0001, "loss": 1.1697, "mean_token_accuracy": 0.6656190156936646, "num_tokens": 214841472.0, "step": 519 }, { "epoch": 0.9122807017543859, "grad_norm": 0.5971705317497253, "learning_rate": 0.0001, "loss": 1.2061, "mean_token_accuracy": 0.656207799911499, "num_tokens": 215261046.0, "step": 520 }, { "epoch": 0.9140350877192982, "grad_norm": 0.7177700400352478, "learning_rate": 0.0001, "loss": 1.1753, "mean_token_accuracy": 0.6650264263153076, "num_tokens": 215664423.0, "step": 521 }, { "epoch": 0.9157894736842105, "grad_norm": 0.5945612788200378, "learning_rate": 0.0001, "loss": 1.1813, "mean_token_accuracy": 0.66515052318573, "num_tokens": 216072733.0, "step": 522 }, { "epoch": 0.9175438596491228, "grad_norm": 0.7161288857460022, "learning_rate": 0.0001, "loss": 1.1964, "mean_token_accuracy": 0.6598995923995972, "num_tokens": 216490157.0, "step": 523 }, { "epoch": 0.9192982456140351, "grad_norm": 0.6490321159362793, "learning_rate": 0.0001, "loss": 1.2151, "mean_token_accuracy": 0.6550993919372559, "num_tokens": 216933714.0, "step": 524 }, { "epoch": 0.9210526315789473, "grad_norm": 0.6328585743904114, "learning_rate": 0.0001, "loss": 1.2008, "mean_token_accuracy": 0.658663272857666, "num_tokens": 217360826.0, "step": 525 }, { "epoch": 0.9228070175438596, "grad_norm": 0.6045000553131104, "learning_rate": 0.0001, "loss": 1.1837, "mean_token_accuracy": 0.6624784469604492, "num_tokens": 217767964.0, "step": 526 }, { "epoch": 0.9245614035087719, "grad_norm": 0.5896552205085754, "learning_rate": 0.0001, "loss": 1.1785, "mean_token_accuracy": 0.663489043712616, "num_tokens": 218182502.0, "step": 527 }, { "epoch": 0.9263157894736842, "grad_norm": 0.6433465480804443, "learning_rate": 0.0001, "loss": 1.1866, "mean_token_accuracy": 0.662973940372467, "num_tokens": 218599164.0, "step": 528 }, { "epoch": 0.9280701754385965, "grad_norm": 0.6225712895393372, "learning_rate": 0.0001, "loss": 1.1732, "mean_token_accuracy": 0.6657634973526001, "num_tokens": 219038853.0, "step": 529 }, { "epoch": 0.9298245614035088, "grad_norm": 0.6584674715995789, "learning_rate": 0.0001, "loss": 1.1931, "mean_token_accuracy": 0.6607257127761841, "num_tokens": 219455763.0, "step": 530 }, { "epoch": 0.9315789473684211, "grad_norm": 0.5859020352363586, "learning_rate": 0.0001, "loss": 1.2043, "mean_token_accuracy": 0.657474935054779, "num_tokens": 219857096.0, "step": 531 }, { "epoch": 0.9333333333333333, "grad_norm": 0.6879558563232422, "learning_rate": 0.0001, "loss": 1.1761, "mean_token_accuracy": 0.6635361909866333, "num_tokens": 220269043.0, "step": 532 }, { "epoch": 0.9350877192982456, "grad_norm": 0.6866979002952576, "learning_rate": 0.0001, "loss": 1.2073, "mean_token_accuracy": 0.6566738486289978, "num_tokens": 220650778.0, "step": 533 }, { "epoch": 0.9368421052631579, "grad_norm": 0.6336923241615295, "learning_rate": 0.0001, "loss": 1.2039, "mean_token_accuracy": 0.6579070091247559, "num_tokens": 221055825.0, "step": 534 }, { "epoch": 0.9385964912280702, "grad_norm": 0.6081579327583313, "learning_rate": 0.0001, "loss": 1.2108, "mean_token_accuracy": 0.6550259590148926, "num_tokens": 221459868.0, "step": 535 }, { "epoch": 0.9403508771929825, "grad_norm": 0.6312009692192078, "learning_rate": 0.0001, "loss": 1.1702, "mean_token_accuracy": 0.6641944646835327, "num_tokens": 221860324.0, "step": 536 }, { "epoch": 0.9421052631578948, "grad_norm": 0.5887439250946045, "learning_rate": 0.0001, "loss": 1.1778, "mean_token_accuracy": 0.6638119220733643, "num_tokens": 222291251.0, "step": 537 }, { "epoch": 0.9438596491228071, "grad_norm": 0.543400764465332, "learning_rate": 0.0001, "loss": 1.1805, "mean_token_accuracy": 0.6627988815307617, "num_tokens": 222699559.0, "step": 538 }, { "epoch": 0.9456140350877194, "grad_norm": 0.5787383913993835, "learning_rate": 0.0001, "loss": 1.1914, "mean_token_accuracy": 0.6620433330535889, "num_tokens": 223156357.0, "step": 539 }, { "epoch": 0.9473684210526315, "grad_norm": 0.6597963571548462, "learning_rate": 0.0001, "loss": 1.1871, "mean_token_accuracy": 0.6626171469688416, "num_tokens": 223562411.0, "step": 540 }, { "epoch": 0.9491228070175438, "grad_norm": 0.5731210112571716, "learning_rate": 0.0001, "loss": 1.2078, "mean_token_accuracy": 0.6560046076774597, "num_tokens": 223988252.0, "step": 541 }, { "epoch": 0.9508771929824561, "grad_norm": 0.7036701440811157, "learning_rate": 0.0001, "loss": 1.1917, "mean_token_accuracy": 0.6613459587097168, "num_tokens": 224401217.0, "step": 542 }, { "epoch": 0.9526315789473684, "grad_norm": 0.5783252120018005, "learning_rate": 0.0001, "loss": 1.1757, "mean_token_accuracy": 0.6623378396034241, "num_tokens": 224807482.0, "step": 543 }, { "epoch": 0.9543859649122807, "grad_norm": 0.7617517113685608, "learning_rate": 0.0001, "loss": 1.1937, "mean_token_accuracy": 0.659681499004364, "num_tokens": 225220680.0, "step": 544 }, { "epoch": 0.956140350877193, "grad_norm": 0.6007680296897888, "learning_rate": 0.0001, "loss": 1.2401, "mean_token_accuracy": 0.6486543416976929, "num_tokens": 225640539.0, "step": 545 }, { "epoch": 0.9578947368421052, "grad_norm": 0.7272628545761108, "learning_rate": 0.0001, "loss": 1.1822, "mean_token_accuracy": 0.6626489162445068, "num_tokens": 226035924.0, "step": 546 }, { "epoch": 0.9596491228070175, "grad_norm": 0.700038492679596, "learning_rate": 0.0001, "loss": 1.1658, "mean_token_accuracy": 0.666049599647522, "num_tokens": 226425232.0, "step": 547 }, { "epoch": 0.9614035087719298, "grad_norm": 0.6490049958229065, "learning_rate": 0.0001, "loss": 1.1691, "mean_token_accuracy": 0.6649153828620911, "num_tokens": 226802383.0, "step": 548 }, { "epoch": 0.9631578947368421, "grad_norm": 0.7154028415679932, "learning_rate": 0.0001, "loss": 1.1986, "mean_token_accuracy": 0.6584769487380981, "num_tokens": 227227610.0, "step": 549 }, { "epoch": 0.9649122807017544, "grad_norm": 0.6601865887641907, "learning_rate": 0.0001, "loss": 1.2034, "mean_token_accuracy": 0.6581115126609802, "num_tokens": 227657882.0, "step": 550 }, { "epoch": 0.9666666666666667, "grad_norm": 0.6211066842079163, "learning_rate": 0.0001, "loss": 1.1749, "mean_token_accuracy": 0.6637754440307617, "num_tokens": 228081460.0, "step": 551 }, { "epoch": 0.968421052631579, "grad_norm": 0.6879007816314697, "learning_rate": 0.0001, "loss": 1.2181, "mean_token_accuracy": 0.6532254219055176, "num_tokens": 228507388.0, "step": 552 }, { "epoch": 0.9701754385964912, "grad_norm": 0.6297675371170044, "learning_rate": 0.0001, "loss": 1.2126, "mean_token_accuracy": 0.654970645904541, "num_tokens": 228935033.0, "step": 553 }, { "epoch": 0.9719298245614035, "grad_norm": 0.5917762517929077, "learning_rate": 0.0001, "loss": 1.1517, "mean_token_accuracy": 0.6693041920661926, "num_tokens": 229325690.0, "step": 554 }, { "epoch": 0.9736842105263158, "grad_norm": 0.6466293334960938, "learning_rate": 0.0001, "loss": 1.1573, "mean_token_accuracy": 0.6677490472793579, "num_tokens": 229726549.0, "step": 555 }, { "epoch": 0.9754385964912281, "grad_norm": 0.6341378688812256, "learning_rate": 0.0001, "loss": 1.185, "mean_token_accuracy": 0.6614177227020264, "num_tokens": 230122274.0, "step": 556 }, { "epoch": 0.9771929824561404, "grad_norm": 0.604850172996521, "learning_rate": 0.0001, "loss": 1.1959, "mean_token_accuracy": 0.6580736637115479, "num_tokens": 230526104.0, "step": 557 }, { "epoch": 0.9789473684210527, "grad_norm": 0.7436766624450684, "learning_rate": 0.0001, "loss": 1.1642, "mean_token_accuracy": 0.6674508452415466, "num_tokens": 230941199.0, "step": 558 }, { "epoch": 0.980701754385965, "grad_norm": 0.6362001895904541, "learning_rate": 0.0001, "loss": 1.1751, "mean_token_accuracy": 0.6646236181259155, "num_tokens": 231367872.0, "step": 559 }, { "epoch": 0.9824561403508771, "grad_norm": 0.6686745882034302, "learning_rate": 0.0001, "loss": 1.2065, "mean_token_accuracy": 0.6581393480300903, "num_tokens": 231804511.0, "step": 560 }, { "epoch": 0.9842105263157894, "grad_norm": 0.7186607718467712, "learning_rate": 0.0001, "loss": 1.1692, "mean_token_accuracy": 0.6657729148864746, "num_tokens": 232201554.0, "step": 561 }, { "epoch": 0.9859649122807017, "grad_norm": 0.5875235795974731, "learning_rate": 0.0001, "loss": 1.185, "mean_token_accuracy": 0.6623810529708862, "num_tokens": 232632247.0, "step": 562 }, { "epoch": 0.987719298245614, "grad_norm": 0.6285355687141418, "learning_rate": 0.0001, "loss": 1.1932, "mean_token_accuracy": 0.6584882140159607, "num_tokens": 233040299.0, "step": 563 }, { "epoch": 0.9894736842105263, "grad_norm": 0.6787013411521912, "learning_rate": 0.0001, "loss": 1.2017, "mean_token_accuracy": 0.6589356660842896, "num_tokens": 233476898.0, "step": 564 }, { "epoch": 0.9912280701754386, "grad_norm": 0.5261335372924805, "learning_rate": 0.0001, "loss": 1.1674, "mean_token_accuracy": 0.6651347279548645, "num_tokens": 233898203.0, "step": 565 }, { "epoch": 0.9929824561403509, "grad_norm": 0.6217278242111206, "learning_rate": 0.0001, "loss": 1.2212, "mean_token_accuracy": 0.653939962387085, "num_tokens": 234316491.0, "step": 566 }, { "epoch": 0.9947368421052631, "grad_norm": 0.6469559073448181, "learning_rate": 0.0001, "loss": 1.1848, "mean_token_accuracy": 0.6615195870399475, "num_tokens": 234725455.0, "step": 567 }, { "epoch": 0.9964912280701754, "grad_norm": 0.6558631062507629, "learning_rate": 0.0001, "loss": 1.173, "mean_token_accuracy": 0.6650323867797852, "num_tokens": 235152094.0, "step": 568 }, { "epoch": 0.9982456140350877, "grad_norm": 0.6159579157829285, "learning_rate": 0.0001, "loss": 1.1885, "mean_token_accuracy": 0.6604526042938232, "num_tokens": 235558911.0, "step": 569 }, { "epoch": 1.0, "grad_norm": 0.6799984574317932, "learning_rate": 0.0001, "loss": 1.1975, "mean_token_accuracy": 0.6584136486053467, "num_tokens": 235994347.0, "step": 570 }, { "epoch": 1.0, "step": 570, "total_flos": 1.377941890090926e+18, "train_loss": 1.2564991597543682, "train_runtime": 1307.2842, "train_samples_per_second": 111.621, "train_steps_per_second": 0.436 } ], "logging_steps": 1, "max_steps": 570, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 285, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.377941890090926e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }