{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 728, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0027472527472527475, "grad_norm": 79.80936431884766, "learning_rate": 1.36986301369863e-07, "loss": 3.2006, "mean_token_accuracy": 0.5809080600738525, "step": 1 }, { "epoch": 0.005494505494505495, "grad_norm": 73.02354431152344, "learning_rate": 2.73972602739726e-07, "loss": 3.0485, "mean_token_accuracy": 0.5531686544418335, "step": 2 }, { "epoch": 0.008241758241758242, "grad_norm": 90.18173217773438, "learning_rate": 4.1095890410958903e-07, "loss": 3.3162, "mean_token_accuracy": 0.5780051350593567, "step": 3 }, { "epoch": 0.01098901098901099, "grad_norm": 89.79154205322266, "learning_rate": 5.47945205479452e-07, "loss": 3.2123, "mean_token_accuracy": 0.5886699557304382, "step": 4 }, { "epoch": 0.013736263736263736, "grad_norm": 83.19721984863281, "learning_rate": 6.849315068493151e-07, "loss": 3.1087, "mean_token_accuracy": 0.5597964525222778, "step": 5 }, { "epoch": 0.016483516483516484, "grad_norm": 75.79467010498047, "learning_rate": 8.219178082191781e-07, "loss": 3.4744, "mean_token_accuracy": 0.54666668176651, "step": 6 }, { "epoch": 0.019230769230769232, "grad_norm": 63.705596923828125, "learning_rate": 9.589041095890411e-07, "loss": 3.7404, "mean_token_accuracy": 0.48148149251937866, "step": 7 }, { "epoch": 0.02197802197802198, "grad_norm": 53.74971008300781, "learning_rate": 1.095890410958904e-06, "loss": 3.3282, "mean_token_accuracy": 0.5387324094772339, "step": 8 }, { "epoch": 0.024725274725274724, "grad_norm": 52.09133529663086, "learning_rate": 1.2328767123287673e-06, "loss": 3.1801, "mean_token_accuracy": 0.5777262449264526, "step": 9 }, { "epoch": 0.027472527472527472, "grad_norm": 37.37009048461914, "learning_rate": 1.3698630136986302e-06, "loss": 2.6745, "mean_token_accuracy": 0.6029082536697388, "step": 10 }, { "epoch": 0.03021978021978022, "grad_norm": 36.568485260009766, "learning_rate": 1.5068493150684932e-06, "loss": 2.8587, "mean_token_accuracy": 0.590347945690155, "step": 11 }, { "epoch": 0.03296703296703297, "grad_norm": 35.09193801879883, "learning_rate": 1.6438356164383561e-06, "loss": 2.4748, "mean_token_accuracy": 0.6369710564613342, "step": 12 }, { "epoch": 0.03571428571428571, "grad_norm": 29.399085998535156, "learning_rate": 1.7808219178082193e-06, "loss": 2.3455, "mean_token_accuracy": 0.6575654149055481, "step": 13 }, { "epoch": 0.038461538461538464, "grad_norm": 29.879207611083984, "learning_rate": 1.9178082191780823e-06, "loss": 2.9237, "mean_token_accuracy": 0.5609756112098694, "step": 14 }, { "epoch": 0.04120879120879121, "grad_norm": 31.87448501586914, "learning_rate": 2.0547945205479454e-06, "loss": 2.8315, "mean_token_accuracy": 0.5687830448150635, "step": 15 }, { "epoch": 0.04395604395604396, "grad_norm": 29.622562408447266, "learning_rate": 2.191780821917808e-06, "loss": 2.7541, "mean_token_accuracy": 0.6079999804496765, "step": 16 }, { "epoch": 0.046703296703296704, "grad_norm": 46.84980010986328, "learning_rate": 2.3287671232876713e-06, "loss": 2.2641, "mean_token_accuracy": 0.6080626845359802, "step": 17 }, { "epoch": 0.04945054945054945, "grad_norm": 34.44080352783203, "learning_rate": 2.4657534246575345e-06, "loss": 2.4124, "mean_token_accuracy": 0.6391075849533081, "step": 18 }, { "epoch": 0.0521978021978022, "grad_norm": 36.21349334716797, "learning_rate": 2.6027397260273973e-06, "loss": 2.2594, "mean_token_accuracy": 0.6270996928215027, "step": 19 }, { "epoch": 0.054945054945054944, "grad_norm": 39.15977096557617, "learning_rate": 2.7397260273972604e-06, "loss": 2.2868, "mean_token_accuracy": 0.6432291865348816, "step": 20 }, { "epoch": 0.057692307692307696, "grad_norm": 35.869197845458984, "learning_rate": 2.876712328767123e-06, "loss": 2.4556, "mean_token_accuracy": 0.6113360524177551, "step": 21 }, { "epoch": 0.06043956043956044, "grad_norm": 30.194963455200195, "learning_rate": 3.0136986301369864e-06, "loss": 2.436, "mean_token_accuracy": 0.6002538204193115, "step": 22 }, { "epoch": 0.06318681318681318, "grad_norm": 24.54381561279297, "learning_rate": 3.1506849315068495e-06, "loss": 2.6554, "mean_token_accuracy": 0.5426666736602783, "step": 23 }, { "epoch": 0.06593406593406594, "grad_norm": 19.29971694946289, "learning_rate": 3.2876712328767123e-06, "loss": 2.4589, "mean_token_accuracy": 0.5473411083221436, "step": 24 }, { "epoch": 0.06868131868131869, "grad_norm": 16.851980209350586, "learning_rate": 3.4246575342465754e-06, "loss": 2.2021, "mean_token_accuracy": 0.6083832383155823, "step": 25 }, { "epoch": 0.07142857142857142, "grad_norm": 17.44552230834961, "learning_rate": 3.5616438356164386e-06, "loss": 1.989, "mean_token_accuracy": 0.6418079137802124, "step": 26 }, { "epoch": 0.07417582417582418, "grad_norm": 17.299097061157227, "learning_rate": 3.6986301369863014e-06, "loss": 1.8649, "mean_token_accuracy": 0.6751946806907654, "step": 27 }, { "epoch": 0.07692307692307693, "grad_norm": 20.152931213378906, "learning_rate": 3.8356164383561645e-06, "loss": 1.9959, "mean_token_accuracy": 0.6616438627243042, "step": 28 }, { "epoch": 0.07967032967032966, "grad_norm": 16.22498321533203, "learning_rate": 3.972602739726027e-06, "loss": 1.7964, "mean_token_accuracy": 0.6655290126800537, "step": 29 }, { "epoch": 0.08241758241758242, "grad_norm": 16.668590545654297, "learning_rate": 4.109589041095891e-06, "loss": 2.1534, "mean_token_accuracy": 0.6357786655426025, "step": 30 }, { "epoch": 0.08516483516483517, "grad_norm": 17.106897354125977, "learning_rate": 4.246575342465754e-06, "loss": 2.0976, "mean_token_accuracy": 0.6054502129554749, "step": 31 }, { "epoch": 0.08791208791208792, "grad_norm": 17.229530334472656, "learning_rate": 4.383561643835616e-06, "loss": 2.0481, "mean_token_accuracy": 0.6231527328491211, "step": 32 }, { "epoch": 0.09065934065934066, "grad_norm": 15.264318466186523, "learning_rate": 4.52054794520548e-06, "loss": 1.9633, "mean_token_accuracy": 0.6321558952331543, "step": 33 }, { "epoch": 0.09340659340659341, "grad_norm": 16.209901809692383, "learning_rate": 4.657534246575343e-06, "loss": 1.9988, "mean_token_accuracy": 0.6246753334999084, "step": 34 }, { "epoch": 0.09615384615384616, "grad_norm": 15.802133560180664, "learning_rate": 4.7945205479452054e-06, "loss": 1.7998, "mean_token_accuracy": 0.6637499928474426, "step": 35 }, { "epoch": 0.0989010989010989, "grad_norm": 19.071365356445312, "learning_rate": 4.931506849315069e-06, "loss": 2.1914, "mean_token_accuracy": 0.6349413394927979, "step": 36 }, { "epoch": 0.10164835164835165, "grad_norm": 21.21844482421875, "learning_rate": 5.068493150684932e-06, "loss": 2.2208, "mean_token_accuracy": 0.5880758762359619, "step": 37 }, { "epoch": 0.1043956043956044, "grad_norm": 18.13629150390625, "learning_rate": 5.2054794520547945e-06, "loss": 1.9187, "mean_token_accuracy": 0.642346203327179, "step": 38 }, { "epoch": 0.10714285714285714, "grad_norm": 17.24445152282715, "learning_rate": 5.342465753424658e-06, "loss": 1.9784, "mean_token_accuracy": 0.6438547372817993, "step": 39 }, { "epoch": 0.10989010989010989, "grad_norm": 15.44080638885498, "learning_rate": 5.479452054794521e-06, "loss": 1.9891, "mean_token_accuracy": 0.6200241446495056, "step": 40 }, { "epoch": 0.11263736263736264, "grad_norm": 14.55709457397461, "learning_rate": 5.6164383561643845e-06, "loss": 1.7106, "mean_token_accuracy": 0.6681614518165588, "step": 41 }, { "epoch": 0.11538461538461539, "grad_norm": 15.768806457519531, "learning_rate": 5.753424657534246e-06, "loss": 2.1038, "mean_token_accuracy": 0.5827505588531494, "step": 42 }, { "epoch": 0.11813186813186813, "grad_norm": 16.35256576538086, "learning_rate": 5.89041095890411e-06, "loss": 2.0303, "mean_token_accuracy": 0.5822167992591858, "step": 43 }, { "epoch": 0.12087912087912088, "grad_norm": 16.935888290405273, "learning_rate": 6.027397260273973e-06, "loss": 1.7315, "mean_token_accuracy": 0.6608186960220337, "step": 44 }, { "epoch": 0.12362637362637363, "grad_norm": 17.174970626831055, "learning_rate": 6.164383561643836e-06, "loss": 1.9942, "mean_token_accuracy": 0.6288659572601318, "step": 45 }, { "epoch": 0.12637362637362637, "grad_norm": 19.020933151245117, "learning_rate": 6.301369863013699e-06, "loss": 1.9312, "mean_token_accuracy": 0.6689007878303528, "step": 46 }, { "epoch": 0.12912087912087913, "grad_norm": 17.157032012939453, "learning_rate": 6.438356164383563e-06, "loss": 2.0065, "mean_token_accuracy": 0.5868187546730042, "step": 47 }, { "epoch": 0.13186813186813187, "grad_norm": 15.546041488647461, "learning_rate": 6.5753424657534245e-06, "loss": 1.5726, "mean_token_accuracy": 0.7092882990837097, "step": 48 }, { "epoch": 0.1346153846153846, "grad_norm": 14.470401763916016, "learning_rate": 6.712328767123288e-06, "loss": 1.6003, "mean_token_accuracy": 0.681922197341919, "step": 49 }, { "epoch": 0.13736263736263737, "grad_norm": 15.666678428649902, "learning_rate": 6.849315068493151e-06, "loss": 1.6609, "mean_token_accuracy": 0.6787048578262329, "step": 50 }, { "epoch": 0.1401098901098901, "grad_norm": 14.968053817749023, "learning_rate": 6.9863013698630145e-06, "loss": 1.9766, "mean_token_accuracy": 0.6470588445663452, "step": 51 }, { "epoch": 0.14285714285714285, "grad_norm": 15.640029907226562, "learning_rate": 7.123287671232877e-06, "loss": 1.736, "mean_token_accuracy": 0.6700379252433777, "step": 52 }, { "epoch": 0.14560439560439561, "grad_norm": 17.240005493164062, "learning_rate": 7.260273972602741e-06, "loss": 1.6045, "mean_token_accuracy": 0.6543046236038208, "step": 53 }, { "epoch": 0.14835164835164835, "grad_norm": 17.649545669555664, "learning_rate": 7.397260273972603e-06, "loss": 2.0, "mean_token_accuracy": 0.6148648858070374, "step": 54 }, { "epoch": 0.1510989010989011, "grad_norm": 17.608076095581055, "learning_rate": 7.534246575342466e-06, "loss": 1.5678, "mean_token_accuracy": 0.7154255509376526, "step": 55 }, { "epoch": 0.15384615384615385, "grad_norm": 17.627878189086914, "learning_rate": 7.671232876712329e-06, "loss": 1.6404, "mean_token_accuracy": 0.6906377077102661, "step": 56 }, { "epoch": 0.1565934065934066, "grad_norm": 17.066679000854492, "learning_rate": 7.808219178082192e-06, "loss": 2.0689, "mean_token_accuracy": 0.6567164063453674, "step": 57 }, { "epoch": 0.15934065934065933, "grad_norm": 16.802358627319336, "learning_rate": 7.945205479452055e-06, "loss": 1.8222, "mean_token_accuracy": 0.6650062203407288, "step": 58 }, { "epoch": 0.1620879120879121, "grad_norm": 14.752166748046875, "learning_rate": 8.082191780821919e-06, "loss": 1.3144, "mean_token_accuracy": 0.7395243644714355, "step": 59 }, { "epoch": 0.16483516483516483, "grad_norm": 15.987241744995117, "learning_rate": 8.219178082191782e-06, "loss": 1.6505, "mean_token_accuracy": 0.6697613000869751, "step": 60 }, { "epoch": 0.16758241758241757, "grad_norm": 14.781291007995605, "learning_rate": 8.356164383561644e-06, "loss": 1.5079, "mean_token_accuracy": 0.6952491402626038, "step": 61 }, { "epoch": 0.17032967032967034, "grad_norm": 17.22585105895996, "learning_rate": 8.493150684931507e-06, "loss": 1.7571, "mean_token_accuracy": 0.67527174949646, "step": 62 }, { "epoch": 0.17307692307692307, "grad_norm": 14.363125801086426, "learning_rate": 8.63013698630137e-06, "loss": 1.2132, "mean_token_accuracy": 0.7480490803718567, "step": 63 }, { "epoch": 0.17582417582417584, "grad_norm": 14.252264976501465, "learning_rate": 8.767123287671233e-06, "loss": 1.3583, "mean_token_accuracy": 0.751387357711792, "step": 64 }, { "epoch": 0.17857142857142858, "grad_norm": 16.291038513183594, "learning_rate": 8.904109589041097e-06, "loss": 1.5959, "mean_token_accuracy": 0.7064676880836487, "step": 65 }, { "epoch": 0.1813186813186813, "grad_norm": 16.286724090576172, "learning_rate": 9.04109589041096e-06, "loss": 2.0605, "mean_token_accuracy": 0.5750834345817566, "step": 66 }, { "epoch": 0.18406593406593408, "grad_norm": 16.324674606323242, "learning_rate": 9.178082191780823e-06, "loss": 1.4098, "mean_token_accuracy": 0.7394179701805115, "step": 67 }, { "epoch": 0.18681318681318682, "grad_norm": 14.816543579101562, "learning_rate": 9.315068493150685e-06, "loss": 1.4708, "mean_token_accuracy": 0.7154285907745361, "step": 68 }, { "epoch": 0.18956043956043955, "grad_norm": 16.583890914916992, "learning_rate": 9.452054794520548e-06, "loss": 1.7538, "mean_token_accuracy": 0.677135705947876, "step": 69 }, { "epoch": 0.19230769230769232, "grad_norm": 14.951848030090332, "learning_rate": 9.589041095890411e-06, "loss": 1.5561, "mean_token_accuracy": 0.70525062084198, "step": 70 }, { "epoch": 0.19505494505494506, "grad_norm": 16.887245178222656, "learning_rate": 9.726027397260275e-06, "loss": 1.6847, "mean_token_accuracy": 0.6716621518135071, "step": 71 }, { "epoch": 0.1978021978021978, "grad_norm": 16.578994750976562, "learning_rate": 9.863013698630138e-06, "loss": 1.7824, "mean_token_accuracy": 0.644752025604248, "step": 72 }, { "epoch": 0.20054945054945056, "grad_norm": 17.51304817199707, "learning_rate": 1e-05, "loss": 1.4727, "mean_token_accuracy": 0.7205039858818054, "step": 73 }, { "epoch": 0.2032967032967033, "grad_norm": 17.006372451782227, "learning_rate": 9.999948239456138e-06, "loss": 2.0252, "mean_token_accuracy": 0.6419752836227417, "step": 74 }, { "epoch": 0.20604395604395603, "grad_norm": 13.299188613891602, "learning_rate": 9.999792959015282e-06, "loss": 1.3618, "mean_token_accuracy": 0.7111574411392212, "step": 75 }, { "epoch": 0.2087912087912088, "grad_norm": 16.785383224487305, "learning_rate": 9.999534162249611e-06, "loss": 1.6636, "mean_token_accuracy": 0.6866666674613953, "step": 76 }, { "epoch": 0.21153846153846154, "grad_norm": 16.0639591217041, "learning_rate": 9.999171855112667e-06, "loss": 1.6716, "mean_token_accuracy": 0.6938775777816772, "step": 77 }, { "epoch": 0.21428571428571427, "grad_norm": 14.496546745300293, "learning_rate": 9.998706045939206e-06, "loss": 1.5081, "mean_token_accuracy": 0.7064676880836487, "step": 78 }, { "epoch": 0.21703296703296704, "grad_norm": 14.92887020111084, "learning_rate": 9.998136745445027e-06, "loss": 1.5817, "mean_token_accuracy": 0.672703742980957, "step": 79 }, { "epoch": 0.21978021978021978, "grad_norm": 15.17734146118164, "learning_rate": 9.997463966726706e-06, "loss": 1.7535, "mean_token_accuracy": 0.614503800868988, "step": 80 }, { "epoch": 0.22252747252747251, "grad_norm": 13.309514045715332, "learning_rate": 9.996687725261309e-06, "loss": 1.4115, "mean_token_accuracy": 0.6910039186477661, "step": 81 }, { "epoch": 0.22527472527472528, "grad_norm": 13.710087776184082, "learning_rate": 9.995808038906024e-06, "loss": 1.9807, "mean_token_accuracy": 0.6172986030578613, "step": 82 }, { "epoch": 0.22802197802197802, "grad_norm": 11.803552627563477, "learning_rate": 9.994824927897763e-06, "loss": 1.3752, "mean_token_accuracy": 0.6950276494026184, "step": 83 }, { "epoch": 0.23076923076923078, "grad_norm": 10.76941204071045, "learning_rate": 9.993738414852683e-06, "loss": 1.3429, "mean_token_accuracy": 0.7200000286102295, "step": 84 }, { "epoch": 0.23351648351648352, "grad_norm": 12.579700469970703, "learning_rate": 9.992548524765677e-06, "loss": 1.6501, "mean_token_accuracy": 0.6551281809806824, "step": 85 }, { "epoch": 0.23626373626373626, "grad_norm": 12.145017623901367, "learning_rate": 9.991255285009794e-06, "loss": 1.7792, "mean_token_accuracy": 0.6391875743865967, "step": 86 }, { "epoch": 0.23901098901098902, "grad_norm": 12.107645034790039, "learning_rate": 9.989858725335608e-06, "loss": 1.7553, "mean_token_accuracy": 0.6398928761482239, "step": 87 }, { "epoch": 0.24175824175824176, "grad_norm": 11.869401931762695, "learning_rate": 9.988358877870536e-06, "loss": 1.4868, "mean_token_accuracy": 0.7045204043388367, "step": 88 }, { "epoch": 0.2445054945054945, "grad_norm": 11.478519439697266, "learning_rate": 9.986755777118095e-06, "loss": 1.6232, "mean_token_accuracy": 0.6606714725494385, "step": 89 }, { "epoch": 0.24725274725274726, "grad_norm": 12.422060012817383, "learning_rate": 9.985049459957121e-06, "loss": 1.1337, "mean_token_accuracy": 0.7422279715538025, "step": 90 }, { "epoch": 0.25, "grad_norm": 13.169363021850586, "learning_rate": 9.983239965640902e-06, "loss": 1.2939, "mean_token_accuracy": 0.7121729254722595, "step": 91 }, { "epoch": 0.25274725274725274, "grad_norm": 11.118330001831055, "learning_rate": 9.981327335796284e-06, "loss": 1.4643, "mean_token_accuracy": 0.6801661252975464, "step": 92 }, { "epoch": 0.2554945054945055, "grad_norm": 13.81788444519043, "learning_rate": 9.97931161442272e-06, "loss": 1.6353, "mean_token_accuracy": 0.6736111044883728, "step": 93 }, { "epoch": 0.25824175824175827, "grad_norm": 11.797772407531738, "learning_rate": 9.977192847891245e-06, "loss": 1.4335, "mean_token_accuracy": 0.6979038119316101, "step": 94 }, { "epoch": 0.260989010989011, "grad_norm": 11.760414123535156, "learning_rate": 9.974971084943421e-06, "loss": 1.5042, "mean_token_accuracy": 0.6891191601753235, "step": 95 }, { "epoch": 0.26373626373626374, "grad_norm": 12.482735633850098, "learning_rate": 9.972646376690204e-06, "loss": 1.3888, "mean_token_accuracy": 0.7099447250366211, "step": 96 }, { "epoch": 0.2664835164835165, "grad_norm": 11.861308097839355, "learning_rate": 9.970218776610781e-06, "loss": 1.6511, "mean_token_accuracy": 0.6784741282463074, "step": 97 }, { "epoch": 0.2692307692307692, "grad_norm": 11.980021476745605, "learning_rate": 9.967688340551328e-06, "loss": 1.4763, "mean_token_accuracy": 0.6922183632850647, "step": 98 }, { "epoch": 0.27197802197802196, "grad_norm": 12.542531967163086, "learning_rate": 9.965055126723733e-06, "loss": 1.7086, "mean_token_accuracy": 0.6587795615196228, "step": 99 }, { "epoch": 0.27472527472527475, "grad_norm": 11.29738998413086, "learning_rate": 9.962319195704253e-06, "loss": 1.2443, "mean_token_accuracy": 0.7068965435028076, "step": 100 }, { "epoch": 0.2774725274725275, "grad_norm": 10.83761978149414, "learning_rate": 9.959480610432126e-06, "loss": 1.3708, "mean_token_accuracy": 0.7224435806274414, "step": 101 }, { "epoch": 0.2802197802197802, "grad_norm": 11.421622276306152, "learning_rate": 9.956539436208109e-06, "loss": 1.2497, "mean_token_accuracy": 0.704336404800415, "step": 102 }, { "epoch": 0.28296703296703296, "grad_norm": 11.305917739868164, "learning_rate": 9.953495740692997e-06, "loss": 1.4686, "mean_token_accuracy": 0.6876543164253235, "step": 103 }, { "epoch": 0.2857142857142857, "grad_norm": 10.745238304138184, "learning_rate": 9.950349593906047e-06, "loss": 1.4166, "mean_token_accuracy": 0.695652186870575, "step": 104 }, { "epoch": 0.28846153846153844, "grad_norm": 10.237006187438965, "learning_rate": 9.947101068223379e-06, "loss": 1.1987, "mean_token_accuracy": 0.717255711555481, "step": 105 }, { "epoch": 0.29120879120879123, "grad_norm": 14.004864692687988, "learning_rate": 9.943750238376311e-06, "loss": 1.9631, "mean_token_accuracy": 0.5841726660728455, "step": 106 }, { "epoch": 0.29395604395604397, "grad_norm": 12.94350814819336, "learning_rate": 9.940297181449626e-06, "loss": 1.5451, "mean_token_accuracy": 0.6545681953430176, "step": 107 }, { "epoch": 0.2967032967032967, "grad_norm": 11.937135696411133, "learning_rate": 9.93674197687982e-06, "loss": 1.5099, "mean_token_accuracy": 0.6719636917114258, "step": 108 }, { "epoch": 0.29945054945054944, "grad_norm": 12.004481315612793, "learning_rate": 9.933084706453253e-06, "loss": 1.362, "mean_token_accuracy": 0.719239354133606, "step": 109 }, { "epoch": 0.3021978021978022, "grad_norm": 11.889949798583984, "learning_rate": 9.929325454304288e-06, "loss": 1.3806, "mean_token_accuracy": 0.7053072452545166, "step": 110 }, { "epoch": 0.30494505494505497, "grad_norm": 29.812204360961914, "learning_rate": 9.92546430691334e-06, "loss": 1.5889, "mean_token_accuracy": 0.6545681953430176, "step": 111 }, { "epoch": 0.3076923076923077, "grad_norm": 12.241945266723633, "learning_rate": 9.92150135310489e-06, "loss": 1.4933, "mean_token_accuracy": 0.6770833134651184, "step": 112 }, { "epoch": 0.31043956043956045, "grad_norm": 11.248332023620605, "learning_rate": 9.917436684045452e-06, "loss": 1.2714, "mean_token_accuracy": 0.6968973875045776, "step": 113 }, { "epoch": 0.3131868131868132, "grad_norm": 11.750696182250977, "learning_rate": 9.913270393241456e-06, "loss": 1.2923, "mean_token_accuracy": 0.7200000286102295, "step": 114 }, { "epoch": 0.3159340659340659, "grad_norm": 11.194052696228027, "learning_rate": 9.90900257653712e-06, "loss": 1.488, "mean_token_accuracy": 0.678185760974884, "step": 115 }, { "epoch": 0.31868131868131866, "grad_norm": 10.4213228225708, "learning_rate": 9.904633332112222e-06, "loss": 1.2861, "mean_token_accuracy": 0.6877682209014893, "step": 116 }, { "epoch": 0.32142857142857145, "grad_norm": 12.326902389526367, "learning_rate": 9.900162760479863e-06, "loss": 1.4382, "mean_token_accuracy": 0.7132667899131775, "step": 117 }, { "epoch": 0.3241758241758242, "grad_norm": 11.26202392578125, "learning_rate": 9.895590964484142e-06, "loss": 1.5174, "mean_token_accuracy": 0.6674786806106567, "step": 118 }, { "epoch": 0.3269230769230769, "grad_norm": 11.558907508850098, "learning_rate": 9.890918049297785e-06, "loss": 1.4946, "mean_token_accuracy": 0.6675094962120056, "step": 119 }, { "epoch": 0.32967032967032966, "grad_norm": 12.017938613891602, "learning_rate": 9.886144122419744e-06, "loss": 1.3344, "mean_token_accuracy": 0.7115135788917542, "step": 120 }, { "epoch": 0.3324175824175824, "grad_norm": 12.464367866516113, "learning_rate": 9.881269293672706e-06, "loss": 1.6151, "mean_token_accuracy": 0.6619115471839905, "step": 121 }, { "epoch": 0.33516483516483514, "grad_norm": 11.000343322753906, "learning_rate": 9.87629367520058e-06, "loss": 1.4913, "mean_token_accuracy": 0.6726457476615906, "step": 122 }, { "epoch": 0.33791208791208793, "grad_norm": 11.648791313171387, "learning_rate": 9.871217381465904e-06, "loss": 1.5451, "mean_token_accuracy": 0.6589806079864502, "step": 123 }, { "epoch": 0.34065934065934067, "grad_norm": 11.940444946289062, "learning_rate": 9.866040529247224e-06, "loss": 1.342, "mean_token_accuracy": 0.7100591659545898, "step": 124 }, { "epoch": 0.3434065934065934, "grad_norm": 11.262575149536133, "learning_rate": 9.860763237636397e-06, "loss": 1.3526, "mean_token_accuracy": 0.6929922103881836, "step": 125 }, { "epoch": 0.34615384615384615, "grad_norm": 10.675347328186035, "learning_rate": 9.855385628035866e-06, "loss": 1.0643, "mean_token_accuracy": 0.7545126080513, "step": 126 }, { "epoch": 0.3489010989010989, "grad_norm": 12.187606811523438, "learning_rate": 9.849907824155847e-06, "loss": 1.4871, "mean_token_accuracy": 0.6959620118141174, "step": 127 }, { "epoch": 0.3516483516483517, "grad_norm": 12.919577598571777, "learning_rate": 9.844329952011506e-06, "loss": 1.8205, "mean_token_accuracy": 0.6162227392196655, "step": 128 }, { "epoch": 0.3543956043956044, "grad_norm": 12.444584846496582, "learning_rate": 9.838652139920032e-06, "loss": 1.5858, "mean_token_accuracy": 0.6922005414962769, "step": 129 }, { "epoch": 0.35714285714285715, "grad_norm": 11.21202564239502, "learning_rate": 9.832874518497718e-06, "loss": 1.2042, "mean_token_accuracy": 0.7413793206214905, "step": 130 }, { "epoch": 0.3598901098901099, "grad_norm": 10.988180160522461, "learning_rate": 9.826997220656925e-06, "loss": 1.289, "mean_token_accuracy": 0.7080214023590088, "step": 131 }, { "epoch": 0.3626373626373626, "grad_norm": 12.33310604095459, "learning_rate": 9.821020381603052e-06, "loss": 1.4773, "mean_token_accuracy": 0.6791208982467651, "step": 132 }, { "epoch": 0.36538461538461536, "grad_norm": 11.341861724853516, "learning_rate": 9.814944138831402e-06, "loss": 1.4338, "mean_token_accuracy": 0.6756476759910583, "step": 133 }, { "epoch": 0.36813186813186816, "grad_norm": 11.409241676330566, "learning_rate": 9.808768632124033e-06, "loss": 1.6764, "mean_token_accuracy": 0.6054852604866028, "step": 134 }, { "epoch": 0.3708791208791209, "grad_norm": 12.015110969543457, "learning_rate": 9.802494003546537e-06, "loss": 1.4402, "mean_token_accuracy": 0.7052023410797119, "step": 135 }, { "epoch": 0.37362637362637363, "grad_norm": 10.703707695007324, "learning_rate": 9.79612039744478e-06, "loss": 1.3956, "mean_token_accuracy": 0.6877990365028381, "step": 136 }, { "epoch": 0.37637362637362637, "grad_norm": 9.813762664794922, "learning_rate": 9.789647960441567e-06, "loss": 1.0888, "mean_token_accuracy": 0.7485515475273132, "step": 137 }, { "epoch": 0.3791208791208791, "grad_norm": 11.24870777130127, "learning_rate": 9.78307684143328e-06, "loss": 1.5276, "mean_token_accuracy": 0.6972833275794983, "step": 138 }, { "epoch": 0.38186813186813184, "grad_norm": 13.590780258178711, "learning_rate": 9.77640719158645e-06, "loss": 1.5919, "mean_token_accuracy": 0.6567164063453674, "step": 139 }, { "epoch": 0.38461538461538464, "grad_norm": 13.572460174560547, "learning_rate": 9.769639164334279e-06, "loss": 1.7118, "mean_token_accuracy": 0.6849489808082581, "step": 140 }, { "epoch": 0.3873626373626374, "grad_norm": 11.5997896194458, "learning_rate": 9.76277291537311e-06, "loss": 1.279, "mean_token_accuracy": 0.7191435694694519, "step": 141 }, { "epoch": 0.3901098901098901, "grad_norm": 10.374162673950195, "learning_rate": 9.75580860265884e-06, "loss": 1.0923, "mean_token_accuracy": 0.7603210806846619, "step": 142 }, { "epoch": 0.39285714285714285, "grad_norm": 11.554888725280762, "learning_rate": 9.748746386403308e-06, "loss": 1.2049, "mean_token_accuracy": 0.7256177067756653, "step": 143 }, { "epoch": 0.3956043956043956, "grad_norm": 12.095945358276367, "learning_rate": 9.741586429070574e-06, "loss": 1.772, "mean_token_accuracy": 0.6329723000526428, "step": 144 }, { "epoch": 0.3983516483516483, "grad_norm": 11.040619850158691, "learning_rate": 9.73432889537321e-06, "loss": 1.2994, "mean_token_accuracy": 0.7119628190994263, "step": 145 }, { "epoch": 0.4010989010989011, "grad_norm": 13.869733810424805, "learning_rate": 9.726973952268507e-06, "loss": 1.9615, "mean_token_accuracy": 0.5833333134651184, "step": 146 }, { "epoch": 0.40384615384615385, "grad_norm": 16.08857536315918, "learning_rate": 9.719521768954615e-06, "loss": 1.767, "mean_token_accuracy": 0.6537982821464539, "step": 147 }, { "epoch": 0.4065934065934066, "grad_norm": 11.085786819458008, "learning_rate": 9.71197251686668e-06, "loss": 1.4405, "mean_token_accuracy": 0.6847457885742188, "step": 148 }, { "epoch": 0.40934065934065933, "grad_norm": 11.921771049499512, "learning_rate": 9.704326369672872e-06, "loss": 1.092, "mean_token_accuracy": 0.7311015129089355, "step": 149 }, { "epoch": 0.41208791208791207, "grad_norm": 12.389166831970215, "learning_rate": 9.696583503270409e-06, "loss": 1.3396, "mean_token_accuracy": 0.6969001293182373, "step": 150 }, { "epoch": 0.41483516483516486, "grad_norm": 12.603693008422852, "learning_rate": 9.688744095781501e-06, "loss": 1.494, "mean_token_accuracy": 0.6674938201904297, "step": 151 }, { "epoch": 0.4175824175824176, "grad_norm": 12.303590774536133, "learning_rate": 9.680808327549261e-06, "loss": 1.3313, "mean_token_accuracy": 0.7104136943817139, "step": 152 }, { "epoch": 0.42032967032967034, "grad_norm": 11.780409812927246, "learning_rate": 9.672776381133541e-06, "loss": 1.3221, "mean_token_accuracy": 0.7101449370384216, "step": 153 }, { "epoch": 0.4230769230769231, "grad_norm": 11.474019050598145, "learning_rate": 9.664648441306753e-06, "loss": 1.5544, "mean_token_accuracy": 0.6828644275665283, "step": 154 }, { "epoch": 0.4258241758241758, "grad_norm": 11.8672456741333, "learning_rate": 9.656424695049597e-06, "loss": 1.6288, "mean_token_accuracy": 0.6559263467788696, "step": 155 }, { "epoch": 0.42857142857142855, "grad_norm": 11.574856758117676, "learning_rate": 9.648105331546778e-06, "loss": 1.1779, "mean_token_accuracy": 0.7253766059875488, "step": 156 }, { "epoch": 0.43131868131868134, "grad_norm": 13.268657684326172, "learning_rate": 9.639690542182643e-06, "loss": 1.6477, "mean_token_accuracy": 0.6597131490707397, "step": 157 }, { "epoch": 0.4340659340659341, "grad_norm": 11.048521995544434, "learning_rate": 9.631180520536778e-06, "loss": 1.476, "mean_token_accuracy": 0.703125, "step": 158 }, { "epoch": 0.4368131868131868, "grad_norm": 10.296670913696289, "learning_rate": 9.622575462379562e-06, "loss": 1.128, "mean_token_accuracy": 0.7398906946182251, "step": 159 }, { "epoch": 0.43956043956043955, "grad_norm": 10.901885986328125, "learning_rate": 9.613875565667655e-06, "loss": 1.4029, "mean_token_accuracy": 0.6959537863731384, "step": 160 }, { "epoch": 0.4423076923076923, "grad_norm": 11.685770988464355, "learning_rate": 9.605081030539453e-06, "loss": 1.4721, "mean_token_accuracy": 0.6764705777168274, "step": 161 }, { "epoch": 0.44505494505494503, "grad_norm": 12.1082763671875, "learning_rate": 9.596192059310475e-06, "loss": 1.4389, "mean_token_accuracy": 0.7186761498451233, "step": 162 }, { "epoch": 0.4478021978021978, "grad_norm": 11.559240341186523, "learning_rate": 9.587208856468715e-06, "loss": 1.5186, "mean_token_accuracy": 0.6510416865348816, "step": 163 }, { "epoch": 0.45054945054945056, "grad_norm": 11.276078224182129, "learning_rate": 9.578131628669936e-06, "loss": 1.3336, "mean_token_accuracy": 0.7079530358314514, "step": 164 }, { "epoch": 0.4532967032967033, "grad_norm": 13.037327766418457, "learning_rate": 9.568960584732912e-06, "loss": 1.5361, "mean_token_accuracy": 0.6537467837333679, "step": 165 }, { "epoch": 0.45604395604395603, "grad_norm": 13.014458656311035, "learning_rate": 9.559695935634636e-06, "loss": 1.7156, "mean_token_accuracy": 0.6452054977416992, "step": 166 }, { "epoch": 0.45879120879120877, "grad_norm": 12.075533866882324, "learning_rate": 9.550337894505446e-06, "loss": 1.2952, "mean_token_accuracy": 0.7117853164672852, "step": 167 }, { "epoch": 0.46153846153846156, "grad_norm": 12.357090950012207, "learning_rate": 9.540886676624145e-06, "loss": 1.3997, "mean_token_accuracy": 0.6951871514320374, "step": 168 }, { "epoch": 0.4642857142857143, "grad_norm": 11.073513984680176, "learning_rate": 9.531342499413034e-06, "loss": 1.1314, "mean_token_accuracy": 0.7375144958496094, "step": 169 }, { "epoch": 0.46703296703296704, "grad_norm": 12.422121047973633, "learning_rate": 9.521705582432915e-06, "loss": 1.7004, "mean_token_accuracy": 0.6406821012496948, "step": 170 }, { "epoch": 0.4697802197802198, "grad_norm": 11.14840030670166, "learning_rate": 9.511976147378038e-06, "loss": 1.3761, "mean_token_accuracy": 0.709113597869873, "step": 171 }, { "epoch": 0.4725274725274725, "grad_norm": 11.885308265686035, "learning_rate": 9.502154418071002e-06, "loss": 1.2528, "mean_token_accuracy": 0.7129629850387573, "step": 172 }, { "epoch": 0.47527472527472525, "grad_norm": 11.952164649963379, "learning_rate": 9.492240620457609e-06, "loss": 1.2345, "mean_token_accuracy": 0.7546296119689941, "step": 173 }, { "epoch": 0.47802197802197804, "grad_norm": 12.823383331298828, "learning_rate": 9.48223498260166e-06, "loss": 1.3316, "mean_token_accuracy": 0.7302383780479431, "step": 174 }, { "epoch": 0.4807692307692308, "grad_norm": 11.896831512451172, "learning_rate": 9.472137734679715e-06, "loss": 1.5243, "mean_token_accuracy": 0.6763284802436829, "step": 175 }, { "epoch": 0.4835164835164835, "grad_norm": 12.501947402954102, "learning_rate": 9.461949108975794e-06, "loss": 1.896, "mean_token_accuracy": 0.6086404323577881, "step": 176 }, { "epoch": 0.48626373626373626, "grad_norm": 11.727622032165527, "learning_rate": 9.45166933987603e-06, "loss": 1.306, "mean_token_accuracy": 0.7123456597328186, "step": 177 }, { "epoch": 0.489010989010989, "grad_norm": 11.941061019897461, "learning_rate": 9.44129866386329e-06, "loss": 1.2725, "mean_token_accuracy": 0.7091412544250488, "step": 178 }, { "epoch": 0.49175824175824173, "grad_norm": 13.788084030151367, "learning_rate": 9.430837319511718e-06, "loss": 1.6164, "mean_token_accuracy": 0.6568502187728882, "step": 179 }, { "epoch": 0.4945054945054945, "grad_norm": 10.0413818359375, "learning_rate": 9.420285547481257e-06, "loss": 1.3372, "mean_token_accuracy": 0.6839577555656433, "step": 180 }, { "epoch": 0.49725274725274726, "grad_norm": 11.216383934020996, "learning_rate": 9.409643590512116e-06, "loss": 1.3565, "mean_token_accuracy": 0.7152317762374878, "step": 181 }, { "epoch": 0.5, "grad_norm": 12.224161148071289, "learning_rate": 9.398911693419168e-06, "loss": 1.2158, "mean_token_accuracy": 0.7243436574935913, "step": 182 }, { "epoch": 0.5027472527472527, "grad_norm": 10.415431022644043, "learning_rate": 9.388090103086344e-06, "loss": 1.2495, "mean_token_accuracy": 0.7525423765182495, "step": 183 }, { "epoch": 0.5054945054945055, "grad_norm": 12.435955047607422, "learning_rate": 9.37717906846093e-06, "loss": 1.3597, "mean_token_accuracy": 0.7139272093772888, "step": 184 }, { "epoch": 0.5082417582417582, "grad_norm": 11.92297649383545, "learning_rate": 9.366178840547853e-06, "loss": 1.2254, "mean_token_accuracy": 0.7317647337913513, "step": 185 }, { "epoch": 0.510989010989011, "grad_norm": 11.196688652038574, "learning_rate": 9.355089672403905e-06, "loss": 1.2686, "mean_token_accuracy": 0.7352586984634399, "step": 186 }, { "epoch": 0.5137362637362637, "grad_norm": 11.09434986114502, "learning_rate": 9.343911819131918e-06, "loss": 1.1463, "mean_token_accuracy": 0.7363420724868774, "step": 187 }, { "epoch": 0.5164835164835165, "grad_norm": 10.482327461242676, "learning_rate": 9.332645537874901e-06, "loss": 1.2708, "mean_token_accuracy": 0.7420091032981873, "step": 188 }, { "epoch": 0.5192307692307693, "grad_norm": 11.183061599731445, "learning_rate": 9.321291087810115e-06, "loss": 1.1896, "mean_token_accuracy": 0.7237977981567383, "step": 189 }, { "epoch": 0.521978021978022, "grad_norm": 12.56871223449707, "learning_rate": 9.309848730143122e-06, "loss": 1.4552, "mean_token_accuracy": 0.6875817179679871, "step": 190 }, { "epoch": 0.5247252747252747, "grad_norm": 12.282219886779785, "learning_rate": 9.298318728101769e-06, "loss": 1.6814, "mean_token_accuracy": 0.6299212574958801, "step": 191 }, { "epoch": 0.5274725274725275, "grad_norm": 11.732946395874023, "learning_rate": 9.286701346930134e-06, "loss": 1.3322, "mean_token_accuracy": 0.6991474032402039, "step": 192 }, { "epoch": 0.5302197802197802, "grad_norm": 11.771052360534668, "learning_rate": 9.274996853882426e-06, "loss": 1.5943, "mean_token_accuracy": 0.6880733966827393, "step": 193 }, { "epoch": 0.532967032967033, "grad_norm": 12.6982421875, "learning_rate": 9.263205518216834e-06, "loss": 1.4098, "mean_token_accuracy": 0.6879063844680786, "step": 194 }, { "epoch": 0.5357142857142857, "grad_norm": 12.095486640930176, "learning_rate": 9.251327611189333e-06, "loss": 1.4982, "mean_token_accuracy": 0.6825208067893982, "step": 195 }, { "epoch": 0.5384615384615384, "grad_norm": 10.93848991394043, "learning_rate": 9.239363406047446e-06, "loss": 1.3762, "mean_token_accuracy": 0.6978672742843628, "step": 196 }, { "epoch": 0.5412087912087912, "grad_norm": 10.177602767944336, "learning_rate": 9.227313178023962e-06, "loss": 1.2093, "mean_token_accuracy": 0.7204058766365051, "step": 197 }, { "epoch": 0.5439560439560439, "grad_norm": 11.405716896057129, "learning_rate": 9.21517720433059e-06, "loss": 1.4351, "mean_token_accuracy": 0.692307710647583, "step": 198 }, { "epoch": 0.5467032967032966, "grad_norm": 11.433396339416504, "learning_rate": 9.202955764151597e-06, "loss": 1.2367, "mean_token_accuracy": 0.7333333492279053, "step": 199 }, { "epoch": 0.5494505494505495, "grad_norm": 11.622626304626465, "learning_rate": 9.190649138637378e-06, "loss": 1.3771, "mean_token_accuracy": 0.707454264163971, "step": 200 }, { "epoch": 0.5521978021978022, "grad_norm": 10.980391502380371, "learning_rate": 9.178257610897996e-06, "loss": 1.3038, "mean_token_accuracy": 0.6971365809440613, "step": 201 }, { "epoch": 0.554945054945055, "grad_norm": 11.408574104309082, "learning_rate": 9.16578146599665e-06, "loss": 1.2083, "mean_token_accuracy": 0.7157106995582581, "step": 202 }, { "epoch": 0.5576923076923077, "grad_norm": 11.354791641235352, "learning_rate": 9.153220990943147e-06, "loss": 1.4099, "mean_token_accuracy": 0.7070063948631287, "step": 203 }, { "epoch": 0.5604395604395604, "grad_norm": 11.774231910705566, "learning_rate": 9.140576474687263e-06, "loss": 1.4604, "mean_token_accuracy": 0.6878378391265869, "step": 204 }, { "epoch": 0.5631868131868132, "grad_norm": 10.277194023132324, "learning_rate": 9.127848208112135e-06, "loss": 1.1073, "mean_token_accuracy": 0.7497337460517883, "step": 205 }, { "epoch": 0.5659340659340659, "grad_norm": 11.545296669006348, "learning_rate": 9.115036484027537e-06, "loss": 1.4562, "mean_token_accuracy": 0.7032085657119751, "step": 206 }, { "epoch": 0.5686813186813187, "grad_norm": 12.219311714172363, "learning_rate": 9.10214159716316e-06, "loss": 1.4314, "mean_token_accuracy": 0.6811988949775696, "step": 207 }, { "epoch": 0.5714285714285714, "grad_norm": 12.188143730163574, "learning_rate": 9.08916384416183e-06, "loss": 1.4823, "mean_token_accuracy": 0.6658130884170532, "step": 208 }, { "epoch": 0.5741758241758241, "grad_norm": 11.3258056640625, "learning_rate": 9.076103523572685e-06, "loss": 1.1717, "mean_token_accuracy": 0.7044854760169983, "step": 209 }, { "epoch": 0.5769230769230769, "grad_norm": 10.30764102935791, "learning_rate": 9.0629609358443e-06, "loss": 1.0972, "mean_token_accuracy": 0.7533252835273743, "step": 210 }, { "epoch": 0.5796703296703297, "grad_norm": 12.15485954284668, "learning_rate": 9.049736383317777e-06, "loss": 1.4388, "mean_token_accuracy": 0.6901408433914185, "step": 211 }, { "epoch": 0.5824175824175825, "grad_norm": 10.234277725219727, "learning_rate": 9.0364301702198e-06, "loss": 1.6417, "mean_token_accuracy": 0.6692913174629211, "step": 212 }, { "epoch": 0.5851648351648352, "grad_norm": 10.58019733428955, "learning_rate": 9.023042602655624e-06, "loss": 1.2144, "mean_token_accuracy": 0.7276995182037354, "step": 213 }, { "epoch": 0.5879120879120879, "grad_norm": 11.242633819580078, "learning_rate": 9.009573988602042e-06, "loss": 1.3192, "mean_token_accuracy": 0.6910569071769714, "step": 214 }, { "epoch": 0.5906593406593407, "grad_norm": 14.42585277557373, "learning_rate": 8.99602463790029e-06, "loss": 1.7263, "mean_token_accuracy": 0.6614060401916504, "step": 215 }, { "epoch": 0.5934065934065934, "grad_norm": 12.312402725219727, "learning_rate": 8.98239486224893e-06, "loss": 1.2264, "mean_token_accuracy": 0.7335203289985657, "step": 216 }, { "epoch": 0.5961538461538461, "grad_norm": 11.044991493225098, "learning_rate": 8.968684975196673e-06, "loss": 1.6332, "mean_token_accuracy": 0.6666666865348816, "step": 217 }, { "epoch": 0.5989010989010989, "grad_norm": 12.098809242248535, "learning_rate": 8.954895292135171e-06, "loss": 1.3326, "mean_token_accuracy": 0.715923547744751, "step": 218 }, { "epoch": 0.6016483516483516, "grad_norm": 10.962403297424316, "learning_rate": 8.94102613029175e-06, "loss": 1.1455, "mean_token_accuracy": 0.7470588088035583, "step": 219 }, { "epoch": 0.6043956043956044, "grad_norm": 13.998690605163574, "learning_rate": 8.927077808722127e-06, "loss": 1.5056, "mean_token_accuracy": 0.6980891823768616, "step": 220 }, { "epoch": 0.6071428571428571, "grad_norm": 13.010772705078125, "learning_rate": 8.913050648303064e-06, "loss": 1.5572, "mean_token_accuracy": 0.6395863890647888, "step": 221 }, { "epoch": 0.6098901098901099, "grad_norm": 11.286635398864746, "learning_rate": 8.898944971724983e-06, "loss": 1.5845, "mean_token_accuracy": 0.6471264362335205, "step": 222 }, { "epoch": 0.6126373626373627, "grad_norm": 10.34750747680664, "learning_rate": 8.884761103484548e-06, "loss": 1.31, "mean_token_accuracy": 0.7002262473106384, "step": 223 }, { "epoch": 0.6153846153846154, "grad_norm": 11.606867790222168, "learning_rate": 8.870499369877194e-06, "loss": 1.2979, "mean_token_accuracy": 0.7079856991767883, "step": 224 }, { "epoch": 0.6181318681318682, "grad_norm": 11.105297088623047, "learning_rate": 8.85616009898963e-06, "loss": 1.1587, "mean_token_accuracy": 0.7566539645195007, "step": 225 }, { "epoch": 0.6208791208791209, "grad_norm": 11.666239738464355, "learning_rate": 8.841743620692279e-06, "loss": 1.1036, "mean_token_accuracy": 0.7243674993515015, "step": 226 }, { "epoch": 0.6236263736263736, "grad_norm": 11.083747863769531, "learning_rate": 8.827250266631704e-06, "loss": 1.2543, "mean_token_accuracy": 0.7404980063438416, "step": 227 }, { "epoch": 0.6263736263736264, "grad_norm": 10.682806968688965, "learning_rate": 8.81268037022296e-06, "loss": 1.2809, "mean_token_accuracy": 0.711448609828949, "step": 228 }, { "epoch": 0.6291208791208791, "grad_norm": 9.957572937011719, "learning_rate": 8.798034266641948e-06, "loss": 1.226, "mean_token_accuracy": 0.7218863368034363, "step": 229 }, { "epoch": 0.6318681318681318, "grad_norm": 11.039960861206055, "learning_rate": 8.783312292817681e-06, "loss": 1.3115, "mean_token_accuracy": 0.722300112247467, "step": 230 }, { "epoch": 0.6346153846153846, "grad_norm": 12.31747055053711, "learning_rate": 8.768514787424548e-06, "loss": 1.6285, "mean_token_accuracy": 0.643410861492157, "step": 231 }, { "epoch": 0.6373626373626373, "grad_norm": 11.145745277404785, "learning_rate": 8.753642090874516e-06, "loss": 1.1148, "mean_token_accuracy": 0.7636138796806335, "step": 232 }, { "epoch": 0.6401098901098901, "grad_norm": 13.070597648620605, "learning_rate": 8.7386945453093e-06, "loss": 1.5973, "mean_token_accuracy": 0.6937984228134155, "step": 233 }, { "epoch": 0.6428571428571429, "grad_norm": 10.66708755493164, "learning_rate": 8.723672494592497e-06, "loss": 0.9787, "mean_token_accuracy": 0.7600487470626831, "step": 234 }, { "epoch": 0.6456043956043956, "grad_norm": 11.117300033569336, "learning_rate": 8.708576284301668e-06, "loss": 1.2219, "mean_token_accuracy": 0.7341614961624146, "step": 235 }, { "epoch": 0.6483516483516484, "grad_norm": 11.09400749206543, "learning_rate": 8.693406261720392e-06, "loss": 1.3723, "mean_token_accuracy": 0.7157622575759888, "step": 236 }, { "epoch": 0.6510989010989011, "grad_norm": 11.477158546447754, "learning_rate": 8.67816277583028e-06, "loss": 1.0769, "mean_token_accuracy": 0.7450980544090271, "step": 237 }, { "epoch": 0.6538461538461539, "grad_norm": 13.270724296569824, "learning_rate": 8.66284617730294e-06, "loss": 1.5841, "mean_token_accuracy": 0.6744487881660461, "step": 238 }, { "epoch": 0.6565934065934066, "grad_norm": 10.172012329101562, "learning_rate": 8.647456818491912e-06, "loss": 1.1889, "mean_token_accuracy": 0.7316538691520691, "step": 239 }, { "epoch": 0.6593406593406593, "grad_norm": 12.0361967086792, "learning_rate": 8.63199505342457e-06, "loss": 1.1995, "mean_token_accuracy": 0.740841269493103, "step": 240 }, { "epoch": 0.6620879120879121, "grad_norm": 12.539520263671875, "learning_rate": 8.616461237793962e-06, "loss": 1.2821, "mean_token_accuracy": 0.7157490253448486, "step": 241 }, { "epoch": 0.6648351648351648, "grad_norm": 10.934772491455078, "learning_rate": 8.600855728950645e-06, "loss": 1.373, "mean_token_accuracy": 0.7006896734237671, "step": 242 }, { "epoch": 0.6675824175824175, "grad_norm": 8.917642593383789, "learning_rate": 8.585178885894451e-06, "loss": 1.314, "mean_token_accuracy": 0.7070063948631287, "step": 243 }, { "epoch": 0.6703296703296703, "grad_norm": 11.78652572631836, "learning_rate": 8.569431069266236e-06, "loss": 1.5815, "mean_token_accuracy": 0.6636713743209839, "step": 244 }, { "epoch": 0.6730769230769231, "grad_norm": 10.088194847106934, "learning_rate": 8.553612641339577e-06, "loss": 1.3323, "mean_token_accuracy": 0.6963562965393066, "step": 245 }, { "epoch": 0.6758241758241759, "grad_norm": 10.238805770874023, "learning_rate": 8.537723966012444e-06, "loss": 1.0912, "mean_token_accuracy": 0.7577962875366211, "step": 246 }, { "epoch": 0.6785714285714286, "grad_norm": 10.34375, "learning_rate": 8.521765408798828e-06, "loss": 1.2058, "mean_token_accuracy": 0.7399346828460693, "step": 247 }, { "epoch": 0.6813186813186813, "grad_norm": 10.886113166809082, "learning_rate": 8.505737336820327e-06, "loss": 1.3668, "mean_token_accuracy": 0.7172582745552063, "step": 248 }, { "epoch": 0.6840659340659341, "grad_norm": 10.41820240020752, "learning_rate": 8.48964011879771e-06, "loss": 1.0878, "mean_token_accuracy": 0.7511904835700989, "step": 249 }, { "epoch": 0.6868131868131868, "grad_norm": 11.025300025939941, "learning_rate": 8.473474125042424e-06, "loss": 1.2859, "mean_token_accuracy": 0.7300613522529602, "step": 250 }, { "epoch": 0.6895604395604396, "grad_norm": 11.369904518127441, "learning_rate": 8.457239727448083e-06, "loss": 1.3666, "mean_token_accuracy": 0.676508367061615, "step": 251 }, { "epoch": 0.6923076923076923, "grad_norm": 13.08974838256836, "learning_rate": 8.440937299481906e-06, "loss": 1.5691, "mean_token_accuracy": 0.6548347473144531, "step": 252 }, { "epoch": 0.695054945054945, "grad_norm": 12.096307754516602, "learning_rate": 8.424567216176132e-06, "loss": 1.4822, "mean_token_accuracy": 0.6430939435958862, "step": 253 }, { "epoch": 0.6978021978021978, "grad_norm": 33.42136001586914, "learning_rate": 8.408129854119395e-06, "loss": 1.4324, "mean_token_accuracy": 0.6841530203819275, "step": 254 }, { "epoch": 0.7005494505494505, "grad_norm": 10.152215957641602, "learning_rate": 8.391625591448044e-06, "loss": 1.4295, "mean_token_accuracy": 0.6865127682685852, "step": 255 }, { "epoch": 0.7032967032967034, "grad_norm": 10.057063102722168, "learning_rate": 8.375054807837466e-06, "loss": 1.2096, "mean_token_accuracy": 0.7173252105712891, "step": 256 }, { "epoch": 0.7060439560439561, "grad_norm": 9.629958152770996, "learning_rate": 8.358417884493336e-06, "loss": 1.179, "mean_token_accuracy": 0.725824773311615, "step": 257 }, { "epoch": 0.7087912087912088, "grad_norm": 10.766592979431152, "learning_rate": 8.341715204142854e-06, "loss": 1.0063, "mean_token_accuracy": 0.7549751400947571, "step": 258 }, { "epoch": 0.7115384615384616, "grad_norm": 11.319416999816895, "learning_rate": 8.324947151025941e-06, "loss": 1.0844, "mean_token_accuracy": 0.75, "step": 259 }, { "epoch": 0.7142857142857143, "grad_norm": 11.977400779724121, "learning_rate": 8.308114110886397e-06, "loss": 1.5588, "mean_token_accuracy": 0.656862735748291, "step": 260 }, { "epoch": 0.717032967032967, "grad_norm": 16.977502822875977, "learning_rate": 8.291216470963026e-06, "loss": 0.9755, "mean_token_accuracy": 0.7794561982154846, "step": 261 }, { "epoch": 0.7197802197802198, "grad_norm": 11.817448616027832, "learning_rate": 8.274254619980728e-06, "loss": 1.2085, "mean_token_accuracy": 0.7370466589927673, "step": 262 }, { "epoch": 0.7225274725274725, "grad_norm": 12.315993309020996, "learning_rate": 8.257228948141569e-06, "loss": 1.3312, "mean_token_accuracy": 0.6979020833969116, "step": 263 }, { "epoch": 0.7252747252747253, "grad_norm": 12.558300018310547, "learning_rate": 8.24013984711578e-06, "loss": 1.2436, "mean_token_accuracy": 0.7176966071128845, "step": 264 }, { "epoch": 0.728021978021978, "grad_norm": 18.78453254699707, "learning_rate": 8.22298771003277e-06, "loss": 1.3972, "mean_token_accuracy": 0.7003994584083557, "step": 265 }, { "epoch": 0.7307692307692307, "grad_norm": 11.667128562927246, "learning_rate": 8.205772931472068e-06, "loss": 1.2294, "mean_token_accuracy": 0.7415881752967834, "step": 266 }, { "epoch": 0.7335164835164835, "grad_norm": 11.029336929321289, "learning_rate": 8.188495907454253e-06, "loss": 1.3288, "mean_token_accuracy": 0.7267441749572754, "step": 267 }, { "epoch": 0.7362637362637363, "grad_norm": 11.00023365020752, "learning_rate": 8.171157035431842e-06, "loss": 1.4495, "mean_token_accuracy": 0.6796338558197021, "step": 268 }, { "epoch": 0.739010989010989, "grad_norm": 12.021223068237305, "learning_rate": 8.153756714280143e-06, "loss": 1.5251, "mean_token_accuracy": 0.6521739363670349, "step": 269 }, { "epoch": 0.7417582417582418, "grad_norm": 11.635795593261719, "learning_rate": 8.13629534428808e-06, "loss": 1.488, "mean_token_accuracy": 0.6854636669158936, "step": 270 }, { "epoch": 0.7445054945054945, "grad_norm": 11.152154922485352, "learning_rate": 8.118773327148994e-06, "loss": 0.9805, "mean_token_accuracy": 0.7692307829856873, "step": 271 }, { "epoch": 0.7472527472527473, "grad_norm": 11.193713188171387, "learning_rate": 8.101191065951388e-06, "loss": 1.482, "mean_token_accuracy": 0.6787003874778748, "step": 272 }, { "epoch": 0.75, "grad_norm": 10.59757137298584, "learning_rate": 8.083548965169663e-06, "loss": 1.2458, "mean_token_accuracy": 0.7222856879234314, "step": 273 }, { "epoch": 0.7527472527472527, "grad_norm": 11.860565185546875, "learning_rate": 8.065847430654813e-06, "loss": 1.3717, "mean_token_accuracy": 0.6764705777168274, "step": 274 }, { "epoch": 0.7554945054945055, "grad_norm": 12.620820045471191, "learning_rate": 8.048086869625081e-06, "loss": 1.3039, "mean_token_accuracy": 0.6946778893470764, "step": 275 }, { "epoch": 0.7582417582417582, "grad_norm": 10.615486145019531, "learning_rate": 8.0302676906566e-06, "loss": 1.0608, "mean_token_accuracy": 0.7779126167297363, "step": 276 }, { "epoch": 0.760989010989011, "grad_norm": 11.107016563415527, "learning_rate": 8.012390303673994e-06, "loss": 1.4002, "mean_token_accuracy": 0.6892052292823792, "step": 277 }, { "epoch": 0.7637362637362637, "grad_norm": 13.737344741821289, "learning_rate": 7.994455119940936e-06, "loss": 1.5131, "mean_token_accuracy": 0.6741293668746948, "step": 278 }, { "epoch": 0.7664835164835165, "grad_norm": 14.99168586730957, "learning_rate": 7.976462552050696e-06, "loss": 1.3036, "mean_token_accuracy": 0.6979637742042542, "step": 279 }, { "epoch": 0.7692307692307693, "grad_norm": 11.542069435119629, "learning_rate": 7.958413013916657e-06, "loss": 1.3555, "mean_token_accuracy": 0.7072808146476746, "step": 280 }, { "epoch": 0.771978021978022, "grad_norm": 11.233433723449707, "learning_rate": 7.94030692076278e-06, "loss": 1.2378, "mean_token_accuracy": 0.714981734752655, "step": 281 }, { "epoch": 0.7747252747252747, "grad_norm": 12.117866516113281, "learning_rate": 7.92214468911405e-06, "loss": 1.3431, "mean_token_accuracy": 0.6927152276039124, "step": 282 }, { "epoch": 0.7774725274725275, "grad_norm": 12.193938255310059, "learning_rate": 7.903926736786908e-06, "loss": 1.2657, "mean_token_accuracy": 0.7244094610214233, "step": 283 }, { "epoch": 0.7802197802197802, "grad_norm": 10.754172325134277, "learning_rate": 7.885653482879632e-06, "loss": 1.1887, "mean_token_accuracy": 0.7329341173171997, "step": 284 }, { "epoch": 0.782967032967033, "grad_norm": 10.909982681274414, "learning_rate": 7.867325347762694e-06, "loss": 1.1202, "mean_token_accuracy": 0.7314148545265198, "step": 285 }, { "epoch": 0.7857142857142857, "grad_norm": 10.36418628692627, "learning_rate": 7.848942753069087e-06, "loss": 1.2096, "mean_token_accuracy": 0.7448856830596924, "step": 286 }, { "epoch": 0.7884615384615384, "grad_norm": 10.146966934204102, "learning_rate": 7.830506121684633e-06, "loss": 1.1391, "mean_token_accuracy": 0.7517985701560974, "step": 287 }, { "epoch": 0.7912087912087912, "grad_norm": 10.96828842163086, "learning_rate": 7.812015877738254e-06, "loss": 1.5554, "mean_token_accuracy": 0.6592317223548889, "step": 288 }, { "epoch": 0.7939560439560439, "grad_norm": 10.668539047241211, "learning_rate": 7.793472446592203e-06, "loss": 1.1544, "mean_token_accuracy": 0.7598944306373596, "step": 289 }, { "epoch": 0.7967032967032966, "grad_norm": 10.111360549926758, "learning_rate": 7.774876254832303e-06, "loss": 1.2486, "mean_token_accuracy": 0.7079261541366577, "step": 290 }, { "epoch": 0.7994505494505495, "grad_norm": 10.784381866455078, "learning_rate": 7.756227730258103e-06, "loss": 1.0485, "mean_token_accuracy": 0.7605294585227966, "step": 291 }, { "epoch": 0.8021978021978022, "grad_norm": 12.186707496643066, "learning_rate": 7.737527301873056e-06, "loss": 1.367, "mean_token_accuracy": 0.7229254841804504, "step": 292 }, { "epoch": 0.804945054945055, "grad_norm": 10.163819313049316, "learning_rate": 7.718775399874655e-06, "loss": 1.046, "mean_token_accuracy": 0.7559523582458496, "step": 293 }, { "epoch": 0.8076923076923077, "grad_norm": 11.084784507751465, "learning_rate": 7.699972455644516e-06, "loss": 1.1147, "mean_token_accuracy": 0.7242128252983093, "step": 294 }, { "epoch": 0.8104395604395604, "grad_norm": 9.741537094116211, "learning_rate": 7.681118901738471e-06, "loss": 1.0944, "mean_token_accuracy": 0.7550111413002014, "step": 295 }, { "epoch": 0.8131868131868132, "grad_norm": 10.739821434020996, "learning_rate": 7.662215171876609e-06, "loss": 1.0543, "mean_token_accuracy": 0.755667507648468, "step": 296 }, { "epoch": 0.8159340659340659, "grad_norm": 10.812918663024902, "learning_rate": 7.643261700933305e-06, "loss": 1.0856, "mean_token_accuracy": 0.7506459951400757, "step": 297 }, { "epoch": 0.8186813186813187, "grad_norm": 10.698429107666016, "learning_rate": 7.624258924927209e-06, "loss": 1.3178, "mean_token_accuracy": 0.6916950941085815, "step": 298 }, { "epoch": 0.8214285714285714, "grad_norm": 11.797857284545898, "learning_rate": 7.605207281011219e-06, "loss": 1.3815, "mean_token_accuracy": 0.7045769691467285, "step": 299 }, { "epoch": 0.8241758241758241, "grad_norm": 10.237780570983887, "learning_rate": 7.5861072074624254e-06, "loss": 1.2108, "mean_token_accuracy": 0.7392900586128235, "step": 300 }, { "epoch": 0.8269230769230769, "grad_norm": 11.791534423828125, "learning_rate": 7.566959143672023e-06, "loss": 1.3458, "mean_token_accuracy": 0.6741440296173096, "step": 301 }, { "epoch": 0.8296703296703297, "grad_norm": 11.488512992858887, "learning_rate": 7.5477635301352115e-06, "loss": 1.2632, "mean_token_accuracy": 0.7227332592010498, "step": 302 }, { "epoch": 0.8324175824175825, "grad_norm": 11.239252090454102, "learning_rate": 7.528520808441058e-06, "loss": 1.3094, "mean_token_accuracy": 0.7037887573242188, "step": 303 }, { "epoch": 0.8351648351648352, "grad_norm": 11.873623847961426, "learning_rate": 7.509231421262333e-06, "loss": 1.2245, "mean_token_accuracy": 0.7310924530029297, "step": 304 }, { "epoch": 0.8379120879120879, "grad_norm": 9.97056770324707, "learning_rate": 7.489895812345335e-06, "loss": 1.1895, "mean_token_accuracy": 0.7085106372833252, "step": 305 }, { "epoch": 0.8406593406593407, "grad_norm": 10.721016883850098, "learning_rate": 7.470514426499681e-06, "loss": 0.9412, "mean_token_accuracy": 0.7770859003067017, "step": 306 }, { "epoch": 0.8434065934065934, "grad_norm": 10.802633285522461, "learning_rate": 7.451087709588069e-06, "loss": 1.1512, "mean_token_accuracy": 0.7355460524559021, "step": 307 }, { "epoch": 0.8461538461538461, "grad_norm": 11.425252914428711, "learning_rate": 7.431616108516022e-06, "loss": 1.406, "mean_token_accuracy": 0.6849925518035889, "step": 308 }, { "epoch": 0.8489010989010989, "grad_norm": 9.682330131530762, "learning_rate": 7.4121000712216165e-06, "loss": 1.0936, "mean_token_accuracy": 0.7689873576164246, "step": 309 }, { "epoch": 0.8516483516483516, "grad_norm": 11.848267555236816, "learning_rate": 7.392540046665161e-06, "loss": 1.4753, "mean_token_accuracy": 0.680701732635498, "step": 310 }, { "epoch": 0.8543956043956044, "grad_norm": 10.513921737670898, "learning_rate": 7.372936484818884e-06, "loss": 1.1327, "mean_token_accuracy": 0.7203166484832764, "step": 311 }, { "epoch": 0.8571428571428571, "grad_norm": 10.796121597290039, "learning_rate": 7.353289836656574e-06, "loss": 1.1179, "mean_token_accuracy": 0.7421451807022095, "step": 312 }, { "epoch": 0.8598901098901099, "grad_norm": 9.850513458251953, "learning_rate": 7.333600554143204e-06, "loss": 1.4695, "mean_token_accuracy": 0.6694129705429077, "step": 313 }, { "epoch": 0.8626373626373627, "grad_norm": 10.93211555480957, "learning_rate": 7.313869090224542e-06, "loss": 1.4422, "mean_token_accuracy": 0.6784530282020569, "step": 314 }, { "epoch": 0.8653846153846154, "grad_norm": 9.837514877319336, "learning_rate": 7.29409589881672e-06, "loss": 1.3123, "mean_token_accuracy": 0.7209039330482483, "step": 315 }, { "epoch": 0.8681318681318682, "grad_norm": 11.812731742858887, "learning_rate": 7.274281434795804e-06, "loss": 1.4742, "mean_token_accuracy": 0.6945205330848694, "step": 316 }, { "epoch": 0.8708791208791209, "grad_norm": 12.415311813354492, "learning_rate": 7.254426153987315e-06, "loss": 1.4583, "mean_token_accuracy": 0.6542416214942932, "step": 317 }, { "epoch": 0.8736263736263736, "grad_norm": 9.164080619812012, "learning_rate": 7.234530513155762e-06, "loss": 1.2034, "mean_token_accuracy": 0.7345678806304932, "step": 318 }, { "epoch": 0.8763736263736264, "grad_norm": 10.568696022033691, "learning_rate": 7.214594969994115e-06, "loss": 1.3091, "mean_token_accuracy": 0.6913425326347351, "step": 319 }, { "epoch": 0.8791208791208791, "grad_norm": 11.376129150390625, "learning_rate": 7.1946199831132905e-06, "loss": 1.1427, "mean_token_accuracy": 0.730867326259613, "step": 320 }, { "epoch": 0.8818681318681318, "grad_norm": 10.715977668762207, "learning_rate": 7.174606012031591e-06, "loss": 1.052, "mean_token_accuracy": 0.7385203838348389, "step": 321 }, { "epoch": 0.8846153846153846, "grad_norm": 10.006759643554688, "learning_rate": 7.154553517164139e-06, "loss": 1.1576, "mean_token_accuracy": 0.7474167346954346, "step": 322 }, { "epoch": 0.8873626373626373, "grad_norm": 11.648452758789062, "learning_rate": 7.134462959812287e-06, "loss": 1.107, "mean_token_accuracy": 0.7347995042800903, "step": 323 }, { "epoch": 0.8901098901098901, "grad_norm": 11.925621032714844, "learning_rate": 7.114334802153003e-06, "loss": 1.8605, "mean_token_accuracy": 0.5874384045600891, "step": 324 }, { "epoch": 0.8928571428571429, "grad_norm": 11.52076244354248, "learning_rate": 7.094169507228236e-06, "loss": 1.3594, "mean_token_accuracy": 0.7149321436882019, "step": 325 }, { "epoch": 0.8956043956043956, "grad_norm": 11.350264549255371, "learning_rate": 7.0739675389342665e-06, "loss": 1.2657, "mean_token_accuracy": 0.7225130796432495, "step": 326 }, { "epoch": 0.8983516483516484, "grad_norm": 9.646256446838379, "learning_rate": 7.053729362011034e-06, "loss": 1.0548, "mean_token_accuracy": 0.7713310718536377, "step": 327 }, { "epoch": 0.9010989010989011, "grad_norm": 11.415197372436523, "learning_rate": 7.033455442031451e-06, "loss": 1.1871, "mean_token_accuracy": 0.7532097101211548, "step": 328 }, { "epoch": 0.9038461538461539, "grad_norm": 11.284040451049805, "learning_rate": 7.0131462453906785e-06, "loss": 1.3412, "mean_token_accuracy": 0.7117437720298767, "step": 329 }, { "epoch": 0.9065934065934066, "grad_norm": 12.890923500061035, "learning_rate": 6.9928022392954175e-06, "loss": 1.502, "mean_token_accuracy": 0.6699716448783875, "step": 330 }, { "epoch": 0.9093406593406593, "grad_norm": 11.523416519165039, "learning_rate": 6.972423891753136e-06, "loss": 1.2782, "mean_token_accuracy": 0.7075351476669312, "step": 331 }, { "epoch": 0.9120879120879121, "grad_norm": 11.033285140991211, "learning_rate": 6.9520116715613315e-06, "loss": 1.4068, "mean_token_accuracy": 0.6863979697227478, "step": 332 }, { "epoch": 0.9148351648351648, "grad_norm": 12.092578887939453, "learning_rate": 6.9315660482967185e-06, "loss": 1.3415, "mean_token_accuracy": 0.7046035528182983, "step": 333 }, { "epoch": 0.9175824175824175, "grad_norm": 9.621912002563477, "learning_rate": 6.9110874923044445e-06, "loss": 0.9547, "mean_token_accuracy": 0.7870036363601685, "step": 334 }, { "epoch": 0.9203296703296703, "grad_norm": 10.068906784057617, "learning_rate": 6.890576474687264e-06, "loss": 1.0756, "mean_token_accuracy": 0.7344632744789124, "step": 335 }, { "epoch": 0.9230769230769231, "grad_norm": 10.981210708618164, "learning_rate": 6.8700334672947e-06, "loss": 1.2718, "mean_token_accuracy": 0.7097902297973633, "step": 336 }, { "epoch": 0.9258241758241759, "grad_norm": 10.970985412597656, "learning_rate": 6.849458942712189e-06, "loss": 1.245, "mean_token_accuracy": 0.7150127291679382, "step": 337 }, { "epoch": 0.9285714285714286, "grad_norm": 9.914239883422852, "learning_rate": 6.828853374250213e-06, "loss": 1.1096, "mean_token_accuracy": 0.7284382581710815, "step": 338 }, { "epoch": 0.9313186813186813, "grad_norm": 11.386443138122559, "learning_rate": 6.8082172359334085e-06, "loss": 1.1512, "mean_token_accuracy": 0.7243902683258057, "step": 339 }, { "epoch": 0.9340659340659341, "grad_norm": 11.182723999023438, "learning_rate": 6.7875510024896595e-06, "loss": 1.0592, "mean_token_accuracy": 0.7604035139083862, "step": 340 }, { "epoch": 0.9368131868131868, "grad_norm": 10.25452709197998, "learning_rate": 6.766855149339182e-06, "loss": 1.2342, "mean_token_accuracy": 0.7211981415748596, "step": 341 }, { "epoch": 0.9395604395604396, "grad_norm": 11.961226463317871, "learning_rate": 6.746130152583581e-06, "loss": 1.3183, "mean_token_accuracy": 0.7117726802825928, "step": 342 }, { "epoch": 0.9423076923076923, "grad_norm": 10.968716621398926, "learning_rate": 6.725376488994904e-06, "loss": 1.08, "mean_token_accuracy": 0.7506666779518127, "step": 343 }, { "epoch": 0.945054945054945, "grad_norm": 11.551229476928711, "learning_rate": 6.704594636004669e-06, "loss": 1.3396, "mean_token_accuracy": 0.729619562625885, "step": 344 }, { "epoch": 0.9478021978021978, "grad_norm": 11.677756309509277, "learning_rate": 6.683785071692877e-06, "loss": 1.3994, "mean_token_accuracy": 0.6937062740325928, "step": 345 }, { "epoch": 0.9505494505494505, "grad_norm": 13.50044059753418, "learning_rate": 6.662948274777031e-06, "loss": 1.3472, "mean_token_accuracy": 0.6719576716423035, "step": 346 }, { "epoch": 0.9532967032967034, "grad_norm": 10.820409774780273, "learning_rate": 6.642084724601101e-06, "loss": 1.0404, "mean_token_accuracy": 0.7569352984428406, "step": 347 }, { "epoch": 0.9560439560439561, "grad_norm": 11.08627700805664, "learning_rate": 6.6211949011245116e-06, "loss": 1.0583, "mean_token_accuracy": 0.740359902381897, "step": 348 }, { "epoch": 0.9587912087912088, "grad_norm": 11.280929565429688, "learning_rate": 6.6002792849110966e-06, "loss": 1.2419, "mean_token_accuracy": 0.7247820496559143, "step": 349 }, { "epoch": 0.9615384615384616, "grad_norm": 11.241645812988281, "learning_rate": 6.579338357118039e-06, "loss": 1.1482, "mean_token_accuracy": 0.7217597961425781, "step": 350 }, { "epoch": 0.9642857142857143, "grad_norm": 10.611461639404297, "learning_rate": 6.558372599484817e-06, "loss": 1.1699, "mean_token_accuracy": 0.7283372282981873, "step": 351 }, { "epoch": 0.967032967032967, "grad_norm": 11.097305297851562, "learning_rate": 6.537382494322101e-06, "loss": 1.1236, "mean_token_accuracy": 0.7349397540092468, "step": 352 }, { "epoch": 0.9697802197802198, "grad_norm": 11.588397026062012, "learning_rate": 6.516368524500673e-06, "loss": 1.2617, "mean_token_accuracy": 0.7199453711509705, "step": 353 }, { "epoch": 0.9725274725274725, "grad_norm": 11.946807861328125, "learning_rate": 6.495331173440315e-06, "loss": 1.1311, "mean_token_accuracy": 0.736912727355957, "step": 354 }, { "epoch": 0.9752747252747253, "grad_norm": 10.510326385498047, "learning_rate": 6.474270925098685e-06, "loss": 1.0399, "mean_token_accuracy": 0.7677852511405945, "step": 355 }, { "epoch": 0.978021978021978, "grad_norm": 10.737903594970703, "learning_rate": 6.453188263960186e-06, "loss": 0.9998, "mean_token_accuracy": 0.772020697593689, "step": 356 }, { "epoch": 0.9807692307692307, "grad_norm": 11.893712997436523, "learning_rate": 6.432083675024823e-06, "loss": 1.5014, "mean_token_accuracy": 0.701265811920166, "step": 357 }, { "epoch": 0.9835164835164835, "grad_norm": 10.858631134033203, "learning_rate": 6.410957643797039e-06, "loss": 1.1924, "mean_token_accuracy": 0.7433525919914246, "step": 358 }, { "epoch": 0.9862637362637363, "grad_norm": 11.224592208862305, "learning_rate": 6.389810656274553e-06, "loss": 1.3194, "mean_token_accuracy": 0.7270560264587402, "step": 359 }, { "epoch": 0.989010989010989, "grad_norm": 11.10081958770752, "learning_rate": 6.368643198937176e-06, "loss": 1.1145, "mean_token_accuracy": 0.734375, "step": 360 }, { "epoch": 0.9917582417582418, "grad_norm": 10.680919647216797, "learning_rate": 6.347455758735622e-06, "loss": 1.1355, "mean_token_accuracy": 0.7137681245803833, "step": 361 }, { "epoch": 0.9945054945054945, "grad_norm": 12.283355712890625, "learning_rate": 6.326248823080302e-06, "loss": 1.2164, "mean_token_accuracy": 0.7481805086135864, "step": 362 }, { "epoch": 0.9972527472527473, "grad_norm": 10.820568084716797, "learning_rate": 6.305022879830115e-06, "loss": 1.1457, "mean_token_accuracy": 0.739130437374115, "step": 363 }, { "epoch": 1.0, "grad_norm": 11.383041381835938, "learning_rate": 6.283778417281226e-06, "loss": 1.1409, "mean_token_accuracy": 0.735052764415741, "step": 364 }, { "epoch": 1.0027472527472527, "grad_norm": 8.176081657409668, "learning_rate": 6.262515924155826e-06, "loss": 0.5403, "mean_token_accuracy": 0.8576388955116272, "step": 365 }, { "epoch": 1.0054945054945055, "grad_norm": 11.837261199951172, "learning_rate": 6.2412358895908975e-06, "loss": 1.077, "mean_token_accuracy": 0.7554417252540588, "step": 366 }, { "epoch": 1.0082417582417582, "grad_norm": 7.267159461975098, "learning_rate": 6.219938803126958e-06, "loss": 0.4787, "mean_token_accuracy": 0.8890160322189331, "step": 367 }, { "epoch": 1.010989010989011, "grad_norm": 7.923352241516113, "learning_rate": 6.198625154696797e-06, "loss": 0.5756, "mean_token_accuracy": 0.8782935738563538, "step": 368 }, { "epoch": 1.0137362637362637, "grad_norm": 8.488197326660156, "learning_rate": 6.177295434614207e-06, "loss": 0.6727, "mean_token_accuracy": 0.8545688390731812, "step": 369 }, { "epoch": 1.0164835164835164, "grad_norm": 9.032209396362305, "learning_rate": 6.155950133562705e-06, "loss": 0.7759, "mean_token_accuracy": 0.8111273646354675, "step": 370 }, { "epoch": 1.0192307692307692, "grad_norm": 8.959623336791992, "learning_rate": 6.134589742584243e-06, "loss": 0.7806, "mean_token_accuracy": 0.80402010679245, "step": 371 }, { "epoch": 1.021978021978022, "grad_norm": 7.533729076385498, "learning_rate": 6.113214753067911e-06, "loss": 0.6383, "mean_token_accuracy": 0.8503740429878235, "step": 372 }, { "epoch": 1.0247252747252746, "grad_norm": 7.809054374694824, "learning_rate": 6.091825656738636e-06, "loss": 0.6482, "mean_token_accuracy": 0.8529411554336548, "step": 373 }, { "epoch": 1.0274725274725274, "grad_norm": 6.753856658935547, "learning_rate": 6.070422945645865e-06, "loss": 0.4424, "mean_token_accuracy": 0.9049773812294006, "step": 374 }, { "epoch": 1.0302197802197801, "grad_norm": 7.692054748535156, "learning_rate": 6.049007112152249e-06, "loss": 0.5784, "mean_token_accuracy": 0.8744344115257263, "step": 375 }, { "epoch": 1.032967032967033, "grad_norm": 8.008811950683594, "learning_rate": 6.027578648922319e-06, "loss": 0.5913, "mean_token_accuracy": 0.8524971008300781, "step": 376 }, { "epoch": 1.0357142857142858, "grad_norm": 7.686079978942871, "learning_rate": 6.006138048911146e-06, "loss": 0.3791, "mean_token_accuracy": 0.903930127620697, "step": 377 }, { "epoch": 1.0384615384615385, "grad_norm": 7.934598445892334, "learning_rate": 5.984685805353001e-06, "loss": 0.5688, "mean_token_accuracy": 0.8726027607917786, "step": 378 }, { "epoch": 1.0412087912087913, "grad_norm": 9.844480514526367, "learning_rate": 5.963222411750017e-06, "loss": 0.6188, "mean_token_accuracy": 0.8668596148490906, "step": 379 }, { "epoch": 1.043956043956044, "grad_norm": 7.320831298828125, "learning_rate": 5.941748361860828e-06, "loss": 0.5387, "mean_token_accuracy": 0.8523409366607666, "step": 380 }, { "epoch": 1.0467032967032968, "grad_norm": 8.960830688476562, "learning_rate": 5.920264149689213e-06, "loss": 0.7356, "mean_token_accuracy": 0.8335607051849365, "step": 381 }, { "epoch": 1.0494505494505495, "grad_norm": 9.879374504089355, "learning_rate": 5.898770269472728e-06, "loss": 0.6628, "mean_token_accuracy": 0.8490284085273743, "step": 382 }, { "epoch": 1.0521978021978022, "grad_norm": 10.704723358154297, "learning_rate": 5.877267215671345e-06, "loss": 0.8533, "mean_token_accuracy": 0.8047493696212769, "step": 383 }, { "epoch": 1.054945054945055, "grad_norm": 7.644108772277832, "learning_rate": 5.855755482956065e-06, "loss": 0.4996, "mean_token_accuracy": 0.8881909251213074, "step": 384 }, { "epoch": 1.0576923076923077, "grad_norm": 8.324013710021973, "learning_rate": 5.834235566197551e-06, "loss": 0.5251, "mean_token_accuracy": 0.8783930540084839, "step": 385 }, { "epoch": 1.0604395604395604, "grad_norm": 7.874902725219727, "learning_rate": 5.812707960454731e-06, "loss": 0.4971, "mean_token_accuracy": 0.884567141532898, "step": 386 }, { "epoch": 1.0631868131868132, "grad_norm": 7.967838764190674, "learning_rate": 5.791173160963419e-06, "loss": 0.5384, "mean_token_accuracy": 0.8828025460243225, "step": 387 }, { "epoch": 1.065934065934066, "grad_norm": 8.472578048706055, "learning_rate": 5.769631663124923e-06, "loss": 0.7063, "mean_token_accuracy": 0.8527131676673889, "step": 388 }, { "epoch": 1.0686813186813187, "grad_norm": 8.554250717163086, "learning_rate": 5.748083962494637e-06, "loss": 0.6025, "mean_token_accuracy": 0.8615210056304932, "step": 389 }, { "epoch": 1.0714285714285714, "grad_norm": 7.113428592681885, "learning_rate": 5.7265305547706516e-06, "loss": 0.4874, "mean_token_accuracy": 0.8806941509246826, "step": 390 }, { "epoch": 1.0741758241758241, "grad_norm": 7.222031593322754, "learning_rate": 5.704971935782348e-06, "loss": 0.4854, "mean_token_accuracy": 0.8786717653274536, "step": 391 }, { "epoch": 1.0769230769230769, "grad_norm": 8.575010299682617, "learning_rate": 5.68340860147899e-06, "loss": 0.5006, "mean_token_accuracy": 0.8834196925163269, "step": 392 }, { "epoch": 1.0796703296703296, "grad_norm": 8.167390823364258, "learning_rate": 5.661841047918318e-06, "loss": 0.5515, "mean_token_accuracy": 0.8620283007621765, "step": 393 }, { "epoch": 1.0824175824175823, "grad_norm": 8.286924362182617, "learning_rate": 5.640269771255126e-06, "loss": 0.5343, "mean_token_accuracy": 0.8598971962928772, "step": 394 }, { "epoch": 1.085164835164835, "grad_norm": 9.173430442810059, "learning_rate": 5.6186952677298705e-06, "loss": 0.6477, "mean_token_accuracy": 0.8426828980445862, "step": 395 }, { "epoch": 1.0879120879120878, "grad_norm": 8.196123123168945, "learning_rate": 5.597118033657231e-06, "loss": 0.637, "mean_token_accuracy": 0.8584905862808228, "step": 396 }, { "epoch": 1.0906593406593406, "grad_norm": 8.771379470825195, "learning_rate": 5.5755385654147084e-06, "loss": 0.5563, "mean_token_accuracy": 0.8635236024856567, "step": 397 }, { "epoch": 1.0934065934065935, "grad_norm": 9.898123741149902, "learning_rate": 5.5539573594311945e-06, "loss": 0.7194, "mean_token_accuracy": 0.822277843952179, "step": 398 }, { "epoch": 1.0961538461538463, "grad_norm": 8.823380470275879, "learning_rate": 5.53237491217556e-06, "loss": 0.6639, "mean_token_accuracy": 0.8430851101875305, "step": 399 }, { "epoch": 1.098901098901099, "grad_norm": 12.10927963256836, "learning_rate": 5.510791720145232e-06, "loss": 0.7262, "mean_token_accuracy": 0.8203753232955933, "step": 400 }, { "epoch": 1.1016483516483517, "grad_norm": 7.197577476501465, "learning_rate": 5.489208279854769e-06, "loss": 0.5494, "mean_token_accuracy": 0.8551502227783203, "step": 401 }, { "epoch": 1.1043956043956045, "grad_norm": 8.447471618652344, "learning_rate": 5.467625087824442e-06, "loss": 0.5537, "mean_token_accuracy": 0.8692810535430908, "step": 402 }, { "epoch": 1.1071428571428572, "grad_norm": 8.937617301940918, "learning_rate": 5.446042640568809e-06, "loss": 0.7289, "mean_token_accuracy": 0.8333333134651184, "step": 403 }, { "epoch": 1.10989010989011, "grad_norm": 8.270979881286621, "learning_rate": 5.424461434585293e-06, "loss": 0.6222, "mean_token_accuracy": 0.846905529499054, "step": 404 }, { "epoch": 1.1126373626373627, "grad_norm": 7.838191986083984, "learning_rate": 5.40288196634277e-06, "loss": 0.6071, "mean_token_accuracy": 0.8605230450630188, "step": 405 }, { "epoch": 1.1153846153846154, "grad_norm": 8.771527290344238, "learning_rate": 5.381304732270131e-06, "loss": 0.5886, "mean_token_accuracy": 0.8531374931335449, "step": 406 }, { "epoch": 1.1181318681318682, "grad_norm": 7.3298563957214355, "learning_rate": 5.359730228744876e-06, "loss": 0.5052, "mean_token_accuracy": 0.8785796165466309, "step": 407 }, { "epoch": 1.120879120879121, "grad_norm": 8.253058433532715, "learning_rate": 5.3381589520816855e-06, "loss": 0.6519, "mean_token_accuracy": 0.8469135761260986, "step": 408 }, { "epoch": 1.1236263736263736, "grad_norm": 9.247401237487793, "learning_rate": 5.31659139852101e-06, "loss": 0.7053, "mean_token_accuracy": 0.8385093212127686, "step": 409 }, { "epoch": 1.1263736263736264, "grad_norm": 8.64867877960205, "learning_rate": 5.295028064217653e-06, "loss": 0.6317, "mean_token_accuracy": 0.84624844789505, "step": 410 }, { "epoch": 1.129120879120879, "grad_norm": 8.751404762268066, "learning_rate": 5.27346944522935e-06, "loss": 0.5933, "mean_token_accuracy": 0.8576826453208923, "step": 411 }, { "epoch": 1.1318681318681318, "grad_norm": 8.154433250427246, "learning_rate": 5.2519160375053645e-06, "loss": 0.4434, "mean_token_accuracy": 0.9011124968528748, "step": 412 }, { "epoch": 1.1346153846153846, "grad_norm": 8.975844383239746, "learning_rate": 5.230368336875078e-06, "loss": 0.7341, "mean_token_accuracy": 0.8397710919380188, "step": 413 }, { "epoch": 1.1373626373626373, "grad_norm": 10.191604614257812, "learning_rate": 5.2088268390365804e-06, "loss": 0.6342, "mean_token_accuracy": 0.8444666266441345, "step": 414 }, { "epoch": 1.14010989010989, "grad_norm": 8.440400123596191, "learning_rate": 5.187292039545271e-06, "loss": 0.6542, "mean_token_accuracy": 0.8637640476226807, "step": 415 }, { "epoch": 1.1428571428571428, "grad_norm": 7.392390727996826, "learning_rate": 5.1657644338024525e-06, "loss": 0.6651, "mean_token_accuracy": 0.8404371738433838, "step": 416 }, { "epoch": 1.1456043956043955, "grad_norm": 8.26939582824707, "learning_rate": 5.144244517043936e-06, "loss": 0.5253, "mean_token_accuracy": 0.8696275353431702, "step": 417 }, { "epoch": 1.1483516483516483, "grad_norm": 8.410235404968262, "learning_rate": 5.122732784328657e-06, "loss": 0.6554, "mean_token_accuracy": 0.842737078666687, "step": 418 }, { "epoch": 1.151098901098901, "grad_norm": 8.329333305358887, "learning_rate": 5.1012297305272725e-06, "loss": 0.6661, "mean_token_accuracy": 0.8437118530273438, "step": 419 }, { "epoch": 1.1538461538461537, "grad_norm": 7.471403121948242, "learning_rate": 5.0797358503107875e-06, "loss": 0.5257, "mean_token_accuracy": 0.8748419880867004, "step": 420 }, { "epoch": 1.1565934065934065, "grad_norm": 8.804765701293945, "learning_rate": 5.058251638139173e-06, "loss": 0.7146, "mean_token_accuracy": 0.8457831144332886, "step": 421 }, { "epoch": 1.1593406593406592, "grad_norm": 8.508563041687012, "learning_rate": 5.036777588249983e-06, "loss": 0.6724, "mean_token_accuracy": 0.8466413021087646, "step": 422 }, { "epoch": 1.1620879120879122, "grad_norm": 8.738518714904785, "learning_rate": 5.015314194647001e-06, "loss": 0.7564, "mean_token_accuracy": 0.8212974071502686, "step": 423 }, { "epoch": 1.164835164835165, "grad_norm": 8.125940322875977, "learning_rate": 4.9938619510888575e-06, "loss": 0.5889, "mean_token_accuracy": 0.879807710647583, "step": 424 }, { "epoch": 1.1675824175824177, "grad_norm": 7.5376081466674805, "learning_rate": 4.972421351077682e-06, "loss": 0.5469, "mean_token_accuracy": 0.8646616339683533, "step": 425 }, { "epoch": 1.1703296703296704, "grad_norm": 8.586243629455566, "learning_rate": 4.950992887847752e-06, "loss": 0.6722, "mean_token_accuracy": 0.8421768546104431, "step": 426 }, { "epoch": 1.1730769230769231, "grad_norm": 9.106139183044434, "learning_rate": 4.929577054354138e-06, "loss": 0.6973, "mean_token_accuracy": 0.8415716290473938, "step": 427 }, { "epoch": 1.1758241758241759, "grad_norm": 8.343849182128906, "learning_rate": 4.908174343261366e-06, "loss": 0.6343, "mean_token_accuracy": 0.8487690687179565, "step": 428 }, { "epoch": 1.1785714285714286, "grad_norm": 8.809746742248535, "learning_rate": 4.88678524693209e-06, "loss": 0.7209, "mean_token_accuracy": 0.8351351618766785, "step": 429 }, { "epoch": 1.1813186813186813, "grad_norm": 8.554579734802246, "learning_rate": 4.865410257415758e-06, "loss": 0.8525, "mean_token_accuracy": 0.7745803594589233, "step": 430 }, { "epoch": 1.184065934065934, "grad_norm": 6.8353495597839355, "learning_rate": 4.844049866437296e-06, "loss": 0.5986, "mean_token_accuracy": 0.8746867179870605, "step": 431 }, { "epoch": 1.1868131868131868, "grad_norm": 8.058353424072266, "learning_rate": 4.822704565385796e-06, "loss": 0.8536, "mean_token_accuracy": 0.8027842044830322, "step": 432 }, { "epoch": 1.1895604395604396, "grad_norm": 8.545801162719727, "learning_rate": 4.801374845303205e-06, "loss": 0.7223, "mean_token_accuracy": 0.8521836400032043, "step": 433 }, { "epoch": 1.1923076923076923, "grad_norm": 8.993727684020996, "learning_rate": 4.780061196873044e-06, "loss": 0.7401, "mean_token_accuracy": 0.8204181790351868, "step": 434 }, { "epoch": 1.195054945054945, "grad_norm": 7.437719345092773, "learning_rate": 4.758764110409103e-06, "loss": 0.7573, "mean_token_accuracy": 0.8167487978935242, "step": 435 }, { "epoch": 1.1978021978021978, "grad_norm": 7.946503162384033, "learning_rate": 4.737484075844175e-06, "loss": 0.5876, "mean_token_accuracy": 0.8464567065238953, "step": 436 }, { "epoch": 1.2005494505494505, "grad_norm": 7.993088245391846, "learning_rate": 4.7162215827187765e-06, "loss": 0.5828, "mean_token_accuracy": 0.8666666746139526, "step": 437 }, { "epoch": 1.2032967032967032, "grad_norm": 8.567098617553711, "learning_rate": 4.694977120169886e-06, "loss": 0.6235, "mean_token_accuracy": 0.8593023419380188, "step": 438 }, { "epoch": 1.206043956043956, "grad_norm": 8.035146713256836, "learning_rate": 4.6737511769197e-06, "loss": 0.6591, "mean_token_accuracy": 0.8387516140937805, "step": 439 }, { "epoch": 1.2087912087912087, "grad_norm": 9.358229637145996, "learning_rate": 4.65254424126438e-06, "loss": 0.4838, "mean_token_accuracy": 0.8591022491455078, "step": 440 }, { "epoch": 1.2115384615384615, "grad_norm": 7.417166709899902, "learning_rate": 4.631356801062824e-06, "loss": 0.5559, "mean_token_accuracy": 0.881313145160675, "step": 441 }, { "epoch": 1.2142857142857142, "grad_norm": 8.637767791748047, "learning_rate": 4.6101893437254485e-06, "loss": 0.5632, "mean_token_accuracy": 0.8659658432006836, "step": 442 }, { "epoch": 1.2170329670329672, "grad_norm": 7.665891170501709, "learning_rate": 4.5890423562029605e-06, "loss": 0.6366, "mean_token_accuracy": 0.863930881023407, "step": 443 }, { "epoch": 1.2197802197802199, "grad_norm": 8.18127155303955, "learning_rate": 4.567916324975178e-06, "loss": 0.5943, "mean_token_accuracy": 0.8619354963302612, "step": 444 }, { "epoch": 1.2225274725274726, "grad_norm": 7.742138862609863, "learning_rate": 4.546811736039814e-06, "loss": 0.6785, "mean_token_accuracy": 0.8467432856559753, "step": 445 }, { "epoch": 1.2252747252747254, "grad_norm": 7.575841426849365, "learning_rate": 4.525729074901316e-06, "loss": 0.5459, "mean_token_accuracy": 0.8661518692970276, "step": 446 }, { "epoch": 1.228021978021978, "grad_norm": 8.588372230529785, "learning_rate": 4.504668826559687e-06, "loss": 0.6267, "mean_token_accuracy": 0.8547717928886414, "step": 447 }, { "epoch": 1.2307692307692308, "grad_norm": 8.191335678100586, "learning_rate": 4.483631475499329e-06, "loss": 0.603, "mean_token_accuracy": 0.8422301411628723, "step": 448 }, { "epoch": 1.2335164835164836, "grad_norm": 7.857209205627441, "learning_rate": 4.4626175056779005e-06, "loss": 0.7386, "mean_token_accuracy": 0.8307873010635376, "step": 449 }, { "epoch": 1.2362637362637363, "grad_norm": 7.451523303985596, "learning_rate": 4.441627400515185e-06, "loss": 0.5647, "mean_token_accuracy": 0.8670588135719299, "step": 450 }, { "epoch": 1.239010989010989, "grad_norm": 7.155614376068115, "learning_rate": 4.420661642881961e-06, "loss": 0.545, "mean_token_accuracy": 0.874709963798523, "step": 451 }, { "epoch": 1.2417582417582418, "grad_norm": 6.548577785491943, "learning_rate": 4.399720715088906e-06, "loss": 0.4698, "mean_token_accuracy": 0.892816424369812, "step": 452 }, { "epoch": 1.2445054945054945, "grad_norm": 7.577704906463623, "learning_rate": 4.378805098875491e-06, "loss": 0.6068, "mean_token_accuracy": 0.851767361164093, "step": 453 }, { "epoch": 1.2472527472527473, "grad_norm": 8.976188659667969, "learning_rate": 4.357915275398901e-06, "loss": 0.8297, "mean_token_accuracy": 0.8129205703735352, "step": 454 }, { "epoch": 1.25, "grad_norm": 7.827886581420898, "learning_rate": 4.33705172522297e-06, "loss": 0.7283, "mean_token_accuracy": 0.8148936033248901, "step": 455 }, { "epoch": 1.2527472527472527, "grad_norm": 6.6771931648254395, "learning_rate": 4.316214928307125e-06, "loss": 0.4674, "mean_token_accuracy": 0.8864168524742126, "step": 456 }, { "epoch": 1.2554945054945055, "grad_norm": 8.46672248840332, "learning_rate": 4.295405363995333e-06, "loss": 0.5265, "mean_token_accuracy": 0.8727770447731018, "step": 457 }, { "epoch": 1.2582417582417582, "grad_norm": 8.40459156036377, "learning_rate": 4.274623511005098e-06, "loss": 0.6493, "mean_token_accuracy": 0.8478260636329651, "step": 458 }, { "epoch": 1.260989010989011, "grad_norm": 8.433124542236328, "learning_rate": 4.25386984741642e-06, "loss": 0.568, "mean_token_accuracy": 0.8743386268615723, "step": 459 }, { "epoch": 1.2637362637362637, "grad_norm": 7.261688709259033, "learning_rate": 4.2331448506608196e-06, "loss": 0.485, "mean_token_accuracy": 0.8756371140480042, "step": 460 }, { "epoch": 1.2664835164835164, "grad_norm": 8.551594734191895, "learning_rate": 4.212448997510341e-06, "loss": 0.627, "mean_token_accuracy": 0.851037859916687, "step": 461 }, { "epoch": 1.2692307692307692, "grad_norm": 7.449108123779297, "learning_rate": 4.191782764066592e-06, "loss": 0.74, "mean_token_accuracy": 0.8116805553436279, "step": 462 }, { "epoch": 1.271978021978022, "grad_norm": 8.490302085876465, "learning_rate": 4.171146625749788e-06, "loss": 0.6644, "mean_token_accuracy": 0.8355408310890198, "step": 463 }, { "epoch": 1.2747252747252746, "grad_norm": 7.6355180740356445, "learning_rate": 4.150541057287814e-06, "loss": 0.5712, "mean_token_accuracy": 0.872826099395752, "step": 464 }, { "epoch": 1.2774725274725274, "grad_norm": 8.184901237487793, "learning_rate": 4.129966532705302e-06, "loss": 0.4505, "mean_token_accuracy": 0.8914955854415894, "step": 465 }, { "epoch": 1.2802197802197801, "grad_norm": 22.74942398071289, "learning_rate": 4.109423525312738e-06, "loss": 0.8091, "mean_token_accuracy": 0.8078431487083435, "step": 466 }, { "epoch": 1.2829670329670328, "grad_norm": 8.367789268493652, "learning_rate": 4.088912507695556e-06, "loss": 0.6103, "mean_token_accuracy": 0.8558322191238403, "step": 467 }, { "epoch": 1.2857142857142856, "grad_norm": 7.367910861968994, "learning_rate": 4.068433951703284e-06, "loss": 0.4561, "mean_token_accuracy": 0.8980070352554321, "step": 468 }, { "epoch": 1.2884615384615383, "grad_norm": 10.204851150512695, "learning_rate": 4.04798832843867e-06, "loss": 0.7424, "mean_token_accuracy": 0.8163265585899353, "step": 469 }, { "epoch": 1.2912087912087913, "grad_norm": 7.735379219055176, "learning_rate": 4.027576108246863e-06, "loss": 0.5422, "mean_token_accuracy": 0.8828213810920715, "step": 470 }, { "epoch": 1.293956043956044, "grad_norm": 8.325048446655273, "learning_rate": 4.007197760704586e-06, "loss": 0.7073, "mean_token_accuracy": 0.8321759104728699, "step": 471 }, { "epoch": 1.2967032967032968, "grad_norm": 7.73416805267334, "learning_rate": 3.986853754609323e-06, "loss": 0.5546, "mean_token_accuracy": 0.8792401552200317, "step": 472 }, { "epoch": 1.2994505494505495, "grad_norm": 8.063698768615723, "learning_rate": 3.96654455796855e-06, "loss": 0.6846, "mean_token_accuracy": 0.8486292958259583, "step": 473 }, { "epoch": 1.3021978021978022, "grad_norm": 7.270962715148926, "learning_rate": 3.946270637988967e-06, "loss": 0.6298, "mean_token_accuracy": 0.8434125185012817, "step": 474 }, { "epoch": 1.304945054945055, "grad_norm": 6.182555675506592, "learning_rate": 3.926032461065735e-06, "loss": 0.4491, "mean_token_accuracy": 0.8908342123031616, "step": 475 }, { "epoch": 1.3076923076923077, "grad_norm": 8.628203392028809, "learning_rate": 3.9058304927717665e-06, "loss": 0.6467, "mean_token_accuracy": 0.8459495306015015, "step": 476 }, { "epoch": 1.3104395604395604, "grad_norm": 6.947534084320068, "learning_rate": 3.885665197847e-06, "loss": 0.4605, "mean_token_accuracy": 0.891922652721405, "step": 477 }, { "epoch": 1.3131868131868132, "grad_norm": 7.754703044891357, "learning_rate": 3.865537040187714e-06, "loss": 0.5706, "mean_token_accuracy": 0.8667481541633606, "step": 478 }, { "epoch": 1.315934065934066, "grad_norm": 8.421850204467773, "learning_rate": 3.845446482835864e-06, "loss": 0.5796, "mean_token_accuracy": 0.8535211086273193, "step": 479 }, { "epoch": 1.3186813186813187, "grad_norm": 7.543336391448975, "learning_rate": 3.825393987968412e-06, "loss": 0.5661, "mean_token_accuracy": 0.8620689511299133, "step": 480 }, { "epoch": 1.3214285714285714, "grad_norm": 7.205345153808594, "learning_rate": 3.8053800168867117e-06, "loss": 0.502, "mean_token_accuracy": 0.8709288239479065, "step": 481 }, { "epoch": 1.3241758241758241, "grad_norm": 7.576408386230469, "learning_rate": 3.7854050300058865e-06, "loss": 0.6592, "mean_token_accuracy": 0.8460508584976196, "step": 482 }, { "epoch": 1.3269230769230769, "grad_norm": 9.292501449584961, "learning_rate": 3.765469486844239e-06, "loss": 0.8449, "mean_token_accuracy": 0.8119080066680908, "step": 483 }, { "epoch": 1.3296703296703296, "grad_norm": 7.648805141448975, "learning_rate": 3.745573846012687e-06, "loss": 0.4898, "mean_token_accuracy": 0.8732572793960571, "step": 484 }, { "epoch": 1.3324175824175823, "grad_norm": 7.9348530769348145, "learning_rate": 3.7257185652041994e-06, "loss": 0.5911, "mean_token_accuracy": 0.8699284195899963, "step": 485 }, { "epoch": 1.335164835164835, "grad_norm": 8.589659690856934, "learning_rate": 3.705904101183281e-06, "loss": 0.5973, "mean_token_accuracy": 0.8438576459884644, "step": 486 }, { "epoch": 1.337912087912088, "grad_norm": 8.7539701461792, "learning_rate": 3.6861309097754595e-06, "loss": 0.7594, "mean_token_accuracy": 0.8237704634666443, "step": 487 }, { "epoch": 1.3406593406593408, "grad_norm": 8.811039924621582, "learning_rate": 3.6663994458567977e-06, "loss": 0.6975, "mean_token_accuracy": 0.8472585082054138, "step": 488 }, { "epoch": 1.3434065934065935, "grad_norm": 7.768817901611328, "learning_rate": 3.646710163343429e-06, "loss": 0.6072, "mean_token_accuracy": 0.8522167205810547, "step": 489 }, { "epoch": 1.3461538461538463, "grad_norm": 6.7450079917907715, "learning_rate": 3.6270635151811175e-06, "loss": 0.433, "mean_token_accuracy": 0.9020737409591675, "step": 490 }, { "epoch": 1.348901098901099, "grad_norm": 7.87033224105835, "learning_rate": 3.60745995333484e-06, "loss": 0.622, "mean_token_accuracy": 0.8647594451904297, "step": 491 }, { "epoch": 1.3516483516483517, "grad_norm": 8.751251220703125, "learning_rate": 3.5878999287783866e-06, "loss": 0.7387, "mean_token_accuracy": 0.8346773982048035, "step": 492 }, { "epoch": 1.3543956043956045, "grad_norm": 8.54961109161377, "learning_rate": 3.5683838914839795e-06, "loss": 0.6551, "mean_token_accuracy": 0.8329238295555115, "step": 493 }, { "epoch": 1.3571428571428572, "grad_norm": 7.03640079498291, "learning_rate": 3.5489122904119332e-06, "loss": 0.4917, "mean_token_accuracy": 0.884529173374176, "step": 494 }, { "epoch": 1.35989010989011, "grad_norm": 147.5779571533203, "learning_rate": 3.52948557350032e-06, "loss": 0.7133, "mean_token_accuracy": 0.8239277601242065, "step": 495 }, { "epoch": 1.3626373626373627, "grad_norm": 5.867126941680908, "learning_rate": 3.510104187654666e-06, "loss": 0.3257, "mean_token_accuracy": 0.9185360074043274, "step": 496 }, { "epoch": 1.3653846153846154, "grad_norm": 8.838720321655273, "learning_rate": 3.490768578737669e-06, "loss": 0.5831, "mean_token_accuracy": 0.8387096524238586, "step": 497 }, { "epoch": 1.3681318681318682, "grad_norm": 7.148526668548584, "learning_rate": 3.471479191558944e-06, "loss": 0.432, "mean_token_accuracy": 0.8999999761581421, "step": 498 }, { "epoch": 1.370879120879121, "grad_norm": 8.821269989013672, "learning_rate": 3.452236469864789e-06, "loss": 0.8936, "mean_token_accuracy": 0.7850574851036072, "step": 499 }, { "epoch": 1.3736263736263736, "grad_norm": 8.092583656311035, "learning_rate": 3.433040856327979e-06, "loss": 0.6344, "mean_token_accuracy": 0.8361244201660156, "step": 500 }, { "epoch": 1.3763736263736264, "grad_norm": 6.676445007324219, "learning_rate": 3.413892792537577e-06, "loss": 0.3909, "mean_token_accuracy": 0.9048811197280884, "step": 501 }, { "epoch": 1.379120879120879, "grad_norm": 8.763235092163086, "learning_rate": 3.394792718988783e-06, "loss": 0.6677, "mean_token_accuracy": 0.8345499038696289, "step": 502 }, { "epoch": 1.3818681318681318, "grad_norm": 8.628216743469238, "learning_rate": 3.3757410750727933e-06, "loss": 0.7486, "mean_token_accuracy": 0.8381071090698242, "step": 503 }, { "epoch": 1.3846153846153846, "grad_norm": 8.193002700805664, "learning_rate": 3.356738299066695e-06, "loss": 0.6746, "mean_token_accuracy": 0.8546798229217529, "step": 504 }, { "epoch": 1.3873626373626373, "grad_norm": 7.721636772155762, "learning_rate": 3.3377848281233916e-06, "loss": 0.4957, "mean_token_accuracy": 0.8779149651527405, "step": 505 }, { "epoch": 1.39010989010989, "grad_norm": 7.719691753387451, "learning_rate": 3.318881098261533e-06, "loss": 0.6541, "mean_token_accuracy": 0.8626444339752197, "step": 506 }, { "epoch": 1.3928571428571428, "grad_norm": 8.396798133850098, "learning_rate": 3.300027544355485e-06, "loss": 0.5343, "mean_token_accuracy": 0.8785046935081482, "step": 507 }, { "epoch": 1.3956043956043955, "grad_norm": 9.754472732543945, "learning_rate": 3.2812246001253455e-06, "loss": 0.7571, "mean_token_accuracy": 0.8223087191581726, "step": 508 }, { "epoch": 1.3983516483516483, "grad_norm": 8.206571578979492, "learning_rate": 3.262472698126944e-06, "loss": 0.7667, "mean_token_accuracy": 0.8345771431922913, "step": 509 }, { "epoch": 1.401098901098901, "grad_norm": 8.819429397583008, "learning_rate": 3.2437722697418995e-06, "loss": 0.7533, "mean_token_accuracy": 0.8287752866744995, "step": 510 }, { "epoch": 1.4038461538461537, "grad_norm": 6.980717182159424, "learning_rate": 3.225123745167699e-06, "loss": 0.4373, "mean_token_accuracy": 0.898408830165863, "step": 511 }, { "epoch": 1.4065934065934065, "grad_norm": 7.27022647857666, "learning_rate": 3.206527553407795e-06, "loss": 0.4416, "mean_token_accuracy": 0.9033613204956055, "step": 512 }, { "epoch": 1.4093406593406592, "grad_norm": 6.68601655960083, "learning_rate": 3.1879841222617484e-06, "loss": 0.5126, "mean_token_accuracy": 0.8735891580581665, "step": 513 }, { "epoch": 1.412087912087912, "grad_norm": 7.937134265899658, "learning_rate": 3.169493878315369e-06, "loss": 0.5541, "mean_token_accuracy": 0.8812351822853088, "step": 514 }, { "epoch": 1.414835164835165, "grad_norm": 8.167470932006836, "learning_rate": 3.151057246930914e-06, "loss": 0.6355, "mean_token_accuracy": 0.8437935709953308, "step": 515 }, { "epoch": 1.4175824175824177, "grad_norm": 7.2766594886779785, "learning_rate": 3.1326746522373073e-06, "loss": 0.4797, "mean_token_accuracy": 0.8883495330810547, "step": 516 }, { "epoch": 1.4203296703296704, "grad_norm": 8.126523971557617, "learning_rate": 3.114346517120369e-06, "loss": 0.6775, "mean_token_accuracy": 0.8465011119842529, "step": 517 }, { "epoch": 1.4230769230769231, "grad_norm": 9.549711227416992, "learning_rate": 3.0960732632130923e-06, "loss": 0.7806, "mean_token_accuracy": 0.798353910446167, "step": 518 }, { "epoch": 1.4258241758241759, "grad_norm": 8.250473022460938, "learning_rate": 3.077855310885952e-06, "loss": 0.6848, "mean_token_accuracy": 0.8284251093864441, "step": 519 }, { "epoch": 1.4285714285714286, "grad_norm": 8.499979019165039, "learning_rate": 3.0596930792372227e-06, "loss": 0.5562, "mean_token_accuracy": 0.8656361699104309, "step": 520 }, { "epoch": 1.4313186813186813, "grad_norm": 8.121464729309082, "learning_rate": 3.0415869860833436e-06, "loss": 0.661, "mean_token_accuracy": 0.8321428298950195, "step": 521 }, { "epoch": 1.434065934065934, "grad_norm": 8.008270263671875, "learning_rate": 3.0235374479493053e-06, "loss": 0.6003, "mean_token_accuracy": 0.8410689234733582, "step": 522 }, { "epoch": 1.4368131868131868, "grad_norm": 7.292999267578125, "learning_rate": 3.0055448800590674e-06, "loss": 0.5489, "mean_token_accuracy": 0.8599537014961243, "step": 523 }, { "epoch": 1.4395604395604396, "grad_norm": 7.336670875549316, "learning_rate": 2.987609696326008e-06, "loss": 0.4491, "mean_token_accuracy": 0.8777633309364319, "step": 524 }, { "epoch": 1.4423076923076923, "grad_norm": 8.703383445739746, "learning_rate": 2.9697323093434006e-06, "loss": 0.6206, "mean_token_accuracy": 0.8430232405662537, "step": 525 }, { "epoch": 1.445054945054945, "grad_norm": 8.316123962402344, "learning_rate": 2.951913130374919e-06, "loss": 0.663, "mean_token_accuracy": 0.8420427441596985, "step": 526 }, { "epoch": 1.4478021978021978, "grad_norm": 8.570442199707031, "learning_rate": 2.934152569345189e-06, "loss": 0.595, "mean_token_accuracy": 0.868789792060852, "step": 527 }, { "epoch": 1.4505494505494505, "grad_norm": 7.871905326843262, "learning_rate": 2.9164510348303366e-06, "loss": 0.6954, "mean_token_accuracy": 0.8410193920135498, "step": 528 }, { "epoch": 1.4532967032967032, "grad_norm": 7.661655902862549, "learning_rate": 2.898808934048613e-06, "loss": 0.6222, "mean_token_accuracy": 0.8458781242370605, "step": 529 }, { "epoch": 1.456043956043956, "grad_norm": 9.568252563476562, "learning_rate": 2.8812266728510075e-06, "loss": 0.5718, "mean_token_accuracy": 0.8805555701255798, "step": 530 }, { "epoch": 1.4587912087912087, "grad_norm": 7.459258556365967, "learning_rate": 2.8637046557119217e-06, "loss": 0.645, "mean_token_accuracy": 0.8480725884437561, "step": 531 }, { "epoch": 1.4615384615384617, "grad_norm": 8.410935401916504, "learning_rate": 2.84624328571986e-06, "loss": 0.686, "mean_token_accuracy": 0.8389512896537781, "step": 532 }, { "epoch": 1.4642857142857144, "grad_norm": 7.343708515167236, "learning_rate": 2.8288429645681604e-06, "loss": 0.4948, "mean_token_accuracy": 0.8748450875282288, "step": 533 }, { "epoch": 1.4670329670329672, "grad_norm": 8.968671798706055, "learning_rate": 2.811504092545748e-06, "loss": 0.6255, "mean_token_accuracy": 0.851640522480011, "step": 534 }, { "epoch": 1.4697802197802199, "grad_norm": 8.382376670837402, "learning_rate": 2.794227068527934e-06, "loss": 0.5991, "mean_token_accuracy": 0.8453188538551331, "step": 535 }, { "epoch": 1.4725274725274726, "grad_norm": 9.791983604431152, "learning_rate": 2.7770122899672314e-06, "loss": 0.9362, "mean_token_accuracy": 0.7942283749580383, "step": 536 }, { "epoch": 1.4752747252747254, "grad_norm": 8.932596206665039, "learning_rate": 2.759860152884222e-06, "loss": 0.6614, "mean_token_accuracy": 0.8456549644470215, "step": 537 }, { "epoch": 1.478021978021978, "grad_norm": 10.183149337768555, "learning_rate": 2.742771051858435e-06, "loss": 0.7975, "mean_token_accuracy": 0.8182989954948425, "step": 538 }, { "epoch": 1.4807692307692308, "grad_norm": 7.038871765136719, "learning_rate": 2.7257453800192724e-06, "loss": 0.5672, "mean_token_accuracy": 0.8578838109970093, "step": 539 }, { "epoch": 1.4835164835164836, "grad_norm": 6.658552169799805, "learning_rate": 2.708783529036977e-06, "loss": 0.4442, "mean_token_accuracy": 0.8848684430122375, "step": 540 }, { "epoch": 1.4862637362637363, "grad_norm": 8.20215129852295, "learning_rate": 2.691885889113606e-06, "loss": 0.7322, "mean_token_accuracy": 0.831932783126831, "step": 541 }, { "epoch": 1.489010989010989, "grad_norm": 9.458049774169922, "learning_rate": 2.675052848974059e-06, "loss": 0.7799, "mean_token_accuracy": 0.8369565010070801, "step": 542 }, { "epoch": 1.4917582417582418, "grad_norm": 9.558260917663574, "learning_rate": 2.6582847958571466e-06, "loss": 0.7362, "mean_token_accuracy": 0.8473091125488281, "step": 543 }, { "epoch": 1.4945054945054945, "grad_norm": 8.83701229095459, "learning_rate": 2.6415821155066657e-06, "loss": 0.693, "mean_token_accuracy": 0.8454810380935669, "step": 544 }, { "epoch": 1.4972527472527473, "grad_norm": 8.02824592590332, "learning_rate": 2.6249451921625355e-06, "loss": 0.6736, "mean_token_accuracy": 0.8448660969734192, "step": 545 }, { "epoch": 1.5, "grad_norm": 6.946861267089844, "learning_rate": 2.608374408551958e-06, "loss": 0.5594, "mean_token_accuracy": 0.8766839504241943, "step": 546 }, { "epoch": 1.5027472527472527, "grad_norm": 6.514960289001465, "learning_rate": 2.5918701458806074e-06, "loss": 0.4437, "mean_token_accuracy": 0.8953744769096375, "step": 547 }, { "epoch": 1.5054945054945055, "grad_norm": 7.499314785003662, "learning_rate": 2.575432783823869e-06, "loss": 0.5534, "mean_token_accuracy": 0.8698453903198242, "step": 548 }, { "epoch": 1.5082417582417582, "grad_norm": 7.372369766235352, "learning_rate": 2.5590627005180974e-06, "loss": 0.5977, "mean_token_accuracy": 0.8597701191902161, "step": 549 }, { "epoch": 1.510989010989011, "grad_norm": 6.4079365730285645, "learning_rate": 2.5427602725519185e-06, "loss": 0.5101, "mean_token_accuracy": 0.89012211561203, "step": 550 }, { "epoch": 1.5137362637362637, "grad_norm": 8.462462425231934, "learning_rate": 2.526525874957577e-06, "loss": 0.7414, "mean_token_accuracy": 0.8104650974273682, "step": 551 }, { "epoch": 1.5164835164835164, "grad_norm": 7.26004695892334, "learning_rate": 2.510359881202291e-06, "loss": 0.5279, "mean_token_accuracy": 0.8739837408065796, "step": 552 }, { "epoch": 1.5192307692307692, "grad_norm": 9.653335571289062, "learning_rate": 2.4942626631796737e-06, "loss": 0.6012, "mean_token_accuracy": 0.854411780834198, "step": 553 }, { "epoch": 1.521978021978022, "grad_norm": 8.012587547302246, "learning_rate": 2.4782345912011746e-06, "loss": 0.7189, "mean_token_accuracy": 0.8267270922660828, "step": 554 }, { "epoch": 1.5247252747252746, "grad_norm": 6.256516456604004, "learning_rate": 2.4622760339875586e-06, "loss": 0.4029, "mean_token_accuracy": 0.9051833152770996, "step": 555 }, { "epoch": 1.5274725274725274, "grad_norm": 7.887014865875244, "learning_rate": 2.4463873586604266e-06, "loss": 0.6252, "mean_token_accuracy": 0.8506731986999512, "step": 556 }, { "epoch": 1.5302197802197801, "grad_norm": 8.715618133544922, "learning_rate": 2.430568930733765e-06, "loss": 0.6903, "mean_token_accuracy": 0.8444130420684814, "step": 557 }, { "epoch": 1.5329670329670328, "grad_norm": 7.426331996917725, "learning_rate": 2.4148211141055495e-06, "loss": 0.6732, "mean_token_accuracy": 0.8408644199371338, "step": 558 }, { "epoch": 1.5357142857142856, "grad_norm": 7.998394966125488, "learning_rate": 2.399144271049357e-06, "loss": 0.5477, "mean_token_accuracy": 0.8670658469200134, "step": 559 }, { "epoch": 1.5384615384615383, "grad_norm": 8.026217460632324, "learning_rate": 2.383538762206038e-06, "loss": 0.5915, "mean_token_accuracy": 0.8620296716690063, "step": 560 }, { "epoch": 1.541208791208791, "grad_norm": 8.00511360168457, "learning_rate": 2.3680049465754314e-06, "loss": 0.5861, "mean_token_accuracy": 0.8640661835670471, "step": 561 }, { "epoch": 1.5439560439560438, "grad_norm": 8.416573524475098, "learning_rate": 2.3525431815080895e-06, "loss": 0.6603, "mean_token_accuracy": 0.8461538553237915, "step": 562 }, { "epoch": 1.5467032967032965, "grad_norm": 8.667383193969727, "learning_rate": 2.337153822697061e-06, "loss": 0.6265, "mean_token_accuracy": 0.8599269390106201, "step": 563 }, { "epoch": 1.5494505494505495, "grad_norm": 8.645866394042969, "learning_rate": 2.3218372241697207e-06, "loss": 0.7027, "mean_token_accuracy": 0.8388969302177429, "step": 564 }, { "epoch": 1.5521978021978022, "grad_norm": 8.266505241394043, "learning_rate": 2.306593738279609e-06, "loss": 0.596, "mean_token_accuracy": 0.8659549355506897, "step": 565 }, { "epoch": 1.554945054945055, "grad_norm": 7.0971550941467285, "learning_rate": 2.291423715698334e-06, "loss": 0.4934, "mean_token_accuracy": 0.8758782148361206, "step": 566 }, { "epoch": 1.5576923076923077, "grad_norm": 7.9168806076049805, "learning_rate": 2.276327505407505e-06, "loss": 0.5401, "mean_token_accuracy": 0.8573486804962158, "step": 567 }, { "epoch": 1.5604395604395604, "grad_norm": 7.8917059898376465, "learning_rate": 2.2613054546907007e-06, "loss": 0.531, "mean_token_accuracy": 0.8639125227928162, "step": 568 }, { "epoch": 1.5631868131868132, "grad_norm": 7.845230579376221, "learning_rate": 2.2463579091254865e-06, "loss": 0.6296, "mean_token_accuracy": 0.8465408682823181, "step": 569 }, { "epoch": 1.565934065934066, "grad_norm": 7.656919479370117, "learning_rate": 2.2314852125754546e-06, "loss": 0.4849, "mean_token_accuracy": 0.891581654548645, "step": 570 }, { "epoch": 1.5686813186813187, "grad_norm": 7.512231826782227, "learning_rate": 2.2166877071823195e-06, "loss": 0.4747, "mean_token_accuracy": 0.8897150158882141, "step": 571 }, { "epoch": 1.5714285714285714, "grad_norm": 7.536767482757568, "learning_rate": 2.201965733358053e-06, "loss": 0.6517, "mean_token_accuracy": 0.8303167223930359, "step": 572 }, { "epoch": 1.5741758241758241, "grad_norm": 6.979121208190918, "learning_rate": 2.1873196297770407e-06, "loss": 0.4826, "mean_token_accuracy": 0.8690476417541504, "step": 573 }, { "epoch": 1.5769230769230769, "grad_norm": 7.652553081512451, "learning_rate": 2.172749733368299e-06, "loss": 0.6127, "mean_token_accuracy": 0.8521836400032043, "step": 574 }, { "epoch": 1.5796703296703298, "grad_norm": 9.753581047058105, "learning_rate": 2.158256379307722e-06, "loss": 1.0171, "mean_token_accuracy": 0.7713936567306519, "step": 575 }, { "epoch": 1.5824175824175826, "grad_norm": 7.93673038482666, "learning_rate": 2.143839901010372e-06, "loss": 0.6781, "mean_token_accuracy": 0.8463227152824402, "step": 576 }, { "epoch": 1.5851648351648353, "grad_norm": 7.431739330291748, "learning_rate": 2.1295006301228067e-06, "loss": 0.5165, "mean_token_accuracy": 0.8890290260314941, "step": 577 }, { "epoch": 1.587912087912088, "grad_norm": 7.179192543029785, "learning_rate": 2.1152388965154536e-06, "loss": 0.5525, "mean_token_accuracy": 0.8607305884361267, "step": 578 }, { "epoch": 1.5906593406593408, "grad_norm": 8.471110343933105, "learning_rate": 2.101055028275018e-06, "loss": 0.8447, "mean_token_accuracy": 0.805443525314331, "step": 579 }, { "epoch": 1.5934065934065935, "grad_norm": 7.559426784515381, "learning_rate": 2.0869493516969373e-06, "loss": 0.6594, "mean_token_accuracy": 0.8257080316543579, "step": 580 }, { "epoch": 1.5961538461538463, "grad_norm": 7.421477317810059, "learning_rate": 2.0729221912778736e-06, "loss": 0.4922, "mean_token_accuracy": 0.8662053346633911, "step": 581 }, { "epoch": 1.598901098901099, "grad_norm": 7.897608757019043, "learning_rate": 2.0589738697082518e-06, "loss": 0.6287, "mean_token_accuracy": 0.8559006452560425, "step": 582 }, { "epoch": 1.6016483516483517, "grad_norm": 7.810044288635254, "learning_rate": 2.0451047078648316e-06, "loss": 0.6793, "mean_token_accuracy": 0.8473118543624878, "step": 583 }, { "epoch": 1.6043956043956045, "grad_norm": 7.520912170410156, "learning_rate": 2.031315024803327e-06, "loss": 0.3915, "mean_token_accuracy": 0.9056203365325928, "step": 584 }, { "epoch": 1.6071428571428572, "grad_norm": 8.198314666748047, "learning_rate": 2.0176051377510707e-06, "loss": 0.6472, "mean_token_accuracy": 0.8204225301742554, "step": 585 }, { "epoch": 1.60989010989011, "grad_norm": 7.389269828796387, "learning_rate": 2.003975362099711e-06, "loss": 0.5766, "mean_token_accuracy": 0.8601484894752502, "step": 586 }, { "epoch": 1.6126373626373627, "grad_norm": 7.9077887535095215, "learning_rate": 1.9904260113979594e-06, "loss": 0.5486, "mean_token_accuracy": 0.8645833134651184, "step": 587 }, { "epoch": 1.6153846153846154, "grad_norm": 7.626746654510498, "learning_rate": 1.9769573973443767e-06, "loss": 0.6089, "mean_token_accuracy": 0.8592411279678345, "step": 588 }, { "epoch": 1.6181318681318682, "grad_norm": 6.979987144470215, "learning_rate": 1.9635698297802006e-06, "loss": 0.3991, "mean_token_accuracy": 0.887499988079071, "step": 589 }, { "epoch": 1.620879120879121, "grad_norm": 7.271255016326904, "learning_rate": 1.9502636166822253e-06, "loss": 0.5482, "mean_token_accuracy": 0.874015748500824, "step": 590 }, { "epoch": 1.6236263736263736, "grad_norm": 8.575961112976074, "learning_rate": 1.9370390641557034e-06, "loss": 0.7048, "mean_token_accuracy": 0.8509485125541687, "step": 591 }, { "epoch": 1.6263736263736264, "grad_norm": 8.057097434997559, "learning_rate": 1.923896476427315e-06, "loss": 0.6843, "mean_token_accuracy": 0.8231292366981506, "step": 592 }, { "epoch": 1.629120879120879, "grad_norm": 6.481530666351318, "learning_rate": 1.9108361558381695e-06, "loss": 0.4746, "mean_token_accuracy": 0.8867470026016235, "step": 593 }, { "epoch": 1.6318681318681318, "grad_norm": 8.327600479125977, "learning_rate": 1.8978584028368418e-06, "loss": 0.6566, "mean_token_accuracy": 0.8510638475418091, "step": 594 }, { "epoch": 1.6346153846153846, "grad_norm": 7.571817398071289, "learning_rate": 1.8849635159724644e-06, "loss": 0.6381, "mean_token_accuracy": 0.867986798286438, "step": 595 }, { "epoch": 1.6373626373626373, "grad_norm": 8.101214408874512, "learning_rate": 1.8721517918878663e-06, "loss": 0.6557, "mean_token_accuracy": 0.8449612259864807, "step": 596 }, { "epoch": 1.64010989010989, "grad_norm": 7.908919334411621, "learning_rate": 1.8594235253127373e-06, "loss": 0.5748, "mean_token_accuracy": 0.8552787899971008, "step": 597 }, { "epoch": 1.6428571428571428, "grad_norm": 8.520292282104492, "learning_rate": 1.8467790090568554e-06, "loss": 0.6607, "mean_token_accuracy": 0.8362652063369751, "step": 598 }, { "epoch": 1.6456043956043955, "grad_norm": 8.188483238220215, "learning_rate": 1.8342185340033496e-06, "loss": 0.6737, "mean_token_accuracy": 0.8428927659988403, "step": 599 }, { "epoch": 1.6483516483516483, "grad_norm": 8.296211242675781, "learning_rate": 1.8217423891020058e-06, "loss": 0.5965, "mean_token_accuracy": 0.8537930846214294, "step": 600 }, { "epoch": 1.651098901098901, "grad_norm": 7.898365497589111, "learning_rate": 1.8093508613626221e-06, "loss": 0.6133, "mean_token_accuracy": 0.8399999737739563, "step": 601 }, { "epoch": 1.6538461538461537, "grad_norm": 7.735372066497803, "learning_rate": 1.7970442358484049e-06, "loss": 0.5112, "mean_token_accuracy": 0.8731428384780884, "step": 602 }, { "epoch": 1.6565934065934065, "grad_norm": 9.300334930419922, "learning_rate": 1.7848227956694119e-06, "loss": 0.7739, "mean_token_accuracy": 0.8144853711128235, "step": 603 }, { "epoch": 1.6593406593406592, "grad_norm": 7.729982376098633, "learning_rate": 1.7726868219760407e-06, "loss": 0.5127, "mean_token_accuracy": 0.8717647194862366, "step": 604 }, { "epoch": 1.662087912087912, "grad_norm": 8.184350967407227, "learning_rate": 1.7606365939525544e-06, "loss": 0.6079, "mean_token_accuracy": 0.8677042722702026, "step": 605 }, { "epoch": 1.6648351648351647, "grad_norm": 7.653561115264893, "learning_rate": 1.7486723888106689e-06, "loss": 0.5513, "mean_token_accuracy": 0.8632371425628662, "step": 606 }, { "epoch": 1.6675824175824174, "grad_norm": 7.327295303344727, "learning_rate": 1.736794481783168e-06, "loss": 0.6586, "mean_token_accuracy": 0.8426023125648499, "step": 607 }, { "epoch": 1.6703296703296702, "grad_norm": 9.70237922668457, "learning_rate": 1.7250031461175751e-06, "loss": 0.5927, "mean_token_accuracy": 0.8830645084381104, "step": 608 }, { "epoch": 1.6730769230769231, "grad_norm": 8.11240005493164, "learning_rate": 1.713298653069867e-06, "loss": 0.6267, "mean_token_accuracy": 0.8669354915618896, "step": 609 }, { "epoch": 1.6758241758241759, "grad_norm": 7.044604778289795, "learning_rate": 1.7016812718982315e-06, "loss": 0.6068, "mean_token_accuracy": 0.8408163189888, "step": 610 }, { "epoch": 1.6785714285714286, "grad_norm": 8.983325958251953, "learning_rate": 1.6901512698568798e-06, "loss": 0.6311, "mean_token_accuracy": 0.8347339034080505, "step": 611 }, { "epoch": 1.6813186813186813, "grad_norm": 6.355010032653809, "learning_rate": 1.678708912189887e-06, "loss": 0.4312, "mean_token_accuracy": 0.8883978128433228, "step": 612 }, { "epoch": 1.684065934065934, "grad_norm": 7.719866752624512, "learning_rate": 1.6673544621251005e-06, "loss": 0.5471, "mean_token_accuracy": 0.8897338509559631, "step": 613 }, { "epoch": 1.6868131868131868, "grad_norm": 7.775828838348389, "learning_rate": 1.6560881808680824e-06, "loss": 0.5938, "mean_token_accuracy": 0.8479212522506714, "step": 614 }, { "epoch": 1.6895604395604396, "grad_norm": 6.961485385894775, "learning_rate": 1.6449103275960967e-06, "loss": 0.5051, "mean_token_accuracy": 0.8725274801254272, "step": 615 }, { "epoch": 1.6923076923076923, "grad_norm": 7.267236232757568, "learning_rate": 1.633821159452148e-06, "loss": 0.5444, "mean_token_accuracy": 0.8766006827354431, "step": 616 }, { "epoch": 1.695054945054945, "grad_norm": 8.290145874023438, "learning_rate": 1.6228209315390716e-06, "loss": 0.5968, "mean_token_accuracy": 0.8578553795814514, "step": 617 }, { "epoch": 1.6978021978021978, "grad_norm": 7.283031940460205, "learning_rate": 1.611909896913657e-06, "loss": 0.5454, "mean_token_accuracy": 0.8671775460243225, "step": 618 }, { "epoch": 1.7005494505494505, "grad_norm": 6.956562519073486, "learning_rate": 1.6010883065808318e-06, "loss": 0.5201, "mean_token_accuracy": 0.8701456189155579, "step": 619 }, { "epoch": 1.7032967032967035, "grad_norm": 7.430464267730713, "learning_rate": 1.5903564094878857e-06, "loss": 0.5259, "mean_token_accuracy": 0.88968825340271, "step": 620 }, { "epoch": 1.7060439560439562, "grad_norm": 8.572484016418457, "learning_rate": 1.5797144525187433e-06, "loss": 0.5999, "mean_token_accuracy": 0.8608695864677429, "step": 621 }, { "epoch": 1.708791208791209, "grad_norm": 7.143974781036377, "learning_rate": 1.5691626804882837e-06, "loss": 0.5746, "mean_token_accuracy": 0.8566392660140991, "step": 622 }, { "epoch": 1.7115384615384617, "grad_norm": 7.603835582733154, "learning_rate": 1.5587013361367126e-06, "loss": 0.4557, "mean_token_accuracy": 0.9003517031669617, "step": 623 }, { "epoch": 1.7142857142857144, "grad_norm": 8.094274520874023, "learning_rate": 1.5483306601239708e-06, "loss": 0.6486, "mean_token_accuracy": 0.8408796787261963, "step": 624 }, { "epoch": 1.7170329670329672, "grad_norm": 9.519272804260254, "learning_rate": 1.5380508910242099e-06, "loss": 0.775, "mean_token_accuracy": 0.8139534592628479, "step": 625 }, { "epoch": 1.7197802197802199, "grad_norm": 10.531563758850098, "learning_rate": 1.527862265320287e-06, "loss": 0.8133, "mean_token_accuracy": 0.8015267252922058, "step": 626 }, { "epoch": 1.7225274725274726, "grad_norm": 7.760357856750488, "learning_rate": 1.5177650173983415e-06, "loss": 0.5966, "mean_token_accuracy": 0.8634311556816101, "step": 627 }, { "epoch": 1.7252747252747254, "grad_norm": 8.748411178588867, "learning_rate": 1.507759379542393e-06, "loss": 0.6808, "mean_token_accuracy": 0.8243430256843567, "step": 628 }, { "epoch": 1.728021978021978, "grad_norm": 7.159116268157959, "learning_rate": 1.4978455819289994e-06, "loss": 0.5778, "mean_token_accuracy": 0.8440366983413696, "step": 629 }, { "epoch": 1.7307692307692308, "grad_norm": 8.136507987976074, "learning_rate": 1.4880238526219635e-06, "loss": 0.6099, "mean_token_accuracy": 0.8495787978172302, "step": 630 }, { "epoch": 1.7335164835164836, "grad_norm": 7.160614013671875, "learning_rate": 1.4782944175670857e-06, "loss": 0.5372, "mean_token_accuracy": 0.8724604845046997, "step": 631 }, { "epoch": 1.7362637362637363, "grad_norm": 7.923597812652588, "learning_rate": 1.4686575005869663e-06, "loss": 0.5452, "mean_token_accuracy": 0.8799019455909729, "step": 632 }, { "epoch": 1.739010989010989, "grad_norm": 8.121793746948242, "learning_rate": 1.459113323375856e-06, "loss": 0.5996, "mean_token_accuracy": 0.8438761830329895, "step": 633 }, { "epoch": 1.7417582417582418, "grad_norm": 7.278065204620361, "learning_rate": 1.4496621054945545e-06, "loss": 0.5784, "mean_token_accuracy": 0.8584905862808228, "step": 634 }, { "epoch": 1.7445054945054945, "grad_norm": 6.142355918884277, "learning_rate": 1.4403040643653657e-06, "loss": 0.3751, "mean_token_accuracy": 0.9079254269599915, "step": 635 }, { "epoch": 1.7472527472527473, "grad_norm": 7.723720550537109, "learning_rate": 1.4310394152670886e-06, "loss": 0.6285, "mean_token_accuracy": 0.8589263558387756, "step": 636 }, { "epoch": 1.75, "grad_norm": 6.810434341430664, "learning_rate": 1.4218683713300653e-06, "loss": 0.4836, "mean_token_accuracy": 0.8887559771537781, "step": 637 }, { "epoch": 1.7527472527472527, "grad_norm": 6.936891555786133, "learning_rate": 1.4127911435312857e-06, "loss": 0.5311, "mean_token_accuracy": 0.8746702075004578, "step": 638 }, { "epoch": 1.7554945054945055, "grad_norm": 7.792840003967285, "learning_rate": 1.4038079406895261e-06, "loss": 0.7044, "mean_token_accuracy": 0.8347205519676208, "step": 639 }, { "epoch": 1.7582417582417582, "grad_norm": 7.776517868041992, "learning_rate": 1.3949189694605486e-06, "loss": 0.538, "mean_token_accuracy": 0.8720445036888123, "step": 640 }, { "epoch": 1.760989010989011, "grad_norm": 6.852481365203857, "learning_rate": 1.3861244343323466e-06, "loss": 0.4217, "mean_token_accuracy": 0.8895630836486816, "step": 641 }, { "epoch": 1.7637362637362637, "grad_norm": 6.889708518981934, "learning_rate": 1.3774245376204407e-06, "loss": 0.419, "mean_token_accuracy": 0.8810572624206543, "step": 642 }, { "epoch": 1.7664835164835164, "grad_norm": 7.797403335571289, "learning_rate": 1.3688194794632236e-06, "loss": 0.5792, "mean_token_accuracy": 0.8663366436958313, "step": 643 }, { "epoch": 1.7692307692307692, "grad_norm": 7.262906551361084, "learning_rate": 1.3603094578173587e-06, "loss": 0.5168, "mean_token_accuracy": 0.8837209343910217, "step": 644 }, { "epoch": 1.771978021978022, "grad_norm": 6.884127140045166, "learning_rate": 1.3518946684532224e-06, "loss": 0.3545, "mean_token_accuracy": 0.9082462191581726, "step": 645 }, { "epoch": 1.7747252747252746, "grad_norm": 7.663375377655029, "learning_rate": 1.3435753049504041e-06, "loss": 0.5832, "mean_token_accuracy": 0.862023651599884, "step": 646 }, { "epoch": 1.7774725274725274, "grad_norm": 8.101320266723633, "learning_rate": 1.3353515586932497e-06, "loss": 0.6461, "mean_token_accuracy": 0.8532731533050537, "step": 647 }, { "epoch": 1.7802197802197801, "grad_norm": 6.119275093078613, "learning_rate": 1.32722361886646e-06, "loss": 0.4742, "mean_token_accuracy": 0.8779661059379578, "step": 648 }, { "epoch": 1.7829670329670328, "grad_norm": 6.471920490264893, "learning_rate": 1.3191916724507415e-06, "loss": 0.3851, "mean_token_accuracy": 0.8980815410614014, "step": 649 }, { "epoch": 1.7857142857142856, "grad_norm": 7.637779712677002, "learning_rate": 1.3112559042184993e-06, "loss": 0.6389, "mean_token_accuracy": 0.8456140160560608, "step": 650 }, { "epoch": 1.7884615384615383, "grad_norm": 6.639374256134033, "learning_rate": 1.3034164967295929e-06, "loss": 0.4687, "mean_token_accuracy": 0.8847059011459351, "step": 651 }, { "epoch": 1.791208791208791, "grad_norm": 7.081116199493408, "learning_rate": 1.2956736303271292e-06, "loss": 0.4666, "mean_token_accuracy": 0.8781321048736572, "step": 652 }, { "epoch": 1.7939560439560438, "grad_norm": 7.375399589538574, "learning_rate": 1.2880274831333211e-06, "loss": 0.5556, "mean_token_accuracy": 0.8748639822006226, "step": 653 }, { "epoch": 1.7967032967032965, "grad_norm": 7.636590957641602, "learning_rate": 1.2804782310453842e-06, "loss": 0.503, "mean_token_accuracy": 0.8553008437156677, "step": 654 }, { "epoch": 1.7994505494505495, "grad_norm": 7.556530952453613, "learning_rate": 1.2730260477314943e-06, "loss": 0.5379, "mean_token_accuracy": 0.8660826086997986, "step": 655 }, { "epoch": 1.8021978021978022, "grad_norm": 7.6223320960998535, "learning_rate": 1.2656711046267891e-06, "loss": 0.5053, "mean_token_accuracy": 0.8884976506233215, "step": 656 }, { "epoch": 1.804945054945055, "grad_norm": 7.420393466949463, "learning_rate": 1.2584135709294283e-06, "loss": 0.5047, "mean_token_accuracy": 0.8751696348190308, "step": 657 }, { "epoch": 1.8076923076923077, "grad_norm": 7.559555530548096, "learning_rate": 1.2512536135966938e-06, "loss": 0.6115, "mean_token_accuracy": 0.8485838770866394, "step": 658 }, { "epoch": 1.8104395604395604, "grad_norm": 8.376801490783691, "learning_rate": 1.2441913973411594e-06, "loss": 0.5606, "mean_token_accuracy": 0.8773333430290222, "step": 659 }, { "epoch": 1.8131868131868132, "grad_norm": 6.666446685791016, "learning_rate": 1.2372270846268935e-06, "loss": 0.4313, "mean_token_accuracy": 0.8859060406684875, "step": 660 }, { "epoch": 1.815934065934066, "grad_norm": 7.478560447692871, "learning_rate": 1.2303608356657226e-06, "loss": 0.5867, "mean_token_accuracy": 0.8590604066848755, "step": 661 }, { "epoch": 1.8186813186813187, "grad_norm": 7.4808855056762695, "learning_rate": 1.223592808413551e-06, "loss": 0.5184, "mean_token_accuracy": 0.8727959990501404, "step": 662 }, { "epoch": 1.8214285714285714, "grad_norm": 8.410175323486328, "learning_rate": 1.216923158566721e-06, "loss": 0.6202, "mean_token_accuracy": 0.8604027032852173, "step": 663 }, { "epoch": 1.8241758241758241, "grad_norm": 8.690070152282715, "learning_rate": 1.2103520395584339e-06, "loss": 0.6715, "mean_token_accuracy": 0.8455089926719666, "step": 664 }, { "epoch": 1.8269230769230769, "grad_norm": 8.344685554504395, "learning_rate": 1.2038796025552207e-06, "loss": 0.5889, "mean_token_accuracy": 0.8775757551193237, "step": 665 }, { "epoch": 1.8296703296703298, "grad_norm": 7.113524913787842, "learning_rate": 1.1975059964534628e-06, "loss": 0.4772, "mean_token_accuracy": 0.8787062168121338, "step": 666 }, { "epoch": 1.8324175824175826, "grad_norm": 9.631033897399902, "learning_rate": 1.191231367875969e-06, "loss": 0.5952, "mean_token_accuracy": 0.8740053176879883, "step": 667 }, { "epoch": 1.8351648351648353, "grad_norm": 7.207862854003906, "learning_rate": 1.1850558611685998e-06, "loss": 0.47, "mean_token_accuracy": 0.8942093253135681, "step": 668 }, { "epoch": 1.837912087912088, "grad_norm": 7.906933307647705, "learning_rate": 1.178979618396949e-06, "loss": 0.5486, "mean_token_accuracy": 0.8479042053222656, "step": 669 }, { "epoch": 1.8406593406593408, "grad_norm": 7.593602657318115, "learning_rate": 1.173002779343075e-06, "loss": 0.5137, "mean_token_accuracy": 0.8721351027488708, "step": 670 }, { "epoch": 1.8434065934065935, "grad_norm": 6.665176868438721, "learning_rate": 1.167125481502284e-06, "loss": 0.5351, "mean_token_accuracy": 0.8627219200134277, "step": 671 }, { "epoch": 1.8461538461538463, "grad_norm": 7.5192670822143555, "learning_rate": 1.1613478600799688e-06, "loss": 0.5553, "mean_token_accuracy": 0.8680142760276794, "step": 672 }, { "epoch": 1.848901098901099, "grad_norm": 7.737743377685547, "learning_rate": 1.1556700479884969e-06, "loss": 0.4324, "mean_token_accuracy": 0.8904638886451721, "step": 673 }, { "epoch": 1.8516483516483517, "grad_norm": 7.322444438934326, "learning_rate": 1.150092175844153e-06, "loss": 0.486, "mean_token_accuracy": 0.8801897764205933, "step": 674 }, { "epoch": 1.8543956043956045, "grad_norm": 8.201929092407227, "learning_rate": 1.1446143719641354e-06, "loss": 0.5865, "mean_token_accuracy": 0.852223813533783, "step": 675 }, { "epoch": 1.8571428571428572, "grad_norm": 7.934874057769775, "learning_rate": 1.1392367623636041e-06, "loss": 0.7429, "mean_token_accuracy": 0.8293269276618958, "step": 676 }, { "epoch": 1.85989010989011, "grad_norm": 7.658584117889404, "learning_rate": 1.133959470752779e-06, "loss": 0.5866, "mean_token_accuracy": 0.8679039478302002, "step": 677 }, { "epoch": 1.8626373626373627, "grad_norm": 7.818109512329102, "learning_rate": 1.1287826185340987e-06, "loss": 0.5487, "mean_token_accuracy": 0.8636363744735718, "step": 678 }, { "epoch": 1.8653846153846154, "grad_norm": 7.738321304321289, "learning_rate": 1.1237063247994219e-06, "loss": 0.6939, "mean_token_accuracy": 0.8329596519470215, "step": 679 }, { "epoch": 1.8681318681318682, "grad_norm": 8.202414512634277, "learning_rate": 1.1187307063272948e-06, "loss": 0.5474, "mean_token_accuracy": 0.8687415719032288, "step": 680 }, { "epoch": 1.870879120879121, "grad_norm": 9.126557350158691, "learning_rate": 1.1138558775802582e-06, "loss": 0.6274, "mean_token_accuracy": 0.8616071343421936, "step": 681 }, { "epoch": 1.8736263736263736, "grad_norm": 7.537760257720947, "learning_rate": 1.1090819507022166e-06, "loss": 0.5639, "mean_token_accuracy": 0.8717948794364929, "step": 682 }, { "epoch": 1.8763736263736264, "grad_norm": 7.776443004608154, "learning_rate": 1.1044090355158607e-06, "loss": 0.5634, "mean_token_accuracy": 0.8497174978256226, "step": 683 }, { "epoch": 1.879120879120879, "grad_norm": 7.746001720428467, "learning_rate": 1.0998372395201377e-06, "loss": 0.5863, "mean_token_accuracy": 0.8390297889709473, "step": 684 }, { "epoch": 1.8818681318681318, "grad_norm": 8.56049633026123, "learning_rate": 1.0953666678877789e-06, "loss": 0.5126, "mean_token_accuracy": 0.8790435791015625, "step": 685 }, { "epoch": 1.8846153846153846, "grad_norm": 7.443101406097412, "learning_rate": 1.0909974234628826e-06, "loss": 0.48, "mean_token_accuracy": 0.8778833150863647, "step": 686 }, { "epoch": 1.8873626373626373, "grad_norm": 8.6889066696167, "learning_rate": 1.0867296067585444e-06, "loss": 0.7503, "mean_token_accuracy": 0.832425057888031, "step": 687 }, { "epoch": 1.89010989010989, "grad_norm": 8.396735191345215, "learning_rate": 1.0825633159545498e-06, "loss": 0.6225, "mean_token_accuracy": 0.8434886336326599, "step": 688 }, { "epoch": 1.8928571428571428, "grad_norm": 6.919631481170654, "learning_rate": 1.0784986468951102e-06, "loss": 0.5102, "mean_token_accuracy": 0.8629807829856873, "step": 689 }, { "epoch": 1.8956043956043955, "grad_norm": 8.00849723815918, "learning_rate": 1.0745356930866608e-06, "loss": 0.6416, "mean_token_accuracy": 0.8387942314147949, "step": 690 }, { "epoch": 1.8983516483516483, "grad_norm": 6.977931499481201, "learning_rate": 1.0706745456957125e-06, "loss": 0.4322, "mean_token_accuracy": 0.9055214524269104, "step": 691 }, { "epoch": 1.901098901098901, "grad_norm": 8.84980583190918, "learning_rate": 1.0669152935467473e-06, "loss": 0.8483, "mean_token_accuracy": 0.8108747005462646, "step": 692 }, { "epoch": 1.9038461538461537, "grad_norm": 7.389832019805908, "learning_rate": 1.0632580231201816e-06, "loss": 0.5073, "mean_token_accuracy": 0.8797468543052673, "step": 693 }, { "epoch": 1.9065934065934065, "grad_norm": 8.551029205322266, "learning_rate": 1.0597028185503741e-06, "loss": 0.7209, "mean_token_accuracy": 0.8173515796661377, "step": 694 }, { "epoch": 1.9093406593406592, "grad_norm": 8.817502975463867, "learning_rate": 1.0562497616236902e-06, "loss": 0.5205, "mean_token_accuracy": 0.8672566413879395, "step": 695 }, { "epoch": 1.912087912087912, "grad_norm": 7.306185722351074, "learning_rate": 1.0528989317766207e-06, "loss": 0.567, "mean_token_accuracy": 0.8645319938659668, "step": 696 }, { "epoch": 1.9148351648351647, "grad_norm": 6.0902228355407715, "learning_rate": 1.0496504060939541e-06, "loss": 0.4767, "mean_token_accuracy": 0.8842975497245789, "step": 697 }, { "epoch": 1.9175824175824174, "grad_norm": 6.566694736480713, "learning_rate": 1.0465042593070051e-06, "loss": 0.4539, "mean_token_accuracy": 0.8871151804924011, "step": 698 }, { "epoch": 1.9203296703296702, "grad_norm": 7.715193748474121, "learning_rate": 1.0434605637918922e-06, "loss": 0.4615, "mean_token_accuracy": 0.8861111402511597, "step": 699 }, { "epoch": 1.9230769230769231, "grad_norm": 6.1946821212768555, "learning_rate": 1.040519389567876e-06, "loss": 0.3546, "mean_token_accuracy": 0.9018912315368652, "step": 700 }, { "epoch": 1.9258241758241759, "grad_norm": 8.542852401733398, "learning_rate": 1.0376808042957467e-06, "loss": 0.7264, "mean_token_accuracy": 0.8218673467636108, "step": 701 }, { "epoch": 1.9285714285714286, "grad_norm": 8.205451011657715, "learning_rate": 1.0349448732762673e-06, "loss": 0.6035, "mean_token_accuracy": 0.8728476762771606, "step": 702 }, { "epoch": 1.9313186813186813, "grad_norm": 8.581854820251465, "learning_rate": 1.032311659448672e-06, "loss": 0.6127, "mean_token_accuracy": 0.8588390350341797, "step": 703 }, { "epoch": 1.934065934065934, "grad_norm": 6.2623443603515625, "learning_rate": 1.0297812233892193e-06, "loss": 0.4931, "mean_token_accuracy": 0.8853362798690796, "step": 704 }, { "epoch": 1.9368131868131868, "grad_norm": 8.019908905029297, "learning_rate": 1.0273536233097956e-06, "loss": 0.622, "mean_token_accuracy": 0.8511363863945007, "step": 705 }, { "epoch": 1.9395604395604396, "grad_norm": 6.778481960296631, "learning_rate": 1.02502891505658e-06, "loss": 0.5602, "mean_token_accuracy": 0.8511837720870972, "step": 706 }, { "epoch": 1.9423076923076923, "grad_norm": 7.289844512939453, "learning_rate": 1.0228071521087555e-06, "loss": 0.5142, "mean_token_accuracy": 0.8857837319374084, "step": 707 }, { "epoch": 1.945054945054945, "grad_norm": 7.7085466384887695, "learning_rate": 1.0206883855772813e-06, "loss": 0.6465, "mean_token_accuracy": 0.8451536893844604, "step": 708 }, { "epoch": 1.9478021978021978, "grad_norm": 8.750447273254395, "learning_rate": 1.0186726642037172e-06, "loss": 0.6184, "mean_token_accuracy": 0.8487874269485474, "step": 709 }, { "epoch": 1.9505494505494505, "grad_norm": 9.00527286529541, "learning_rate": 1.0167600343591e-06, "loss": 0.7001, "mean_token_accuracy": 0.8461538553237915, "step": 710 }, { "epoch": 1.9532967032967035, "grad_norm": 7.398614406585693, "learning_rate": 1.0149505400428795e-06, "loss": 0.5246, "mean_token_accuracy": 0.869927167892456, "step": 711 }, { "epoch": 1.9560439560439562, "grad_norm": 7.224842548370361, "learning_rate": 1.0132442228819047e-06, "loss": 0.4347, "mean_token_accuracy": 0.9007731676101685, "step": 712 }, { "epoch": 1.958791208791209, "grad_norm": 6.304765224456787, "learning_rate": 1.0116411221294663e-06, "loss": 0.3831, "mean_token_accuracy": 0.9077936410903931, "step": 713 }, { "epoch": 1.9615384615384617, "grad_norm": 8.755301475524902, "learning_rate": 1.0101412746643932e-06, "loss": 0.6125, "mean_token_accuracy": 0.8540462255477905, "step": 714 }, { "epoch": 1.9642857142857144, "grad_norm": 8.6292724609375, "learning_rate": 1.0087447149902067e-06, "loss": 0.6039, "mean_token_accuracy": 0.8370370268821716, "step": 715 }, { "epoch": 1.9670329670329672, "grad_norm": 7.7700347900390625, "learning_rate": 1.0074514752343238e-06, "loss": 0.5602, "mean_token_accuracy": 0.8450184464454651, "step": 716 }, { "epoch": 1.9697802197802199, "grad_norm": 8.478242874145508, "learning_rate": 1.0062615851473182e-06, "loss": 0.6344, "mean_token_accuracy": 0.8534798622131348, "step": 717 }, { "epoch": 1.9725274725274726, "grad_norm": 8.109445571899414, "learning_rate": 1.0051750721022387e-06, "loss": 0.5387, "mean_token_accuracy": 0.8751530051231384, "step": 718 }, { "epoch": 1.9752747252747254, "grad_norm": 6.251338958740234, "learning_rate": 1.0041919610939768e-06, "loss": 0.3721, "mean_token_accuracy": 0.9060240983963013, "step": 719 }, { "epoch": 1.978021978021978, "grad_norm": 8.324958801269531, "learning_rate": 1.0033122747386922e-06, "loss": 0.6845, "mean_token_accuracy": 0.8313725590705872, "step": 720 }, { "epoch": 1.9807692307692308, "grad_norm": 8.283183097839355, "learning_rate": 1.0025360332732942e-06, "loss": 0.5906, "mean_token_accuracy": 0.8554913401603699, "step": 721 }, { "epoch": 1.9835164835164836, "grad_norm": 8.054919242858887, "learning_rate": 1.0018632545549739e-06, "loss": 0.7021, "mean_token_accuracy": 0.8413878679275513, "step": 722 }, { "epoch": 1.9862637362637363, "grad_norm": 7.945398807525635, "learning_rate": 1.0012939540607945e-06, "loss": 0.5362, "mean_token_accuracy": 0.8812903165817261, "step": 723 }, { "epoch": 1.989010989010989, "grad_norm": 7.639397144317627, "learning_rate": 1.0008281448873346e-06, "loss": 0.5045, "mean_token_accuracy": 0.871408998966217, "step": 724 }, { "epoch": 1.9917582417582418, "grad_norm": 7.862728595733643, "learning_rate": 1.0004658377503893e-06, "loss": 0.6753, "mean_token_accuracy": 0.8276283740997314, "step": 725 }, { "epoch": 1.9945054945054945, "grad_norm": 8.535736083984375, "learning_rate": 1.0002070409847193e-06, "loss": 0.6547, "mean_token_accuracy": 0.8429203629493713, "step": 726 }, { "epoch": 1.9972527472527473, "grad_norm": 6.102328300476074, "learning_rate": 1.0000517605438636e-06, "loss": 0.3124, "mean_token_accuracy": 0.9138134717941284, "step": 727 }, { "epoch": 2.0, "grad_norm": 6.113708972930908, "learning_rate": 1.0000000000000002e-06, "loss": 0.3288, "mean_token_accuracy": 0.9122137427330017, "step": 728 }, { "epoch": 2.0, "step": 728, "total_flos": 4.595247209544417e+17, "train_loss": 1.052156428714375, "train_runtime": 2874.0296, "train_samples_per_second": 8.099, "train_steps_per_second": 0.253 } ], "logging_steps": 1, "max_steps": 728, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.595247209544417e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }