Files
P2-split2_prob_Qwen3-8B-Bas…/trainer_state.json
ModelHub XC e3ffeabc7e 初始化项目,由ModelHub XC社区提供模型
Model: Hyeongwon/P2-split2_prob_Qwen3-8B-Base_0325-01
Source: Original Platform
2026-05-13 05:45:20 +08:00

7604 lines
214 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 756,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.5635986328125,
"epoch": 0.003968253968253968,
"grad_norm": 5.863266347518074,
"learning_rate": 0.0,
"loss": 1.3929,
"mean_token_accuracy": 0.6520986258983612,
"num_tokens": 436822.0,
"step": 1
},
{
"entropy": 0.571868896484375,
"epoch": 0.007936507936507936,
"grad_norm": 5.943034119425208,
"learning_rate": 5.263157894736843e-07,
"loss": 1.3984,
"mean_token_accuracy": 0.6573778251186013,
"num_tokens": 849869.0,
"step": 2
},
{
"entropy": 0.571136474609375,
"epoch": 0.011904761904761904,
"grad_norm": 5.9888348315878766,
"learning_rate": 1.0526315789473685e-06,
"loss": 1.4019,
"mean_token_accuracy": 0.6531417248770595,
"num_tokens": 1257883.0,
"step": 3
},
{
"entropy": 0.56817626953125,
"epoch": 0.015873015873015872,
"grad_norm": 5.819974339041837,
"learning_rate": 1.5789473684210526e-06,
"loss": 1.3961,
"mean_token_accuracy": 0.6506756190210581,
"num_tokens": 1710146.0,
"step": 4
},
{
"entropy": 0.563323974609375,
"epoch": 0.01984126984126984,
"grad_norm": 5.674858276690005,
"learning_rate": 2.105263157894737e-06,
"loss": 1.3737,
"mean_token_accuracy": 0.6581529462710023,
"num_tokens": 2138902.0,
"step": 5
},
{
"entropy": 0.583770751953125,
"epoch": 0.023809523809523808,
"grad_norm": 5.287171943303066,
"learning_rate": 2.631578947368421e-06,
"loss": 1.3562,
"mean_token_accuracy": 0.6605429640039802,
"num_tokens": 2560005.0,
"step": 6
},
{
"entropy": 0.5577392578125,
"epoch": 0.027777777777777776,
"grad_norm": 4.925117375179032,
"learning_rate": 3.157894736842105e-06,
"loss": 1.3458,
"mean_token_accuracy": 0.6580116618424654,
"num_tokens": 3004121.0,
"step": 7
},
{
"entropy": 0.57269287109375,
"epoch": 0.031746031746031744,
"grad_norm": 4.510966194233729,
"learning_rate": 3.6842105263157896e-06,
"loss": 1.303,
"mean_token_accuracy": 0.6702660601586103,
"num_tokens": 3457966.0,
"step": 8
},
{
"entropy": 0.56524658203125,
"epoch": 0.03571428571428571,
"grad_norm": 4.257257337401794,
"learning_rate": 4.210526315789474e-06,
"loss": 1.2854,
"mean_token_accuracy": 0.6731634242460132,
"num_tokens": 3902759.0,
"step": 9
},
{
"entropy": 0.585052490234375,
"epoch": 0.03968253968253968,
"grad_norm": 3.560753211544556,
"learning_rate": 4.736842105263158e-06,
"loss": 1.1912,
"mean_token_accuracy": 0.6834962824359536,
"num_tokens": 4321827.0,
"step": 10
},
{
"entropy": 0.56036376953125,
"epoch": 0.04365079365079365,
"grad_norm": 3.440982638815655,
"learning_rate": 5.263157894736842e-06,
"loss": 1.129,
"mean_token_accuracy": 0.7041243137791753,
"num_tokens": 4748195.0,
"step": 11
},
{
"entropy": 0.5570068359375,
"epoch": 0.047619047619047616,
"grad_norm": 3.196606172568719,
"learning_rate": 5.789473684210527e-06,
"loss": 1.128,
"mean_token_accuracy": 0.7001799792051315,
"num_tokens": 5188122.0,
"step": 12
},
{
"entropy": 0.53070068359375,
"epoch": 0.051587301587301584,
"grad_norm": 4.635446866713048,
"learning_rate": 6.31578947368421e-06,
"loss": 1.0401,
"mean_token_accuracy": 0.7150256410241127,
"num_tokens": 5615040.0,
"step": 13
},
{
"entropy": 0.53387451171875,
"epoch": 0.05555555555555555,
"grad_norm": 4.895076624528778,
"learning_rate": 6.842105263157896e-06,
"loss": 1.029,
"mean_token_accuracy": 0.7148290146142244,
"num_tokens": 6042413.0,
"step": 14
},
{
"entropy": 0.546905517578125,
"epoch": 0.05952380952380952,
"grad_norm": 3.838947346620084,
"learning_rate": 7.368421052631579e-06,
"loss": 0.9875,
"mean_token_accuracy": 0.725364712998271,
"num_tokens": 6468019.0,
"step": 15
},
{
"entropy": 0.54766845703125,
"epoch": 0.06349206349206349,
"grad_norm": 3.52284752091451,
"learning_rate": 7.894736842105265e-06,
"loss": 0.9743,
"mean_token_accuracy": 0.7267373474314809,
"num_tokens": 6898441.0,
"step": 16
},
{
"entropy": 0.5467529296875,
"epoch": 0.06746031746031746,
"grad_norm": 3.0694966842969307,
"learning_rate": 8.421052631578948e-06,
"loss": 0.918,
"mean_token_accuracy": 0.7363277673721313,
"num_tokens": 7333054.0,
"step": 17
},
{
"entropy": 0.5263671875,
"epoch": 0.07142857142857142,
"grad_norm": 3.842330950557049,
"learning_rate": 8.947368421052632e-06,
"loss": 0.9026,
"mean_token_accuracy": 0.7431545937433839,
"num_tokens": 7794638.0,
"step": 18
},
{
"entropy": 0.5352783203125,
"epoch": 0.07539682539682539,
"grad_norm": 3.361580339127847,
"learning_rate": 9.473684210526315e-06,
"loss": 0.9166,
"mean_token_accuracy": 0.7366319699212909,
"num_tokens": 8237624.0,
"step": 19
},
{
"entropy": 0.5377197265625,
"epoch": 0.07936507936507936,
"grad_norm": 2.6450589328361254,
"learning_rate": 1e-05,
"loss": 0.8844,
"mean_token_accuracy": 0.7442264417186379,
"num_tokens": 8673402.0,
"step": 20
},
{
"entropy": 0.5318603515625,
"epoch": 0.08333333333333333,
"grad_norm": 2.5182823829653525,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.8645,
"mean_token_accuracy": 0.74986263923347,
"num_tokens": 9121387.0,
"step": 21
},
{
"entropy": 0.547149658203125,
"epoch": 0.0873015873015873,
"grad_norm": 2.169285955075468,
"learning_rate": 1.105263157894737e-05,
"loss": 0.8049,
"mean_token_accuracy": 0.7641561925411224,
"num_tokens": 9525436.0,
"step": 22
},
{
"entropy": 0.54248046875,
"epoch": 0.09126984126984126,
"grad_norm": 2.2658426207555955,
"learning_rate": 1.1578947368421053e-05,
"loss": 0.8038,
"mean_token_accuracy": 0.7617505192756653,
"num_tokens": 9932011.0,
"step": 23
},
{
"entropy": 0.53009033203125,
"epoch": 0.09523809523809523,
"grad_norm": 2.082747079461424,
"learning_rate": 1.2105263157894737e-05,
"loss": 0.797,
"mean_token_accuracy": 0.7638106672093272,
"num_tokens": 10358777.0,
"step": 24
},
{
"entropy": 0.535430908203125,
"epoch": 0.0992063492063492,
"grad_norm": 2.1722194195956828,
"learning_rate": 1.263157894736842e-05,
"loss": 0.7812,
"mean_token_accuracy": 0.7707258444279432,
"num_tokens": 10773051.0,
"step": 25
},
{
"entropy": 0.521759033203125,
"epoch": 0.10317460317460317,
"grad_norm": 2.2209577423566316,
"learning_rate": 1.3157894736842108e-05,
"loss": 0.7645,
"mean_token_accuracy": 0.7706983601674438,
"num_tokens": 11211677.0,
"step": 26
},
{
"entropy": 0.52227783203125,
"epoch": 0.10714285714285714,
"grad_norm": 1.8746595697038755,
"learning_rate": 1.3684210526315791e-05,
"loss": 0.7243,
"mean_token_accuracy": 0.7795716691762209,
"num_tokens": 11628779.0,
"step": 27
},
{
"entropy": 0.512603759765625,
"epoch": 0.1111111111111111,
"grad_norm": 1.9181519214362959,
"learning_rate": 1.4210526315789475e-05,
"loss": 0.7589,
"mean_token_accuracy": 0.7727855974808335,
"num_tokens": 12067363.0,
"step": 28
},
{
"entropy": 0.499237060546875,
"epoch": 0.11507936507936507,
"grad_norm": 1.8295943705658484,
"learning_rate": 1.4736842105263159e-05,
"loss": 0.7531,
"mean_token_accuracy": 0.7741835163906217,
"num_tokens": 12507159.0,
"step": 29
},
{
"entropy": 0.499603271484375,
"epoch": 0.11904761904761904,
"grad_norm": 1.6179644350101932,
"learning_rate": 1.5263157894736846e-05,
"loss": 0.7344,
"mean_token_accuracy": 0.7773478422313929,
"num_tokens": 12945458.0,
"step": 30
},
{
"entropy": 0.51458740234375,
"epoch": 0.12301587301587301,
"grad_norm": 1.8610284952179306,
"learning_rate": 1.578947368421053e-05,
"loss": 0.7118,
"mean_token_accuracy": 0.7809475539252162,
"num_tokens": 13370514.0,
"step": 31
},
{
"entropy": 0.497283935546875,
"epoch": 0.12698412698412698,
"grad_norm": 1.8670029587267238,
"learning_rate": 1.6315789473684213e-05,
"loss": 0.7114,
"mean_token_accuracy": 0.7820066763088107,
"num_tokens": 13815066.0,
"step": 32
},
{
"entropy": 0.503814697265625,
"epoch": 0.13095238095238096,
"grad_norm": 1.7232235737579489,
"learning_rate": 1.6842105263157896e-05,
"loss": 0.7034,
"mean_token_accuracy": 0.7832089820876718,
"num_tokens": 14240312.0,
"step": 33
},
{
"entropy": 0.49395751953125,
"epoch": 0.1349206349206349,
"grad_norm": 1.5626721336124045,
"learning_rate": 1.736842105263158e-05,
"loss": 0.6996,
"mean_token_accuracy": 0.785805162973702,
"num_tokens": 14685173.0,
"step": 34
},
{
"entropy": 0.499114990234375,
"epoch": 0.1388888888888889,
"grad_norm": 1.5550361833914306,
"learning_rate": 1.7894736842105264e-05,
"loss": 0.6808,
"mean_token_accuracy": 0.7907448643818498,
"num_tokens": 15099914.0,
"step": 35
},
{
"entropy": 0.494049072265625,
"epoch": 0.14285714285714285,
"grad_norm": 1.6976994127074492,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.6869,
"mean_token_accuracy": 0.7874529659748077,
"num_tokens": 15522062.0,
"step": 36
},
{
"entropy": 0.49200439453125,
"epoch": 0.14682539682539683,
"grad_norm": 1.6731263798294116,
"learning_rate": 1.894736842105263e-05,
"loss": 0.6716,
"mean_token_accuracy": 0.7911530267447233,
"num_tokens": 15955138.0,
"step": 37
},
{
"entropy": 0.494598388671875,
"epoch": 0.15079365079365079,
"grad_norm": 1.6340216417965139,
"learning_rate": 1.9473684210526318e-05,
"loss": 0.6583,
"mean_token_accuracy": 0.7931346474215388,
"num_tokens": 16388252.0,
"step": 38
},
{
"entropy": 0.4937744140625,
"epoch": 0.15476190476190477,
"grad_norm": 1.5462189800304016,
"learning_rate": 2e-05,
"loss": 0.6748,
"mean_token_accuracy": 0.7880920702591538,
"num_tokens": 16821287.0,
"step": 39
},
{
"entropy": 0.4898681640625,
"epoch": 0.15873015873015872,
"grad_norm": 1.5616598366947274,
"learning_rate": 1.999990427614762e-05,
"loss": 0.6311,
"mean_token_accuracy": 0.7991781169548631,
"num_tokens": 17235205.0,
"step": 40
},
{
"entropy": 0.48675537109375,
"epoch": 0.1626984126984127,
"grad_norm": 1.646272088858081,
"learning_rate": 1.999961710642308e-05,
"loss": 0.65,
"mean_token_accuracy": 0.7964185178279877,
"num_tokens": 17657156.0,
"step": 41
},
{
"entropy": 0.4864501953125,
"epoch": 0.16666666666666666,
"grad_norm": 1.7240469107631835,
"learning_rate": 1.999913849632419e-05,
"loss": 0.6519,
"mean_token_accuracy": 0.7951665250584483,
"num_tokens": 18090069.0,
"step": 42
},
{
"entropy": 0.480987548828125,
"epoch": 0.17063492063492064,
"grad_norm": 1.6378737913794217,
"learning_rate": 1.9998468455013825e-05,
"loss": 0.6596,
"mean_token_accuracy": 0.7942898478358984,
"num_tokens": 18542578.0,
"step": 43
},
{
"entropy": 0.494598388671875,
"epoch": 0.1746031746031746,
"grad_norm": 1.5436306701146438,
"learning_rate": 1.999760699531977e-05,
"loss": 0.6298,
"mean_token_accuracy": 0.8005202021449804,
"num_tokens": 18955962.0,
"step": 44
},
{
"entropy": 0.49298095703125,
"epoch": 0.17857142857142858,
"grad_norm": 1.4015532613832098,
"learning_rate": 1.9996554133734473e-05,
"loss": 0.6231,
"mean_token_accuracy": 0.8032774887979031,
"num_tokens": 19379353.0,
"step": 45
},
{
"entropy": 0.49383544921875,
"epoch": 0.18253968253968253,
"grad_norm": 1.3858501151489995,
"learning_rate": 1.9995309890414735e-05,
"loss": 0.6116,
"mean_token_accuracy": 0.8062875410541892,
"num_tokens": 19812261.0,
"step": 46
},
{
"entropy": 0.5113525390625,
"epoch": 0.1865079365079365,
"grad_norm": 1.4607281864296495,
"learning_rate": 1.99938742891813e-05,
"loss": 0.6135,
"mean_token_accuracy": 0.8027496039867401,
"num_tokens": 20210235.0,
"step": 47
},
{
"entropy": 0.4896240234375,
"epoch": 0.19047619047619047,
"grad_norm": 1.532717597565806,
"learning_rate": 1.9992247357518428e-05,
"loss": 0.619,
"mean_token_accuracy": 0.8026178050786257,
"num_tokens": 20647709.0,
"step": 48
},
{
"entropy": 0.491119384765625,
"epoch": 0.19444444444444445,
"grad_norm": 1.4410103415538014,
"learning_rate": 1.9990429126573353e-05,
"loss": 0.6121,
"mean_token_accuracy": 0.8036608071997762,
"num_tokens": 21065897.0,
"step": 49
},
{
"entropy": 0.499908447265625,
"epoch": 0.1984126984126984,
"grad_norm": 1.563783097879555,
"learning_rate": 1.9988419631155686e-05,
"loss": 0.6192,
"mean_token_accuracy": 0.8009361904114485,
"num_tokens": 21486371.0,
"step": 50
},
{
"entropy": 0.50067138671875,
"epoch": 0.20238095238095238,
"grad_norm": 1.725707634668624,
"learning_rate": 1.9986218909736758e-05,
"loss": 0.6139,
"mean_token_accuracy": 0.8020936474204063,
"num_tokens": 21909309.0,
"step": 51
},
{
"entropy": 0.504180908203125,
"epoch": 0.20634920634920634,
"grad_norm": 1.3228518418004556,
"learning_rate": 1.9983827004448875e-05,
"loss": 0.6003,
"mean_token_accuracy": 0.8073256686329842,
"num_tokens": 22318414.0,
"step": 52
},
{
"entropy": 0.500946044921875,
"epoch": 0.21031746031746032,
"grad_norm": 1.720867937204593,
"learning_rate": 1.9981243961084516e-05,
"loss": 0.5856,
"mean_token_accuracy": 0.8099770434200764,
"num_tokens": 22719471.0,
"step": 53
},
{
"entropy": 0.49346923828125,
"epoch": 0.21428571428571427,
"grad_norm": 1.3398077758817923,
"learning_rate": 1.997846982909545e-05,
"loss": 0.591,
"mean_token_accuracy": 0.8068837188184261,
"num_tokens": 23134604.0,
"step": 54
},
{
"entropy": 0.480987548828125,
"epoch": 0.21825396825396826,
"grad_norm": 1.4838615278854144,
"learning_rate": 1.99755046615918e-05,
"loss": 0.5942,
"mean_token_accuracy": 0.8071342799812555,
"num_tokens": 23571548.0,
"step": 55
},
{
"entropy": 0.4854736328125,
"epoch": 0.2222222222222222,
"grad_norm": 1.4492711420273152,
"learning_rate": 1.9972348515341018e-05,
"loss": 0.6042,
"mean_token_accuracy": 0.8067643223330379,
"num_tokens": 24009770.0,
"step": 56
},
{
"entropy": 0.484954833984375,
"epoch": 0.2261904761904762,
"grad_norm": 1.256987618745555,
"learning_rate": 1.9969001450766795e-05,
"loss": 0.6043,
"mean_token_accuracy": 0.805039519444108,
"num_tokens": 24447544.0,
"step": 57
},
{
"entropy": 0.495086669921875,
"epoch": 0.23015873015873015,
"grad_norm": 1.3407462818472953,
"learning_rate": 1.996546353194792e-05,
"loss": 0.5971,
"mean_token_accuracy": 0.8080601003021002,
"num_tokens": 24869806.0,
"step": 58
},
{
"entropy": 0.473907470703125,
"epoch": 0.23412698412698413,
"grad_norm": 1.3879658160131176,
"learning_rate": 1.9961734826617033e-05,
"loss": 0.6012,
"mean_token_accuracy": 0.8073396803811193,
"num_tokens": 25337885.0,
"step": 59
},
{
"entropy": 0.47698974609375,
"epoch": 0.23809523809523808,
"grad_norm": 1.2797357075630325,
"learning_rate": 1.9957815406159344e-05,
"loss": 0.586,
"mean_token_accuracy": 0.8098131148144603,
"num_tokens": 25762999.0,
"step": 60
},
{
"entropy": 0.480926513671875,
"epoch": 0.24206349206349206,
"grad_norm": 1.492286782191842,
"learning_rate": 1.995370534561125e-05,
"loss": 0.5819,
"mean_token_accuracy": 0.8118512397632003,
"num_tokens": 26204122.0,
"step": 61
},
{
"entropy": 0.46820068359375,
"epoch": 0.24603174603174602,
"grad_norm": 1.2844128245812518,
"learning_rate": 1.994940472365893e-05,
"loss": 0.567,
"mean_token_accuracy": 0.8152421358972788,
"num_tokens": 26651412.0,
"step": 62
},
{
"entropy": 0.481475830078125,
"epoch": 0.25,
"grad_norm": 1.1956700400619351,
"learning_rate": 1.9944913622636798e-05,
"loss": 0.5794,
"mean_token_accuracy": 0.8128518350422382,
"num_tokens": 27080137.0,
"step": 63
},
{
"entropy": 0.470306396484375,
"epoch": 0.25396825396825395,
"grad_norm": 1.4972705474023489,
"learning_rate": 1.994023212852595e-05,
"loss": 0.5791,
"mean_token_accuracy": 0.8129686992615461,
"num_tokens": 27520069.0,
"step": 64
},
{
"entropy": 0.47222900390625,
"epoch": 0.25793650793650796,
"grad_norm": 1.4814711931573392,
"learning_rate": 1.993536033095252e-05,
"loss": 0.5814,
"mean_token_accuracy": 0.8099355883896351,
"num_tokens": 27954154.0,
"step": 65
},
{
"entropy": 0.48028564453125,
"epoch": 0.2619047619047619,
"grad_norm": 1.546117863417093,
"learning_rate": 1.9930298323185945e-05,
"loss": 0.5858,
"mean_token_accuracy": 0.809964569285512,
"num_tokens": 28367541.0,
"step": 66
},
{
"entropy": 0.4788818359375,
"epoch": 0.26587301587301587,
"grad_norm": 1.3505593123937785,
"learning_rate": 1.9925046202137215e-05,
"loss": 0.5555,
"mean_token_accuracy": 0.815067121759057,
"num_tokens": 28779140.0,
"step": 67
},
{
"entropy": 0.4736328125,
"epoch": 0.2698412698412698,
"grad_norm": 1.5267113437685296,
"learning_rate": 1.9919604068356978e-05,
"loss": 0.5792,
"mean_token_accuracy": 0.8099933639168739,
"num_tokens": 29217570.0,
"step": 68
},
{
"entropy": 0.4664306640625,
"epoch": 0.27380952380952384,
"grad_norm": 1.2597555449754034,
"learning_rate": 1.991397202603363e-05,
"loss": 0.5556,
"mean_token_accuracy": 0.8194932043552399,
"num_tokens": 29656943.0,
"step": 69
},
{
"entropy": 0.46923828125,
"epoch": 0.2777777777777778,
"grad_norm": 1.4010912673765619,
"learning_rate": 1.9908150182991338e-05,
"loss": 0.5801,
"mean_token_accuracy": 0.8125716717913747,
"num_tokens": 30088869.0,
"step": 70
},
{
"entropy": 0.460540771484375,
"epoch": 0.28174603174603174,
"grad_norm": 1.10011210824897,
"learning_rate": 1.9902138650687943e-05,
"loss": 0.5787,
"mean_token_accuracy": 0.8111176574602723,
"num_tokens": 30550317.0,
"step": 71
},
{
"entropy": 0.466552734375,
"epoch": 0.2857142857142857,
"grad_norm": 1.4150609712955267,
"learning_rate": 1.9895937544212856e-05,
"loss": 0.5603,
"mean_token_accuracy": 0.816674031317234,
"num_tokens": 30959821.0,
"step": 72
},
{
"entropy": 0.462890625,
"epoch": 0.2896825396825397,
"grad_norm": 1.2951602254642691,
"learning_rate": 1.9889546982284833e-05,
"loss": 0.58,
"mean_token_accuracy": 0.8122155498713255,
"num_tokens": 31412639.0,
"step": 73
},
{
"entropy": 0.47247314453125,
"epoch": 0.29365079365079366,
"grad_norm": 1.4230925187441965,
"learning_rate": 1.988296708724972e-05,
"loss": 0.5739,
"mean_token_accuracy": 0.8140842840075493,
"num_tokens": 31830767.0,
"step": 74
},
{
"entropy": 0.468292236328125,
"epoch": 0.2976190476190476,
"grad_norm": 1.346368649645527,
"learning_rate": 1.987619798507809e-05,
"loss": 0.5668,
"mean_token_accuracy": 0.8116888217628002,
"num_tokens": 32267329.0,
"step": 75
},
{
"entropy": 0.470458984375,
"epoch": 0.30158730158730157,
"grad_norm": 1.2828384645809423,
"learning_rate": 1.986923980536286e-05,
"loss": 0.5668,
"mean_token_accuracy": 0.8176631266251206,
"num_tokens": 32692726.0,
"step": 76
},
{
"entropy": 0.47137451171875,
"epoch": 0.3055555555555556,
"grad_norm": 1.2453331184538057,
"learning_rate": 1.9862092681316774e-05,
"loss": 0.5442,
"mean_token_accuracy": 0.8217762364074588,
"num_tokens": 33108936.0,
"step": 77
},
{
"entropy": 0.453704833984375,
"epoch": 0.30952380952380953,
"grad_norm": 1.138281360826344,
"learning_rate": 1.9854756749769893e-05,
"loss": 0.5451,
"mean_token_accuracy": 0.8192528188228607,
"num_tokens": 33543076.0,
"step": 78
},
{
"entropy": 0.455780029296875,
"epoch": 0.3134920634920635,
"grad_norm": 1.3113762371169908,
"learning_rate": 1.984723215116693e-05,
"loss": 0.559,
"mean_token_accuracy": 0.8173351967707276,
"num_tokens": 33980897.0,
"step": 79
},
{
"entropy": 0.461700439453125,
"epoch": 0.31746031746031744,
"grad_norm": 1.231273184699265,
"learning_rate": 1.9839519029564608e-05,
"loss": 0.5545,
"mean_token_accuracy": 0.8189000273123384,
"num_tokens": 34404352.0,
"step": 80
},
{
"entropy": 0.476165771484375,
"epoch": 0.32142857142857145,
"grad_norm": 1.3255041013441944,
"learning_rate": 1.983161753262886e-05,
"loss": 0.5591,
"mean_token_accuracy": 0.8155939728021622,
"num_tokens": 34815122.0,
"step": 81
},
{
"entropy": 0.479095458984375,
"epoch": 0.3253968253968254,
"grad_norm": 1.144243415774409,
"learning_rate": 1.982352781163204e-05,
"loss": 0.5569,
"mean_token_accuracy": 0.8167071230709553,
"num_tokens": 35236933.0,
"step": 82
},
{
"entropy": 0.468017578125,
"epoch": 0.32936507936507936,
"grad_norm": 1.3470659575750075,
"learning_rate": 1.9815250021449998e-05,
"loss": 0.5557,
"mean_token_accuracy": 0.8185685835778713,
"num_tokens": 35664506.0,
"step": 83
},
{
"entropy": 0.46673583984375,
"epoch": 0.3333333333333333,
"grad_norm": 1.2993064977541513,
"learning_rate": 1.980678432055913e-05,
"loss": 0.555,
"mean_token_accuracy": 0.8149783732369542,
"num_tokens": 36088050.0,
"step": 84
},
{
"entropy": 0.472625732421875,
"epoch": 0.3373015873015873,
"grad_norm": 1.260442517304779,
"learning_rate": 1.9798130871033322e-05,
"loss": 0.5511,
"mean_token_accuracy": 0.818051096983254,
"num_tokens": 36501235.0,
"step": 85
},
{
"entropy": 0.455963134765625,
"epoch": 0.3412698412698413,
"grad_norm": 1.2542494850010633,
"learning_rate": 1.9789289838540897e-05,
"loss": 0.554,
"mean_token_accuracy": 0.8185935420915484,
"num_tokens": 36942407.0,
"step": 86
},
{
"entropy": 0.456634521484375,
"epoch": 0.34523809523809523,
"grad_norm": 1.3539802412376234,
"learning_rate": 1.9780261392341383e-05,
"loss": 0.5516,
"mean_token_accuracy": 0.8191157821565866,
"num_tokens": 37381260.0,
"step": 87
},
{
"entropy": 0.4647216796875,
"epoch": 0.3492063492063492,
"grad_norm": 1.2106246481197491,
"learning_rate": 1.9771045705282313e-05,
"loss": 0.5564,
"mean_token_accuracy": 0.8167914487421513,
"num_tokens": 37803882.0,
"step": 88
},
{
"entropy": 0.4688720703125,
"epoch": 0.3531746031746032,
"grad_norm": 1.4482065495179155,
"learning_rate": 1.9761642953795896e-05,
"loss": 0.549,
"mean_token_accuracy": 0.8173990342766047,
"num_tokens": 38227635.0,
"step": 89
},
{
"entropy": 0.464630126953125,
"epoch": 0.35714285714285715,
"grad_norm": 1.483279776234404,
"learning_rate": 1.975205331789566e-05,
"loss": 0.5646,
"mean_token_accuracy": 0.8164498396217823,
"num_tokens": 38667329.0,
"step": 90
},
{
"entropy": 0.4603271484375,
"epoch": 0.3611111111111111,
"grad_norm": 1.1991682622509072,
"learning_rate": 1.9742276981172978e-05,
"loss": 0.5558,
"mean_token_accuracy": 0.816204615868628,
"num_tokens": 39107330.0,
"step": 91
},
{
"entropy": 0.464508056640625,
"epoch": 0.36507936507936506,
"grad_norm": 1.0687403883123787,
"learning_rate": 1.973231413079357e-05,
"loss": 0.5331,
"mean_token_accuracy": 0.8214370720088482,
"num_tokens": 39524166.0,
"step": 92
},
{
"entropy": 0.4556884765625,
"epoch": 0.36904761904761907,
"grad_norm": 1.212069109316228,
"learning_rate": 1.9722164957493925e-05,
"loss": 0.5358,
"mean_token_accuracy": 0.823941863141954,
"num_tokens": 39952174.0,
"step": 93
},
{
"entropy": 0.45452880859375,
"epoch": 0.373015873015873,
"grad_norm": 1.1912124705725184,
"learning_rate": 1.971182965557763e-05,
"loss": 0.5462,
"mean_token_accuracy": 0.8201974583789706,
"num_tokens": 40389693.0,
"step": 94
},
{
"entropy": 0.45831298828125,
"epoch": 0.376984126984127,
"grad_norm": 1.097985862600317,
"learning_rate": 1.9701308422911674e-05,
"loss": 0.5417,
"mean_token_accuracy": 0.8212789446115494,
"num_tokens": 40815730.0,
"step": 95
},
{
"entropy": 0.472564697265625,
"epoch": 0.38095238095238093,
"grad_norm": 1.0506974161366276,
"learning_rate": 1.969060146092264e-05,
"loss": 0.5464,
"mean_token_accuracy": 0.821564057841897,
"num_tokens": 41231841.0,
"step": 96
},
{
"entropy": 0.477783203125,
"epoch": 0.38492063492063494,
"grad_norm": 1.052268920405974,
"learning_rate": 1.967970897459286e-05,
"loss": 0.5561,
"mean_token_accuracy": 0.8177454238757491,
"num_tokens": 41650119.0,
"step": 97
},
{
"entropy": 0.462188720703125,
"epoch": 0.3888888888888889,
"grad_norm": 1.07343802824294,
"learning_rate": 1.966863117245648e-05,
"loss": 0.5351,
"mean_token_accuracy": 0.8210580609738827,
"num_tokens": 42082897.0,
"step": 98
},
{
"entropy": 0.47021484375,
"epoch": 0.39285714285714285,
"grad_norm": 0.9505719077699497,
"learning_rate": 1.9657368266595477e-05,
"loss": 0.5309,
"mean_token_accuracy": 0.8261024495586753,
"num_tokens": 42519361.0,
"step": 99
},
{
"entropy": 0.468353271484375,
"epoch": 0.3968253968253968,
"grad_norm": 1.118827385837718,
"learning_rate": 1.964592047263561e-05,
"loss": 0.5289,
"mean_token_accuracy": 0.8250956162810326,
"num_tokens": 42941458.0,
"step": 100
},
{
"entropy": 0.459075927734375,
"epoch": 0.4007936507936508,
"grad_norm": 1.0805587142732163,
"learning_rate": 1.9634288009742254e-05,
"loss": 0.5345,
"mean_token_accuracy": 0.822862328030169,
"num_tokens": 43370971.0,
"step": 101
},
{
"entropy": 0.461395263671875,
"epoch": 0.40476190476190477,
"grad_norm": 1.11495068252665,
"learning_rate": 1.9622471100616253e-05,
"loss": 0.5358,
"mean_token_accuracy": 0.8217748673632741,
"num_tokens": 43801380.0,
"step": 102
},
{
"entropy": 0.45855712890625,
"epoch": 0.4087301587301587,
"grad_norm": 1.140917585323956,
"learning_rate": 1.961046997148961e-05,
"loss": 0.5482,
"mean_token_accuracy": 0.8196941930800676,
"num_tokens": 44245066.0,
"step": 103
},
{
"entropy": 0.47332763671875,
"epoch": 0.4126984126984127,
"grad_norm": 1.2030195624026125,
"learning_rate": 1.959828485212119e-05,
"loss": 0.546,
"mean_token_accuracy": 0.8202573778107762,
"num_tokens": 44671335.0,
"step": 104
},
{
"entropy": 0.48388671875,
"epoch": 0.4166666666666667,
"grad_norm": 1.2332127945633589,
"learning_rate": 1.958591597579231e-05,
"loss": 0.5427,
"mean_token_accuracy": 0.8179315272718668,
"num_tokens": 45066930.0,
"step": 105
},
{
"entropy": 0.463653564453125,
"epoch": 0.42063492063492064,
"grad_norm": 1.1113355845745616,
"learning_rate": 1.957336357930227e-05,
"loss": 0.5331,
"mean_token_accuracy": 0.8212415920570493,
"num_tokens": 45508166.0,
"step": 106
},
{
"entropy": 0.458984375,
"epoch": 0.4246031746031746,
"grad_norm": 1.0863705725170332,
"learning_rate": 1.9560627902963808e-05,
"loss": 0.5484,
"mean_token_accuracy": 0.8195863580331206,
"num_tokens": 45960298.0,
"step": 107
},
{
"entropy": 0.46014404296875,
"epoch": 0.42857142857142855,
"grad_norm": 1.0877758605686885,
"learning_rate": 1.9547709190598538e-05,
"loss": 0.5359,
"mean_token_accuracy": 0.8214817009866238,
"num_tokens": 46398974.0,
"step": 108
},
{
"entropy": 0.451568603515625,
"epoch": 0.43253968253968256,
"grad_norm": 1.0427249445385431,
"learning_rate": 1.9534607689532236e-05,
"loss": 0.5438,
"mean_token_accuracy": 0.8201778931543231,
"num_tokens": 46837899.0,
"step": 109
},
{
"entropy": 0.451446533203125,
"epoch": 0.4365079365079365,
"grad_norm": 1.175695289729488,
"learning_rate": 1.9521323650590135e-05,
"loss": 0.5498,
"mean_token_accuracy": 0.8169540381059051,
"num_tokens": 47311746.0,
"step": 110
},
{
"entropy": 0.457611083984375,
"epoch": 0.44047619047619047,
"grad_norm": 1.1077962748274401,
"learning_rate": 1.950785732809211e-05,
"loss": 0.5247,
"mean_token_accuracy": 0.8267239183187485,
"num_tokens": 47744340.0,
"step": 111
},
{
"entropy": 0.45452880859375,
"epoch": 0.4444444444444444,
"grad_norm": 0.9399177463048327,
"learning_rate": 1.9494208979847814e-05,
"loss": 0.5204,
"mean_token_accuracy": 0.826555940322578,
"num_tokens": 48177761.0,
"step": 112
},
{
"entropy": 0.464630126953125,
"epoch": 0.44841269841269843,
"grad_norm": 1.1098464998441184,
"learning_rate": 1.9480378867151746e-05,
"loss": 0.5415,
"mean_token_accuracy": 0.8196114804595709,
"num_tokens": 48604700.0,
"step": 113
},
{
"entropy": 0.464874267578125,
"epoch": 0.4523809523809524,
"grad_norm": 1.0854538207166011,
"learning_rate": 1.9466367254778234e-05,
"loss": 0.5224,
"mean_token_accuracy": 0.8268638197332621,
"num_tokens": 49026375.0,
"step": 114
},
{
"entropy": 0.449798583984375,
"epoch": 0.45634920634920634,
"grad_norm": 1.17364696539316,
"learning_rate": 1.9452174410976383e-05,
"loss": 0.5367,
"mean_token_accuracy": 0.8208001255989075,
"num_tokens": 49467267.0,
"step": 115
},
{
"entropy": 0.461151123046875,
"epoch": 0.4603174603174603,
"grad_norm": 0.9873427369389342,
"learning_rate": 1.943780060746493e-05,
"loss": 0.5353,
"mean_token_accuracy": 0.8211417645215988,
"num_tokens": 49889163.0,
"step": 116
},
{
"entropy": 0.462799072265625,
"epoch": 0.4642857142857143,
"grad_norm": 1.0735045578812574,
"learning_rate": 1.9423246119427044e-05,
"loss": 0.5143,
"mean_token_accuracy": 0.8246986707672477,
"num_tokens": 50300460.0,
"step": 117
},
{
"entropy": 0.454498291015625,
"epoch": 0.46825396825396826,
"grad_norm": 1.2028864034535025,
"learning_rate": 1.940851122550506e-05,
"loss": 0.536,
"mean_token_accuracy": 0.8228262066841125,
"num_tokens": 50735361.0,
"step": 118
},
{
"entropy": 0.458526611328125,
"epoch": 0.4722222222222222,
"grad_norm": 1.0533705874526058,
"learning_rate": 1.9393596207795135e-05,
"loss": 0.5221,
"mean_token_accuracy": 0.8254814920946956,
"num_tokens": 51149122.0,
"step": 119
},
{
"entropy": 0.453338623046875,
"epoch": 0.47619047619047616,
"grad_norm": 1.0854374369386737,
"learning_rate": 1.9378501351841864e-05,
"loss": 0.5267,
"mean_token_accuracy": 0.8232422964647412,
"num_tokens": 51597577.0,
"step": 120
},
{
"entropy": 0.442718505859375,
"epoch": 0.4801587301587302,
"grad_norm": 0.9631721841884465,
"learning_rate": 1.93632269466328e-05,
"loss": 0.53,
"mean_token_accuracy": 0.8238179190084338,
"num_tokens": 52043666.0,
"step": 121
},
{
"entropy": 0.4471435546875,
"epoch": 0.48412698412698413,
"grad_norm": 1.0190972188396954,
"learning_rate": 1.934777328459292e-05,
"loss": 0.5315,
"mean_token_accuracy": 0.8235808834433556,
"num_tokens": 52482382.0,
"step": 122
},
{
"entropy": 0.45355224609375,
"epoch": 0.4880952380952381,
"grad_norm": 0.9437542356299639,
"learning_rate": 1.933214066157904e-05,
"loss": 0.5263,
"mean_token_accuracy": 0.8229744052514434,
"num_tokens": 52931115.0,
"step": 123
},
{
"entropy": 0.451507568359375,
"epoch": 0.49206349206349204,
"grad_norm": 1.0834515748734725,
"learning_rate": 1.9316329376874146e-05,
"loss": 0.53,
"mean_token_accuracy": 0.8217291543260217,
"num_tokens": 53370750.0,
"step": 124
},
{
"entropy": 0.45770263671875,
"epoch": 0.49603174603174605,
"grad_norm": 1.0001575216738139,
"learning_rate": 1.930033973318164e-05,
"loss": 0.5209,
"mean_token_accuracy": 0.8262900058180094,
"num_tokens": 53798951.0,
"step": 125
},
{
"entropy": 0.446929931640625,
"epoch": 0.5,
"grad_norm": 0.984702450663036,
"learning_rate": 1.9284172036619597e-05,
"loss": 0.5122,
"mean_token_accuracy": 0.8294054577127099,
"num_tokens": 54235189.0,
"step": 126
},
{
"entropy": 0.447174072265625,
"epoch": 0.503968253968254,
"grad_norm": 0.9634718133480947,
"learning_rate": 1.926782659671484e-05,
"loss": 0.5118,
"mean_token_accuracy": 0.8281444823369384,
"num_tokens": 54642925.0,
"step": 127
},
{
"entropy": 0.452545166015625,
"epoch": 0.5079365079365079,
"grad_norm": 0.9876659774467254,
"learning_rate": 1.9251303726397076e-05,
"loss": 0.5216,
"mean_token_accuracy": 0.8251809384673834,
"num_tokens": 55066936.0,
"step": 128
},
{
"entropy": 0.4573974609375,
"epoch": 0.5119047619047619,
"grad_norm": 0.9632797656676275,
"learning_rate": 1.9234603741992864e-05,
"loss": 0.5165,
"mean_token_accuracy": 0.8273195894435048,
"num_tokens": 55484500.0,
"step": 129
},
{
"entropy": 0.44158935546875,
"epoch": 0.5158730158730159,
"grad_norm": 0.9175209493730221,
"learning_rate": 1.9217726963219567e-05,
"loss": 0.5209,
"mean_token_accuracy": 0.8260001242160797,
"num_tokens": 55922405.0,
"step": 130
},
{
"entropy": 0.4527587890625,
"epoch": 0.5198412698412699,
"grad_norm": 1.062087131423626,
"learning_rate": 1.9200673713179245e-05,
"loss": 0.5299,
"mean_token_accuracy": 0.8199161011725664,
"num_tokens": 56355834.0,
"step": 131
},
{
"entropy": 0.452056884765625,
"epoch": 0.5238095238095238,
"grad_norm": 0.964250067282172,
"learning_rate": 1.9183444318352458e-05,
"loss": 0.509,
"mean_token_accuracy": 0.8293298603966832,
"num_tokens": 56770275.0,
"step": 132
},
{
"entropy": 0.44903564453125,
"epoch": 0.5277777777777778,
"grad_norm": 0.9853678148235807,
"learning_rate": 1.9166039108592008e-05,
"loss": 0.5289,
"mean_token_accuracy": 0.8244264824315906,
"num_tokens": 57203160.0,
"step": 133
},
{
"entropy": 0.44952392578125,
"epoch": 0.5317460317460317,
"grad_norm": 1.0255963315554393,
"learning_rate": 1.9148458417116645e-05,
"loss": 0.5217,
"mean_token_accuracy": 0.8221221547573805,
"num_tokens": 57627870.0,
"step": 134
},
{
"entropy": 0.44769287109375,
"epoch": 0.5357142857142857,
"grad_norm": 1.048457109780583,
"learning_rate": 1.9130702580504678e-05,
"loss": 0.5239,
"mean_token_accuracy": 0.8261872082948685,
"num_tokens": 58059792.0,
"step": 135
},
{
"entropy": 0.452117919921875,
"epoch": 0.5396825396825397,
"grad_norm": 0.9254591817527784,
"learning_rate": 1.911277193868751e-05,
"loss": 0.5071,
"mean_token_accuracy": 0.8290363997220993,
"num_tokens": 58469884.0,
"step": 136
},
{
"entropy": 0.44451904296875,
"epoch": 0.5436507936507936,
"grad_norm": 1.0048803588236515,
"learning_rate": 1.9094666834943177e-05,
"loss": 0.5088,
"mean_token_accuracy": 0.8265325101092458,
"num_tokens": 58908403.0,
"step": 137
},
{
"entropy": 0.4510498046875,
"epoch": 0.5476190476190477,
"grad_norm": 0.8959703325073818,
"learning_rate": 1.9076387615889728e-05,
"loss": 0.509,
"mean_token_accuracy": 0.8270564498379827,
"num_tokens": 59323796.0,
"step": 138
},
{
"entropy": 0.4468994140625,
"epoch": 0.5515873015873016,
"grad_norm": 0.9671186022782653,
"learning_rate": 1.9057934631478616e-05,
"loss": 0.5097,
"mean_token_accuracy": 0.8285402255132794,
"num_tokens": 59751885.0,
"step": 139
},
{
"entropy": 0.44732666015625,
"epoch": 0.5555555555555556,
"grad_norm": 1.0314928737522524,
"learning_rate": 1.903930823498799e-05,
"loss": 0.4979,
"mean_token_accuracy": 0.8310831068083644,
"num_tokens": 60183841.0,
"step": 140
},
{
"entropy": 0.447723388671875,
"epoch": 0.5595238095238095,
"grad_norm": 0.9892945139524538,
"learning_rate": 1.9020508783015942e-05,
"loss": 0.5197,
"mean_token_accuracy": 0.8258982775732875,
"num_tokens": 60605801.0,
"step": 141
},
{
"entropy": 0.447662353515625,
"epoch": 0.5634920634920635,
"grad_norm": 0.9568747475172027,
"learning_rate": 1.9001536635473664e-05,
"loss": 0.5162,
"mean_token_accuracy": 0.8260063044726849,
"num_tokens": 61048601.0,
"step": 142
},
{
"entropy": 0.46087646484375,
"epoch": 0.5674603174603174,
"grad_norm": 1.0859536458388588,
"learning_rate": 1.898239215557856e-05,
"loss": 0.5123,
"mean_token_accuracy": 0.8280721958726645,
"num_tokens": 61446873.0,
"step": 143
},
{
"entropy": 0.448699951171875,
"epoch": 0.5714285714285714,
"grad_norm": 1.0019498607421429,
"learning_rate": 1.8963075709847308e-05,
"loss": 0.5278,
"mean_token_accuracy": 0.8225747244432569,
"num_tokens": 61887912.0,
"step": 144
},
{
"entropy": 0.454803466796875,
"epoch": 0.5753968253968254,
"grad_norm": 0.9611573189972261,
"learning_rate": 1.894358766808883e-05,
"loss": 0.5125,
"mean_token_accuracy": 0.8298147981986403,
"num_tokens": 62316051.0,
"step": 145
},
{
"entropy": 0.440704345703125,
"epoch": 0.5793650793650794,
"grad_norm": 1.0453009129466901,
"learning_rate": 1.892392840339721e-05,
"loss": 0.5101,
"mean_token_accuracy": 0.8277928857132792,
"num_tokens": 62741342.0,
"step": 146
},
{
"entropy": 0.445953369140625,
"epoch": 0.5833333333333334,
"grad_norm": 0.9253514323420257,
"learning_rate": 1.8904098292144556e-05,
"loss": 0.5111,
"mean_token_accuracy": 0.8307394320145249,
"num_tokens": 63164890.0,
"step": 147
},
{
"entropy": 0.4366455078125,
"epoch": 0.5873015873015873,
"grad_norm": 1.0178074892348359,
"learning_rate": 1.8884097713973798e-05,
"loss": 0.514,
"mean_token_accuracy": 0.8273154394701123,
"num_tokens": 63594617.0,
"step": 148
},
{
"entropy": 0.44122314453125,
"epoch": 0.5912698412698413,
"grad_norm": 0.9605119211913561,
"learning_rate": 1.8863927051791418e-05,
"loss": 0.5227,
"mean_token_accuracy": 0.8260425301268697,
"num_tokens": 64050293.0,
"step": 149
},
{
"entropy": 0.440643310546875,
"epoch": 0.5952380952380952,
"grad_norm": 0.9479991030852958,
"learning_rate": 1.884358669176011e-05,
"loss": 0.4982,
"mean_token_accuracy": 0.8325884817168117,
"num_tokens": 64467695.0,
"step": 150
},
{
"entropy": 0.444976806640625,
"epoch": 0.5992063492063492,
"grad_norm": 0.964607540439781,
"learning_rate": 1.88230770232914e-05,
"loss": 0.5142,
"mean_token_accuracy": 0.825270832516253,
"num_tokens": 64908198.0,
"step": 151
},
{
"entropy": 0.4447021484375,
"epoch": 0.6031746031746031,
"grad_norm": 0.9804608822405252,
"learning_rate": 1.8802398439038175e-05,
"loss": 0.503,
"mean_token_accuracy": 0.8305519446730614,
"num_tokens": 65333788.0,
"step": 152
},
{
"entropy": 0.45428466796875,
"epoch": 0.6071428571428571,
"grad_norm": 0.9727870339013585,
"learning_rate": 1.8781551334887204e-05,
"loss": 0.494,
"mean_token_accuracy": 0.833051766268909,
"num_tokens": 65748782.0,
"step": 153
},
{
"entropy": 0.44195556640625,
"epoch": 0.6111111111111112,
"grad_norm": 1.0608760680511125,
"learning_rate": 1.876053610995149e-05,
"loss": 0.5093,
"mean_token_accuracy": 0.8293332532048225,
"num_tokens": 66178918.0,
"step": 154
},
{
"entropy": 0.44537353515625,
"epoch": 0.6150793650793651,
"grad_norm": 1.0235107189518842,
"learning_rate": 1.87393531665627e-05,
"loss": 0.5184,
"mean_token_accuracy": 0.828137094154954,
"num_tokens": 66603248.0,
"step": 155
},
{
"entropy": 0.438262939453125,
"epoch": 0.6190476190476191,
"grad_norm": 0.9010906323539429,
"learning_rate": 1.8718002910263426e-05,
"loss": 0.5053,
"mean_token_accuracy": 0.8307396033778787,
"num_tokens": 67052342.0,
"step": 156
},
{
"entropy": 0.43731689453125,
"epoch": 0.623015873015873,
"grad_norm": 0.9986738769179647,
"learning_rate": 1.869648574979942e-05,
"loss": 0.5065,
"mean_token_accuracy": 0.8303233273327351,
"num_tokens": 67476890.0,
"step": 157
},
{
"entropy": 0.440155029296875,
"epoch": 0.626984126984127,
"grad_norm": 0.8657633291877197,
"learning_rate": 1.8674802097111784e-05,
"loss": 0.5059,
"mean_token_accuracy": 0.8300606962293386,
"num_tokens": 67913391.0,
"step": 158
},
{
"entropy": 0.448150634765625,
"epoch": 0.6309523809523809,
"grad_norm": 0.9352113389947035,
"learning_rate": 1.865295236732907e-05,
"loss": 0.4949,
"mean_token_accuracy": 0.833710327744484,
"num_tokens": 68339443.0,
"step": 159
},
{
"entropy": 0.4429931640625,
"epoch": 0.6349206349206349,
"grad_norm": 0.8523834865258976,
"learning_rate": 1.8630936978759337e-05,
"loss": 0.5092,
"mean_token_accuracy": 0.8283061692491174,
"num_tokens": 68772115.0,
"step": 160
},
{
"entropy": 0.43743896484375,
"epoch": 0.6388888888888888,
"grad_norm": 0.9636363540162411,
"learning_rate": 1.8608756352882152e-05,
"loss": 0.4984,
"mean_token_accuracy": 0.8309094673022628,
"num_tokens": 69198272.0,
"step": 161
},
{
"entropy": 0.43994140625,
"epoch": 0.6428571428571429,
"grad_norm": 0.8966734535109344,
"learning_rate": 1.85864109143405e-05,
"loss": 0.4921,
"mean_token_accuracy": 0.8349130833521485,
"num_tokens": 69611653.0,
"step": 162
},
{
"entropy": 0.448516845703125,
"epoch": 0.6468253968253969,
"grad_norm": 0.8821880241673743,
"learning_rate": 1.8563901090932673e-05,
"loss": 0.5232,
"mean_token_accuracy": 0.8246152186766267,
"num_tokens": 70054917.0,
"step": 163
},
{
"entropy": 0.437896728515625,
"epoch": 0.6507936507936508,
"grad_norm": 0.9223123901727143,
"learning_rate": 1.854122731360408e-05,
"loss": 0.5027,
"mean_token_accuracy": 0.8312725247815251,
"num_tokens": 70496952.0,
"step": 164
},
{
"entropy": 0.43646240234375,
"epoch": 0.6547619047619048,
"grad_norm": 0.8939813183437758,
"learning_rate": 1.851839001643898e-05,
"loss": 0.4959,
"mean_token_accuracy": 0.8323455331847072,
"num_tokens": 70919899.0,
"step": 165
},
{
"entropy": 0.43939208984375,
"epoch": 0.6587301587301587,
"grad_norm": 0.9020946161100143,
"learning_rate": 1.8495389636652185e-05,
"loss": 0.4995,
"mean_token_accuracy": 0.8345548948273063,
"num_tokens": 71343921.0,
"step": 166
},
{
"entropy": 0.436492919921875,
"epoch": 0.6626984126984127,
"grad_norm": 0.907744568728475,
"learning_rate": 1.847222661458069e-05,
"loss": 0.5191,
"mean_token_accuracy": 0.8266001716256142,
"num_tokens": 71808905.0,
"step": 167
},
{
"entropy": 0.442840576171875,
"epoch": 0.6666666666666666,
"grad_norm": 0.9515785083438172,
"learning_rate": 1.8448901393675233e-05,
"loss": 0.5001,
"mean_token_accuracy": 0.8303157305344939,
"num_tokens": 72240608.0,
"step": 168
},
{
"entropy": 0.44061279296875,
"epoch": 0.6706349206349206,
"grad_norm": 1.0721428605965826,
"learning_rate": 1.8425414420491817e-05,
"loss": 0.492,
"mean_token_accuracy": 0.832602908834815,
"num_tokens": 72668688.0,
"step": 169
},
{
"entropy": 0.439117431640625,
"epoch": 0.6746031746031746,
"grad_norm": 0.823986080877254,
"learning_rate": 1.8401766144683145e-05,
"loss": 0.5066,
"mean_token_accuracy": 0.8304582042619586,
"num_tokens": 73118452.0,
"step": 170
},
{
"entropy": 0.435211181640625,
"epoch": 0.6785714285714286,
"grad_norm": 0.9329819325690418,
"learning_rate": 1.8377957018990043e-05,
"loss": 0.5083,
"mean_token_accuracy": 0.8277195170521736,
"num_tokens": 73569120.0,
"step": 171
},
{
"entropy": 0.433624267578125,
"epoch": 0.6825396825396826,
"grad_norm": 0.9469634105889654,
"learning_rate": 1.8353987499232747e-05,
"loss": 0.5,
"mean_token_accuracy": 0.830720316618681,
"num_tokens": 73991069.0,
"step": 172
},
{
"entropy": 0.44415283203125,
"epoch": 0.6865079365079365,
"grad_norm": 0.8437243008063409,
"learning_rate": 1.8329858044302212e-05,
"loss": 0.4914,
"mean_token_accuracy": 0.8340719323605299,
"num_tokens": 74406271.0,
"step": 173
},
{
"entropy": 0.442138671875,
"epoch": 0.6904761904761905,
"grad_norm": 0.9448325053412396,
"learning_rate": 1.830556911615132e-05,
"loss": 0.4984,
"mean_token_accuracy": 0.8315248852595687,
"num_tokens": 74839901.0,
"step": 174
},
{
"entropy": 0.435211181640625,
"epoch": 0.6944444444444444,
"grad_norm": 0.8082342371498417,
"learning_rate": 1.8281121179786024e-05,
"loss": 0.4941,
"mean_token_accuracy": 0.8327573603019118,
"num_tokens": 75280578.0,
"step": 175
},
{
"entropy": 0.436492919921875,
"epoch": 0.6984126984126984,
"grad_norm": 0.9182267774958506,
"learning_rate": 1.825651470325645e-05,
"loss": 0.5185,
"mean_token_accuracy": 0.8261669343337417,
"num_tokens": 75749725.0,
"step": 176
},
{
"entropy": 0.441497802734375,
"epoch": 0.7023809523809523,
"grad_norm": 0.8607237067960308,
"learning_rate": 1.823175015764795e-05,
"loss": 0.5041,
"mean_token_accuracy": 0.8318371307104826,
"num_tokens": 76175668.0,
"step": 177
},
{
"entropy": 0.443450927734375,
"epoch": 0.7063492063492064,
"grad_norm": 0.8618633112263342,
"learning_rate": 1.8206828017072057e-05,
"loss": 0.4985,
"mean_token_accuracy": 0.8302229046821594,
"num_tokens": 76593690.0,
"step": 178
},
{
"entropy": 0.441619873046875,
"epoch": 0.7103174603174603,
"grad_norm": 0.9015146702404145,
"learning_rate": 1.818174875865744e-05,
"loss": 0.491,
"mean_token_accuracy": 0.8343524327501655,
"num_tokens": 77009261.0,
"step": 179
},
{
"entropy": 0.441009521484375,
"epoch": 0.7142857142857143,
"grad_norm": 0.9212502815359641,
"learning_rate": 1.815651286254074e-05,
"loss": 0.5019,
"mean_token_accuracy": 0.8306312058120966,
"num_tokens": 77431030.0,
"step": 180
},
{
"entropy": 0.435089111328125,
"epoch": 0.7182539682539683,
"grad_norm": 0.8731300242197482,
"learning_rate": 1.8131120811857398e-05,
"loss": 0.4858,
"mean_token_accuracy": 0.8352022236213088,
"num_tokens": 77852655.0,
"step": 181
},
{
"entropy": 0.44256591796875,
"epoch": 0.7222222222222222,
"grad_norm": 0.8959355020911784,
"learning_rate": 1.81055730927324e-05,
"loss": 0.4998,
"mean_token_accuracy": 0.8333287099376321,
"num_tokens": 78278605.0,
"step": 182
},
{
"entropy": 0.43603515625,
"epoch": 0.7261904761904762,
"grad_norm": 0.9116496627423212,
"learning_rate": 1.8079870194270958e-05,
"loss": 0.4855,
"mean_token_accuracy": 0.8339688014239073,
"num_tokens": 78721098.0,
"step": 183
},
{
"entropy": 0.439605712890625,
"epoch": 0.7301587301587301,
"grad_norm": 0.9140622005319157,
"learning_rate": 1.8054012608549167e-05,
"loss": 0.4963,
"mean_token_accuracy": 0.8326618708670139,
"num_tokens": 79154216.0,
"step": 184
},
{
"entropy": 0.433380126953125,
"epoch": 0.7341269841269841,
"grad_norm": 0.8229034336908294,
"learning_rate": 1.802800083060457e-05,
"loss": 0.4938,
"mean_token_accuracy": 0.8325842721387744,
"num_tokens": 79599332.0,
"step": 185
},
{
"entropy": 0.43524169921875,
"epoch": 0.7380952380952381,
"grad_norm": 0.897020159567663,
"learning_rate": 1.8001835358426688e-05,
"loss": 0.5046,
"mean_token_accuracy": 0.8298712829127908,
"num_tokens": 80039204.0,
"step": 186
},
{
"entropy": 0.4283447265625,
"epoch": 0.7420634920634921,
"grad_norm": 0.8527620690748114,
"learning_rate": 1.7975516692947478e-05,
"loss": 0.4862,
"mean_token_accuracy": 0.8363011125475168,
"num_tokens": 80472128.0,
"step": 187
},
{
"entropy": 0.434478759765625,
"epoch": 0.746031746031746,
"grad_norm": 0.8892589600328908,
"learning_rate": 1.7949045338031744e-05,
"loss": 0.5016,
"mean_token_accuracy": 0.830377884209156,
"num_tokens": 80910348.0,
"step": 188
},
{
"entropy": 0.435333251953125,
"epoch": 0.75,
"grad_norm": 0.8402522023302489,
"learning_rate": 1.7922421800467515e-05,
"loss": 0.4981,
"mean_token_accuracy": 0.830297333188355,
"num_tokens": 81336350.0,
"step": 189
},
{
"entropy": 0.43426513671875,
"epoch": 0.753968253968254,
"grad_norm": 0.8661830298081237,
"learning_rate": 1.7895646589956294e-05,
"loss": 0.4933,
"mean_token_accuracy": 0.8319168901070952,
"num_tokens": 81765325.0,
"step": 190
},
{
"entropy": 0.45111083984375,
"epoch": 0.7579365079365079,
"grad_norm": 0.9513754499577816,
"learning_rate": 1.7868720219103343e-05,
"loss": 0.4966,
"mean_token_accuracy": 0.8327370900660753,
"num_tokens": 82191876.0,
"step": 191
},
{
"entropy": 0.443572998046875,
"epoch": 0.7619047619047619,
"grad_norm": 0.8565906413499814,
"learning_rate": 1.7841643203407854e-05,
"loss": 0.4729,
"mean_token_accuracy": 0.8389169629663229,
"num_tokens": 82611125.0,
"step": 192
},
{
"entropy": 0.43267822265625,
"epoch": 0.7658730158730159,
"grad_norm": 0.9135365351092013,
"learning_rate": 1.7814416061253076e-05,
"loss": 0.4998,
"mean_token_accuracy": 0.8300598934292793,
"num_tokens": 83051815.0,
"step": 193
},
{
"entropy": 0.431640625,
"epoch": 0.7698412698412699,
"grad_norm": 0.8467497209932042,
"learning_rate": 1.77870393138964e-05,
"loss": 0.4929,
"mean_token_accuracy": 0.8316225642338395,
"num_tokens": 83488021.0,
"step": 194
},
{
"entropy": 0.43865966796875,
"epoch": 0.7738095238095238,
"grad_norm": 0.8533008256293745,
"learning_rate": 1.7759513485459367e-05,
"loss": 0.4911,
"mean_token_accuracy": 0.8322532493621111,
"num_tokens": 83902519.0,
"step": 195
},
{
"entropy": 0.433837890625,
"epoch": 0.7777777777777778,
"grad_norm": 0.9223169889850834,
"learning_rate": 1.7731839102917646e-05,
"loss": 0.4917,
"mean_token_accuracy": 0.8330741114914417,
"num_tokens": 84321775.0,
"step": 196
},
{
"entropy": 0.431854248046875,
"epoch": 0.7817460317460317,
"grad_norm": 0.8693783829283632,
"learning_rate": 1.7704016696090936e-05,
"loss": 0.4877,
"mean_token_accuracy": 0.8349456917494535,
"num_tokens": 84745911.0,
"step": 197
},
{
"entropy": 0.433502197265625,
"epoch": 0.7857142857142857,
"grad_norm": 0.8213507259820778,
"learning_rate": 1.7676046797632834e-05,
"loss": 0.4712,
"mean_token_accuracy": 0.8394572427496314,
"num_tokens": 85167087.0,
"step": 198
},
{
"entropy": 0.443817138671875,
"epoch": 0.7896825396825397,
"grad_norm": 0.8331545963147762,
"learning_rate": 1.7647929943020625e-05,
"loss": 0.4933,
"mean_token_accuracy": 0.833775763399899,
"num_tokens": 85588482.0,
"step": 199
},
{
"entropy": 0.427947998046875,
"epoch": 0.7936507936507936,
"grad_norm": 0.8194052047164025,
"learning_rate": 1.7619666670545034e-05,
"loss": 0.4887,
"mean_token_accuracy": 0.8344959514215589,
"num_tokens": 86009383.0,
"step": 200
},
{
"entropy": 0.433013916015625,
"epoch": 0.7976190476190477,
"grad_norm": 0.8009988751245896,
"learning_rate": 1.759125752129993e-05,
"loss": 0.4961,
"mean_token_accuracy": 0.8323280559852719,
"num_tokens": 86449600.0,
"step": 201
},
{
"entropy": 0.433624267578125,
"epoch": 0.8015873015873016,
"grad_norm": 0.8209561833032096,
"learning_rate": 1.7562703039171955e-05,
"loss": 0.4747,
"mean_token_accuracy": 0.8379244077950716,
"num_tokens": 86862628.0,
"step": 202
},
{
"entropy": 0.4326171875,
"epoch": 0.8055555555555556,
"grad_norm": 0.8915807469586918,
"learning_rate": 1.753400377083011e-05,
"loss": 0.49,
"mean_token_accuracy": 0.8344454681500793,
"num_tokens": 87295233.0,
"step": 203
},
{
"entropy": 0.431121826171875,
"epoch": 0.8095238095238095,
"grad_norm": 0.8241124494920523,
"learning_rate": 1.7505160265715303e-05,
"loss": 0.4813,
"mean_token_accuracy": 0.8365888074040413,
"num_tokens": 87713395.0,
"step": 204
},
{
"entropy": 0.432586669921875,
"epoch": 0.8134920634920635,
"grad_norm": 0.8140817202619633,
"learning_rate": 1.747617307602982e-05,
"loss": 0.5055,
"mean_token_accuracy": 0.830372148193419,
"num_tokens": 88158822.0,
"step": 205
},
{
"entropy": 0.430633544921875,
"epoch": 0.8174603174603174,
"grad_norm": 0.9065008583130683,
"learning_rate": 1.7447042756726756e-05,
"loss": 0.4892,
"mean_token_accuracy": 0.8338504349812865,
"num_tokens": 88602545.0,
"step": 206
},
{
"entropy": 0.435455322265625,
"epoch": 0.8214285714285714,
"grad_norm": 0.8128045472312435,
"learning_rate": 1.741776986549938e-05,
"loss": 0.4704,
"mean_token_accuracy": 0.8403382319957018,
"num_tokens": 89025796.0,
"step": 207
},
{
"entropy": 0.43255615234375,
"epoch": 0.8253968253968254,
"grad_norm": 0.8022720959148042,
"learning_rate": 1.7388354962770488e-05,
"loss": 0.4908,
"mean_token_accuracy": 0.8343631466850638,
"num_tokens": 89444255.0,
"step": 208
},
{
"entropy": 0.43310546875,
"epoch": 0.8293650793650794,
"grad_norm": 0.9092929881385042,
"learning_rate": 1.735879861168163e-05,
"loss": 0.4971,
"mean_token_accuracy": 0.8356021726503968,
"num_tokens": 89894143.0,
"step": 209
},
{
"entropy": 0.4346923828125,
"epoch": 0.8333333333333334,
"grad_norm": 0.7549249190997407,
"learning_rate": 1.7329101378082374e-05,
"loss": 0.4546,
"mean_token_accuracy": 0.8430411163717508,
"num_tokens": 90312774.0,
"step": 210
},
{
"entropy": 0.4388427734375,
"epoch": 0.8373015873015873,
"grad_norm": 0.8727254647412117,
"learning_rate": 1.729926383051943e-05,
"loss": 0.4784,
"mean_token_accuracy": 0.8370981393381953,
"num_tokens": 90741787.0,
"step": 211
},
{
"entropy": 0.431640625,
"epoch": 0.8412698412698413,
"grad_norm": 0.7541804775023891,
"learning_rate": 1.7269286540225805e-05,
"loss": 0.4704,
"mean_token_accuracy": 0.8386409590020776,
"num_tokens": 91177447.0,
"step": 212
},
{
"entropy": 0.43267822265625,
"epoch": 0.8452380952380952,
"grad_norm": 0.7677395619634457,
"learning_rate": 1.723917008110984e-05,
"loss": 0.4759,
"mean_token_accuracy": 0.8381857760250568,
"num_tokens": 91607165.0,
"step": 213
},
{
"entropy": 0.43829345703125,
"epoch": 0.8492063492063492,
"grad_norm": 0.8426223853041325,
"learning_rate": 1.720891502974423e-05,
"loss": 0.4774,
"mean_token_accuracy": 0.8353362819179893,
"num_tokens": 92026164.0,
"step": 214
},
{
"entropy": 0.432647705078125,
"epoch": 0.8531746031746031,
"grad_norm": 0.7956595213704394,
"learning_rate": 1.7178521965354992e-05,
"loss": 0.4736,
"mean_token_accuracy": 0.8389335246756673,
"num_tokens": 92463989.0,
"step": 215
},
{
"entropy": 0.430908203125,
"epoch": 0.8571428571428571,
"grad_norm": 0.7978027639386832,
"learning_rate": 1.714799146981037e-05,
"loss": 0.4745,
"mean_token_accuracy": 0.8379769828170538,
"num_tokens": 92891631.0,
"step": 216
},
{
"entropy": 0.43951416015625,
"epoch": 0.8611111111111112,
"grad_norm": 0.7515092463448632,
"learning_rate": 1.7117324127609686e-05,
"loss": 0.4838,
"mean_token_accuracy": 0.83737269975245,
"num_tokens": 93322969.0,
"step": 217
},
{
"entropy": 0.436614990234375,
"epoch": 0.8650793650793651,
"grad_norm": 0.7917014918299389,
"learning_rate": 1.7086520525872173e-05,
"loss": 0.4737,
"mean_token_accuracy": 0.8372634230181575,
"num_tokens": 93760535.0,
"step": 218
},
{
"entropy": 0.431365966796875,
"epoch": 0.8690476190476191,
"grad_norm": 0.7226980931603464,
"learning_rate": 1.7055581254325716e-05,
"loss": 0.468,
"mean_token_accuracy": 0.8401279039680958,
"num_tokens": 94172398.0,
"step": 219
},
{
"entropy": 0.43701171875,
"epoch": 0.873015873015873,
"grad_norm": 0.7637809745171817,
"learning_rate": 1.7024506905295566e-05,
"loss": 0.4819,
"mean_token_accuracy": 0.836346473544836,
"num_tokens": 94599260.0,
"step": 220
},
{
"entropy": 0.44158935546875,
"epoch": 0.876984126984127,
"grad_norm": 0.8053881064314574,
"learning_rate": 1.6993298073693005e-05,
"loss": 0.477,
"mean_token_accuracy": 0.8353390069678426,
"num_tokens": 95019433.0,
"step": 221
},
{
"entropy": 0.44500732421875,
"epoch": 0.8809523809523809,
"grad_norm": 0.7574417737692334,
"learning_rate": 1.6961955357003948e-05,
"loss": 0.4732,
"mean_token_accuracy": 0.8381653232499957,
"num_tokens": 95425799.0,
"step": 222
},
{
"entropy": 0.436981201171875,
"epoch": 0.8849206349206349,
"grad_norm": 0.8024393653209613,
"learning_rate": 1.693047935527751e-05,
"loss": 0.4736,
"mean_token_accuracy": 0.8379595559090376,
"num_tokens": 95822890.0,
"step": 223
},
{
"entropy": 0.43756103515625,
"epoch": 0.8888888888888888,
"grad_norm": 0.8215854969202835,
"learning_rate": 1.6898870671114527e-05,
"loss": 0.4883,
"mean_token_accuracy": 0.8364003216847777,
"num_tokens": 96260271.0,
"step": 224
},
{
"entropy": 0.4278564453125,
"epoch": 0.8928571428571429,
"grad_norm": 0.7668608306492521,
"learning_rate": 1.6867129909656e-05,
"loss": 0.4783,
"mean_token_accuracy": 0.8366113835945725,
"num_tokens": 96696652.0,
"step": 225
},
{
"entropy": 0.436859130859375,
"epoch": 0.8968253968253969,
"grad_norm": 0.8321480081512098,
"learning_rate": 1.6835257678571515e-05,
"loss": 0.4763,
"mean_token_accuracy": 0.8387005385011435,
"num_tokens": 97135925.0,
"step": 226
},
{
"entropy": 0.4412841796875,
"epoch": 0.9007936507936508,
"grad_norm": 0.8357475643591474,
"learning_rate": 1.680325458804763e-05,
"loss": 0.4969,
"mean_token_accuracy": 0.8320699343457818,
"num_tokens": 97573810.0,
"step": 227
},
{
"entropy": 0.431304931640625,
"epoch": 0.9047619047619048,
"grad_norm": 0.779656796305732,
"learning_rate": 1.6771121250776163e-05,
"loss": 0.465,
"mean_token_accuracy": 0.8408154509961605,
"num_tokens": 98011108.0,
"step": 228
},
{
"entropy": 0.43701171875,
"epoch": 0.9087301587301587,
"grad_norm": 0.8611557820075499,
"learning_rate": 1.6738858281942477e-05,
"loss": 0.4637,
"mean_token_accuracy": 0.8399258134886622,
"num_tokens": 98441631.0,
"step": 229
},
{
"entropy": 0.440643310546875,
"epoch": 0.9126984126984127,
"grad_norm": 0.8547304328052301,
"learning_rate": 1.6706466299213718e-05,
"loss": 0.4763,
"mean_token_accuracy": 0.8364039584994316,
"num_tokens": 98873029.0,
"step": 230
},
{
"entropy": 0.437896728515625,
"epoch": 0.9166666666666666,
"grad_norm": 0.8763515952964974,
"learning_rate": 1.6673945922726945e-05,
"loss": 0.4784,
"mean_token_accuracy": 0.8354263128712773,
"num_tokens": 99296106.0,
"step": 231
},
{
"entropy": 0.43878173828125,
"epoch": 0.9206349206349206,
"grad_norm": 0.814646978847238,
"learning_rate": 1.6641297775077313e-05,
"loss": 0.4772,
"mean_token_accuracy": 0.8371500456705689,
"num_tokens": 99734864.0,
"step": 232
},
{
"entropy": 0.43292236328125,
"epoch": 0.9246031746031746,
"grad_norm": 0.8062733622515471,
"learning_rate": 1.660852248130611e-05,
"loss": 0.4863,
"mean_token_accuracy": 0.8347219526767731,
"num_tokens": 100174517.0,
"step": 233
},
{
"entropy": 0.43914794921875,
"epoch": 0.9285714285714286,
"grad_norm": 0.8754059145656132,
"learning_rate": 1.6575620668888812e-05,
"loss": 0.4732,
"mean_token_accuracy": 0.837783177383244,
"num_tokens": 100606926.0,
"step": 234
},
{
"entropy": 0.43682861328125,
"epoch": 0.9325396825396826,
"grad_norm": 0.8404842546931222,
"learning_rate": 1.6542592967723065e-05,
"loss": 0.4661,
"mean_token_accuracy": 0.8383018802851439,
"num_tokens": 101020425.0,
"step": 235
},
{
"entropy": 0.43560791015625,
"epoch": 0.9365079365079365,
"grad_norm": 0.8076832265015242,
"learning_rate": 1.6509440010116634e-05,
"loss": 0.4723,
"mean_token_accuracy": 0.8383181607350707,
"num_tokens": 101447599.0,
"step": 236
},
{
"entropy": 0.435455322265625,
"epoch": 0.9404761904761905,
"grad_norm": 0.8241311788244491,
"learning_rate": 1.6476162430775278e-05,
"loss": 0.4663,
"mean_token_accuracy": 0.8403345802798867,
"num_tokens": 101851986.0,
"step": 237
},
{
"entropy": 0.42706298828125,
"epoch": 0.9444444444444444,
"grad_norm": 0.8537310664275062,
"learning_rate": 1.6442760866790616e-05,
"loss": 0.4719,
"mean_token_accuracy": 0.8388460287824273,
"num_tokens": 102275358.0,
"step": 238
},
{
"entropy": 0.43780517578125,
"epoch": 0.9484126984126984,
"grad_norm": 0.811058555828741,
"learning_rate": 1.6409235957627926e-05,
"loss": 0.4673,
"mean_token_accuracy": 0.8385212691500783,
"num_tokens": 102688415.0,
"step": 239
},
{
"entropy": 0.43798828125,
"epoch": 0.9523809523809523,
"grad_norm": 0.9404107957388771,
"learning_rate": 1.6375588345113895e-05,
"loss": 0.4716,
"mean_token_accuracy": 0.8381083710119128,
"num_tokens": 103113293.0,
"step": 240
},
{
"entropy": 0.429534912109375,
"epoch": 0.9563492063492064,
"grad_norm": 0.8268811238645964,
"learning_rate": 1.6341818673424342e-05,
"loss": 0.4738,
"mean_token_accuracy": 0.838194428011775,
"num_tokens": 103566779.0,
"step": 241
},
{
"entropy": 0.4266357421875,
"epoch": 0.9603174603174603,
"grad_norm": 0.9050037047902241,
"learning_rate": 1.630792758907189e-05,
"loss": 0.4782,
"mean_token_accuracy": 0.8364842068403959,
"num_tokens": 104000550.0,
"step": 242
},
{
"entropy": 0.436737060546875,
"epoch": 0.9642857142857143,
"grad_norm": 0.8094767952175125,
"learning_rate": 1.6273915740893557e-05,
"loss": 0.476,
"mean_token_accuracy": 0.8376884264871478,
"num_tokens": 104429372.0,
"step": 243
},
{
"entropy": 0.42962646484375,
"epoch": 0.9682539682539683,
"grad_norm": 0.8904444225202598,
"learning_rate": 1.6239783780038374e-05,
"loss": 0.4686,
"mean_token_accuracy": 0.8404411617666483,
"num_tokens": 104859286.0,
"step": 244
},
{
"entropy": 0.438262939453125,
"epoch": 0.9722222222222222,
"grad_norm": 0.8056975230627009,
"learning_rate": 1.6205532359954905e-05,
"loss": 0.4771,
"mean_token_accuracy": 0.8340575834736228,
"num_tokens": 105281017.0,
"step": 245
},
{
"entropy": 0.423126220703125,
"epoch": 0.9761904761904762,
"grad_norm": 0.7784171562070645,
"learning_rate": 1.6171162136378716e-05,
"loss": 0.4669,
"mean_token_accuracy": 0.8392490344122052,
"num_tokens": 105729675.0,
"step": 246
},
{
"entropy": 0.42401123046875,
"epoch": 0.9801587301587301,
"grad_norm": 0.8062608875027727,
"learning_rate": 1.6136673767319853e-05,
"loss": 0.4712,
"mean_token_accuracy": 0.8378767920657992,
"num_tokens": 106168119.0,
"step": 247
},
{
"entropy": 0.422515869140625,
"epoch": 0.9841269841269841,
"grad_norm": 0.7922278955543418,
"learning_rate": 1.6102067913050227e-05,
"loss": 0.4687,
"mean_token_accuracy": 0.8400682499632239,
"num_tokens": 106603968.0,
"step": 248
},
{
"entropy": 0.421356201171875,
"epoch": 0.9880952380952381,
"grad_norm": 0.7435919501708257,
"learning_rate": 1.606734523609097e-05,
"loss": 0.4712,
"mean_token_accuracy": 0.8397826086729765,
"num_tokens": 107054152.0,
"step": 249
},
{
"entropy": 0.423004150390625,
"epoch": 0.9920634920634921,
"grad_norm": 0.7325977788312235,
"learning_rate": 1.603250640119977e-05,
"loss": 0.4609,
"mean_token_accuracy": 0.8421022659167647,
"num_tokens": 107495007.0,
"step": 250
},
{
"entropy": 0.4324951171875,
"epoch": 0.996031746031746,
"grad_norm": 0.8289315109880487,
"learning_rate": 1.5997552075358122e-05,
"loss": 0.4783,
"mean_token_accuracy": 0.8354923082515597,
"num_tokens": 107928758.0,
"step": 251
},
{
"entropy": 0.427337646484375,
"epoch": 1.0,
"grad_norm": 0.7226694679539868,
"learning_rate": 1.5962482927758568e-05,
"loss": 0.4732,
"mean_token_accuracy": 0.8377068918198347,
"num_tokens": 108364335.0,
"step": 252
},
{
"entropy": 0.443572998046875,
"epoch": 1.003968253968254,
"grad_norm": 0.7341598629462971,
"learning_rate": 1.592729962979189e-05,
"loss": 0.4341,
"mean_token_accuracy": 0.8490036567673087,
"num_tokens": 108775586.0,
"step": 253
},
{
"entropy": 0.433013916015625,
"epoch": 1.007936507936508,
"grad_norm": 0.6958749153405379,
"learning_rate": 1.589200285503426e-05,
"loss": 0.4311,
"mean_token_accuracy": 0.850931248627603,
"num_tokens": 109202665.0,
"step": 254
},
{
"entropy": 0.42047119140625,
"epoch": 1.0119047619047619,
"grad_norm": 0.830299124303698,
"learning_rate": 1.585659327923432e-05,
"loss": 0.4343,
"mean_token_accuracy": 0.8494839882478118,
"num_tokens": 109629975.0,
"step": 255
},
{
"entropy": 0.41961669921875,
"epoch": 1.0158730158730158,
"grad_norm": 0.7427828462108962,
"learning_rate": 1.582107158030027e-05,
"loss": 0.4332,
"mean_token_accuracy": 0.8491029348224401,
"num_tokens": 110061605.0,
"step": 256
},
{
"entropy": 0.42138671875,
"epoch": 1.0198412698412698,
"grad_norm": 0.6632051825171987,
"learning_rate": 1.5785438438286892e-05,
"loss": 0.4215,
"mean_token_accuracy": 0.8531029289588332,
"num_tokens": 110487591.0,
"step": 257
},
{
"entropy": 0.422271728515625,
"epoch": 1.0238095238095237,
"grad_norm": 0.7208685537266245,
"learning_rate": 1.574969453538251e-05,
"loss": 0.4367,
"mean_token_accuracy": 0.8492154879495502,
"num_tokens": 110924491.0,
"step": 258
},
{
"entropy": 0.422119140625,
"epoch": 1.0277777777777777,
"grad_norm": 0.7622696147016514,
"learning_rate": 1.5713840555895937e-05,
"loss": 0.4322,
"mean_token_accuracy": 0.8504317132756114,
"num_tokens": 111351831.0,
"step": 259
},
{
"entropy": 0.42340087890625,
"epoch": 1.0317460317460316,
"grad_norm": 0.7232679832518091,
"learning_rate": 1.567787718624338e-05,
"loss": 0.4315,
"mean_token_accuracy": 0.8494284749031067,
"num_tokens": 111773832.0,
"step": 260
},
{
"entropy": 0.424407958984375,
"epoch": 1.0357142857142858,
"grad_norm": 0.7348492552631081,
"learning_rate": 1.5641805114935297e-05,
"loss": 0.4229,
"mean_token_accuracy": 0.8527503348886967,
"num_tokens": 112210334.0,
"step": 261
},
{
"entropy": 0.424346923828125,
"epoch": 1.0396825396825398,
"grad_norm": 0.7224982909642063,
"learning_rate": 1.560562503256322e-05,
"loss": 0.4295,
"mean_token_accuracy": 0.8483862616121769,
"num_tokens": 112637470.0,
"step": 262
},
{
"entropy": 0.421417236328125,
"epoch": 1.0436507936507937,
"grad_norm": 0.7968828029144892,
"learning_rate": 1.556933763178651e-05,
"loss": 0.4338,
"mean_token_accuracy": 0.8488446967676282,
"num_tokens": 113069179.0,
"step": 263
},
{
"entropy": 0.427215576171875,
"epoch": 1.0476190476190477,
"grad_norm": 0.6927882582463685,
"learning_rate": 1.5532943607319143e-05,
"loss": 0.4348,
"mean_token_accuracy": 0.8495698990300298,
"num_tokens": 113501590.0,
"step": 264
},
{
"entropy": 0.423583984375,
"epoch": 1.0515873015873016,
"grad_norm": 0.6855557306353498,
"learning_rate": 1.5496443655916348e-05,
"loss": 0.4302,
"mean_token_accuracy": 0.8517827754840255,
"num_tokens": 113935337.0,
"step": 265
},
{
"entropy": 0.424652099609375,
"epoch": 1.0555555555555556,
"grad_norm": 0.7439838182634214,
"learning_rate": 1.5459838476361326e-05,
"loss": 0.4339,
"mean_token_accuracy": 0.8486921405419707,
"num_tokens": 114360533.0,
"step": 266
},
{
"entropy": 0.416717529296875,
"epoch": 1.0595238095238095,
"grad_norm": 0.7237479220972823,
"learning_rate": 1.5423128769451832e-05,
"loss": 0.4282,
"mean_token_accuracy": 0.8520865635946393,
"num_tokens": 114792424.0,
"step": 267
},
{
"entropy": 0.41729736328125,
"epoch": 1.0634920634920635,
"grad_norm": 0.7635204529254467,
"learning_rate": 1.5386315237986785e-05,
"loss": 0.4366,
"mean_token_accuracy": 0.8476630486547947,
"num_tokens": 115231953.0,
"step": 268
},
{
"entropy": 0.420654296875,
"epoch": 1.0674603174603174,
"grad_norm": 0.6400753342281554,
"learning_rate": 1.5349398586752794e-05,
"loss": 0.4309,
"mean_token_accuracy": 0.8499054629355669,
"num_tokens": 115664522.0,
"step": 269
},
{
"entropy": 0.417083740234375,
"epoch": 1.0714285714285714,
"grad_norm": 0.8123384521312528,
"learning_rate": 1.5312379522510666e-05,
"loss": 0.4301,
"mean_token_accuracy": 0.8492474015802145,
"num_tokens": 116092221.0,
"step": 270
},
{
"entropy": 0.421173095703125,
"epoch": 1.0753968253968254,
"grad_norm": 0.7019628203672783,
"learning_rate": 1.52752587539819e-05,
"loss": 0.4274,
"mean_token_accuracy": 0.8491615150123835,
"num_tokens": 116534679.0,
"step": 271
},
{
"entropy": 0.4229736328125,
"epoch": 1.0793650793650793,
"grad_norm": 0.7506293935142412,
"learning_rate": 1.5238036991835085e-05,
"loss": 0.438,
"mean_token_accuracy": 0.8483763262629509,
"num_tokens": 116984739.0,
"step": 272
},
{
"entropy": 0.418182373046875,
"epoch": 1.0833333333333333,
"grad_norm": 0.81916726768363,
"learning_rate": 1.5200714948672313e-05,
"loss": 0.4329,
"mean_token_accuracy": 0.8497389126569033,
"num_tokens": 117413323.0,
"step": 273
},
{
"entropy": 0.412933349609375,
"epoch": 1.0873015873015872,
"grad_norm": 0.7157560717096457,
"learning_rate": 1.5163293339015535e-05,
"loss": 0.4333,
"mean_token_accuracy": 0.8479102049022913,
"num_tokens": 117852991.0,
"step": 274
},
{
"entropy": 0.42327880859375,
"epoch": 1.0912698412698412,
"grad_norm": 0.7802341946060086,
"learning_rate": 1.512577287929288e-05,
"loss": 0.438,
"mean_token_accuracy": 0.8477771393954754,
"num_tokens": 118264477.0,
"step": 275
},
{
"entropy": 0.41497802734375,
"epoch": 1.0952380952380953,
"grad_norm": 0.7955621612100239,
"learning_rate": 1.5088154287824934e-05,
"loss": 0.4264,
"mean_token_accuracy": 0.8529663607478142,
"num_tokens": 118696927.0,
"step": 276
},
{
"entropy": 0.409820556640625,
"epoch": 1.0992063492063493,
"grad_norm": 0.7201767225753009,
"learning_rate": 1.5050438284811001e-05,
"loss": 0.4352,
"mean_token_accuracy": 0.847323065623641,
"num_tokens": 119143061.0,
"step": 277
},
{
"entropy": 0.417938232421875,
"epoch": 1.1031746031746033,
"grad_norm": 0.7480446387717942,
"learning_rate": 1.5012625592315298e-05,
"loss": 0.4259,
"mean_token_accuracy": 0.8503220491111279,
"num_tokens": 119569613.0,
"step": 278
},
{
"entropy": 0.417694091796875,
"epoch": 1.1071428571428572,
"grad_norm": 0.8503377095008527,
"learning_rate": 1.4974716934253146e-05,
"loss": 0.4295,
"mean_token_accuracy": 0.8494290672242641,
"num_tokens": 119990044.0,
"step": 279
},
{
"entropy": 0.405731201171875,
"epoch": 1.1111111111111112,
"grad_norm": 0.8068032460856993,
"learning_rate": 1.4936713036377102e-05,
"loss": 0.4352,
"mean_token_accuracy": 0.8492538705468178,
"num_tokens": 120447089.0,
"step": 280
},
{
"entropy": 0.41217041015625,
"epoch": 1.1150793650793651,
"grad_norm": 0.789218202429046,
"learning_rate": 1.4898614626263066e-05,
"loss": 0.4152,
"mean_token_accuracy": 0.8548848666250706,
"num_tokens": 120882118.0,
"step": 281
},
{
"entropy": 0.415618896484375,
"epoch": 1.119047619047619,
"grad_norm": 0.7218191501417075,
"learning_rate": 1.4860422433296363e-05,
"loss": 0.4317,
"mean_token_accuracy": 0.8495792560279369,
"num_tokens": 121314886.0,
"step": 282
},
{
"entropy": 0.4183349609375,
"epoch": 1.123015873015873,
"grad_norm": 0.7503416099168865,
"learning_rate": 1.4822137188657752e-05,
"loss": 0.4197,
"mean_token_accuracy": 0.8535007536411285,
"num_tokens": 121731396.0,
"step": 283
},
{
"entropy": 0.413421630859375,
"epoch": 1.126984126984127,
"grad_norm": 0.7881593510234792,
"learning_rate": 1.4783759625309454e-05,
"loss": 0.4241,
"mean_token_accuracy": 0.851870458573103,
"num_tokens": 122167617.0,
"step": 284
},
{
"entropy": 0.4091796875,
"epoch": 1.130952380952381,
"grad_norm": 0.6940859902467295,
"learning_rate": 1.474529047798112e-05,
"loss": 0.4272,
"mean_token_accuracy": 0.8504150630906224,
"num_tokens": 122593508.0,
"step": 285
},
{
"entropy": 0.411651611328125,
"epoch": 1.1349206349206349,
"grad_norm": 0.7885889108343426,
"learning_rate": 1.4706730483155738e-05,
"loss": 0.4288,
"mean_token_accuracy": 0.8491430478170514,
"num_tokens": 123013840.0,
"step": 286
},
{
"entropy": 0.4080810546875,
"epoch": 1.1388888888888888,
"grad_norm": 0.7779729557331533,
"learning_rate": 1.4668080379055563e-05,
"loss": 0.4192,
"mean_token_accuracy": 0.853282954543829,
"num_tokens": 123444043.0,
"step": 287
},
{
"entropy": 0.410888671875,
"epoch": 1.1428571428571428,
"grad_norm": 0.7541972725693249,
"learning_rate": 1.4629340905627964e-05,
"loss": 0.4172,
"mean_token_accuracy": 0.8533556731417775,
"num_tokens": 123876490.0,
"step": 288
},
{
"entropy": 0.414825439453125,
"epoch": 1.1468253968253967,
"grad_norm": 0.7466215675753664,
"learning_rate": 1.4590512804531272e-05,
"loss": 0.4226,
"mean_token_accuracy": 0.8501791479066014,
"num_tokens": 124297019.0,
"step": 289
},
{
"entropy": 0.41717529296875,
"epoch": 1.1507936507936507,
"grad_norm": 0.7693301960113025,
"learning_rate": 1.4551596819120564e-05,
"loss": 0.4292,
"mean_token_accuracy": 0.8512964397668839,
"num_tokens": 124713314.0,
"step": 290
},
{
"entropy": 0.415802001953125,
"epoch": 1.1547619047619047,
"grad_norm": 0.7246395498891883,
"learning_rate": 1.4512593694433455e-05,
"loss": 0.4277,
"mean_token_accuracy": 0.8493410600349307,
"num_tokens": 125125061.0,
"step": 291
},
{
"entropy": 0.4139404296875,
"epoch": 1.1587301587301586,
"grad_norm": 0.7560965179980317,
"learning_rate": 1.447350417717581e-05,
"loss": 0.4436,
"mean_token_accuracy": 0.8454991178587079,
"num_tokens": 125564746.0,
"step": 292
},
{
"entropy": 0.40960693359375,
"epoch": 1.1626984126984128,
"grad_norm": 0.7077121780226429,
"learning_rate": 1.4434329015707468e-05,
"loss": 0.418,
"mean_token_accuracy": 0.8543984591960907,
"num_tokens": 125992351.0,
"step": 293
},
{
"entropy": 0.41363525390625,
"epoch": 1.1666666666666667,
"grad_norm": 0.7114122184270399,
"learning_rate": 1.4395068960027903e-05,
"loss": 0.4184,
"mean_token_accuracy": 0.852596671320498,
"num_tokens": 126415997.0,
"step": 294
},
{
"entropy": 0.418243408203125,
"epoch": 1.1706349206349207,
"grad_norm": 0.7588297506038938,
"learning_rate": 1.435572476176187e-05,
"loss": 0.4344,
"mean_token_accuracy": 0.8491576574742794,
"num_tokens": 126851930.0,
"step": 295
},
{
"entropy": 0.41839599609375,
"epoch": 1.1746031746031746,
"grad_norm": 0.7000410652898528,
"learning_rate": 1.4316297174145018e-05,
"loss": 0.4359,
"mean_token_accuracy": 0.847908278927207,
"num_tokens": 127285760.0,
"step": 296
},
{
"entropy": 0.426849365234375,
"epoch": 1.1785714285714286,
"grad_norm": 0.7000110970011031,
"learning_rate": 1.427678695200945e-05,
"loss": 0.447,
"mean_token_accuracy": 0.8466757563874125,
"num_tokens": 127724762.0,
"step": 297
},
{
"entropy": 0.412445068359375,
"epoch": 1.1825396825396826,
"grad_norm": 0.7729410408403453,
"learning_rate": 1.4237194851769318e-05,
"loss": 0.4245,
"mean_token_accuracy": 0.853807931765914,
"num_tokens": 128153685.0,
"step": 298
},
{
"entropy": 0.416259765625,
"epoch": 1.1865079365079365,
"grad_norm": 0.7363431945766964,
"learning_rate": 1.4197521631406279e-05,
"loss": 0.4234,
"mean_token_accuracy": 0.8529269192367792,
"num_tokens": 128574985.0,
"step": 299
},
{
"entropy": 0.4239501953125,
"epoch": 1.1904761904761905,
"grad_norm": 0.7757786686903285,
"learning_rate": 1.4157768050455038e-05,
"loss": 0.4238,
"mean_token_accuracy": 0.8496858030557632,
"num_tokens": 128990738.0,
"step": 300
},
{
"entropy": 0.419342041015625,
"epoch": 1.1944444444444444,
"grad_norm": 0.7832399458969364,
"learning_rate": 1.4117934869988776e-05,
"loss": 0.4214,
"mean_token_accuracy": 0.8544543124735355,
"num_tokens": 129413253.0,
"step": 301
},
{
"entropy": 0.417449951171875,
"epoch": 1.1984126984126984,
"grad_norm": 0.7164361411308464,
"learning_rate": 1.4078022852604591e-05,
"loss": 0.448,
"mean_token_accuracy": 0.8454335303977132,
"num_tokens": 129848900.0,
"step": 302
},
{
"entropy": 0.419342041015625,
"epoch": 1.2023809523809523,
"grad_norm": 0.741247005990124,
"learning_rate": 1.4038032762408897e-05,
"loss": 0.4252,
"mean_token_accuracy": 0.8511590985581279,
"num_tokens": 130278951.0,
"step": 303
},
{
"entropy": 0.42132568359375,
"epoch": 1.2063492063492063,
"grad_norm": 0.8114658165541835,
"learning_rate": 1.3997965365002789e-05,
"loss": 0.4318,
"mean_token_accuracy": 0.8505065925419331,
"num_tokens": 130724709.0,
"step": 304
},
{
"entropy": 0.421905517578125,
"epoch": 1.2103174603174602,
"grad_norm": 0.6859694143798629,
"learning_rate": 1.3957821427467392e-05,
"loss": 0.4091,
"mean_token_accuracy": 0.8572418540716171,
"num_tokens": 131151665.0,
"step": 305
},
{
"entropy": 0.41455078125,
"epoch": 1.2142857142857142,
"grad_norm": 0.7754845945066426,
"learning_rate": 1.3917601718349183e-05,
"loss": 0.4175,
"mean_token_accuracy": 0.8536632917821407,
"num_tokens": 131582811.0,
"step": 306
},
{
"entropy": 0.416259765625,
"epoch": 1.2182539682539684,
"grad_norm": 0.7186546230914753,
"learning_rate": 1.3877307007645256e-05,
"loss": 0.427,
"mean_token_accuracy": 0.8525044862180948,
"num_tokens": 131998529.0,
"step": 307
},
{
"entropy": 0.4093017578125,
"epoch": 1.2222222222222223,
"grad_norm": 0.7164454164150232,
"learning_rate": 1.3836938066788599e-05,
"loss": 0.4198,
"mean_token_accuracy": 0.8528330260887742,
"num_tokens": 132429743.0,
"step": 308
},
{
"entropy": 0.41455078125,
"epoch": 1.2261904761904763,
"grad_norm": 0.7241248979958975,
"learning_rate": 1.3796495668633325e-05,
"loss": 0.4265,
"mean_token_accuracy": 0.850708675570786,
"num_tokens": 132864811.0,
"step": 309
},
{
"entropy": 0.4132080078125,
"epoch": 1.2301587301587302,
"grad_norm": 0.7207326099891552,
"learning_rate": 1.3755980587439857e-05,
"loss": 0.4318,
"mean_token_accuracy": 0.8507328238338232,
"num_tokens": 133291943.0,
"step": 310
},
{
"entropy": 0.41790771484375,
"epoch": 1.2341269841269842,
"grad_norm": 0.7273508243242796,
"learning_rate": 1.3715393598860129e-05,
"loss": 0.4246,
"mean_token_accuracy": 0.8514404995366931,
"num_tokens": 133728664.0,
"step": 311
},
{
"entropy": 0.4178466796875,
"epoch": 1.2380952380952381,
"grad_norm": 0.70087947356439,
"learning_rate": 1.367473547992272e-05,
"loss": 0.4159,
"mean_token_accuracy": 0.8550357017666101,
"num_tokens": 134149814.0,
"step": 312
},
{
"entropy": 0.419586181640625,
"epoch": 1.242063492063492,
"grad_norm": 0.6639795743514464,
"learning_rate": 1.3634007009017986e-05,
"loss": 0.4151,
"mean_token_accuracy": 0.853931562975049,
"num_tokens": 134567037.0,
"step": 313
},
{
"entropy": 0.416595458984375,
"epoch": 1.246031746031746,
"grad_norm": 0.7100431406730954,
"learning_rate": 1.3593208965883156e-05,
"loss": 0.4137,
"mean_token_accuracy": 0.8554408671334386,
"num_tokens": 134989406.0,
"step": 314
},
{
"entropy": 0.4085693359375,
"epoch": 1.25,
"grad_norm": 0.7105214659480911,
"learning_rate": 1.3552342131587399e-05,
"loss": 0.4025,
"mean_token_accuracy": 0.8589053172618151,
"num_tokens": 135405949.0,
"step": 315
},
{
"entropy": 0.408843994140625,
"epoch": 1.253968253968254,
"grad_norm": 0.7899492973598554,
"learning_rate": 1.351140728851688e-05,
"loss": 0.4253,
"mean_token_accuracy": 0.8534569833427668,
"num_tokens": 135832642.0,
"step": 316
},
{
"entropy": 0.4046630859375,
"epoch": 1.257936507936508,
"grad_norm": 0.6960132729213381,
"learning_rate": 1.3470405220359773e-05,
"loss": 0.4211,
"mean_token_accuracy": 0.8528056116774678,
"num_tokens": 136281104.0,
"step": 317
},
{
"entropy": 0.41253662109375,
"epoch": 1.2619047619047619,
"grad_norm": 0.7421504637929449,
"learning_rate": 1.3429336712091258e-05,
"loss": 0.4113,
"mean_token_accuracy": 0.8566197585314512,
"num_tokens": 136724748.0,
"step": 318
},
{
"entropy": 0.410736083984375,
"epoch": 1.2658730158730158,
"grad_norm": 0.75910615139755,
"learning_rate": 1.3388202549958507e-05,
"loss": 0.4167,
"mean_token_accuracy": 0.8541540773585439,
"num_tokens": 137144383.0,
"step": 319
},
{
"entropy": 0.4176025390625,
"epoch": 1.2698412698412698,
"grad_norm": 0.7176046910830354,
"learning_rate": 1.334700352146561e-05,
"loss": 0.4221,
"mean_token_accuracy": 0.8523535262793303,
"num_tokens": 137570382.0,
"step": 320
},
{
"entropy": 0.4130859375,
"epoch": 1.2738095238095237,
"grad_norm": 0.7147216223748529,
"learning_rate": 1.3305740415358506e-05,
"loss": 0.4255,
"mean_token_accuracy": 0.8524302830919623,
"num_tokens": 138002323.0,
"step": 321
},
{
"entropy": 0.412811279296875,
"epoch": 1.2777777777777777,
"grad_norm": 0.7746193751855323,
"learning_rate": 1.3264414021609899e-05,
"loss": 0.4271,
"mean_token_accuracy": 0.8531954158097506,
"num_tokens": 138431194.0,
"step": 322
},
{
"entropy": 0.41534423828125,
"epoch": 1.2817460317460316,
"grad_norm": 0.6884329097906199,
"learning_rate": 1.3223025131404106e-05,
"loss": 0.4116,
"mean_token_accuracy": 0.8547803815454245,
"num_tokens": 138863136.0,
"step": 323
},
{
"entropy": 0.410064697265625,
"epoch": 1.2857142857142856,
"grad_norm": 0.7440216530883886,
"learning_rate": 1.3181574537121933e-05,
"loss": 0.4058,
"mean_token_accuracy": 0.8586990479379892,
"num_tokens": 139287890.0,
"step": 324
},
{
"entropy": 0.412506103515625,
"epoch": 1.2896825396825398,
"grad_norm": 0.7042933191716969,
"learning_rate": 1.3140063032325491e-05,
"loss": 0.4269,
"mean_token_accuracy": 0.852539798244834,
"num_tokens": 139730663.0,
"step": 325
},
{
"entropy": 0.40789794921875,
"epoch": 1.2936507936507937,
"grad_norm": 0.7053569226738333,
"learning_rate": 1.3098491411743014e-05,
"loss": 0.4203,
"mean_token_accuracy": 0.8511422863230109,
"num_tokens": 140160179.0,
"step": 326
},
{
"entropy": 0.41094970703125,
"epoch": 1.2976190476190477,
"grad_norm": 0.7260221861550915,
"learning_rate": 1.3056860471253639e-05,
"loss": 0.4148,
"mean_token_accuracy": 0.8542684894055128,
"num_tokens": 140577958.0,
"step": 327
},
{
"entropy": 0.40771484375,
"epoch": 1.3015873015873016,
"grad_norm": 0.6945091615927053,
"learning_rate": 1.3015171007872161e-05,
"loss": 0.4327,
"mean_token_accuracy": 0.8508248487487435,
"num_tokens": 141002875.0,
"step": 328
},
{
"entropy": 0.410186767578125,
"epoch": 1.3055555555555556,
"grad_norm": 0.7190051589443747,
"learning_rate": 1.297342381973379e-05,
"loss": 0.4144,
"mean_token_accuracy": 0.855740231461823,
"num_tokens": 141425392.0,
"step": 329
},
{
"entropy": 0.412933349609375,
"epoch": 1.3095238095238095,
"grad_norm": 0.7230421897639797,
"learning_rate": 1.2931619706078862e-05,
"loss": 0.4101,
"mean_token_accuracy": 0.8563672862946987,
"num_tokens": 141858286.0,
"step": 330
},
{
"entropy": 0.416717529296875,
"epoch": 1.3134920634920635,
"grad_norm": 0.7164485626421909,
"learning_rate": 1.2889759467237532e-05,
"loss": 0.4104,
"mean_token_accuracy": 0.8578307218849659,
"num_tokens": 142279417.0,
"step": 331
},
{
"entropy": 0.4114990234375,
"epoch": 1.3174603174603174,
"grad_norm": 0.6758312572105224,
"learning_rate": 1.2847843904614474e-05,
"loss": 0.4122,
"mean_token_accuracy": 0.8550651278346777,
"num_tokens": 142698339.0,
"step": 332
},
{
"entropy": 0.409271240234375,
"epoch": 1.3214285714285714,
"grad_norm": 0.8043515777393827,
"learning_rate": 1.2805873820673509e-05,
"loss": 0.4097,
"mean_token_accuracy": 0.8561445344239473,
"num_tokens": 143128013.0,
"step": 333
},
{
"entropy": 0.4112548828125,
"epoch": 1.3253968253968254,
"grad_norm": 0.7169697385515053,
"learning_rate": 1.2763850018922257e-05,
"loss": 0.4106,
"mean_token_accuracy": 0.8560521546751261,
"num_tokens": 143561112.0,
"step": 334
},
{
"entropy": 0.412353515625,
"epoch": 1.3293650793650793,
"grad_norm": 0.7437379976550189,
"learning_rate": 1.2721773303896765e-05,
"loss": 0.4195,
"mean_token_accuracy": 0.8526777876541018,
"num_tokens": 143970890.0,
"step": 335
},
{
"entropy": 0.408599853515625,
"epoch": 1.3333333333333333,
"grad_norm": 0.8092416715710883,
"learning_rate": 1.2679644481146081e-05,
"loss": 0.4168,
"mean_token_accuracy": 0.8542767520993948,
"num_tokens": 144390223.0,
"step": 336
},
{
"entropy": 0.407928466796875,
"epoch": 1.3373015873015874,
"grad_norm": 0.7460569230441075,
"learning_rate": 1.2637464357216847e-05,
"loss": 0.4298,
"mean_token_accuracy": 0.8512799562886357,
"num_tokens": 144839957.0,
"step": 337
},
{
"entropy": 0.412689208984375,
"epoch": 1.3412698412698414,
"grad_norm": 0.7141238772525242,
"learning_rate": 1.2595233739637851e-05,
"loss": 0.4296,
"mean_token_accuracy": 0.8526173504069448,
"num_tokens": 145276276.0,
"step": 338
},
{
"entropy": 0.41314697265625,
"epoch": 1.3452380952380953,
"grad_norm": 0.7625177516979067,
"learning_rate": 1.2552953436904578e-05,
"loss": 0.4318,
"mean_token_accuracy": 0.8507062029093504,
"num_tokens": 145722320.0,
"step": 339
},
{
"entropy": 0.416168212890625,
"epoch": 1.3492063492063493,
"grad_norm": 0.7323762900468244,
"learning_rate": 1.2510624258463719e-05,
"loss": 0.4102,
"mean_token_accuracy": 0.8566265497356653,
"num_tokens": 146148957.0,
"step": 340
},
{
"entropy": 0.419464111328125,
"epoch": 1.3531746031746033,
"grad_norm": 0.720246750942426,
"learning_rate": 1.246824701469768e-05,
"loss": 0.4241,
"mean_token_accuracy": 0.8515134025365114,
"num_tokens": 146580318.0,
"step": 341
},
{
"entropy": 0.413848876953125,
"epoch": 1.3571428571428572,
"grad_norm": 0.7263345728177462,
"learning_rate": 1.2425822516909065e-05,
"loss": 0.4106,
"mean_token_accuracy": 0.8557441309094429,
"num_tokens": 146999892.0,
"step": 342
},
{
"entropy": 0.411956787109375,
"epoch": 1.3611111111111112,
"grad_norm": 0.6886575308922511,
"learning_rate": 1.2383351577305148e-05,
"loss": 0.4141,
"mean_token_accuracy": 0.8551986450329423,
"num_tokens": 147436184.0,
"step": 343
},
{
"entropy": 0.401458740234375,
"epoch": 1.3650793650793651,
"grad_norm": 0.7573997962027809,
"learning_rate": 1.2340835008982315e-05,
"loss": 0.4188,
"mean_token_accuracy": 0.8537988383322954,
"num_tokens": 147888947.0,
"step": 344
},
{
"entropy": 0.412353515625,
"epoch": 1.369047619047619,
"grad_norm": 0.702714358199938,
"learning_rate": 1.2298273625910512e-05,
"loss": 0.4268,
"mean_token_accuracy": 0.8513675974681973,
"num_tokens": 148330624.0,
"step": 345
},
{
"entropy": 0.41815185546875,
"epoch": 1.373015873015873,
"grad_norm": 0.7125776426540186,
"learning_rate": 1.2255668242917651e-05,
"loss": 0.431,
"mean_token_accuracy": 0.8508031954988837,
"num_tokens": 148771994.0,
"step": 346
},
{
"entropy": 0.41412353515625,
"epoch": 1.376984126984127,
"grad_norm": 0.7116129020149227,
"learning_rate": 1.2213019675674008e-05,
"loss": 0.4131,
"mean_token_accuracy": 0.8551193429157138,
"num_tokens": 149203608.0,
"step": 347
},
{
"entropy": 0.4124755859375,
"epoch": 1.380952380952381,
"grad_norm": 0.6802674847268556,
"learning_rate": 1.2170328740676613e-05,
"loss": 0.4155,
"mean_token_accuracy": 0.8542332891374826,
"num_tokens": 149626353.0,
"step": 348
},
{
"entropy": 0.412689208984375,
"epoch": 1.3849206349206349,
"grad_norm": 0.7065345082327148,
"learning_rate": 1.2127596255233622e-05,
"loss": 0.4104,
"mean_token_accuracy": 0.8574284976348281,
"num_tokens": 150036189.0,
"step": 349
},
{
"entropy": 0.41180419921875,
"epoch": 1.3888888888888888,
"grad_norm": 0.6478009510663947,
"learning_rate": 1.2084823037448654e-05,
"loss": 0.4027,
"mean_token_accuracy": 0.858050768263638,
"num_tokens": 150484433.0,
"step": 350
},
{
"entropy": 0.411163330078125,
"epoch": 1.3928571428571428,
"grad_norm": 0.6867982734460257,
"learning_rate": 1.2042009906205152e-05,
"loss": 0.4141,
"mean_token_accuracy": 0.8563978290185332,
"num_tokens": 150916869.0,
"step": 351
},
{
"entropy": 0.413421630859375,
"epoch": 1.3968253968253967,
"grad_norm": 0.7528102622705124,
"learning_rate": 1.1999157681150683e-05,
"loss": 0.4231,
"mean_token_accuracy": 0.8521570805460215,
"num_tokens": 151351171.0,
"step": 352
},
{
"entropy": 0.4088134765625,
"epoch": 1.4007936507936507,
"grad_norm": 0.7335651303479648,
"learning_rate": 1.1956267182681265e-05,
"loss": 0.4134,
"mean_token_accuracy": 0.8541133729740977,
"num_tokens": 151779921.0,
"step": 353
},
{
"entropy": 0.408721923828125,
"epoch": 1.4047619047619047,
"grad_norm": 0.7419182720532009,
"learning_rate": 1.1913339231925642e-05,
"loss": 0.4256,
"mean_token_accuracy": 0.850860440172255,
"num_tokens": 152198704.0,
"step": 354
},
{
"entropy": 0.412109375,
"epoch": 1.4087301587301586,
"grad_norm": 0.6775984558339723,
"learning_rate": 1.1870374650729582e-05,
"loss": 0.4096,
"mean_token_accuracy": 0.8563865106552839,
"num_tokens": 152607114.0,
"step": 355
},
{
"entropy": 0.4183349609375,
"epoch": 1.4126984126984126,
"grad_norm": 0.6888120701305268,
"learning_rate": 1.1827374261640128e-05,
"loss": 0.4131,
"mean_token_accuracy": 0.856896661221981,
"num_tokens": 153027562.0,
"step": 356
},
{
"entropy": 0.416961669921875,
"epoch": 1.4166666666666667,
"grad_norm": 0.6818388664948022,
"learning_rate": 1.1784338887889858e-05,
"loss": 0.4057,
"mean_token_accuracy": 0.8576616421341896,
"num_tokens": 153449258.0,
"step": 357
},
{
"entropy": 0.41827392578125,
"epoch": 1.4206349206349207,
"grad_norm": 0.6735012808919367,
"learning_rate": 1.1741269353381128e-05,
"loss": 0.4119,
"mean_token_accuracy": 0.8539746999740601,
"num_tokens": 153863890.0,
"step": 358
},
{
"entropy": 0.417083740234375,
"epoch": 1.4246031746031746,
"grad_norm": 0.7056639670954223,
"learning_rate": 1.1698166482670293e-05,
"loss": 0.4136,
"mean_token_accuracy": 0.8554719127714634,
"num_tokens": 154280797.0,
"step": 359
},
{
"entropy": 0.415863037109375,
"epoch": 1.4285714285714286,
"grad_norm": 0.6586008605306323,
"learning_rate": 1.165503110095191e-05,
"loss": 0.4163,
"mean_token_accuracy": 0.8539502024650574,
"num_tokens": 154707913.0,
"step": 360
},
{
"entropy": 0.41082763671875,
"epoch": 1.4325396825396826,
"grad_norm": 0.72245972178782,
"learning_rate": 1.1611864034042972e-05,
"loss": 0.4094,
"mean_token_accuracy": 0.8547179391607642,
"num_tokens": 155150096.0,
"step": 361
},
{
"entropy": 0.407867431640625,
"epoch": 1.4365079365079365,
"grad_norm": 0.6648391208069424,
"learning_rate": 1.1568666108367066e-05,
"loss": 0.4103,
"mean_token_accuracy": 0.8559131594374776,
"num_tokens": 155590050.0,
"step": 362
},
{
"entropy": 0.41180419921875,
"epoch": 1.4404761904761905,
"grad_norm": 0.6867753073723831,
"learning_rate": 1.1525438150938554e-05,
"loss": 0.4133,
"mean_token_accuracy": 0.8570982730016112,
"num_tokens": 156016093.0,
"step": 363
},
{
"entropy": 0.41387939453125,
"epoch": 1.4444444444444444,
"grad_norm": 0.617533443654073,
"learning_rate": 1.1482180989346771e-05,
"loss": 0.4084,
"mean_token_accuracy": 0.8573052315041423,
"num_tokens": 156449879.0,
"step": 364
},
{
"entropy": 0.42047119140625,
"epoch": 1.4484126984126984,
"grad_norm": 0.702255339155768,
"learning_rate": 1.1438895451740141e-05,
"loss": 0.4021,
"mean_token_accuracy": 0.8589789541438222,
"num_tokens": 156866761.0,
"step": 365
},
{
"entropy": 0.416290283203125,
"epoch": 1.4523809523809523,
"grad_norm": 0.6723521380171534,
"learning_rate": 1.1395582366810348e-05,
"loss": 0.3975,
"mean_token_accuracy": 0.8603022275492549,
"num_tokens": 157304143.0,
"step": 366
},
{
"entropy": 0.414306640625,
"epoch": 1.4563492063492063,
"grad_norm": 0.6310156884325218,
"learning_rate": 1.135224256377646e-05,
"loss": 0.4337,
"mean_token_accuracy": 0.8482790300622582,
"num_tokens": 157758390.0,
"step": 367
},
{
"entropy": 0.416717529296875,
"epoch": 1.4603174603174602,
"grad_norm": 0.6674900430117614,
"learning_rate": 1.1308876872369062e-05,
"loss": 0.4057,
"mean_token_accuracy": 0.8564106421545148,
"num_tokens": 158177988.0,
"step": 368
},
{
"entropy": 0.406097412109375,
"epoch": 1.4642857142857144,
"grad_norm": 0.6565941986269037,
"learning_rate": 1.1265486122814359e-05,
"loss": 0.418,
"mean_token_accuracy": 0.853111038915813,
"num_tokens": 158634349.0,
"step": 369
},
{
"entropy": 0.412628173828125,
"epoch": 1.4682539682539684,
"grad_norm": 0.6632104455005605,
"learning_rate": 1.1222071145818293e-05,
"loss": 0.4122,
"mean_token_accuracy": 0.8553181765601039,
"num_tokens": 159060066.0,
"step": 370
},
{
"entropy": 0.415252685546875,
"epoch": 1.4722222222222223,
"grad_norm": 0.707239372405196,
"learning_rate": 1.1178632772550636e-05,
"loss": 0.4113,
"mean_token_accuracy": 0.8547909967601299,
"num_tokens": 159490031.0,
"step": 371
},
{
"entropy": 0.40692138671875,
"epoch": 1.4761904761904763,
"grad_norm": 0.7069148314328081,
"learning_rate": 1.113517183462907e-05,
"loss": 0.412,
"mean_token_accuracy": 0.8557059289887547,
"num_tokens": 159942986.0,
"step": 372
},
{
"entropy": 0.41400146484375,
"epoch": 1.4801587301587302,
"grad_norm": 0.6897820697933245,
"learning_rate": 1.1091689164103281e-05,
"loss": 0.3906,
"mean_token_accuracy": 0.8620947021991014,
"num_tokens": 160355322.0,
"step": 373
},
{
"entropy": 0.408599853515625,
"epoch": 1.4841269841269842,
"grad_norm": 0.6751095283586555,
"learning_rate": 1.1048185593439014e-05,
"loss": 0.4147,
"mean_token_accuracy": 0.8550657378509641,
"num_tokens": 160782816.0,
"step": 374
},
{
"entropy": 0.410614013671875,
"epoch": 1.4880952380952381,
"grad_norm": 0.6972038572778951,
"learning_rate": 1.1004661955502143e-05,
"loss": 0.4148,
"mean_token_accuracy": 0.8550673946738243,
"num_tokens": 161216771.0,
"step": 375
},
{
"entropy": 0.412353515625,
"epoch": 1.492063492063492,
"grad_norm": 0.6351069017407472,
"learning_rate": 1.0961119083542727e-05,
"loss": 0.3967,
"mean_token_accuracy": 0.8618287779390812,
"num_tokens": 161643512.0,
"step": 376
},
{
"entropy": 0.407440185546875,
"epoch": 1.496031746031746,
"grad_norm": 0.6819391026589559,
"learning_rate": 1.0917557811179057e-05,
"loss": 0.4052,
"mean_token_accuracy": 0.8588421484455466,
"num_tokens": 162077647.0,
"step": 377
},
{
"entropy": 0.4102783203125,
"epoch": 1.5,
"grad_norm": 0.6625502386920177,
"learning_rate": 1.0873978972381692e-05,
"loss": 0.3982,
"mean_token_accuracy": 0.859605161473155,
"num_tokens": 162503001.0,
"step": 378
},
{
"entropy": 0.411865234375,
"epoch": 1.503968253968254,
"grad_norm": 0.651583717791506,
"learning_rate": 1.0830383401457499e-05,
"loss": 0.4195,
"mean_token_accuracy": 0.8546869652345777,
"num_tokens": 162949686.0,
"step": 379
},
{
"entropy": 0.405609130859375,
"epoch": 1.507936507936508,
"grad_norm": 0.6806902067734124,
"learning_rate": 1.0786771933033677e-05,
"loss": 0.4037,
"mean_token_accuracy": 0.8567021545022726,
"num_tokens": 163388010.0,
"step": 380
},
{
"entropy": 0.40545654296875,
"epoch": 1.5119047619047619,
"grad_norm": 0.6568305792093015,
"learning_rate": 1.0743145402041781e-05,
"loss": 0.3984,
"mean_token_accuracy": 0.8587896954268217,
"num_tokens": 163816567.0,
"step": 381
},
{
"entropy": 0.406280517578125,
"epoch": 1.5158730158730158,
"grad_norm": 0.6687688963859889,
"learning_rate": 1.0699504643701732e-05,
"loss": 0.4051,
"mean_token_accuracy": 0.8573078708723187,
"num_tokens": 164270399.0,
"step": 382
},
{
"entropy": 0.402984619140625,
"epoch": 1.5198412698412698,
"grad_norm": 0.6205824056394684,
"learning_rate": 1.0655850493505834e-05,
"loss": 0.3876,
"mean_token_accuracy": 0.8629192840307951,
"num_tokens": 164712402.0,
"step": 383
},
{
"entropy": 0.401519775390625,
"epoch": 1.5238095238095237,
"grad_norm": 0.6946581962210521,
"learning_rate": 1.0612183787202768e-05,
"loss": 0.4147,
"mean_token_accuracy": 0.8557152729481459,
"num_tokens": 165155523.0,
"step": 384
},
{
"entropy": 0.40338134765625,
"epoch": 1.5277777777777777,
"grad_norm": 0.6178420541561561,
"learning_rate": 1.0568505360781606e-05,
"loss": 0.3841,
"mean_token_accuracy": 0.8645893288776278,
"num_tokens": 165575993.0,
"step": 385
},
{
"entropy": 0.40411376953125,
"epoch": 1.5317460317460316,
"grad_norm": 0.6997457239337582,
"learning_rate": 1.0524816050455801e-05,
"loss": 0.4145,
"mean_token_accuracy": 0.8545086095109582,
"num_tokens": 166004219.0,
"step": 386
},
{
"entropy": 0.404937744140625,
"epoch": 1.5357142857142856,
"grad_norm": 0.6344135411115165,
"learning_rate": 1.0481116692647165e-05,
"loss": 0.3977,
"mean_token_accuracy": 0.8587166350334883,
"num_tokens": 166459333.0,
"step": 387
},
{
"entropy": 0.40777587890625,
"epoch": 1.5396825396825395,
"grad_norm": 0.6634593666477472,
"learning_rate": 1.0437408123969877e-05,
"loss": 0.4007,
"mean_token_accuracy": 0.8594609973952174,
"num_tokens": 166887486.0,
"step": 388
},
{
"entropy": 0.4075927734375,
"epoch": 1.5436507936507935,
"grad_norm": 0.6684250544789218,
"learning_rate": 1.039369118121445e-05,
"loss": 0.4136,
"mean_token_accuracy": 0.8562117423862219,
"num_tokens": 167305651.0,
"step": 389
},
{
"entropy": 0.402740478515625,
"epoch": 1.5476190476190477,
"grad_norm": 0.6921394211692032,
"learning_rate": 1.0349966701331721e-05,
"loss": 0.4043,
"mean_token_accuracy": 0.8599712895229459,
"num_tokens": 167743608.0,
"step": 390
},
{
"entropy": 0.409759521484375,
"epoch": 1.5515873015873016,
"grad_norm": 0.6424341617432388,
"learning_rate": 1.0306235521416822e-05,
"loss": 0.4002,
"mean_token_accuracy": 0.860035234130919,
"num_tokens": 168189361.0,
"step": 391
},
{
"entropy": 0.41387939453125,
"epoch": 1.5555555555555556,
"grad_norm": 0.6897265091020467,
"learning_rate": 1.0262498478693148e-05,
"loss": 0.4003,
"mean_token_accuracy": 0.8587908744812012,
"num_tokens": 168602032.0,
"step": 392
},
{
"entropy": 0.40863037109375,
"epoch": 1.5595238095238095,
"grad_norm": 0.6945381133940456,
"learning_rate": 1.0218756410496353e-05,
"loss": 0.4068,
"mean_token_accuracy": 0.8557405965402722,
"num_tokens": 169036397.0,
"step": 393
},
{
"entropy": 0.404693603515625,
"epoch": 1.5634920634920635,
"grad_norm": 0.6424876592982417,
"learning_rate": 1.0175010154258288e-05,
"loss": 0.4059,
"mean_token_accuracy": 0.8577195946127176,
"num_tokens": 169469975.0,
"step": 394
},
{
"entropy": 0.397216796875,
"epoch": 1.5674603174603174,
"grad_norm": 0.6360455374207441,
"learning_rate": 1.013126054749099e-05,
"loss": 0.4075,
"mean_token_accuracy": 0.8579382970929146,
"num_tokens": 169905002.0,
"step": 395
},
{
"entropy": 0.4017333984375,
"epoch": 1.5714285714285714,
"grad_norm": 0.6606478789612956,
"learning_rate": 1.0087508427770639e-05,
"loss": 0.4025,
"mean_token_accuracy": 0.8562004147097468,
"num_tokens": 170343282.0,
"step": 396
},
{
"entropy": 0.408935546875,
"epoch": 1.5753968253968254,
"grad_norm": 0.6987795798888022,
"learning_rate": 1.0043754632721519e-05,
"loss": 0.3966,
"mean_token_accuracy": 0.8587719267234206,
"num_tokens": 170783254.0,
"step": 397
},
{
"entropy": 0.406951904296875,
"epoch": 1.5793650793650795,
"grad_norm": 0.6593916685637742,
"learning_rate": 1e-05,
"loss": 0.414,
"mean_token_accuracy": 0.8565381094813347,
"num_tokens": 171227432.0,
"step": 398
},
{
"entropy": 0.4052734375,
"epoch": 1.5833333333333335,
"grad_norm": 0.6987433063006335,
"learning_rate": 9.956245367278483e-06,
"loss": 0.386,
"mean_token_accuracy": 0.8624997651204467,
"num_tokens": 171673565.0,
"step": 399
},
{
"entropy": 0.4105224609375,
"epoch": 1.5873015873015874,
"grad_norm": 0.672307925393411,
"learning_rate": 9.912491572229366e-06,
"loss": 0.3915,
"mean_token_accuracy": 0.8613409381359816,
"num_tokens": 172096305.0,
"step": 400
},
{
"entropy": 0.409149169921875,
"epoch": 1.5912698412698414,
"grad_norm": 0.7041365935845996,
"learning_rate": 9.868739452509011e-06,
"loss": 0.3931,
"mean_token_accuracy": 0.8597009964287281,
"num_tokens": 172495298.0,
"step": 401
},
{
"entropy": 0.40899658203125,
"epoch": 1.5952380952380953,
"grad_norm": 0.7517004342308762,
"learning_rate": 9.824989845741713e-06,
"loss": 0.3972,
"mean_token_accuracy": 0.8588024405762553,
"num_tokens": 172910673.0,
"step": 402
},
{
"entropy": 0.40240478515625,
"epoch": 1.5992063492063493,
"grad_norm": 0.6198853423830931,
"learning_rate": 9.78124358950365e-06,
"loss": 0.3954,
"mean_token_accuracy": 0.8617026535794139,
"num_tokens": 173326209.0,
"step": 403
},
{
"entropy": 0.403167724609375,
"epoch": 1.6031746031746033,
"grad_norm": 0.684199049613554,
"learning_rate": 9.737501521306855e-06,
"loss": 0.3994,
"mean_token_accuracy": 0.8593406956642866,
"num_tokens": 173775762.0,
"step": 404
},
{
"entropy": 0.410186767578125,
"epoch": 1.6071428571428572,
"grad_norm": 0.7341393365653166,
"learning_rate": 9.693764478583185e-06,
"loss": 0.3975,
"mean_token_accuracy": 0.8609625976532698,
"num_tokens": 174200041.0,
"step": 405
},
{
"entropy": 0.402435302734375,
"epoch": 1.6111111111111112,
"grad_norm": 0.6781454981761531,
"learning_rate": 9.65003329866828e-06,
"loss": 0.4042,
"mean_token_accuracy": 0.8575213002040982,
"num_tokens": 174651858.0,
"step": 406
},
{
"entropy": 0.408233642578125,
"epoch": 1.6150793650793651,
"grad_norm": 0.6317009902021083,
"learning_rate": 9.606308818785552e-06,
"loss": 0.3867,
"mean_token_accuracy": 0.8641856899484992,
"num_tokens": 175081173.0,
"step": 407
},
{
"entropy": 0.402801513671875,
"epoch": 1.619047619047619,
"grad_norm": 0.6752157911204306,
"learning_rate": 9.562591876030127e-06,
"loss": 0.3937,
"mean_token_accuracy": 0.8606289671733975,
"num_tokens": 175519282.0,
"step": 408
},
{
"entropy": 0.406097412109375,
"epoch": 1.623015873015873,
"grad_norm": 0.7258956750619963,
"learning_rate": 9.518883307352839e-06,
"loss": 0.4086,
"mean_token_accuracy": 0.8588223177939653,
"num_tokens": 175965036.0,
"step": 409
},
{
"entropy": 0.405303955078125,
"epoch": 1.626984126984127,
"grad_norm": 0.6365492747842574,
"learning_rate": 9.475183949544204e-06,
"loss": 0.4104,
"mean_token_accuracy": 0.8550521014258265,
"num_tokens": 176387199.0,
"step": 410
},
{
"entropy": 0.40435791015625,
"epoch": 1.630952380952381,
"grad_norm": 0.6803954542416505,
"learning_rate": 9.431494639218397e-06,
"loss": 0.3969,
"mean_token_accuracy": 0.8615404982119799,
"num_tokens": 176823428.0,
"step": 411
},
{
"entropy": 0.39996337890625,
"epoch": 1.6349206349206349,
"grad_norm": 0.6352543229598833,
"learning_rate": 9.387816212797233e-06,
"loss": 0.4023,
"mean_token_accuracy": 0.8599696168676019,
"num_tokens": 177264131.0,
"step": 412
},
{
"entropy": 0.408111572265625,
"epoch": 1.6388888888888888,
"grad_norm": 0.6536962265542489,
"learning_rate": 9.344149506494169e-06,
"loss": 0.3952,
"mean_token_accuracy": 0.8607912426814437,
"num_tokens": 177680911.0,
"step": 413
},
{
"entropy": 0.406005859375,
"epoch": 1.6428571428571428,
"grad_norm": 0.6661123978510443,
"learning_rate": 9.30049535629827e-06,
"loss": 0.4014,
"mean_token_accuracy": 0.859113815240562,
"num_tokens": 178114003.0,
"step": 414
},
{
"entropy": 0.410400390625,
"epoch": 1.6468253968253967,
"grad_norm": 0.6902363753958481,
"learning_rate": 9.256854597958222e-06,
"loss": 0.4146,
"mean_token_accuracy": 0.8541763303801417,
"num_tokens": 178533077.0,
"step": 415
},
{
"entropy": 0.408782958984375,
"epoch": 1.6507936507936507,
"grad_norm": 0.7030439443784253,
"learning_rate": 9.213228066966328e-06,
"loss": 0.3924,
"mean_token_accuracy": 0.8622600650414824,
"num_tokens": 178950487.0,
"step": 416
},
{
"entropy": 0.405303955078125,
"epoch": 1.6547619047619047,
"grad_norm": 0.6475133352507451,
"learning_rate": 9.169616598542503e-06,
"loss": 0.3994,
"mean_token_accuracy": 0.8593355258926749,
"num_tokens": 179384739.0,
"step": 417
},
{
"entropy": 0.399627685546875,
"epoch": 1.6587301587301586,
"grad_norm": 0.6723065907691588,
"learning_rate": 9.126021027618312e-06,
"loss": 0.4085,
"mean_token_accuracy": 0.8555388646200299,
"num_tokens": 179833212.0,
"step": 418
},
{
"entropy": 0.4014892578125,
"epoch": 1.6626984126984126,
"grad_norm": 0.6653095194789432,
"learning_rate": 9.082442188820947e-06,
"loss": 0.391,
"mean_token_accuracy": 0.8626063298434019,
"num_tokens": 180259940.0,
"step": 419
},
{
"entropy": 0.396484375,
"epoch": 1.6666666666666665,
"grad_norm": 0.6734067922751799,
"learning_rate": 9.038880916457276e-06,
"loss": 0.3954,
"mean_token_accuracy": 0.8605092065408826,
"num_tokens": 180712234.0,
"step": 420
},
{
"entropy": 0.40069580078125,
"epoch": 1.6706349206349205,
"grad_norm": 0.7248437901702897,
"learning_rate": 8.995338044497862e-06,
"loss": 0.4161,
"mean_token_accuracy": 0.8539806362241507,
"num_tokens": 181151354.0,
"step": 421
},
{
"entropy": 0.407073974609375,
"epoch": 1.6746031746031746,
"grad_norm": 0.6569712167965962,
"learning_rate": 8.951814406560988e-06,
"loss": 0.397,
"mean_token_accuracy": 0.8575204182416201,
"num_tokens": 181566490.0,
"step": 422
},
{
"entropy": 0.40673828125,
"epoch": 1.6785714285714286,
"grad_norm": 0.6763632648533178,
"learning_rate": 8.90831083589672e-06,
"loss": 0.4021,
"mean_token_accuracy": 0.8591755600646138,
"num_tokens": 181997420.0,
"step": 423
},
{
"entropy": 0.409881591796875,
"epoch": 1.6825396825396826,
"grad_norm": 0.6793544531923543,
"learning_rate": 8.864828165370932e-06,
"loss": 0.396,
"mean_token_accuracy": 0.8604298168793321,
"num_tokens": 182413254.0,
"step": 424
},
{
"entropy": 0.4053955078125,
"epoch": 1.6865079365079365,
"grad_norm": 0.6619199494789456,
"learning_rate": 8.821367227449368e-06,
"loss": 0.3906,
"mean_token_accuracy": 0.862027888186276,
"num_tokens": 182840621.0,
"step": 425
},
{
"entropy": 0.405548095703125,
"epoch": 1.6904761904761905,
"grad_norm": 0.6604274179296861,
"learning_rate": 8.77792885418171e-06,
"loss": 0.3975,
"mean_token_accuracy": 0.8592646988108754,
"num_tokens": 183269538.0,
"step": 426
},
{
"entropy": 0.410430908203125,
"epoch": 1.6944444444444444,
"grad_norm": 0.6568683879895119,
"learning_rate": 8.734513877185644e-06,
"loss": 0.3838,
"mean_token_accuracy": 0.8651055432856083,
"num_tokens": 183684845.0,
"step": 427
},
{
"entropy": 0.4075927734375,
"epoch": 1.6984126984126984,
"grad_norm": 0.6704945921600994,
"learning_rate": 8.691123127630942e-06,
"loss": 0.3902,
"mean_token_accuracy": 0.8633867194876075,
"num_tokens": 184109496.0,
"step": 428
},
{
"entropy": 0.407073974609375,
"epoch": 1.7023809523809523,
"grad_norm": 0.7170767243747824,
"learning_rate": 8.647757436223543e-06,
"loss": 0.4135,
"mean_token_accuracy": 0.8537504924461246,
"num_tokens": 184533568.0,
"step": 429
},
{
"entropy": 0.4056396484375,
"epoch": 1.7063492063492065,
"grad_norm": 0.6949542788037961,
"learning_rate": 8.604417633189658e-06,
"loss": 0.4013,
"mean_token_accuracy": 0.8596660671755672,
"num_tokens": 184968366.0,
"step": 430
},
{
"entropy": 0.406036376953125,
"epoch": 1.7103174603174605,
"grad_norm": 0.6318993348714297,
"learning_rate": 8.561104548259864e-06,
"loss": 0.4005,
"mean_token_accuracy": 0.8592518717050552,
"num_tokens": 185404884.0,
"step": 431
},
{
"entropy": 0.4052734375,
"epoch": 1.7142857142857144,
"grad_norm": 0.7150048622751305,
"learning_rate": 8.517819010653234e-06,
"loss": 0.4082,
"mean_token_accuracy": 0.8565432196483016,
"num_tokens": 185857166.0,
"step": 432
},
{
"entropy": 0.4102783203125,
"epoch": 1.7182539682539684,
"grad_norm": 0.6984571949535301,
"learning_rate": 8.474561849061446e-06,
"loss": 0.385,
"mean_token_accuracy": 0.8628620821982622,
"num_tokens": 186278301.0,
"step": 433
},
{
"entropy": 0.413604736328125,
"epoch": 1.7222222222222223,
"grad_norm": 0.6927790727966183,
"learning_rate": 8.431333891632937e-06,
"loss": 0.3857,
"mean_token_accuracy": 0.8627250017598271,
"num_tokens": 186684767.0,
"step": 434
},
{
"entropy": 0.404876708984375,
"epoch": 1.7261904761904763,
"grad_norm": 0.6958224617466756,
"learning_rate": 8.388135965957031e-06,
"loss": 0.3926,
"mean_token_accuracy": 0.8619824200868607,
"num_tokens": 187107470.0,
"step": 435
},
{
"entropy": 0.399261474609375,
"epoch": 1.7301587301587302,
"grad_norm": 0.6668337287163626,
"learning_rate": 8.344968899048093e-06,
"loss": 0.3852,
"mean_token_accuracy": 0.8640718599781394,
"num_tokens": 187534641.0,
"step": 436
},
{
"entropy": 0.399871826171875,
"epoch": 1.7341269841269842,
"grad_norm": 0.660340307861853,
"learning_rate": 8.301833517329714e-06,
"loss": 0.3941,
"mean_token_accuracy": 0.8606917411088943,
"num_tokens": 187972052.0,
"step": 437
},
{
"entropy": 0.4000244140625,
"epoch": 1.7380952380952381,
"grad_norm": 0.7449016377460825,
"learning_rate": 8.258730646618872e-06,
"loss": 0.3883,
"mean_token_accuracy": 0.8606307609006763,
"num_tokens": 188403747.0,
"step": 438
},
{
"entropy": 0.40106201171875,
"epoch": 1.742063492063492,
"grad_norm": 0.6657481910760707,
"learning_rate": 8.215661112110143e-06,
"loss": 0.3932,
"mean_token_accuracy": 0.8618741119280457,
"num_tokens": 188833376.0,
"step": 439
},
{
"entropy": 0.397735595703125,
"epoch": 1.746031746031746,
"grad_norm": 0.6529962126614087,
"learning_rate": 8.172625738359876e-06,
"loss": 0.3923,
"mean_token_accuracy": 0.8605843409895897,
"num_tokens": 189286051.0,
"step": 440
},
{
"entropy": 0.409576416015625,
"epoch": 1.75,
"grad_norm": 0.6520639060375251,
"learning_rate": 8.12962534927042e-06,
"loss": 0.3842,
"mean_token_accuracy": 0.8636613693088293,
"num_tokens": 189700932.0,
"step": 441
},
{
"entropy": 0.4051513671875,
"epoch": 1.753968253968254,
"grad_norm": 0.6752597049435843,
"learning_rate": 8.08666076807436e-06,
"loss": 0.392,
"mean_token_accuracy": 0.859367199242115,
"num_tokens": 190135846.0,
"step": 442
},
{
"entropy": 0.39495849609375,
"epoch": 1.757936507936508,
"grad_norm": 0.6405008649597489,
"learning_rate": 8.043732817318736e-06,
"loss": 0.3953,
"mean_token_accuracy": 0.8626237865537405,
"num_tokens": 190599539.0,
"step": 443
},
{
"entropy": 0.40020751953125,
"epoch": 1.7619047619047619,
"grad_norm": 0.7045668566293105,
"learning_rate": 8.000842318849317e-06,
"loss": 0.3977,
"mean_token_accuracy": 0.8612882681190968,
"num_tokens": 191023956.0,
"step": 444
},
{
"entropy": 0.412139892578125,
"epoch": 1.7658730158730158,
"grad_norm": 0.6309864360582974,
"learning_rate": 7.95799009379485e-06,
"loss": 0.3714,
"mean_token_accuracy": 0.8665594831109047,
"num_tokens": 191419256.0,
"step": 445
},
{
"entropy": 0.400787353515625,
"epoch": 1.7698412698412698,
"grad_norm": 0.6313143549912045,
"learning_rate": 7.915176962551347e-06,
"loss": 0.3848,
"mean_token_accuracy": 0.8646227335557342,
"num_tokens": 191865715.0,
"step": 446
},
{
"entropy": 0.40869140625,
"epoch": 1.7738095238095237,
"grad_norm": 0.6791552619696869,
"learning_rate": 7.872403744766383e-06,
"loss": 0.403,
"mean_token_accuracy": 0.8603015225380659,
"num_tokens": 192279544.0,
"step": 447
},
{
"entropy": 0.40484619140625,
"epoch": 1.7777777777777777,
"grad_norm": 0.6497182250849004,
"learning_rate": 7.82967125932339e-06,
"loss": 0.3872,
"mean_token_accuracy": 0.8622237564995885,
"num_tokens": 192687536.0,
"step": 448
},
{
"entropy": 0.405914306640625,
"epoch": 1.7817460317460316,
"grad_norm": 0.6473578509768539,
"learning_rate": 7.786980324325994e-06,
"loss": 0.3886,
"mean_token_accuracy": 0.8661008570343256,
"num_tokens": 193092369.0,
"step": 449
},
{
"entropy": 0.408721923828125,
"epoch": 1.7857142857142856,
"grad_norm": 0.6624737663531524,
"learning_rate": 7.74433175708235e-06,
"loss": 0.3794,
"mean_token_accuracy": 0.8674081796780229,
"num_tokens": 193514317.0,
"step": 450
},
{
"entropy": 0.400177001953125,
"epoch": 1.7896825396825395,
"grad_norm": 0.6173586758665184,
"learning_rate": 7.70172637408949e-06,
"loss": 0.3859,
"mean_token_accuracy": 0.8648398378863931,
"num_tokens": 193943892.0,
"step": 451
},
{
"entropy": 0.39544677734375,
"epoch": 1.7936507936507935,
"grad_norm": 0.6882345053798201,
"learning_rate": 7.659164991017689e-06,
"loss": 0.3816,
"mean_token_accuracy": 0.864141782745719,
"num_tokens": 194368425.0,
"step": 452
},
{
"entropy": 0.398651123046875,
"epoch": 1.7976190476190477,
"grad_norm": 0.643498270336236,
"learning_rate": 7.616648422694858e-06,
"loss": 0.3886,
"mean_token_accuracy": 0.8627204354852438,
"num_tokens": 194781122.0,
"step": 453
},
{
"entropy": 0.398651123046875,
"epoch": 1.8015873015873016,
"grad_norm": 0.6388429129984551,
"learning_rate": 7.5741774830909375e-06,
"loss": 0.3865,
"mean_token_accuracy": 0.8640177240595222,
"num_tokens": 195229420.0,
"step": 454
},
{
"entropy": 0.3966064453125,
"epoch": 1.8055555555555556,
"grad_norm": 0.6756670134886724,
"learning_rate": 7.531752985302323e-06,
"loss": 0.3922,
"mean_token_accuracy": 0.8626740667968988,
"num_tokens": 195670858.0,
"step": 455
},
{
"entropy": 0.403045654296875,
"epoch": 1.8095238095238095,
"grad_norm": 0.673993885311234,
"learning_rate": 7.489375741536283e-06,
"loss": 0.3958,
"mean_token_accuracy": 0.8607546780258417,
"num_tokens": 196086060.0,
"step": 456
},
{
"entropy": 0.40087890625,
"epoch": 1.8134920634920635,
"grad_norm": 0.6550524224232206,
"learning_rate": 7.447046563095425e-06,
"loss": 0.3957,
"mean_token_accuracy": 0.8578624930232763,
"num_tokens": 196534067.0,
"step": 457
},
{
"entropy": 0.4056396484375,
"epoch": 1.8174603174603174,
"grad_norm": 0.6241087307689689,
"learning_rate": 7.404766260362153e-06,
"loss": 0.3842,
"mean_token_accuracy": 0.8643308812752366,
"num_tokens": 196949752.0,
"step": 458
},
{
"entropy": 0.40155029296875,
"epoch": 1.8214285714285714,
"grad_norm": 0.6333219557811135,
"learning_rate": 7.362535642783155e-06,
"loss": 0.3816,
"mean_token_accuracy": 0.8649132940918207,
"num_tokens": 197361623.0,
"step": 459
},
{
"entropy": 0.399658203125,
"epoch": 1.8253968253968254,
"grad_norm": 0.6379379135553368,
"learning_rate": 7.320355518853921e-06,
"loss": 0.3859,
"mean_token_accuracy": 0.8645495921373367,
"num_tokens": 197787383.0,
"step": 460
},
{
"entropy": 0.398345947265625,
"epoch": 1.8293650793650795,
"grad_norm": 0.6588997380159366,
"learning_rate": 7.278226696103239e-06,
"loss": 0.3924,
"mean_token_accuracy": 0.8615701934322715,
"num_tokens": 198214788.0,
"step": 461
},
{
"entropy": 0.40325927734375,
"epoch": 1.8333333333333335,
"grad_norm": 0.6430673965041483,
"learning_rate": 7.236149981077746e-06,
"loss": 0.3974,
"mean_token_accuracy": 0.8626204943284392,
"num_tokens": 198640204.0,
"step": 462
},
{
"entropy": 0.3955078125,
"epoch": 1.8373015873015874,
"grad_norm": 0.6487127679328336,
"learning_rate": 7.194126179326497e-06,
"loss": 0.3985,
"mean_token_accuracy": 0.8603583332151175,
"num_tokens": 199086174.0,
"step": 463
},
{
"entropy": 0.395599365234375,
"epoch": 1.8412698412698414,
"grad_norm": 0.6624716705219701,
"learning_rate": 7.1521560953855274e-06,
"loss": 0.3894,
"mean_token_accuracy": 0.8631916120648384,
"num_tokens": 199534945.0,
"step": 464
},
{
"entropy": 0.40576171875,
"epoch": 1.8452380952380953,
"grad_norm": 0.6699916552832386,
"learning_rate": 7.110240532762469e-06,
"loss": 0.3878,
"mean_token_accuracy": 0.862906139343977,
"num_tokens": 199970879.0,
"step": 465
},
{
"entropy": 0.406005859375,
"epoch": 1.8492063492063493,
"grad_norm": 0.6365324815628914,
"learning_rate": 7.068380293921142e-06,
"loss": 0.3794,
"mean_token_accuracy": 0.8647226821631193,
"num_tokens": 200401566.0,
"step": 466
},
{
"entropy": 0.403961181640625,
"epoch": 1.8531746031746033,
"grad_norm": 0.6659380259679744,
"learning_rate": 7.026576180266213e-06,
"loss": 0.3795,
"mean_token_accuracy": 0.8645898820832372,
"num_tokens": 200819281.0,
"step": 467
},
{
"entropy": 0.400146484375,
"epoch": 1.8571428571428572,
"grad_norm": 0.6623575240954482,
"learning_rate": 6.984828992127842e-06,
"loss": 0.3869,
"mean_token_accuracy": 0.8644722169265151,
"num_tokens": 201286569.0,
"step": 468
},
{
"entropy": 0.399322509765625,
"epoch": 1.8611111111111112,
"grad_norm": 0.6428412382384305,
"learning_rate": 6.9431395287463655e-06,
"loss": 0.3817,
"mean_token_accuracy": 0.8629525965079665,
"num_tokens": 201729117.0,
"step": 469
},
{
"entropy": 0.4012451171875,
"epoch": 1.8650793650793651,
"grad_norm": 0.6590064495664711,
"learning_rate": 6.9015085882569866e-06,
"loss": 0.3845,
"mean_token_accuracy": 0.8646084098145366,
"num_tokens": 202148785.0,
"step": 470
},
{
"entropy": 0.401275634765625,
"epoch": 1.869047619047619,
"grad_norm": 0.6644154480782758,
"learning_rate": 6.859936967674509e-06,
"loss": 0.3833,
"mean_token_accuracy": 0.8641043901443481,
"num_tokens": 202562335.0,
"step": 471
},
{
"entropy": 0.408447265625,
"epoch": 1.873015873015873,
"grad_norm": 0.6874013779210217,
"learning_rate": 6.818425462878071e-06,
"loss": 0.3786,
"mean_token_accuracy": 0.8654429130256176,
"num_tokens": 202969412.0,
"step": 472
},
{
"entropy": 0.399871826171875,
"epoch": 1.876984126984127,
"grad_norm": 0.6419195701464625,
"learning_rate": 6.776974868595898e-06,
"loss": 0.3855,
"mean_token_accuracy": 0.8635794082656503,
"num_tokens": 203414579.0,
"step": 473
},
{
"entropy": 0.40521240234375,
"epoch": 1.880952380952381,
"grad_norm": 0.6373613162236946,
"learning_rate": 6.735585978390105e-06,
"loss": 0.3821,
"mean_token_accuracy": 0.8647842686623335,
"num_tokens": 203845826.0,
"step": 474
},
{
"entropy": 0.40625,
"epoch": 1.8849206349206349,
"grad_norm": 0.7484479121963832,
"learning_rate": 6.694259584641496e-06,
"loss": 0.3879,
"mean_token_accuracy": 0.862905758433044,
"num_tokens": 204272914.0,
"step": 475
},
{
"entropy": 0.401580810546875,
"epoch": 1.8888888888888888,
"grad_norm": 0.6661868398653255,
"learning_rate": 6.652996478534395e-06,
"loss": 0.3772,
"mean_token_accuracy": 0.8666380383074284,
"num_tokens": 204713067.0,
"step": 476
},
{
"entropy": 0.40240478515625,
"epoch": 1.8928571428571428,
"grad_norm": 0.6367178541736724,
"learning_rate": 6.611797450041495e-06,
"loss": 0.3824,
"mean_token_accuracy": 0.8640545001253486,
"num_tokens": 205140799.0,
"step": 477
},
{
"entropy": 0.410491943359375,
"epoch": 1.8968253968253967,
"grad_norm": 0.6411794902398776,
"learning_rate": 6.570663287908744e-06,
"loss": 0.3759,
"mean_token_accuracy": 0.8667557742446661,
"num_tokens": 205549482.0,
"step": 478
},
{
"entropy": 0.39971923828125,
"epoch": 1.9007936507936507,
"grad_norm": 0.6657409283144686,
"learning_rate": 6.5295947796402315e-06,
"loss": 0.3871,
"mean_token_accuracy": 0.8628489142283797,
"num_tokens": 205957709.0,
"step": 479
},
{
"entropy": 0.399810791015625,
"epoch": 1.9047619047619047,
"grad_norm": 0.6650888096094876,
"learning_rate": 6.488592711483122e-06,
"loss": 0.3813,
"mean_token_accuracy": 0.865462708286941,
"num_tokens": 206394578.0,
"step": 480
},
{
"entropy": 0.404998779296875,
"epoch": 1.9087301587301586,
"grad_norm": 0.6520479299882604,
"learning_rate": 6.447657868412603e-06,
"loss": 0.3832,
"mean_token_accuracy": 0.8648681128397584,
"num_tokens": 206810851.0,
"step": 481
},
{
"entropy": 0.400482177734375,
"epoch": 1.9126984126984126,
"grad_norm": 0.6641867800114467,
"learning_rate": 6.406791034116846e-06,
"loss": 0.3911,
"mean_token_accuracy": 0.8639266528189182,
"num_tokens": 207233636.0,
"step": 482
},
{
"entropy": 0.39788818359375,
"epoch": 1.9166666666666665,
"grad_norm": 0.6616452561654831,
"learning_rate": 6.365992990982015e-06,
"loss": 0.3917,
"mean_token_accuracy": 0.8605171097442508,
"num_tokens": 207671663.0,
"step": 483
},
{
"entropy": 0.413970947265625,
"epoch": 1.9206349206349205,
"grad_norm": 0.6811368094477863,
"learning_rate": 6.3252645200772836e-06,
"loss": 0.3839,
"mean_token_accuracy": 0.8638743665069342,
"num_tokens": 208074376.0,
"step": 484
},
{
"entropy": 0.402008056640625,
"epoch": 1.9246031746031746,
"grad_norm": 0.6474488866243678,
"learning_rate": 6.284606401139875e-06,
"loss": 0.3933,
"mean_token_accuracy": 0.8602316891774535,
"num_tokens": 208524843.0,
"step": 485
},
{
"entropy": 0.407806396484375,
"epoch": 1.9285714285714286,
"grad_norm": 0.6704121804386468,
"learning_rate": 6.244019412560144e-06,
"loss": 0.3848,
"mean_token_accuracy": 0.863696001470089,
"num_tokens": 208947370.0,
"step": 486
},
{
"entropy": 0.401947021484375,
"epoch": 1.9325396825396826,
"grad_norm": 0.6514026741498656,
"learning_rate": 6.203504331366677e-06,
"loss": 0.3738,
"mean_token_accuracy": 0.8671468198299408,
"num_tokens": 209354451.0,
"step": 487
},
{
"entropy": 0.39605712890625,
"epoch": 1.9365079365079365,
"grad_norm": 0.6332801182132646,
"learning_rate": 6.163061933211403e-06,
"loss": 0.3815,
"mean_token_accuracy": 0.8641307642683387,
"num_tokens": 209798547.0,
"step": 488
},
{
"entropy": 0.396148681640625,
"epoch": 1.9404761904761905,
"grad_norm": 0.6747869284841455,
"learning_rate": 6.122692992354748e-06,
"loss": 0.3783,
"mean_token_accuracy": 0.8645921712741256,
"num_tokens": 210233790.0,
"step": 489
},
{
"entropy": 0.40008544921875,
"epoch": 1.9444444444444444,
"grad_norm": 0.6605152850398239,
"learning_rate": 6.082398281650823e-06,
"loss": 0.392,
"mean_token_accuracy": 0.8625458022579551,
"num_tokens": 210661829.0,
"step": 490
},
{
"entropy": 0.39251708984375,
"epoch": 1.9484126984126984,
"grad_norm": 0.6047228067514416,
"learning_rate": 6.0421785725326085e-06,
"loss": 0.3807,
"mean_token_accuracy": 0.8650286197662354,
"num_tokens": 211113597.0,
"step": 491
},
{
"entropy": 0.39459228515625,
"epoch": 1.9523809523809523,
"grad_norm": 0.6770029694746321,
"learning_rate": 6.002034634997214e-06,
"loss": 0.3845,
"mean_token_accuracy": 0.8641347736120224,
"num_tokens": 211549046.0,
"step": 492
},
{
"entropy": 0.393890380859375,
"epoch": 1.9563492063492065,
"grad_norm": 0.6764981819376107,
"learning_rate": 5.9619672375911065e-06,
"loss": 0.3661,
"mean_token_accuracy": 0.8685044087469578,
"num_tokens": 212000519.0,
"step": 493
},
{
"entropy": 0.399566650390625,
"epoch": 1.9603174603174605,
"grad_norm": 0.5976014492249986,
"learning_rate": 5.92197714739541e-06,
"loss": 0.3806,
"mean_token_accuracy": 0.8653174787759781,
"num_tokens": 212447521.0,
"step": 494
},
{
"entropy": 0.401123046875,
"epoch": 1.9642857142857144,
"grad_norm": 0.6419565626843364,
"learning_rate": 5.882065130011226e-06,
"loss": 0.3854,
"mean_token_accuracy": 0.8639181992039084,
"num_tokens": 212865927.0,
"step": 495
},
{
"entropy": 0.398956298828125,
"epoch": 1.9682539682539684,
"grad_norm": 0.649036294431529,
"learning_rate": 5.842231949544963e-06,
"loss": 0.3814,
"mean_token_accuracy": 0.8645293368026614,
"num_tokens": 213310334.0,
"step": 496
},
{
"entropy": 0.396514892578125,
"epoch": 1.9722222222222223,
"grad_norm": 0.6540271641946366,
"learning_rate": 5.80247836859372e-06,
"loss": 0.3819,
"mean_token_accuracy": 0.8648376986384392,
"num_tokens": 213742399.0,
"step": 497
},
{
"entropy": 0.401824951171875,
"epoch": 1.9761904761904763,
"grad_norm": 0.6249976191527038,
"learning_rate": 5.762805148230688e-06,
"loss": 0.3883,
"mean_token_accuracy": 0.8644895693287253,
"num_tokens": 214169043.0,
"step": 498
},
{
"entropy": 0.400848388671875,
"epoch": 1.9801587301587302,
"grad_norm": 0.6226665699147694,
"learning_rate": 5.723213047990553e-06,
"loss": 0.3869,
"mean_token_accuracy": 0.863439017906785,
"num_tokens": 214572957.0,
"step": 499
},
{
"entropy": 0.403656005859375,
"epoch": 1.9841269841269842,
"grad_norm": 0.6446422210225001,
"learning_rate": 5.68370282585499e-06,
"loss": 0.3877,
"mean_token_accuracy": 0.8621528865769506,
"num_tokens": 215005057.0,
"step": 500
},
{
"entropy": 0.392364501953125,
"epoch": 1.9880952380952381,
"grad_norm": 0.6640325469666031,
"learning_rate": 5.64427523823813e-06,
"loss": 0.3714,
"mean_token_accuracy": 0.8665375467389822,
"num_tokens": 215449399.0,
"step": 501
},
{
"entropy": 0.3995361328125,
"epoch": 1.992063492063492,
"grad_norm": 0.6967069661289652,
"learning_rate": 5.604931039972099e-06,
"loss": 0.3723,
"mean_token_accuracy": 0.8670346606522799,
"num_tokens": 215869766.0,
"step": 502
},
{
"entropy": 0.399871826171875,
"epoch": 1.996031746031746,
"grad_norm": 0.6014892141579964,
"learning_rate": 5.5656709842925335e-06,
"loss": 0.3726,
"mean_token_accuracy": 0.8665146352723241,
"num_tokens": 216298988.0,
"step": 503
},
{
"entropy": 0.392730712890625,
"epoch": 2.0,
"grad_norm": 0.6445229203870088,
"learning_rate": 5.5264958228241925e-06,
"loss": 0.3738,
"mean_token_accuracy": 0.8680051285773516,
"num_tokens": 216731206.0,
"step": 504
},
{
"entropy": 0.392303466796875,
"epoch": 2.003968253968254,
"grad_norm": 0.7007642747575338,
"learning_rate": 5.4874063055665495e-06,
"loss": 0.3394,
"mean_token_accuracy": 0.8802464632317424,
"num_tokens": 217159781.0,
"step": 505
},
{
"entropy": 0.394378662109375,
"epoch": 2.007936507936508,
"grad_norm": 0.6915377565113627,
"learning_rate": 5.44840318087944e-06,
"loss": 0.3333,
"mean_token_accuracy": 0.8822930511087179,
"num_tokens": 217589905.0,
"step": 506
},
{
"entropy": 0.389556884765625,
"epoch": 2.011904761904762,
"grad_norm": 0.6607496119019493,
"learning_rate": 5.40948719546873e-06,
"loss": 0.3223,
"mean_token_accuracy": 0.8840867523103952,
"num_tokens": 218017429.0,
"step": 507
},
{
"entropy": 0.38677978515625,
"epoch": 2.015873015873016,
"grad_norm": 0.797575729187655,
"learning_rate": 5.370659094372036e-06,
"loss": 0.3487,
"mean_token_accuracy": 0.8751778230071068,
"num_tokens": 218446876.0,
"step": 508
},
{
"entropy": 0.38916015625,
"epoch": 2.0198412698412698,
"grad_norm": 0.8079376215374279,
"learning_rate": 5.331919620944438e-06,
"loss": 0.3421,
"mean_token_accuracy": 0.8785031987354159,
"num_tokens": 218885713.0,
"step": 509
},
{
"entropy": 0.3873291015625,
"epoch": 2.0238095238095237,
"grad_norm": 0.7130485344720543,
"learning_rate": 5.293269516844263e-06,
"loss": 0.3347,
"mean_token_accuracy": 0.8804626753553748,
"num_tokens": 219322571.0,
"step": 510
},
{
"entropy": 0.394134521484375,
"epoch": 2.0277777777777777,
"grad_norm": 0.6529573527681491,
"learning_rate": 5.2547095220188815e-06,
"loss": 0.3378,
"mean_token_accuracy": 0.8767837462946773,
"num_tokens": 219748508.0,
"step": 511
},
{
"entropy": 0.39422607421875,
"epoch": 2.0317460317460316,
"grad_norm": 0.6836683232299732,
"learning_rate": 5.216240374690546e-06,
"loss": 0.3337,
"mean_token_accuracy": 0.8813108829781413,
"num_tokens": 220180160.0,
"step": 512
},
{
"entropy": 0.3922119140625,
"epoch": 2.0357142857142856,
"grad_norm": 0.6799043393272431,
"learning_rate": 5.177862811342254e-06,
"loss": 0.3295,
"mean_token_accuracy": 0.8823963804170489,
"num_tokens": 220606565.0,
"step": 513
},
{
"entropy": 0.39385986328125,
"epoch": 2.0396825396825395,
"grad_norm": 0.6581193128400078,
"learning_rate": 5.139577566703643e-06,
"loss": 0.3299,
"mean_token_accuracy": 0.881388746201992,
"num_tokens": 221016578.0,
"step": 514
},
{
"entropy": 0.38360595703125,
"epoch": 2.0436507936507935,
"grad_norm": 0.6684855157209281,
"learning_rate": 5.101385373736937e-06,
"loss": 0.3245,
"mean_token_accuracy": 0.8840543190017343,
"num_tokens": 221455851.0,
"step": 515
},
{
"entropy": 0.389862060546875,
"epoch": 2.0476190476190474,
"grad_norm": 0.7158648675214864,
"learning_rate": 5.0632869636229035e-06,
"loss": 0.3372,
"mean_token_accuracy": 0.8804104384034872,
"num_tokens": 221871968.0,
"step": 516
},
{
"entropy": 0.38134765625,
"epoch": 2.0515873015873014,
"grad_norm": 0.6851404252008069,
"learning_rate": 5.025283065746855e-06,
"loss": 0.3204,
"mean_token_accuracy": 0.8836971241980791,
"num_tokens": 222303576.0,
"step": 517
},
{
"entropy": 0.383941650390625,
"epoch": 2.0555555555555554,
"grad_norm": 0.6707049123314238,
"learning_rate": 4.987374407684703e-06,
"loss": 0.3253,
"mean_token_accuracy": 0.8830197919160128,
"num_tokens": 222738323.0,
"step": 518
},
{
"entropy": 0.38531494140625,
"epoch": 2.0595238095238093,
"grad_norm": 0.651061286914035,
"learning_rate": 4.949561715189001e-06,
"loss": 0.3242,
"mean_token_accuracy": 0.8821469666436315,
"num_tokens": 223168261.0,
"step": 519
},
{
"entropy": 0.390960693359375,
"epoch": 2.0634920634920633,
"grad_norm": 0.6476270763528862,
"learning_rate": 4.911845712175067e-06,
"loss": 0.3313,
"mean_token_accuracy": 0.8828292330726981,
"num_tokens": 223584134.0,
"step": 520
},
{
"entropy": 0.386322021484375,
"epoch": 2.0674603174603177,
"grad_norm": 0.6364599620319137,
"learning_rate": 4.8742271207071226e-06,
"loss": 0.3228,
"mean_token_accuracy": 0.8835309613496065,
"num_tokens": 224013215.0,
"step": 521
},
{
"entropy": 0.380828857421875,
"epoch": 2.0714285714285716,
"grad_norm": 0.6564651643829943,
"learning_rate": 4.836706660984467e-06,
"loss": 0.3321,
"mean_token_accuracy": 0.8808335028588772,
"num_tokens": 224461654.0,
"step": 522
},
{
"entropy": 0.38494873046875,
"epoch": 2.0753968253968256,
"grad_norm": 0.6364445517923993,
"learning_rate": 4.799285051327686e-06,
"loss": 0.33,
"mean_token_accuracy": 0.8805900542065501,
"num_tokens": 224885889.0,
"step": 523
},
{
"entropy": 0.38519287109375,
"epoch": 2.0793650793650795,
"grad_norm": 0.6498450049930671,
"learning_rate": 4.761963008164918e-06,
"loss": 0.3366,
"mean_token_accuracy": 0.8799572549760342,
"num_tokens": 225327562.0,
"step": 524
},
{
"entropy": 0.378173828125,
"epoch": 2.0833333333333335,
"grad_norm": 0.644410093004496,
"learning_rate": 4.724741246018103e-06,
"loss": 0.3292,
"mean_token_accuracy": 0.8829803112894297,
"num_tokens": 225767121.0,
"step": 525
},
{
"entropy": 0.3868408203125,
"epoch": 2.0873015873015874,
"grad_norm": 0.6615349299629835,
"learning_rate": 4.687620477489337e-06,
"loss": 0.3184,
"mean_token_accuracy": 0.8841284308582544,
"num_tokens": 226189031.0,
"step": 526
},
{
"entropy": 0.382080078125,
"epoch": 2.0912698412698414,
"grad_norm": 0.681773396883082,
"learning_rate": 4.650601413247214e-06,
"loss": 0.3324,
"mean_token_accuracy": 0.8802290465682745,
"num_tokens": 226627902.0,
"step": 527
},
{
"entropy": 0.3839111328125,
"epoch": 2.0952380952380953,
"grad_norm": 0.6628168297027915,
"learning_rate": 4.613684762013217e-06,
"loss": 0.3426,
"mean_token_accuracy": 0.8767191367223859,
"num_tokens": 227062309.0,
"step": 528
},
{
"entropy": 0.386871337890625,
"epoch": 2.0992063492063493,
"grad_norm": 0.6227370006419931,
"learning_rate": 4.57687123054817e-06,
"loss": 0.3273,
"mean_token_accuracy": 0.8822421031072736,
"num_tokens": 227485113.0,
"step": 529
},
{
"entropy": 0.389251708984375,
"epoch": 2.1031746031746033,
"grad_norm": 0.6402574176585532,
"learning_rate": 4.5401615236386785e-06,
"loss": 0.3309,
"mean_token_accuracy": 0.8817484118044376,
"num_tokens": 227920598.0,
"step": 530
},
{
"entropy": 0.388427734375,
"epoch": 2.107142857142857,
"grad_norm": 0.6406500881050289,
"learning_rate": 4.503556344083656e-06,
"loss": 0.3243,
"mean_token_accuracy": 0.8830273868516088,
"num_tokens": 228346699.0,
"step": 531
},
{
"entropy": 0.3875732421875,
"epoch": 2.111111111111111,
"grad_norm": 0.6789574480582125,
"learning_rate": 4.467056392680863e-06,
"loss": 0.3309,
"mean_token_accuracy": 0.8825666131451726,
"num_tokens": 228773818.0,
"step": 532
},
{
"entropy": 0.388214111328125,
"epoch": 2.115079365079365,
"grad_norm": 0.6510204304692228,
"learning_rate": 4.4306623682134875e-06,
"loss": 0.3244,
"mean_token_accuracy": 0.8822726272046566,
"num_tokens": 229188623.0,
"step": 533
},
{
"entropy": 0.381195068359375,
"epoch": 2.119047619047619,
"grad_norm": 0.6607016173165737,
"learning_rate": 4.394374967436783e-06,
"loss": 0.3153,
"mean_token_accuracy": 0.8861860791221261,
"num_tokens": 229627711.0,
"step": 534
},
{
"entropy": 0.385040283203125,
"epoch": 2.123015873015873,
"grad_norm": 0.6694823178157694,
"learning_rate": 4.358194885064704e-06,
"loss": 0.3384,
"mean_token_accuracy": 0.8798664947971702,
"num_tokens": 230054209.0,
"step": 535
},
{
"entropy": 0.3831787109375,
"epoch": 2.126984126984127,
"grad_norm": 0.6435423039418425,
"learning_rate": 4.3221228137566225e-06,
"loss": 0.3309,
"mean_token_accuracy": 0.8810933278873563,
"num_tokens": 230489032.0,
"step": 536
},
{
"entropy": 0.382110595703125,
"epoch": 2.130952380952381,
"grad_norm": 0.6509585759312683,
"learning_rate": 4.286159444104068e-06,
"loss": 0.3316,
"mean_token_accuracy": 0.8800732661038637,
"num_tokens": 230910745.0,
"step": 537
},
{
"entropy": 0.387664794921875,
"epoch": 2.134920634920635,
"grad_norm": 0.6213215519100801,
"learning_rate": 4.250305464617494e-06,
"loss": 0.3314,
"mean_token_accuracy": 0.8809810969978571,
"num_tokens": 231339019.0,
"step": 538
},
{
"entropy": 0.385498046875,
"epoch": 2.138888888888889,
"grad_norm": 0.7913050493338661,
"learning_rate": 4.2145615617131095e-06,
"loss": 0.3388,
"mean_token_accuracy": 0.8805615156888962,
"num_tokens": 231777523.0,
"step": 539
},
{
"entropy": 0.3892822265625,
"epoch": 2.142857142857143,
"grad_norm": 0.6397574362821433,
"learning_rate": 4.178928419699731e-06,
"loss": 0.3275,
"mean_token_accuracy": 0.8803174262866378,
"num_tokens": 232199672.0,
"step": 540
},
{
"entropy": 0.39251708984375,
"epoch": 2.1468253968253967,
"grad_norm": 0.6561714495314072,
"learning_rate": 4.143406720765687e-06,
"loss": 0.3224,
"mean_token_accuracy": 0.8841910324990749,
"num_tokens": 232633797.0,
"step": 541
},
{
"entropy": 0.386260986328125,
"epoch": 2.1507936507936507,
"grad_norm": 0.676691357314513,
"learning_rate": 4.107997144965747e-06,
"loss": 0.3324,
"mean_token_accuracy": 0.881273141130805,
"num_tokens": 233076007.0,
"step": 542
},
{
"entropy": 0.38470458984375,
"epoch": 2.1547619047619047,
"grad_norm": 0.7241624585208927,
"learning_rate": 4.0727003702081146e-06,
"loss": 0.3172,
"mean_token_accuracy": 0.8856724062934518,
"num_tokens": 233509851.0,
"step": 543
},
{
"entropy": 0.3896484375,
"epoch": 2.1587301587301586,
"grad_norm": 0.78911437248552,
"learning_rate": 4.037517072241435e-06,
"loss": 0.3271,
"mean_token_accuracy": 0.8804695382714272,
"num_tokens": 233942156.0,
"step": 544
},
{
"entropy": 0.386474609375,
"epoch": 2.1626984126984126,
"grad_norm": 0.6722927644193839,
"learning_rate": 4.002447924641882e-06,
"loss": 0.324,
"mean_token_accuracy": 0.8829648504033685,
"num_tokens": 234372963.0,
"step": 545
},
{
"entropy": 0.38409423828125,
"epoch": 2.1666666666666665,
"grad_norm": 0.6466957029468782,
"learning_rate": 3.967493598800233e-06,
"loss": 0.3237,
"mean_token_accuracy": 0.884342834353447,
"num_tokens": 234844668.0,
"step": 546
},
{
"entropy": 0.39801025390625,
"epoch": 2.1706349206349205,
"grad_norm": 0.6412289556826881,
"learning_rate": 3.9326547639090315e-06,
"loss": 0.3504,
"mean_token_accuracy": 0.8763287300243974,
"num_tokens": 235271990.0,
"step": 547
},
{
"entropy": 0.385894775390625,
"epoch": 2.1746031746031744,
"grad_norm": 0.6171782227283126,
"learning_rate": 3.897932086949778e-06,
"loss": 0.3311,
"mean_token_accuracy": 0.8819945016875863,
"num_tokens": 235697877.0,
"step": 548
},
{
"entropy": 0.396209716796875,
"epoch": 2.1785714285714284,
"grad_norm": 0.6361871979559561,
"learning_rate": 3.863326232680148e-06,
"loss": 0.3325,
"mean_token_accuracy": 0.8788015209138393,
"num_tokens": 236134537.0,
"step": 549
},
{
"entropy": 0.379608154296875,
"epoch": 2.1825396825396823,
"grad_norm": 0.6315811408319069,
"learning_rate": 3.828837863621286e-06,
"loss": 0.3138,
"mean_token_accuracy": 0.8871300676837564,
"num_tokens": 236586699.0,
"step": 550
},
{
"entropy": 0.391204833984375,
"epoch": 2.1865079365079367,
"grad_norm": 0.6494424143773995,
"learning_rate": 3.7944676400451017e-06,
"loss": 0.3287,
"mean_token_accuracy": 0.8799652308225632,
"num_tokens": 236995332.0,
"step": 551
},
{
"entropy": 0.3856201171875,
"epoch": 2.1904761904761907,
"grad_norm": 0.6255192304119477,
"learning_rate": 3.76021621996163e-06,
"loss": 0.3207,
"mean_token_accuracy": 0.8843406355008483,
"num_tokens": 237426378.0,
"step": 552
},
{
"entropy": 0.376617431640625,
"epoch": 2.1944444444444446,
"grad_norm": 0.6407771894276859,
"learning_rate": 3.7260842591064504e-06,
"loss": 0.3152,
"mean_token_accuracy": 0.8854828383773565,
"num_tokens": 237866086.0,
"step": 553
},
{
"entropy": 0.382476806640625,
"epoch": 2.1984126984126986,
"grad_norm": 0.6801463416248983,
"learning_rate": 3.6920724109281146e-06,
"loss": 0.329,
"mean_token_accuracy": 0.8822561521083117,
"num_tokens": 238297987.0,
"step": 554
},
{
"entropy": 0.395477294921875,
"epoch": 2.2023809523809526,
"grad_norm": 0.6188509021837985,
"learning_rate": 3.6581813265756595e-06,
"loss": 0.3282,
"mean_token_accuracy": 0.8823494836688042,
"num_tokens": 238729296.0,
"step": 555
},
{
"entropy": 0.388092041015625,
"epoch": 2.2063492063492065,
"grad_norm": 0.6351713659674968,
"learning_rate": 3.6244116548861084e-06,
"loss": 0.336,
"mean_token_accuracy": 0.8807284999638796,
"num_tokens": 239171101.0,
"step": 556
},
{
"entropy": 0.38885498046875,
"epoch": 2.2103174603174605,
"grad_norm": 0.6619090358761864,
"learning_rate": 3.590764042372079e-06,
"loss": 0.3248,
"mean_token_accuracy": 0.8822610294446349,
"num_tokens": 239602299.0,
"step": 557
},
{
"entropy": 0.38800048828125,
"epoch": 2.2142857142857144,
"grad_norm": 0.6012724630133454,
"learning_rate": 3.557239133209387e-06,
"loss": 0.3182,
"mean_token_accuracy": 0.88497896771878,
"num_tokens": 240034337.0,
"step": 558
},
{
"entropy": 0.388671875,
"epoch": 2.2182539682539684,
"grad_norm": 0.6362574759114775,
"learning_rate": 3.523837569224725e-06,
"loss": 0.3203,
"mean_token_accuracy": 0.8840533634647727,
"num_tokens": 240461291.0,
"step": 559
},
{
"entropy": 0.392059326171875,
"epoch": 2.2222222222222223,
"grad_norm": 0.6674103812544709,
"learning_rate": 3.4905599898833665e-06,
"loss": 0.3153,
"mean_token_accuracy": 0.8856786200776696,
"num_tokens": 240860927.0,
"step": 560
},
{
"entropy": 0.383575439453125,
"epoch": 2.2261904761904763,
"grad_norm": 0.6349123290369829,
"learning_rate": 3.4574070322769347e-06,
"loss": 0.3281,
"mean_token_accuracy": 0.8817283101379871,
"num_tokens": 241301179.0,
"step": 561
},
{
"entropy": 0.384368896484375,
"epoch": 2.2301587301587302,
"grad_norm": 0.7198961662563939,
"learning_rate": 3.4243793311111916e-06,
"loss": 0.3277,
"mean_token_accuracy": 0.8824960431084037,
"num_tokens": 241739076.0,
"step": 562
},
{
"entropy": 0.39068603515625,
"epoch": 2.234126984126984,
"grad_norm": 0.683740121166347,
"learning_rate": 3.391477518693894e-06,
"loss": 0.3321,
"mean_token_accuracy": 0.8821808360517025,
"num_tokens": 242150640.0,
"step": 563
},
{
"entropy": 0.3800048828125,
"epoch": 2.238095238095238,
"grad_norm": 0.6437837444588406,
"learning_rate": 3.358702224922691e-06,
"loss": 0.3158,
"mean_token_accuracy": 0.8863429753109813,
"num_tokens": 242574011.0,
"step": 564
},
{
"entropy": 0.39166259765625,
"epoch": 2.242063492063492,
"grad_norm": 0.7506758272688273,
"learning_rate": 3.3260540772730576e-06,
"loss": 0.3276,
"mean_token_accuracy": 0.879186031408608,
"num_tokens": 243005939.0,
"step": 565
},
{
"entropy": 0.387481689453125,
"epoch": 2.246031746031746,
"grad_norm": 0.6315669129852838,
"learning_rate": 3.2935337007862865e-06,
"loss": 0.3262,
"mean_token_accuracy": 0.8840317856520414,
"num_tokens": 243458878.0,
"step": 566
},
{
"entropy": 0.38836669921875,
"epoch": 2.25,
"grad_norm": 0.6389628006202059,
"learning_rate": 3.261141718057523e-06,
"loss": 0.3356,
"mean_token_accuracy": 0.8814380522817373,
"num_tokens": 243877438.0,
"step": 567
},
{
"entropy": 0.38360595703125,
"epoch": 2.253968253968254,
"grad_norm": 0.6259643943263192,
"learning_rate": 3.2288787492238416e-06,
"loss": 0.3263,
"mean_token_accuracy": 0.8834005445241928,
"num_tokens": 244313964.0,
"step": 568
},
{
"entropy": 0.383331298828125,
"epoch": 2.257936507936508,
"grad_norm": 0.6219404801873205,
"learning_rate": 3.1967454119523745e-06,
"loss": 0.324,
"mean_token_accuracy": 0.8827744442969561,
"num_tokens": 244761734.0,
"step": 569
},
{
"entropy": 0.384307861328125,
"epoch": 2.261904761904762,
"grad_norm": 0.5955899515224996,
"learning_rate": 3.1647423214284856e-06,
"loss": 0.3118,
"mean_token_accuracy": 0.887902582064271,
"num_tokens": 245200322.0,
"step": 570
},
{
"entropy": 0.381744384765625,
"epoch": 2.265873015873016,
"grad_norm": 0.646703276648792,
"learning_rate": 3.1328700903440045e-06,
"loss": 0.3206,
"mean_token_accuracy": 0.8841962497681379,
"num_tokens": 245646233.0,
"step": 571
},
{
"entropy": 0.386260986328125,
"epoch": 2.2698412698412698,
"grad_norm": 0.6521102057792992,
"learning_rate": 3.101129328885475e-06,
"loss": 0.3217,
"mean_token_accuracy": 0.8851350508630276,
"num_tokens": 246083539.0,
"step": 572
},
{
"entropy": 0.38726806640625,
"epoch": 2.2738095238095237,
"grad_norm": 0.635188052792932,
"learning_rate": 3.0695206447224923e-06,
"loss": 0.3198,
"mean_token_accuracy": 0.8851980855688453,
"num_tokens": 246515677.0,
"step": 573
},
{
"entropy": 0.38677978515625,
"epoch": 2.2777777777777777,
"grad_norm": 0.6388945711775174,
"learning_rate": 3.0380446429960573e-06,
"loss": 0.3178,
"mean_token_accuracy": 0.8853446776047349,
"num_tokens": 246933619.0,
"step": 574
},
{
"entropy": 0.399322509765625,
"epoch": 2.2817460317460316,
"grad_norm": 0.6434111635560193,
"learning_rate": 3.0067019263069973e-06,
"loss": 0.3261,
"mean_token_accuracy": 0.8847488528117537,
"num_tokens": 247344382.0,
"step": 575
},
{
"entropy": 0.385498046875,
"epoch": 2.2857142857142856,
"grad_norm": 0.6359197299105421,
"learning_rate": 2.9754930947044357e-06,
"loss": 0.3144,
"mean_token_accuracy": 0.8865975281223655,
"num_tokens": 247765672.0,
"step": 576
},
{
"entropy": 0.389495849609375,
"epoch": 2.2896825396825395,
"grad_norm": 0.6154901011869501,
"learning_rate": 2.9444187456742855e-06,
"loss": 0.3172,
"mean_token_accuracy": 0.8845733245834708,
"num_tokens": 248183306.0,
"step": 577
},
{
"entropy": 0.37921142578125,
"epoch": 2.2936507936507935,
"grad_norm": 0.6319087371113467,
"learning_rate": 2.9134794741278317e-06,
"loss": 0.3226,
"mean_token_accuracy": 0.8831391530111432,
"num_tokens": 248628378.0,
"step": 578
},
{
"entropy": 0.386138916015625,
"epoch": 2.2976190476190474,
"grad_norm": 0.617827795169469,
"learning_rate": 2.8826758723903192e-06,
"loss": 0.3289,
"mean_token_accuracy": 0.8825296880677342,
"num_tokens": 249071096.0,
"step": 579
},
{
"entropy": 0.38525390625,
"epoch": 2.3015873015873014,
"grad_norm": 0.6959398807719873,
"learning_rate": 2.8520085301896373e-06,
"loss": 0.3265,
"mean_token_accuracy": 0.8830155087634921,
"num_tokens": 249501143.0,
"step": 580
},
{
"entropy": 0.385986328125,
"epoch": 2.3055555555555554,
"grad_norm": 0.6348696654732209,
"learning_rate": 2.821478034645009e-06,
"loss": 0.323,
"mean_token_accuracy": 0.8836052594706416,
"num_tokens": 249948355.0,
"step": 581
},
{
"entropy": 0.38616943359375,
"epoch": 2.3095238095238093,
"grad_norm": 0.6329011771841873,
"learning_rate": 2.791084970255772e-06,
"loss": 0.3233,
"mean_token_accuracy": 0.8842885615304112,
"num_tokens": 250356099.0,
"step": 582
},
{
"entropy": 0.384613037109375,
"epoch": 2.3134920634920633,
"grad_norm": 0.6456652675035203,
"learning_rate": 2.7608299188901632e-06,
"loss": 0.3144,
"mean_token_accuracy": 0.8848955044522882,
"num_tokens": 250775592.0,
"step": 583
},
{
"entropy": 0.381988525390625,
"epoch": 2.317460317460317,
"grad_norm": 0.6394640721117717,
"learning_rate": 2.730713459774198e-06,
"loss": 0.3277,
"mean_token_accuracy": 0.8828927706927061,
"num_tokens": 251219125.0,
"step": 584
},
{
"entropy": 0.379119873046875,
"epoch": 2.3214285714285716,
"grad_norm": 0.632993088257485,
"learning_rate": 2.7007361694805735e-06,
"loss": 0.32,
"mean_token_accuracy": 0.8848384916782379,
"num_tokens": 251648861.0,
"step": 585
},
{
"entropy": 0.37896728515625,
"epoch": 2.3253968253968256,
"grad_norm": 0.642288723817126,
"learning_rate": 2.670898621917629e-06,
"loss": 0.3212,
"mean_token_accuracy": 0.8847629306837916,
"num_tokens": 252080434.0,
"step": 586
},
{
"entropy": 0.38128662109375,
"epoch": 2.3293650793650795,
"grad_norm": 0.6541101628145074,
"learning_rate": 2.64120138831837e-06,
"loss": 0.3126,
"mean_token_accuracy": 0.8882659897208214,
"num_tokens": 252502018.0,
"step": 587
},
{
"entropy": 0.383026123046875,
"epoch": 2.3333333333333335,
"grad_norm": 0.7162259536705861,
"learning_rate": 2.6116450372295145e-06,
"loss": 0.3191,
"mean_token_accuracy": 0.883970595896244,
"num_tokens": 252923218.0,
"step": 588
},
{
"entropy": 0.384735107421875,
"epoch": 2.3373015873015874,
"grad_norm": 0.6626450382178084,
"learning_rate": 2.5822301345006196e-06,
"loss": 0.3168,
"mean_token_accuracy": 0.8849868765100837,
"num_tokens": 253337148.0,
"step": 589
},
{
"entropy": 0.39013671875,
"epoch": 2.3412698412698414,
"grad_norm": 0.6568665871706648,
"learning_rate": 2.5529572432732473e-06,
"loss": 0.3209,
"mean_token_accuracy": 0.884642724879086,
"num_tokens": 253761289.0,
"step": 590
},
{
"entropy": 0.38323974609375,
"epoch": 2.3452380952380953,
"grad_norm": 0.6598852576655171,
"learning_rate": 2.5238269239701816e-06,
"loss": 0.3161,
"mean_token_accuracy": 0.884860472753644,
"num_tokens": 254195017.0,
"step": 591
},
{
"entropy": 0.38433837890625,
"epoch": 2.3492063492063493,
"grad_norm": 0.637276420544699,
"learning_rate": 2.4948397342846985e-06,
"loss": 0.3328,
"mean_token_accuracy": 0.8818829879164696,
"num_tokens": 254643716.0,
"step": 592
},
{
"entropy": 0.385223388671875,
"epoch": 2.3531746031746033,
"grad_norm": 0.6301366966020561,
"learning_rate": 2.4659962291698936e-06,
"loss": 0.3282,
"mean_token_accuracy": 0.880899085663259,
"num_tokens": 255069339.0,
"step": 593
},
{
"entropy": 0.389404296875,
"epoch": 2.357142857142857,
"grad_norm": 0.638225122674086,
"learning_rate": 2.4372969608280483e-06,
"loss": 0.3203,
"mean_token_accuracy": 0.8841871181502938,
"num_tokens": 255486573.0,
"step": 594
},
{
"entropy": 0.381072998046875,
"epoch": 2.361111111111111,
"grad_norm": 0.6545431908408221,
"learning_rate": 2.408742478700071e-06,
"loss": 0.3076,
"mean_token_accuracy": 0.8880687272176147,
"num_tokens": 255913426.0,
"step": 595
},
{
"entropy": 0.383636474609375,
"epoch": 2.365079365079365,
"grad_norm": 0.645166690257223,
"learning_rate": 2.3803333294549647e-06,
"loss": 0.3124,
"mean_token_accuracy": 0.8868766566738486,
"num_tokens": 256345326.0,
"step": 596
},
{
"entropy": 0.378448486328125,
"epoch": 2.369047619047619,
"grad_norm": 0.646372016881236,
"learning_rate": 2.352070056979375e-06,
"loss": 0.3161,
"mean_token_accuracy": 0.8870938578620553,
"num_tokens": 256765173.0,
"step": 597
},
{
"entropy": 0.377532958984375,
"epoch": 2.373015873015873,
"grad_norm": 0.6693829082891946,
"learning_rate": 2.3239532023671663e-06,
"loss": 0.3087,
"mean_token_accuracy": 0.8909780327230692,
"num_tokens": 257185152.0,
"step": 598
},
{
"entropy": 0.377288818359375,
"epoch": 2.376984126984127,
"grad_norm": 0.6753658239787765,
"learning_rate": 2.295983303909065e-06,
"loss": 0.3163,
"mean_token_accuracy": 0.8857049969956279,
"num_tokens": 257627732.0,
"step": 599
},
{
"entropy": 0.37738037109375,
"epoch": 2.380952380952381,
"grad_norm": 0.637766728659791,
"learning_rate": 2.2681608970823567e-06,
"loss": 0.3121,
"mean_token_accuracy": 0.8857978647574782,
"num_tokens": 258067282.0,
"step": 600
},
{
"entropy": 0.385101318359375,
"epoch": 2.384920634920635,
"grad_norm": 1.319145343989088,
"learning_rate": 2.2404865145406353e-06,
"loss": 0.3237,
"mean_token_accuracy": 0.8839105069637299,
"num_tokens": 258491168.0,
"step": 601
},
{
"entropy": 0.378387451171875,
"epoch": 2.388888888888889,
"grad_norm": 0.6194353316699897,
"learning_rate": 2.2129606861036003e-06,
"loss": 0.3159,
"mean_token_accuracy": 0.8844478046521544,
"num_tokens": 258944016.0,
"step": 602
},
{
"entropy": 0.385406494140625,
"epoch": 2.392857142857143,
"grad_norm": 0.6145111798633861,
"learning_rate": 2.1855839387469237e-06,
"loss": 0.3121,
"mean_token_accuracy": 0.8875940628349781,
"num_tokens": 259379493.0,
"step": 603
},
{
"entropy": 0.388275146484375,
"epoch": 2.3968253968253967,
"grad_norm": 0.6886819461995825,
"learning_rate": 2.158356796592147e-06,
"loss": 0.3041,
"mean_token_accuracy": 0.8887152783572674,
"num_tokens": 259808268.0,
"step": 604
},
{
"entropy": 0.3804931640625,
"epoch": 2.4007936507936507,
"grad_norm": 0.6244956591633763,
"learning_rate": 2.1312797808966625e-06,
"loss": 0.3245,
"mean_token_accuracy": 0.8850710866972804,
"num_tokens": 260247659.0,
"step": 605
},
{
"entropy": 0.3851318359375,
"epoch": 2.4047619047619047,
"grad_norm": 0.6517731852559921,
"learning_rate": 2.1043534100437123e-06,
"loss": 0.3371,
"mean_token_accuracy": 0.8805443737655878,
"num_tokens": 260684292.0,
"step": 606
},
{
"entropy": 0.380767822265625,
"epoch": 2.4087301587301586,
"grad_norm": 0.6377112681635206,
"learning_rate": 2.0775781995324886e-06,
"loss": 0.3219,
"mean_token_accuracy": 0.8848995277658105,
"num_tokens": 261121173.0,
"step": 607
},
{
"entropy": 0.387847900390625,
"epoch": 2.4126984126984126,
"grad_norm": 0.6347004293735145,
"learning_rate": 2.0509546619682553e-06,
"loss": 0.3183,
"mean_token_accuracy": 0.8860855745151639,
"num_tokens": 261555918.0,
"step": 608
},
{
"entropy": 0.38128662109375,
"epoch": 2.4166666666666665,
"grad_norm": 2.3373272895881656,
"learning_rate": 2.024483307052526e-06,
"loss": 0.3236,
"mean_token_accuracy": 0.8850080538541079,
"num_tokens": 262000331.0,
"step": 609
},
{
"entropy": 0.384307861328125,
"epoch": 2.4206349206349205,
"grad_norm": 0.645561304833522,
"learning_rate": 1.9981646415733157e-06,
"loss": 0.3145,
"mean_token_accuracy": 0.88714156486094,
"num_tokens": 262425946.0,
"step": 610
},
{
"entropy": 0.385833740234375,
"epoch": 2.4246031746031744,
"grad_norm": 0.639279216493586,
"learning_rate": 1.971999169395432e-06,
"loss": 0.3141,
"mean_token_accuracy": 0.8860221272334456,
"num_tokens": 262853210.0,
"step": 611
},
{
"entropy": 0.3848876953125,
"epoch": 2.4285714285714284,
"grad_norm": 0.6257683879585533,
"learning_rate": 1.945987391450833e-06,
"loss": 0.3041,
"mean_token_accuracy": 0.8897171234712005,
"num_tokens": 263268966.0,
"step": 612
},
{
"entropy": 0.383392333984375,
"epoch": 2.432539682539683,
"grad_norm": 0.6467526598593158,
"learning_rate": 1.920129805729043e-06,
"loss": 0.3224,
"mean_token_accuracy": 0.8844416281208396,
"num_tokens": 263709583.0,
"step": 613
},
{
"entropy": 0.383514404296875,
"epoch": 2.4365079365079367,
"grad_norm": 0.6584407884920952,
"learning_rate": 1.8944269072676013e-06,
"loss": 0.3024,
"mean_token_accuracy": 0.888917769305408,
"num_tokens": 264137961.0,
"step": 614
},
{
"entropy": 0.378143310546875,
"epoch": 2.4404761904761907,
"grad_norm": 2.358916383104869,
"learning_rate": 1.8688791881426017e-06,
"loss": 0.3235,
"mean_token_accuracy": 0.8838730929419398,
"num_tokens": 264579578.0,
"step": 615
},
{
"entropy": 0.387786865234375,
"epoch": 2.4444444444444446,
"grad_norm": 0.6484092770878318,
"learning_rate": 1.843487137459261e-06,
"loss": 0.3158,
"mean_token_accuracy": 0.8852028921246529,
"num_tokens": 264995910.0,
"step": 616
},
{
"entropy": 0.3782958984375,
"epoch": 2.4484126984126986,
"grad_norm": 0.6283835633567808,
"learning_rate": 1.8182512413425624e-06,
"loss": 0.3221,
"mean_token_accuracy": 0.8847752753645182,
"num_tokens": 265440836.0,
"step": 617
},
{
"entropy": 0.385711669921875,
"epoch": 2.4523809523809526,
"grad_norm": 0.6265536607231427,
"learning_rate": 1.7931719829279448e-06,
"loss": 0.3131,
"mean_token_accuracy": 0.8853830918669701,
"num_tokens": 265867518.0,
"step": 618
},
{
"entropy": 0.385498046875,
"epoch": 2.4563492063492065,
"grad_norm": 0.6536050788460606,
"learning_rate": 1.7682498423520545e-06,
"loss": 0.3276,
"mean_token_accuracy": 0.8838295871391892,
"num_tokens": 266304569.0,
"step": 619
},
{
"entropy": 0.38214111328125,
"epoch": 2.4603174603174605,
"grad_norm": 0.6199559421660695,
"learning_rate": 1.7434852967435523e-06,
"loss": 0.3089,
"mean_token_accuracy": 0.8871648367494345,
"num_tokens": 266730039.0,
"step": 620
},
{
"entropy": 0.38201904296875,
"epoch": 2.4642857142857144,
"grad_norm": 0.6217208841926519,
"learning_rate": 1.7188788202139794e-06,
"loss": 0.3192,
"mean_token_accuracy": 0.8856141036376357,
"num_tokens": 267169815.0,
"step": 621
},
{
"entropy": 0.3819580078125,
"epoch": 2.4682539682539684,
"grad_norm": 0.6118534660743937,
"learning_rate": 1.6944308838486823e-06,
"loss": 0.3198,
"mean_token_accuracy": 0.8841827157884836,
"num_tokens": 267605673.0,
"step": 622
},
{
"entropy": 0.3839111328125,
"epoch": 2.4722222222222223,
"grad_norm": 0.6453367699458878,
"learning_rate": 1.6701419556977882e-06,
"loss": 0.3326,
"mean_token_accuracy": 0.8813635427504778,
"num_tokens": 268044360.0,
"step": 623
},
{
"entropy": 0.38397216796875,
"epoch": 2.4761904761904763,
"grad_norm": 0.6102430733837766,
"learning_rate": 1.6460125007672556e-06,
"loss": 0.3197,
"mean_token_accuracy": 0.8862089207395911,
"num_tokens": 268470812.0,
"step": 624
},
{
"entropy": 0.382415771484375,
"epoch": 2.4801587301587302,
"grad_norm": 0.6312834862639178,
"learning_rate": 1.6220429810099603e-06,
"loss": 0.3197,
"mean_token_accuracy": 0.8827575715258718,
"num_tokens": 268895281.0,
"step": 625
},
{
"entropy": 0.379058837890625,
"epoch": 2.484126984126984,
"grad_norm": 0.7044597247523807,
"learning_rate": 1.5982338553168563e-06,
"loss": 0.3012,
"mean_token_accuracy": 0.8895168509334326,
"num_tokens": 269329768.0,
"step": 626
},
{
"entropy": 0.379669189453125,
"epoch": 2.488095238095238,
"grad_norm": 0.6298185893493836,
"learning_rate": 1.5745855795081889e-06,
"loss": 0.323,
"mean_token_accuracy": 0.8828219333663583,
"num_tokens": 269763538.0,
"step": 627
},
{
"entropy": 0.38275146484375,
"epoch": 2.492063492063492,
"grad_norm": 0.6657396946256616,
"learning_rate": 1.551098606324768e-06,
"loss": 0.3185,
"mean_token_accuracy": 0.8838854227215052,
"num_tokens": 270192672.0,
"step": 628
},
{
"entropy": 0.383880615234375,
"epoch": 2.496031746031746,
"grad_norm": 0.6461801602019108,
"learning_rate": 1.527773385419311e-06,
"loss": 0.3291,
"mean_token_accuracy": 0.8830574974417686,
"num_tokens": 270633240.0,
"step": 629
},
{
"entropy": 0.3831787109375,
"epoch": 2.5,
"grad_norm": 5.02088348983452,
"learning_rate": 1.5046103633478148e-06,
"loss": 0.3115,
"mean_token_accuracy": 0.8865573918446898,
"num_tokens": 271053623.0,
"step": 630
},
{
"entropy": 0.378753662109375,
"epoch": 2.503968253968254,
"grad_norm": 0.5964915600808843,
"learning_rate": 1.4816099835610209e-06,
"loss": 0.312,
"mean_token_accuracy": 0.88751888461411,
"num_tokens": 271492583.0,
"step": 631
},
{
"entropy": 0.381805419921875,
"epoch": 2.507936507936508,
"grad_norm": 0.6381933996091104,
"learning_rate": 1.4587726863959239e-06,
"loss": 0.3269,
"mean_token_accuracy": 0.8842789568006992,
"num_tokens": 271921985.0,
"step": 632
},
{
"entropy": 0.39300537109375,
"epoch": 2.511904761904762,
"grad_norm": 0.6212653644771459,
"learning_rate": 1.4360989090673284e-06,
"loss": 0.3149,
"mean_token_accuracy": 0.8869155524298549,
"num_tokens": 272349367.0,
"step": 633
},
{
"entropy": 0.391815185546875,
"epoch": 2.515873015873016,
"grad_norm": 0.6390041331562423,
"learning_rate": 1.4135890856595047e-06,
"loss": 0.3251,
"mean_token_accuracy": 0.883450117893517,
"num_tokens": 272757706.0,
"step": 634
},
{
"entropy": 0.3900146484375,
"epoch": 2.5198412698412698,
"grad_norm": 0.6415648041434637,
"learning_rate": 1.3912436471178525e-06,
"loss": 0.3137,
"mean_token_accuracy": 0.8865510458126664,
"num_tokens": 273178323.0,
"step": 635
},
{
"entropy": 0.385589599609375,
"epoch": 2.5238095238095237,
"grad_norm": 0.6786208735461828,
"learning_rate": 1.3690630212406653e-06,
"loss": 0.31,
"mean_token_accuracy": 0.886282910592854,
"num_tokens": 273603268.0,
"step": 636
},
{
"entropy": 0.384246826171875,
"epoch": 2.5277777777777777,
"grad_norm": 0.6389187775408092,
"learning_rate": 1.3470476326709337e-06,
"loss": 0.3051,
"mean_token_accuracy": 0.8894489388912916,
"num_tokens": 274036335.0,
"step": 637
},
{
"entropy": 0.384674072265625,
"epoch": 2.5317460317460316,
"grad_norm": 0.6496924237808063,
"learning_rate": 1.3251979028882179e-06,
"loss": 0.3153,
"mean_token_accuracy": 0.8875564280897379,
"num_tokens": 274458735.0,
"step": 638
},
{
"entropy": 0.377685546875,
"epoch": 2.5357142857142856,
"grad_norm": 0.6510962044644899,
"learning_rate": 1.3035142502005792e-06,
"loss": 0.3144,
"mean_token_accuracy": 0.8849059278145432,
"num_tokens": 274909982.0,
"step": 639
},
{
"entropy": 0.38714599609375,
"epoch": 2.5396825396825395,
"grad_norm": 0.6541727348118395,
"learning_rate": 1.281997089736574e-06,
"loss": 0.3178,
"mean_token_accuracy": 0.8841635789722204,
"num_tokens": 275320028.0,
"step": 640
},
{
"entropy": 0.379791259765625,
"epoch": 2.5436507936507935,
"grad_norm": 0.6200761248476931,
"learning_rate": 1.2606468334373e-06,
"loss": 0.3102,
"mean_token_accuracy": 0.8897372307255864,
"num_tokens": 275726848.0,
"step": 641
},
{
"entropy": 0.378173828125,
"epoch": 2.5476190476190474,
"grad_norm": 0.6688554503436679,
"learning_rate": 1.2394638900485124e-06,
"loss": 0.3209,
"mean_token_accuracy": 0.8843832314014435,
"num_tokens": 276151262.0,
"step": 642
},
{
"entropy": 0.380615234375,
"epoch": 2.5515873015873014,
"grad_norm": 0.6660887015596744,
"learning_rate": 1.2184486651128014e-06,
"loss": 0.3254,
"mean_token_accuracy": 0.8823684249073267,
"num_tokens": 276568860.0,
"step": 643
},
{
"entropy": 0.3778076171875,
"epoch": 2.5555555555555554,
"grad_norm": 0.6455282415719623,
"learning_rate": 1.197601560961824e-06,
"loss": 0.3125,
"mean_token_accuracy": 0.886366662569344,
"num_tokens": 276997327.0,
"step": 644
},
{
"entropy": 0.38653564453125,
"epoch": 2.5595238095238093,
"grad_norm": 0.6059096202304458,
"learning_rate": 1.1769229767086053e-06,
"loss": 0.3223,
"mean_token_accuracy": 0.8857936700806022,
"num_tokens": 277426196.0,
"step": 645
},
{
"entropy": 0.3828125,
"epoch": 2.5634920634920633,
"grad_norm": 0.6256296541644343,
"learning_rate": 1.1564133082398942e-06,
"loss": 0.3233,
"mean_token_accuracy": 0.8879679264500737,
"num_tokens": 277849848.0,
"step": 646
},
{
"entropy": 0.37725830078125,
"epoch": 2.567460317460317,
"grad_norm": 0.6068305756135023,
"learning_rate": 1.1360729482085852e-06,
"loss": 0.3111,
"mean_token_accuracy": 0.8888252349570394,
"num_tokens": 278289888.0,
"step": 647
},
{
"entropy": 0.379852294921875,
"epoch": 2.571428571428571,
"grad_norm": 0.6525074013342971,
"learning_rate": 1.1159022860262036e-06,
"loss": 0.3065,
"mean_token_accuracy": 0.8898705607280135,
"num_tokens": 278726761.0,
"step": 648
},
{
"entropy": 0.382720947265625,
"epoch": 2.575396825396825,
"grad_norm": 0.6120138350704707,
"learning_rate": 1.0959017078554458e-06,
"loss": 0.2998,
"mean_token_accuracy": 0.8927411213517189,
"num_tokens": 279136759.0,
"step": 649
},
{
"entropy": 0.382659912109375,
"epoch": 2.5793650793650795,
"grad_norm": 0.6229193839182782,
"learning_rate": 1.0760715966027923e-06,
"loss": 0.3193,
"mean_token_accuracy": 0.8850045939907432,
"num_tokens": 279572082.0,
"step": 650
},
{
"entropy": 0.382904052734375,
"epoch": 2.5833333333333335,
"grad_norm": 0.6342083100796324,
"learning_rate": 1.0564123319111708e-06,
"loss": 0.329,
"mean_token_accuracy": 0.8824840355664492,
"num_tokens": 279994957.0,
"step": 651
},
{
"entropy": 0.38031005859375,
"epoch": 2.5873015873015874,
"grad_norm": 0.6205742323248385,
"learning_rate": 1.036924290152691e-06,
"loss": 0.3075,
"mean_token_accuracy": 0.8880053823813796,
"num_tokens": 280414468.0,
"step": 652
},
{
"entropy": 0.38330078125,
"epoch": 2.5912698412698414,
"grad_norm": 0.6452525781952142,
"learning_rate": 1.017607844421441e-06,
"loss": 0.3234,
"mean_token_accuracy": 0.8852792903780937,
"num_tokens": 280832145.0,
"step": 653
},
{
"entropy": 0.38494873046875,
"epoch": 2.5952380952380953,
"grad_norm": 0.6150246403399638,
"learning_rate": 9.984633645263386e-07,
"loss": 0.3053,
"mean_token_accuracy": 0.8897464731708169,
"num_tokens": 281237288.0,
"step": 654
},
{
"entropy": 0.376922607421875,
"epoch": 2.5992063492063493,
"grad_norm": 0.6204054410973546,
"learning_rate": 9.794912169840564e-07,
"loss": 0.3063,
"mean_token_accuracy": 0.8919531209394336,
"num_tokens": 281692258.0,
"step": 655
},
{
"entropy": 0.38128662109375,
"epoch": 2.6031746031746033,
"grad_norm": 0.6259115787217637,
"learning_rate": 9.606917650120084e-07,
"loss": 0.3019,
"mean_token_accuracy": 0.8896377719938755,
"num_tokens": 282118057.0,
"step": 656
},
{
"entropy": 0.376617431640625,
"epoch": 2.607142857142857,
"grad_norm": 0.6405045817213874,
"learning_rate": 9.420653685213854e-07,
"loss": 0.3207,
"mean_token_accuracy": 0.883592015132308,
"num_tokens": 282582066.0,
"step": 657
},
{
"entropy": 0.371551513671875,
"epoch": 2.611111111111111,
"grad_norm": 0.6258478413075457,
"learning_rate": 9.236123841102762e-07,
"loss": 0.3199,
"mean_token_accuracy": 0.8873294722288847,
"num_tokens": 283028330.0,
"step": 658
},
{
"entropy": 0.390045166015625,
"epoch": 2.615079365079365,
"grad_norm": 0.623636922000303,
"learning_rate": 9.053331650568264e-07,
"loss": 0.322,
"mean_token_accuracy": 0.8835032200440764,
"num_tokens": 283436768.0,
"step": 659
},
{
"entropy": 0.38201904296875,
"epoch": 2.619047619047619,
"grad_norm": 0.6221707456489836,
"learning_rate": 8.872280613124895e-07,
"loss": 0.3076,
"mean_token_accuracy": 0.887150245718658,
"num_tokens": 283866607.0,
"step": 660
},
{
"entropy": 0.38751220703125,
"epoch": 2.623015873015873,
"grad_norm": 0.6328952287494918,
"learning_rate": 8.692974194953263e-07,
"loss": 0.3223,
"mean_token_accuracy": 0.8878462919965386,
"num_tokens": 284281791.0,
"step": 661
},
{
"entropy": 0.383544921875,
"epoch": 2.626984126984127,
"grad_norm": 0.6134165981973436,
"learning_rate": 8.515415828833562e-07,
"loss": 0.3041,
"mean_token_accuracy": 0.8905844045802951,
"num_tokens": 284705319.0,
"step": 662
},
{
"entropy": 0.3831787109375,
"epoch": 2.630952380952381,
"grad_norm": 0.6003677767075459,
"learning_rate": 8.339608914079944e-07,
"loss": 0.3277,
"mean_token_accuracy": 0.8837083810940385,
"num_tokens": 285154313.0,
"step": 663
},
{
"entropy": 0.3780517578125,
"epoch": 2.634920634920635,
"grad_norm": 0.6185564068781259,
"learning_rate": 8.165556816475462e-07,
"loss": 0.3169,
"mean_token_accuracy": 0.8858558805659413,
"num_tokens": 285598883.0,
"step": 664
},
{
"entropy": 0.384521484375,
"epoch": 2.638888888888889,
"grad_norm": 0.615938430119585,
"learning_rate": 7.993262868207552e-07,
"loss": 0.3189,
"mean_token_accuracy": 0.8873115805909038,
"num_tokens": 286037660.0,
"step": 665
},
{
"entropy": 0.3826904296875,
"epoch": 2.642857142857143,
"grad_norm": 0.6120620823511644,
"learning_rate": 7.822730367804332e-07,
"loss": 0.3162,
"mean_token_accuracy": 0.8862875821068883,
"num_tokens": 286461446.0,
"step": 666
},
{
"entropy": 0.3834228515625,
"epoch": 2.6468253968253967,
"grad_norm": 0.5993824978295295,
"learning_rate": 7.653962580071384e-07,
"loss": 0.3063,
"mean_token_accuracy": 0.8879508459940553,
"num_tokens": 286877023.0,
"step": 667
},
{
"entropy": 0.3873291015625,
"epoch": 2.6507936507936507,
"grad_norm": 0.633531914850964,
"learning_rate": 7.486962736029247e-07,
"loss": 0.3123,
"mean_token_accuracy": 0.887955573387444,
"num_tokens": 287308399.0,
"step": 668
},
{
"entropy": 0.379241943359375,
"epoch": 2.6547619047619047,
"grad_norm": 0.6065402358723048,
"learning_rate": 7.321734032851613e-07,
"loss": 0.3179,
"mean_token_accuracy": 0.8874384136870503,
"num_tokens": 287728804.0,
"step": 669
},
{
"entropy": 0.376983642578125,
"epoch": 2.6587301587301586,
"grad_norm": 0.6293972276012216,
"learning_rate": 7.158279633804077e-07,
"loss": 0.3106,
"mean_token_accuracy": 0.8873779606074095,
"num_tokens": 288187832.0,
"step": 670
},
{
"entropy": 0.38238525390625,
"epoch": 2.6626984126984126,
"grad_norm": 0.6195597148292992,
"learning_rate": 6.996602668183605e-07,
"loss": 0.3109,
"mean_token_accuracy": 0.8889595773071051,
"num_tokens": 288600411.0,
"step": 671
},
{
"entropy": 0.37738037109375,
"epoch": 2.6666666666666665,
"grad_norm": 0.6154573646928048,
"learning_rate": 6.836706231258583e-07,
"loss": 0.3192,
"mean_token_accuracy": 0.8858912223950028,
"num_tokens": 289047051.0,
"step": 672
},
{
"entropy": 0.383026123046875,
"epoch": 2.6706349206349205,
"grad_norm": 0.6263655996244245,
"learning_rate": 6.678593384209597e-07,
"loss": 0.3155,
"mean_token_accuracy": 0.8867061976343393,
"num_tokens": 289478039.0,
"step": 673
},
{
"entropy": 0.385162353515625,
"epoch": 2.674603174603175,
"grad_norm": 0.6573248369770568,
"learning_rate": 6.522267154070816e-07,
"loss": 0.3262,
"mean_token_accuracy": 0.8859394267201424,
"num_tokens": 289920851.0,
"step": 674
},
{
"entropy": 0.38201904296875,
"epoch": 2.678571428571429,
"grad_norm": 0.5939088235971999,
"learning_rate": 6.367730533672035e-07,
"loss": 0.3119,
"mean_token_accuracy": 0.8882531467825174,
"num_tokens": 290351325.0,
"step": 675
},
{
"entropy": 0.380096435546875,
"epoch": 2.682539682539683,
"grad_norm": 0.6262374391544449,
"learning_rate": 6.214986481581365e-07,
"loss": 0.3045,
"mean_token_accuracy": 0.8911728356033564,
"num_tokens": 290766931.0,
"step": 676
},
{
"entropy": 0.389251708984375,
"epoch": 2.6865079365079367,
"grad_norm": 0.6147080862748338,
"learning_rate": 6.064037922048661e-07,
"loss": 0.3191,
"mean_token_accuracy": 0.8846084726974368,
"num_tokens": 291185306.0,
"step": 677
},
{
"entropy": 0.386932373046875,
"epoch": 2.6904761904761907,
"grad_norm": 0.6146757669564619,
"learning_rate": 5.914887744949426e-07,
"loss": 0.3072,
"mean_token_accuracy": 0.8904552990570664,
"num_tokens": 291599836.0,
"step": 678
},
{
"entropy": 0.3798828125,
"epoch": 2.6944444444444446,
"grad_norm": 0.6122316865058661,
"learning_rate": 5.767538805729578e-07,
"loss": 0.3233,
"mean_token_accuracy": 0.8849823428317904,
"num_tokens": 292026507.0,
"step": 679
},
{
"entropy": 0.377960205078125,
"epoch": 2.6984126984126986,
"grad_norm": 0.5919864168210549,
"learning_rate": 5.621993925350722e-07,
"loss": 0.3139,
"mean_token_accuracy": 0.8860092582181096,
"num_tokens": 292454972.0,
"step": 680
},
{
"entropy": 0.38397216796875,
"epoch": 2.7023809523809526,
"grad_norm": 0.6003413926603242,
"learning_rate": 5.478255890236184e-07,
"loss": 0.3145,
"mean_token_accuracy": 0.8844663957133889,
"num_tokens": 292885705.0,
"step": 681
},
{
"entropy": 0.384857177734375,
"epoch": 2.7063492063492065,
"grad_norm": 0.6327165590547364,
"learning_rate": 5.336327452217682e-07,
"loss": 0.3009,
"mean_token_accuracy": 0.8899004301056266,
"num_tokens": 293307675.0,
"step": 682
},
{
"entropy": 0.384429931640625,
"epoch": 2.7103174603174605,
"grad_norm": 0.6210543868986266,
"learning_rate": 5.196211328482559e-07,
"loss": 0.3226,
"mean_token_accuracy": 0.8854889068752527,
"num_tokens": 293722148.0,
"step": 683
},
{
"entropy": 0.3853759765625,
"epoch": 2.7142857142857144,
"grad_norm": 0.6057372551677664,
"learning_rate": 5.057910201521876e-07,
"loss": 0.3211,
"mean_token_accuracy": 0.8846705863252282,
"num_tokens": 294149752.0,
"step": 684
},
{
"entropy": 0.387847900390625,
"epoch": 2.7182539682539684,
"grad_norm": 0.6148908097024449,
"learning_rate": 4.921426719078948e-07,
"loss": 0.3049,
"mean_token_accuracy": 0.8889408009126782,
"num_tokens": 294555288.0,
"step": 685
},
{
"entropy": 0.383392333984375,
"epoch": 2.7222222222222223,
"grad_norm": 0.6097068524134369,
"learning_rate": 4.786763494098689e-07,
"loss": 0.3014,
"mean_token_accuracy": 0.8917689863592386,
"num_tokens": 294975614.0,
"step": 686
},
{
"entropy": 0.377655029296875,
"epoch": 2.7261904761904763,
"grad_norm": 0.6094849230504564,
"learning_rate": 4.653923104677671e-07,
"loss": 0.3148,
"mean_token_accuracy": 0.8867970844730735,
"num_tokens": 295422071.0,
"step": 687
},
{
"entropy": 0.380615234375,
"epoch": 2.7301587301587302,
"grad_norm": 0.6178616647341642,
"learning_rate": 4.522908094014655e-07,
"loss": 0.3151,
"mean_token_accuracy": 0.8884375654160976,
"num_tokens": 295846874.0,
"step": 688
},
{
"entropy": 0.38031005859375,
"epoch": 2.734126984126984,
"grad_norm": 0.5894255953320765,
"learning_rate": 4.3937209703619476e-07,
"loss": 0.3011,
"mean_token_accuracy": 0.8908967413008213,
"num_tokens": 296288822.0,
"step": 689
},
{
"entropy": 0.3753662109375,
"epoch": 2.738095238095238,
"grad_norm": 0.6135220448939788,
"learning_rate": 4.2663642069773693e-07,
"loss": 0.3102,
"mean_token_accuracy": 0.8868862120434642,
"num_tokens": 296730922.0,
"step": 690
},
{
"entropy": 0.376129150390625,
"epoch": 2.742063492063492,
"grad_norm": 0.5985603171722215,
"learning_rate": 4.140840242076927e-07,
"loss": 0.3124,
"mean_token_accuracy": 0.8877345686778426,
"num_tokens": 297169723.0,
"step": 691
},
{
"entropy": 0.37847900390625,
"epoch": 2.746031746031746,
"grad_norm": 0.5897101346016688,
"learning_rate": 4.017151478788117e-07,
"loss": 0.2995,
"mean_token_accuracy": 0.8927986742928624,
"num_tokens": 297610941.0,
"step": 692
},
{
"entropy": 0.380523681640625,
"epoch": 2.75,
"grad_norm": 0.6138920707362799,
"learning_rate": 3.895300285103931e-07,
"loss": 0.2959,
"mean_token_accuracy": 0.8884680820629001,
"num_tokens": 298044879.0,
"step": 693
},
{
"entropy": 0.379547119140625,
"epoch": 2.753968253968254,
"grad_norm": 0.6212097605594783,
"learning_rate": 3.7752889938375113e-07,
"loss": 0.3028,
"mean_token_accuracy": 0.8896859297528863,
"num_tokens": 298464464.0,
"step": 694
},
{
"entropy": 0.374053955078125,
"epoch": 2.757936507936508,
"grad_norm": 0.6230588065700267,
"learning_rate": 3.657119902577466e-07,
"loss": 0.3059,
"mean_token_accuracy": 0.8904304560273886,
"num_tokens": 298903233.0,
"step": 695
},
{
"entropy": 0.379852294921875,
"epoch": 2.761904761904762,
"grad_norm": 0.6119585891354671,
"learning_rate": 3.5407952736439266e-07,
"loss": 0.3011,
"mean_token_accuracy": 0.8909467747434974,
"num_tokens": 299315956.0,
"step": 696
},
{
"entropy": 0.384979248046875,
"epoch": 2.765873015873016,
"grad_norm": 0.630765395796783,
"learning_rate": 3.426317334045226e-07,
"loss": 0.3082,
"mean_token_accuracy": 0.8890936635434628,
"num_tokens": 299732237.0,
"step": 697
},
{
"entropy": 0.379730224609375,
"epoch": 2.7698412698412698,
"grad_norm": 0.665272270914942,
"learning_rate": 3.313688275435234e-07,
"loss": 0.3077,
"mean_token_accuracy": 0.889319458976388,
"num_tokens": 300162540.0,
"step": 698
},
{
"entropy": 0.38665771484375,
"epoch": 2.7738095238095237,
"grad_norm": 0.6746597478881748,
"learning_rate": 3.202910254071434e-07,
"loss": 0.3147,
"mean_token_accuracy": 0.88882967364043,
"num_tokens": 300580263.0,
"step": 699
},
{
"entropy": 0.376708984375,
"epoch": 2.7777777777777777,
"grad_norm": 0.6038711621641686,
"learning_rate": 3.0939853907736126e-07,
"loss": 0.2978,
"mean_token_accuracy": 0.8918641023337841,
"num_tokens": 301009855.0,
"step": 700
},
{
"entropy": 0.3848876953125,
"epoch": 2.7817460317460316,
"grad_norm": 0.6176026217150087,
"learning_rate": 2.9869157708832805e-07,
"loss": 0.3019,
"mean_token_accuracy": 0.8898680582642555,
"num_tokens": 301429836.0,
"step": 701
},
{
"entropy": 0.38232421875,
"epoch": 2.7857142857142856,
"grad_norm": 0.6147895745768526,
"learning_rate": 2.881703444223716e-07,
"loss": 0.3053,
"mean_token_accuracy": 0.8896784670650959,
"num_tokens": 301852351.0,
"step": 702
},
{
"entropy": 0.382965087890625,
"epoch": 2.7896825396825395,
"grad_norm": 0.6315651993446116,
"learning_rate": 2.778350425060794e-07,
"loss": 0.3084,
"mean_token_accuracy": 0.8870635628700256,
"num_tokens": 302270241.0,
"step": 703
},
{
"entropy": 0.37701416015625,
"epoch": 2.7936507936507935,
"grad_norm": 0.599519759404208,
"learning_rate": 2.6768586920643324e-07,
"loss": 0.3068,
"mean_token_accuracy": 0.8893379056826234,
"num_tokens": 302702189.0,
"step": 704
},
{
"entropy": 0.380889892578125,
"epoch": 2.7976190476190474,
"grad_norm": 0.6078813973100959,
"learning_rate": 2.5772301882702634e-07,
"loss": 0.2992,
"mean_token_accuracy": 0.8915481101721525,
"num_tokens": 303138996.0,
"step": 705
},
{
"entropy": 0.38671875,
"epoch": 2.8015873015873014,
"grad_norm": 0.6086380862937784,
"learning_rate": 2.4794668210434194e-07,
"loss": 0.3079,
"mean_token_accuracy": 0.887032619677484,
"num_tokens": 303546117.0,
"step": 706
},
{
"entropy": 0.37628173828125,
"epoch": 2.8055555555555554,
"grad_norm": 0.6027408982980551,
"learning_rate": 2.3835704620410294e-07,
"loss": 0.3063,
"mean_token_accuracy": 0.8905981313437223,
"num_tokens": 303985054.0,
"step": 707
},
{
"entropy": 0.37945556640625,
"epoch": 2.8095238095238093,
"grad_norm": 0.5994294775725023,
"learning_rate": 2.2895429471768925e-07,
"loss": 0.3073,
"mean_token_accuracy": 0.888611021451652,
"num_tokens": 304424136.0,
"step": 708
},
{
"entropy": 0.377716064453125,
"epoch": 2.8134920634920633,
"grad_norm": 0.6845179144444798,
"learning_rate": 2.1973860765861831e-07,
"loss": 0.302,
"mean_token_accuracy": 0.8900069631636143,
"num_tokens": 304862181.0,
"step": 709
},
{
"entropy": 0.3812255859375,
"epoch": 2.817460317460317,
"grad_norm": 0.6225625871453486,
"learning_rate": 2.107101614591045e-07,
"loss": 0.322,
"mean_token_accuracy": 0.8866511387750506,
"num_tokens": 305299176.0,
"step": 710
},
{
"entropy": 0.377899169921875,
"epoch": 2.821428571428571,
"grad_norm": 0.6096957767253436,
"learning_rate": 2.0186912896667744e-07,
"loss": 0.3126,
"mean_token_accuracy": 0.8875591978430748,
"num_tokens": 305755604.0,
"step": 711
},
{
"entropy": 0.37921142578125,
"epoch": 2.825396825396825,
"grad_norm": 0.6667361291321218,
"learning_rate": 1.9321567944087573e-07,
"loss": 0.3012,
"mean_token_accuracy": 0.8895198963582516,
"num_tokens": 306180918.0,
"step": 712
},
{
"entropy": 0.382537841796875,
"epoch": 2.8293650793650795,
"grad_norm": 0.6218241109654549,
"learning_rate": 1.8474997855000177e-07,
"loss": 0.3208,
"mean_token_accuracy": 0.885167789645493,
"num_tokens": 306601315.0,
"step": 713
},
{
"entropy": 0.378570556640625,
"epoch": 2.8333333333333335,
"grad_norm": 0.5992276590550127,
"learning_rate": 1.7647218836795878e-07,
"loss": 0.3143,
"mean_token_accuracy": 0.8854878153651953,
"num_tokens": 307053855.0,
"step": 714
},
{
"entropy": 0.384765625,
"epoch": 2.8373015873015874,
"grad_norm": 0.6678241835727501,
"learning_rate": 1.6838246737113983e-07,
"loss": 0.3242,
"mean_token_accuracy": 0.8875371310859919,
"num_tokens": 307466527.0,
"step": 715
},
{
"entropy": 0.382415771484375,
"epoch": 2.8412698412698414,
"grad_norm": 0.6327330692027668,
"learning_rate": 1.604809704353949e-07,
"loss": 0.3148,
"mean_token_accuracy": 0.8862312156707048,
"num_tokens": 307908233.0,
"step": 716
},
{
"entropy": 0.382720947265625,
"epoch": 2.8452380952380953,
"grad_norm": 0.6124198926311829,
"learning_rate": 1.5276784883307084e-07,
"loss": 0.3008,
"mean_token_accuracy": 0.8917096089571714,
"num_tokens": 308331740.0,
"step": 717
},
{
"entropy": 0.38189697265625,
"epoch": 2.8492063492063493,
"grad_norm": 0.595663678571871,
"learning_rate": 1.4524325023010932e-07,
"loss": 0.3004,
"mean_token_accuracy": 0.8921401789411902,
"num_tokens": 308765267.0,
"step": 718
},
{
"entropy": 0.3831787109375,
"epoch": 2.8531746031746033,
"grad_norm": 0.6212901666769164,
"learning_rate": 1.3790731868322472e-07,
"loss": 0.2948,
"mean_token_accuracy": 0.8915543537586927,
"num_tokens": 309177878.0,
"step": 719
},
{
"entropy": 0.390869140625,
"epoch": 2.857142857142857,
"grad_norm": 0.6016366375502071,
"learning_rate": 1.3076019463714173e-07,
"loss": 0.3179,
"mean_token_accuracy": 0.885790922679007,
"num_tokens": 309586897.0,
"step": 720
},
{
"entropy": 0.377838134765625,
"epoch": 2.861111111111111,
"grad_norm": 0.6177685966036272,
"learning_rate": 1.238020149219099e-07,
"loss": 0.2997,
"mean_token_accuracy": 0.8919113390147686,
"num_tokens": 310011953.0,
"step": 721
},
{
"entropy": 0.382354736328125,
"epoch": 2.865079365079365,
"grad_norm": 0.5837465047986848,
"learning_rate": 1.1703291275028227e-07,
"loss": 0.3124,
"mean_token_accuracy": 0.8868957068771124,
"num_tokens": 310440896.0,
"step": 722
},
{
"entropy": 0.38067626953125,
"epoch": 2.869047619047619,
"grad_norm": 0.6367975390082888,
"learning_rate": 1.1045301771516748e-07,
"loss": 0.3049,
"mean_token_accuracy": 0.8887251811102033,
"num_tokens": 310867155.0,
"step": 723
},
{
"entropy": 0.380279541015625,
"epoch": 2.873015873015873,
"grad_norm": 0.5927899623186489,
"learning_rate": 1.0406245578714613e-07,
"loss": 0.3041,
"mean_token_accuracy": 0.8911432735621929,
"num_tokens": 311297562.0,
"step": 724
},
{
"entropy": 0.377532958984375,
"epoch": 2.876984126984127,
"grad_norm": 0.6106617971663426,
"learning_rate": 9.786134931205726e-08,
"loss": 0.3168,
"mean_token_accuracy": 0.888288808055222,
"num_tokens": 311750163.0,
"step": 725
},
{
"entropy": 0.37890625,
"epoch": 2.880952380952381,
"grad_norm": 0.6038481080562059,
"learning_rate": 9.184981700866347e-08,
"loss": 0.3229,
"mean_token_accuracy": 0.887555805966258,
"num_tokens": 312189513.0,
"step": 726
},
{
"entropy": 0.37835693359375,
"epoch": 2.884920634920635,
"grad_norm": 0.5979812639407914,
"learning_rate": 8.602797396636941e-08,
"loss": 0.3062,
"mean_token_accuracy": 0.8877870365977287,
"num_tokens": 312626002.0,
"step": 727
},
{
"entropy": 0.38275146484375,
"epoch": 2.888888888888889,
"grad_norm": 0.6398028531474257,
"learning_rate": 8.039593164302362e-08,
"loss": 0.3138,
"mean_token_accuracy": 0.8863641833886504,
"num_tokens": 313055977.0,
"step": 728
},
{
"entropy": 0.3807373046875,
"epoch": 2.892857142857143,
"grad_norm": 0.616805437056275,
"learning_rate": 7.495379786278456e-08,
"loss": 0.302,
"mean_token_accuracy": 0.8888390958309174,
"num_tokens": 313491837.0,
"step": 729
},
{
"entropy": 0.381591796875,
"epoch": 2.8968253968253967,
"grad_norm": 0.6015916324330552,
"learning_rate": 6.970167681405459e-08,
"loss": 0.3251,
"mean_token_accuracy": 0.8857261892408133,
"num_tokens": 313934488.0,
"step": 730
},
{
"entropy": 0.383575439453125,
"epoch": 2.9007936507936507,
"grad_norm": 0.6021514631896276,
"learning_rate": 6.463966904748487e-08,
"loss": 0.308,
"mean_token_accuracy": 0.8882989063858986,
"num_tokens": 314354323.0,
"step": 731
},
{
"entropy": 0.38092041015625,
"epoch": 2.9047619047619047,
"grad_norm": 0.6035366486394909,
"learning_rate": 5.97678714740535e-08,
"loss": 0.3191,
"mean_token_accuracy": 0.8849895298480988,
"num_tokens": 314782888.0,
"step": 732
},
{
"entropy": 0.380157470703125,
"epoch": 2.9087301587301586,
"grad_norm": 0.6246810880941083,
"learning_rate": 5.508637736320488e-08,
"loss": 0.3026,
"mean_token_accuracy": 0.8905821247026324,
"num_tokens": 315218545.0,
"step": 733
},
{
"entropy": 0.37481689453125,
"epoch": 2.9126984126984126,
"grad_norm": 0.5742999768873462,
"learning_rate": 5.0595276341071084e-08,
"loss": 0.2928,
"mean_token_accuracy": 0.8928123638033867,
"num_tokens": 315667111.0,
"step": 734
},
{
"entropy": 0.387847900390625,
"epoch": 2.9166666666666665,
"grad_norm": 0.61460809122353,
"learning_rate": 4.62946543887488e-08,
"loss": 0.3122,
"mean_token_accuracy": 0.8853531358763576,
"num_tokens": 316078411.0,
"step": 735
},
{
"entropy": 0.378631591796875,
"epoch": 2.9206349206349205,
"grad_norm": 0.6260179342418157,
"learning_rate": 4.218459384065954e-08,
"loss": 0.3127,
"mean_token_accuracy": 0.8865446662530303,
"num_tokens": 316519645.0,
"step": 736
},
{
"entropy": 0.38299560546875,
"epoch": 2.924603174603175,
"grad_norm": 0.6528580905521568,
"learning_rate": 3.826517338296865e-08,
"loss": 0.3214,
"mean_token_accuracy": 0.886158674955368,
"num_tokens": 316948710.0,
"step": 737
},
{
"entropy": 0.380340576171875,
"epoch": 2.928571428571429,
"grad_norm": 0.5932548971126642,
"learning_rate": 3.4536468052082106e-08,
"loss": 0.2991,
"mean_token_accuracy": 0.8918010191991925,
"num_tokens": 317376914.0,
"step": 738
},
{
"entropy": 0.382415771484375,
"epoch": 2.932539682539683,
"grad_norm": 0.6263098885743876,
"learning_rate": 3.0998549233205446e-08,
"loss": 0.3014,
"mean_token_accuracy": 0.8899451838806272,
"num_tokens": 317783718.0,
"step": 739
},
{
"entropy": 0.38323974609375,
"epoch": 2.9365079365079367,
"grad_norm": 0.6280292549269856,
"learning_rate": 2.7651484658984816e-08,
"loss": 0.3263,
"mean_token_accuracy": 0.8853430952876806,
"num_tokens": 318210944.0,
"step": 740
},
{
"entropy": 0.380523681640625,
"epoch": 2.9404761904761907,
"grad_norm": 0.6199875922767187,
"learning_rate": 2.4495338408201397e-08,
"loss": 0.3206,
"mean_token_accuracy": 0.8864352721720934,
"num_tokens": 318636758.0,
"step": 741
},
{
"entropy": 0.37628173828125,
"epoch": 2.9444444444444446,
"grad_norm": 0.5975785528517842,
"learning_rate": 2.153017090455123e-08,
"loss": 0.3055,
"mean_token_accuracy": 0.8887868747115135,
"num_tokens": 319072977.0,
"step": 742
},
{
"entropy": 0.383331298828125,
"epoch": 2.9484126984126986,
"grad_norm": 0.6002719426244223,
"learning_rate": 1.8756038915486165e-08,
"loss": 0.3097,
"mean_token_accuracy": 0.888092122040689,
"num_tokens": 319491098.0,
"step": 743
},
{
"entropy": 0.376800537109375,
"epoch": 2.9523809523809526,
"grad_norm": 0.6400387970423335,
"learning_rate": 1.6172995551125836e-08,
"loss": 0.3248,
"mean_token_accuracy": 0.8845498086884618,
"num_tokens": 319936836.0,
"step": 744
},
{
"entropy": 0.376953125,
"epoch": 2.9563492063492065,
"grad_norm": 0.6271748530387473,
"learning_rate": 1.3781090263242924e-08,
"loss": 0.3167,
"mean_token_accuracy": 0.887404091656208,
"num_tokens": 320379711.0,
"step": 745
},
{
"entropy": 0.379180908203125,
"epoch": 2.9603174603174605,
"grad_norm": 0.6066932052254789,
"learning_rate": 1.1580368844316125e-08,
"loss": 0.2976,
"mean_token_accuracy": 0.8918975051492453,
"num_tokens": 320807541.0,
"step": 746
},
{
"entropy": 0.380401611328125,
"epoch": 2.9642857142857144,
"grad_norm": 0.6315981302947575,
"learning_rate": 9.570873426649752e-09,
"loss": 0.3099,
"mean_token_accuracy": 0.8889196058735251,
"num_tokens": 321249293.0,
"step": 747
},
{
"entropy": 0.381500244140625,
"epoch": 2.9682539682539684,
"grad_norm": 0.5833316927135702,
"learning_rate": 7.752642481573258e-09,
"loss": 0.3042,
"mean_token_accuracy": 0.8893669536337256,
"num_tokens": 321676330.0,
"step": 748
},
{
"entropy": 0.379302978515625,
"epoch": 2.9722222222222223,
"grad_norm": 0.6084320256550227,
"learning_rate": 6.125710818701836e-09,
"loss": 0.3034,
"mean_token_accuracy": 0.8921427316963673,
"num_tokens": 322088977.0,
"step": 749
},
{
"entropy": 0.3780517578125,
"epoch": 2.9761904761904763,
"grad_norm": 0.6035928268017802,
"learning_rate": 4.690109585268054e-09,
"loss": 0.3061,
"mean_token_accuracy": 0.8898161184042692,
"num_tokens": 322526195.0,
"step": 750
},
{
"entropy": 0.38763427734375,
"epoch": 2.9801587301587302,
"grad_norm": 0.6190270535862277,
"learning_rate": 3.445866265526787e-09,
"loss": 0.3177,
"mean_token_accuracy": 0.8873382732272148,
"num_tokens": 322944491.0,
"step": 751
},
{
"entropy": 0.38092041015625,
"epoch": 2.984126984126984,
"grad_norm": 0.6117829785012998,
"learning_rate": 2.3930046802322914e-09,
"loss": 0.3148,
"mean_token_accuracy": 0.885888421908021,
"num_tokens": 323373321.0,
"step": 752
},
{
"entropy": 0.3792724609375,
"epoch": 2.988095238095238,
"grad_norm": 0.588266741951019,
"learning_rate": 1.531544986177469e-09,
"loss": 0.3041,
"mean_token_accuracy": 0.8897914877161384,
"num_tokens": 323803097.0,
"step": 753
},
{
"entropy": 0.37762451171875,
"epoch": 2.992063492063492,
"grad_norm": 0.5894056132295504,
"learning_rate": 8.615036758108375e-10,
"loss": 0.2976,
"mean_token_accuracy": 0.890996178612113,
"num_tokens": 324241487.0,
"step": 754
},
{
"entropy": 0.376739501953125,
"epoch": 2.996031746031746,
"grad_norm": 0.6741605494505846,
"learning_rate": 3.8289357691900785e-10,
"loss": 0.314,
"mean_token_accuracy": 0.8861532881855965,
"num_tokens": 324694260.0,
"step": 755
},
{
"entropy": 0.38372802734375,
"epoch": 3.0,
"grad_norm": 0.6110340309341243,
"learning_rate": 9.572385238243443e-11,
"loss": 0.314,
"mean_token_accuracy": 0.8879904169589281,
"num_tokens": 325114310.0,
"step": 756
},
{
"epoch": 3.0,
"step": 756,
"total_flos": 601237772369920.0,
"train_loss": 0.43782205426346055,
"train_runtime": 58008.3635,
"train_samples_per_second": 1.27,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 756,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 63,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 601237772369920.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}