7604 lines
214 KiB
JSON
7604 lines
214 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 756,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.5635986328125,
|
|
"epoch": 0.003968253968253968,
|
|
"grad_norm": 5.863266347518074,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.3929,
|
|
"mean_token_accuracy": 0.6520986258983612,
|
|
"num_tokens": 436822.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 0.571868896484375,
|
|
"epoch": 0.007936507936507936,
|
|
"grad_norm": 5.943034119425208,
|
|
"learning_rate": 5.263157894736843e-07,
|
|
"loss": 1.3984,
|
|
"mean_token_accuracy": 0.6573778251186013,
|
|
"num_tokens": 849869.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"entropy": 0.571136474609375,
|
|
"epoch": 0.011904761904761904,
|
|
"grad_norm": 5.9888348315878766,
|
|
"learning_rate": 1.0526315789473685e-06,
|
|
"loss": 1.4019,
|
|
"mean_token_accuracy": 0.6531417248770595,
|
|
"num_tokens": 1257883.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"entropy": 0.56817626953125,
|
|
"epoch": 0.015873015873015872,
|
|
"grad_norm": 5.819974339041837,
|
|
"learning_rate": 1.5789473684210526e-06,
|
|
"loss": 1.3961,
|
|
"mean_token_accuracy": 0.6506756190210581,
|
|
"num_tokens": 1710146.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"entropy": 0.563323974609375,
|
|
"epoch": 0.01984126984126984,
|
|
"grad_norm": 5.674858276690005,
|
|
"learning_rate": 2.105263157894737e-06,
|
|
"loss": 1.3737,
|
|
"mean_token_accuracy": 0.6581529462710023,
|
|
"num_tokens": 2138902.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 0.583770751953125,
|
|
"epoch": 0.023809523809523808,
|
|
"grad_norm": 5.287171943303066,
|
|
"learning_rate": 2.631578947368421e-06,
|
|
"loss": 1.3562,
|
|
"mean_token_accuracy": 0.6605429640039802,
|
|
"num_tokens": 2560005.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"entropy": 0.5577392578125,
|
|
"epoch": 0.027777777777777776,
|
|
"grad_norm": 4.925117375179032,
|
|
"learning_rate": 3.157894736842105e-06,
|
|
"loss": 1.3458,
|
|
"mean_token_accuracy": 0.6580116618424654,
|
|
"num_tokens": 3004121.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"entropy": 0.57269287109375,
|
|
"epoch": 0.031746031746031744,
|
|
"grad_norm": 4.510966194233729,
|
|
"learning_rate": 3.6842105263157896e-06,
|
|
"loss": 1.303,
|
|
"mean_token_accuracy": 0.6702660601586103,
|
|
"num_tokens": 3457966.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"entropy": 0.56524658203125,
|
|
"epoch": 0.03571428571428571,
|
|
"grad_norm": 4.257257337401794,
|
|
"learning_rate": 4.210526315789474e-06,
|
|
"loss": 1.2854,
|
|
"mean_token_accuracy": 0.6731634242460132,
|
|
"num_tokens": 3902759.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"entropy": 0.585052490234375,
|
|
"epoch": 0.03968253968253968,
|
|
"grad_norm": 3.560753211544556,
|
|
"learning_rate": 4.736842105263158e-06,
|
|
"loss": 1.1912,
|
|
"mean_token_accuracy": 0.6834962824359536,
|
|
"num_tokens": 4321827.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 0.56036376953125,
|
|
"epoch": 0.04365079365079365,
|
|
"grad_norm": 3.440982638815655,
|
|
"learning_rate": 5.263157894736842e-06,
|
|
"loss": 1.129,
|
|
"mean_token_accuracy": 0.7041243137791753,
|
|
"num_tokens": 4748195.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"entropy": 0.5570068359375,
|
|
"epoch": 0.047619047619047616,
|
|
"grad_norm": 3.196606172568719,
|
|
"learning_rate": 5.789473684210527e-06,
|
|
"loss": 1.128,
|
|
"mean_token_accuracy": 0.7001799792051315,
|
|
"num_tokens": 5188122.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"entropy": 0.53070068359375,
|
|
"epoch": 0.051587301587301584,
|
|
"grad_norm": 4.635446866713048,
|
|
"learning_rate": 6.31578947368421e-06,
|
|
"loss": 1.0401,
|
|
"mean_token_accuracy": 0.7150256410241127,
|
|
"num_tokens": 5615040.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"entropy": 0.53387451171875,
|
|
"epoch": 0.05555555555555555,
|
|
"grad_norm": 4.895076624528778,
|
|
"learning_rate": 6.842105263157896e-06,
|
|
"loss": 1.029,
|
|
"mean_token_accuracy": 0.7148290146142244,
|
|
"num_tokens": 6042413.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"entropy": 0.546905517578125,
|
|
"epoch": 0.05952380952380952,
|
|
"grad_norm": 3.838947346620084,
|
|
"learning_rate": 7.368421052631579e-06,
|
|
"loss": 0.9875,
|
|
"mean_token_accuracy": 0.725364712998271,
|
|
"num_tokens": 6468019.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 0.54766845703125,
|
|
"epoch": 0.06349206349206349,
|
|
"grad_norm": 3.52284752091451,
|
|
"learning_rate": 7.894736842105265e-06,
|
|
"loss": 0.9743,
|
|
"mean_token_accuracy": 0.7267373474314809,
|
|
"num_tokens": 6898441.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"entropy": 0.5467529296875,
|
|
"epoch": 0.06746031746031746,
|
|
"grad_norm": 3.0694966842969307,
|
|
"learning_rate": 8.421052631578948e-06,
|
|
"loss": 0.918,
|
|
"mean_token_accuracy": 0.7363277673721313,
|
|
"num_tokens": 7333054.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"entropy": 0.5263671875,
|
|
"epoch": 0.07142857142857142,
|
|
"grad_norm": 3.842330950557049,
|
|
"learning_rate": 8.947368421052632e-06,
|
|
"loss": 0.9026,
|
|
"mean_token_accuracy": 0.7431545937433839,
|
|
"num_tokens": 7794638.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"entropy": 0.5352783203125,
|
|
"epoch": 0.07539682539682539,
|
|
"grad_norm": 3.361580339127847,
|
|
"learning_rate": 9.473684210526315e-06,
|
|
"loss": 0.9166,
|
|
"mean_token_accuracy": 0.7366319699212909,
|
|
"num_tokens": 8237624.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"entropy": 0.5377197265625,
|
|
"epoch": 0.07936507936507936,
|
|
"grad_norm": 2.6450589328361254,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.8844,
|
|
"mean_token_accuracy": 0.7442264417186379,
|
|
"num_tokens": 8673402.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.5318603515625,
|
|
"epoch": 0.08333333333333333,
|
|
"grad_norm": 2.5182823829653525,
|
|
"learning_rate": 1.0526315789473684e-05,
|
|
"loss": 0.8645,
|
|
"mean_token_accuracy": 0.74986263923347,
|
|
"num_tokens": 9121387.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"entropy": 0.547149658203125,
|
|
"epoch": 0.0873015873015873,
|
|
"grad_norm": 2.169285955075468,
|
|
"learning_rate": 1.105263157894737e-05,
|
|
"loss": 0.8049,
|
|
"mean_token_accuracy": 0.7641561925411224,
|
|
"num_tokens": 9525436.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"entropy": 0.54248046875,
|
|
"epoch": 0.09126984126984126,
|
|
"grad_norm": 2.2658426207555955,
|
|
"learning_rate": 1.1578947368421053e-05,
|
|
"loss": 0.8038,
|
|
"mean_token_accuracy": 0.7617505192756653,
|
|
"num_tokens": 9932011.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"entropy": 0.53009033203125,
|
|
"epoch": 0.09523809523809523,
|
|
"grad_norm": 2.082747079461424,
|
|
"learning_rate": 1.2105263157894737e-05,
|
|
"loss": 0.797,
|
|
"mean_token_accuracy": 0.7638106672093272,
|
|
"num_tokens": 10358777.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"entropy": 0.535430908203125,
|
|
"epoch": 0.0992063492063492,
|
|
"grad_norm": 2.1722194195956828,
|
|
"learning_rate": 1.263157894736842e-05,
|
|
"loss": 0.7812,
|
|
"mean_token_accuracy": 0.7707258444279432,
|
|
"num_tokens": 10773051.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 0.521759033203125,
|
|
"epoch": 0.10317460317460317,
|
|
"grad_norm": 2.2209577423566316,
|
|
"learning_rate": 1.3157894736842108e-05,
|
|
"loss": 0.7645,
|
|
"mean_token_accuracy": 0.7706983601674438,
|
|
"num_tokens": 11211677.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"entropy": 0.52227783203125,
|
|
"epoch": 0.10714285714285714,
|
|
"grad_norm": 1.8746595697038755,
|
|
"learning_rate": 1.3684210526315791e-05,
|
|
"loss": 0.7243,
|
|
"mean_token_accuracy": 0.7795716691762209,
|
|
"num_tokens": 11628779.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"entropy": 0.512603759765625,
|
|
"epoch": 0.1111111111111111,
|
|
"grad_norm": 1.9181519214362959,
|
|
"learning_rate": 1.4210526315789475e-05,
|
|
"loss": 0.7589,
|
|
"mean_token_accuracy": 0.7727855974808335,
|
|
"num_tokens": 12067363.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"entropy": 0.499237060546875,
|
|
"epoch": 0.11507936507936507,
|
|
"grad_norm": 1.8295943705658484,
|
|
"learning_rate": 1.4736842105263159e-05,
|
|
"loss": 0.7531,
|
|
"mean_token_accuracy": 0.7741835163906217,
|
|
"num_tokens": 12507159.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"entropy": 0.499603271484375,
|
|
"epoch": 0.11904761904761904,
|
|
"grad_norm": 1.6179644350101932,
|
|
"learning_rate": 1.5263157894736846e-05,
|
|
"loss": 0.7344,
|
|
"mean_token_accuracy": 0.7773478422313929,
|
|
"num_tokens": 12945458.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.51458740234375,
|
|
"epoch": 0.12301587301587301,
|
|
"grad_norm": 1.8610284952179306,
|
|
"learning_rate": 1.578947368421053e-05,
|
|
"loss": 0.7118,
|
|
"mean_token_accuracy": 0.7809475539252162,
|
|
"num_tokens": 13370514.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"entropy": 0.497283935546875,
|
|
"epoch": 0.12698412698412698,
|
|
"grad_norm": 1.8670029587267238,
|
|
"learning_rate": 1.6315789473684213e-05,
|
|
"loss": 0.7114,
|
|
"mean_token_accuracy": 0.7820066763088107,
|
|
"num_tokens": 13815066.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"entropy": 0.503814697265625,
|
|
"epoch": 0.13095238095238096,
|
|
"grad_norm": 1.7232235737579489,
|
|
"learning_rate": 1.6842105263157896e-05,
|
|
"loss": 0.7034,
|
|
"mean_token_accuracy": 0.7832089820876718,
|
|
"num_tokens": 14240312.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"entropy": 0.49395751953125,
|
|
"epoch": 0.1349206349206349,
|
|
"grad_norm": 1.5626721336124045,
|
|
"learning_rate": 1.736842105263158e-05,
|
|
"loss": 0.6996,
|
|
"mean_token_accuracy": 0.785805162973702,
|
|
"num_tokens": 14685173.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"entropy": 0.499114990234375,
|
|
"epoch": 0.1388888888888889,
|
|
"grad_norm": 1.5550361833914306,
|
|
"learning_rate": 1.7894736842105264e-05,
|
|
"loss": 0.6808,
|
|
"mean_token_accuracy": 0.7907448643818498,
|
|
"num_tokens": 15099914.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 0.494049072265625,
|
|
"epoch": 0.14285714285714285,
|
|
"grad_norm": 1.6976994127074492,
|
|
"learning_rate": 1.8421052631578947e-05,
|
|
"loss": 0.6869,
|
|
"mean_token_accuracy": 0.7874529659748077,
|
|
"num_tokens": 15522062.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"entropy": 0.49200439453125,
|
|
"epoch": 0.14682539682539683,
|
|
"grad_norm": 1.6731263798294116,
|
|
"learning_rate": 1.894736842105263e-05,
|
|
"loss": 0.6716,
|
|
"mean_token_accuracy": 0.7911530267447233,
|
|
"num_tokens": 15955138.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"entropy": 0.494598388671875,
|
|
"epoch": 0.15079365079365079,
|
|
"grad_norm": 1.6340216417965139,
|
|
"learning_rate": 1.9473684210526318e-05,
|
|
"loss": 0.6583,
|
|
"mean_token_accuracy": 0.7931346474215388,
|
|
"num_tokens": 16388252.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"entropy": 0.4937744140625,
|
|
"epoch": 0.15476190476190477,
|
|
"grad_norm": 1.5462189800304016,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.6748,
|
|
"mean_token_accuracy": 0.7880920702591538,
|
|
"num_tokens": 16821287.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"entropy": 0.4898681640625,
|
|
"epoch": 0.15873015873015872,
|
|
"grad_norm": 1.5616598366947274,
|
|
"learning_rate": 1.999990427614762e-05,
|
|
"loss": 0.6311,
|
|
"mean_token_accuracy": 0.7991781169548631,
|
|
"num_tokens": 17235205.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 0.48675537109375,
|
|
"epoch": 0.1626984126984127,
|
|
"grad_norm": 1.646272088858081,
|
|
"learning_rate": 1.999961710642308e-05,
|
|
"loss": 0.65,
|
|
"mean_token_accuracy": 0.7964185178279877,
|
|
"num_tokens": 17657156.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"entropy": 0.4864501953125,
|
|
"epoch": 0.16666666666666666,
|
|
"grad_norm": 1.7240469107631835,
|
|
"learning_rate": 1.999913849632419e-05,
|
|
"loss": 0.6519,
|
|
"mean_token_accuracy": 0.7951665250584483,
|
|
"num_tokens": 18090069.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"entropy": 0.480987548828125,
|
|
"epoch": 0.17063492063492064,
|
|
"grad_norm": 1.6378737913794217,
|
|
"learning_rate": 1.9998468455013825e-05,
|
|
"loss": 0.6596,
|
|
"mean_token_accuracy": 0.7942898478358984,
|
|
"num_tokens": 18542578.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"entropy": 0.494598388671875,
|
|
"epoch": 0.1746031746031746,
|
|
"grad_norm": 1.5436306701146438,
|
|
"learning_rate": 1.999760699531977e-05,
|
|
"loss": 0.6298,
|
|
"mean_token_accuracy": 0.8005202021449804,
|
|
"num_tokens": 18955962.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"entropy": 0.49298095703125,
|
|
"epoch": 0.17857142857142858,
|
|
"grad_norm": 1.4015532613832098,
|
|
"learning_rate": 1.9996554133734473e-05,
|
|
"loss": 0.6231,
|
|
"mean_token_accuracy": 0.8032774887979031,
|
|
"num_tokens": 19379353.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 0.49383544921875,
|
|
"epoch": 0.18253968253968253,
|
|
"grad_norm": 1.3858501151489995,
|
|
"learning_rate": 1.9995309890414735e-05,
|
|
"loss": 0.6116,
|
|
"mean_token_accuracy": 0.8062875410541892,
|
|
"num_tokens": 19812261.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"entropy": 0.5113525390625,
|
|
"epoch": 0.1865079365079365,
|
|
"grad_norm": 1.4607281864296495,
|
|
"learning_rate": 1.99938742891813e-05,
|
|
"loss": 0.6135,
|
|
"mean_token_accuracy": 0.8027496039867401,
|
|
"num_tokens": 20210235.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"entropy": 0.4896240234375,
|
|
"epoch": 0.19047619047619047,
|
|
"grad_norm": 1.532717597565806,
|
|
"learning_rate": 1.9992247357518428e-05,
|
|
"loss": 0.619,
|
|
"mean_token_accuracy": 0.8026178050786257,
|
|
"num_tokens": 20647709.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"entropy": 0.491119384765625,
|
|
"epoch": 0.19444444444444445,
|
|
"grad_norm": 1.4410103415538014,
|
|
"learning_rate": 1.9990429126573353e-05,
|
|
"loss": 0.6121,
|
|
"mean_token_accuracy": 0.8036608071997762,
|
|
"num_tokens": 21065897.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"entropy": 0.499908447265625,
|
|
"epoch": 0.1984126984126984,
|
|
"grad_norm": 1.563783097879555,
|
|
"learning_rate": 1.9988419631155686e-05,
|
|
"loss": 0.6192,
|
|
"mean_token_accuracy": 0.8009361904114485,
|
|
"num_tokens": 21486371.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 0.50067138671875,
|
|
"epoch": 0.20238095238095238,
|
|
"grad_norm": 1.725707634668624,
|
|
"learning_rate": 1.9986218909736758e-05,
|
|
"loss": 0.6139,
|
|
"mean_token_accuracy": 0.8020936474204063,
|
|
"num_tokens": 21909309.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"entropy": 0.504180908203125,
|
|
"epoch": 0.20634920634920634,
|
|
"grad_norm": 1.3228518418004556,
|
|
"learning_rate": 1.9983827004448875e-05,
|
|
"loss": 0.6003,
|
|
"mean_token_accuracy": 0.8073256686329842,
|
|
"num_tokens": 22318414.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"entropy": 0.500946044921875,
|
|
"epoch": 0.21031746031746032,
|
|
"grad_norm": 1.720867937204593,
|
|
"learning_rate": 1.9981243961084516e-05,
|
|
"loss": 0.5856,
|
|
"mean_token_accuracy": 0.8099770434200764,
|
|
"num_tokens": 22719471.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"entropy": 0.49346923828125,
|
|
"epoch": 0.21428571428571427,
|
|
"grad_norm": 1.3398077758817923,
|
|
"learning_rate": 1.997846982909545e-05,
|
|
"loss": 0.591,
|
|
"mean_token_accuracy": 0.8068837188184261,
|
|
"num_tokens": 23134604.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"entropy": 0.480987548828125,
|
|
"epoch": 0.21825396825396826,
|
|
"grad_norm": 1.4838615278854144,
|
|
"learning_rate": 1.99755046615918e-05,
|
|
"loss": 0.5942,
|
|
"mean_token_accuracy": 0.8071342799812555,
|
|
"num_tokens": 23571548.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 0.4854736328125,
|
|
"epoch": 0.2222222222222222,
|
|
"grad_norm": 1.4492711420273152,
|
|
"learning_rate": 1.9972348515341018e-05,
|
|
"loss": 0.6042,
|
|
"mean_token_accuracy": 0.8067643223330379,
|
|
"num_tokens": 24009770.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"entropy": 0.484954833984375,
|
|
"epoch": 0.2261904761904762,
|
|
"grad_norm": 1.256987618745555,
|
|
"learning_rate": 1.9969001450766795e-05,
|
|
"loss": 0.6043,
|
|
"mean_token_accuracy": 0.805039519444108,
|
|
"num_tokens": 24447544.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"entropy": 0.495086669921875,
|
|
"epoch": 0.23015873015873015,
|
|
"grad_norm": 1.3407462818472953,
|
|
"learning_rate": 1.996546353194792e-05,
|
|
"loss": 0.5971,
|
|
"mean_token_accuracy": 0.8080601003021002,
|
|
"num_tokens": 24869806.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"entropy": 0.473907470703125,
|
|
"epoch": 0.23412698412698413,
|
|
"grad_norm": 1.3879658160131176,
|
|
"learning_rate": 1.9961734826617033e-05,
|
|
"loss": 0.6012,
|
|
"mean_token_accuracy": 0.8073396803811193,
|
|
"num_tokens": 25337885.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"entropy": 0.47698974609375,
|
|
"epoch": 0.23809523809523808,
|
|
"grad_norm": 1.2797357075630325,
|
|
"learning_rate": 1.9957815406159344e-05,
|
|
"loss": 0.586,
|
|
"mean_token_accuracy": 0.8098131148144603,
|
|
"num_tokens": 25762999.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 0.480926513671875,
|
|
"epoch": 0.24206349206349206,
|
|
"grad_norm": 1.492286782191842,
|
|
"learning_rate": 1.995370534561125e-05,
|
|
"loss": 0.5819,
|
|
"mean_token_accuracy": 0.8118512397632003,
|
|
"num_tokens": 26204122.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"entropy": 0.46820068359375,
|
|
"epoch": 0.24603174603174602,
|
|
"grad_norm": 1.2844128245812518,
|
|
"learning_rate": 1.994940472365893e-05,
|
|
"loss": 0.567,
|
|
"mean_token_accuracy": 0.8152421358972788,
|
|
"num_tokens": 26651412.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"entropy": 0.481475830078125,
|
|
"epoch": 0.25,
|
|
"grad_norm": 1.1956700400619351,
|
|
"learning_rate": 1.9944913622636798e-05,
|
|
"loss": 0.5794,
|
|
"mean_token_accuracy": 0.8128518350422382,
|
|
"num_tokens": 27080137.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"entropy": 0.470306396484375,
|
|
"epoch": 0.25396825396825395,
|
|
"grad_norm": 1.4972705474023489,
|
|
"learning_rate": 1.994023212852595e-05,
|
|
"loss": 0.5791,
|
|
"mean_token_accuracy": 0.8129686992615461,
|
|
"num_tokens": 27520069.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"entropy": 0.47222900390625,
|
|
"epoch": 0.25793650793650796,
|
|
"grad_norm": 1.4814711931573392,
|
|
"learning_rate": 1.993536033095252e-05,
|
|
"loss": 0.5814,
|
|
"mean_token_accuracy": 0.8099355883896351,
|
|
"num_tokens": 27954154.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 0.48028564453125,
|
|
"epoch": 0.2619047619047619,
|
|
"grad_norm": 1.546117863417093,
|
|
"learning_rate": 1.9930298323185945e-05,
|
|
"loss": 0.5858,
|
|
"mean_token_accuracy": 0.809964569285512,
|
|
"num_tokens": 28367541.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"entropy": 0.4788818359375,
|
|
"epoch": 0.26587301587301587,
|
|
"grad_norm": 1.3505593123937785,
|
|
"learning_rate": 1.9925046202137215e-05,
|
|
"loss": 0.5555,
|
|
"mean_token_accuracy": 0.815067121759057,
|
|
"num_tokens": 28779140.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"entropy": 0.4736328125,
|
|
"epoch": 0.2698412698412698,
|
|
"grad_norm": 1.5267113437685296,
|
|
"learning_rate": 1.9919604068356978e-05,
|
|
"loss": 0.5792,
|
|
"mean_token_accuracy": 0.8099933639168739,
|
|
"num_tokens": 29217570.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"entropy": 0.4664306640625,
|
|
"epoch": 0.27380952380952384,
|
|
"grad_norm": 1.2597555449754034,
|
|
"learning_rate": 1.991397202603363e-05,
|
|
"loss": 0.5556,
|
|
"mean_token_accuracy": 0.8194932043552399,
|
|
"num_tokens": 29656943.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"entropy": 0.46923828125,
|
|
"epoch": 0.2777777777777778,
|
|
"grad_norm": 1.4010912673765619,
|
|
"learning_rate": 1.9908150182991338e-05,
|
|
"loss": 0.5801,
|
|
"mean_token_accuracy": 0.8125716717913747,
|
|
"num_tokens": 30088869.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 0.460540771484375,
|
|
"epoch": 0.28174603174603174,
|
|
"grad_norm": 1.10011210824897,
|
|
"learning_rate": 1.9902138650687943e-05,
|
|
"loss": 0.5787,
|
|
"mean_token_accuracy": 0.8111176574602723,
|
|
"num_tokens": 30550317.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"entropy": 0.466552734375,
|
|
"epoch": 0.2857142857142857,
|
|
"grad_norm": 1.4150609712955267,
|
|
"learning_rate": 1.9895937544212856e-05,
|
|
"loss": 0.5603,
|
|
"mean_token_accuracy": 0.816674031317234,
|
|
"num_tokens": 30959821.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"entropy": 0.462890625,
|
|
"epoch": 0.2896825396825397,
|
|
"grad_norm": 1.2951602254642691,
|
|
"learning_rate": 1.9889546982284833e-05,
|
|
"loss": 0.58,
|
|
"mean_token_accuracy": 0.8122155498713255,
|
|
"num_tokens": 31412639.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"entropy": 0.47247314453125,
|
|
"epoch": 0.29365079365079366,
|
|
"grad_norm": 1.4230925187441965,
|
|
"learning_rate": 1.988296708724972e-05,
|
|
"loss": 0.5739,
|
|
"mean_token_accuracy": 0.8140842840075493,
|
|
"num_tokens": 31830767.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"entropy": 0.468292236328125,
|
|
"epoch": 0.2976190476190476,
|
|
"grad_norm": 1.346368649645527,
|
|
"learning_rate": 1.987619798507809e-05,
|
|
"loss": 0.5668,
|
|
"mean_token_accuracy": 0.8116888217628002,
|
|
"num_tokens": 32267329.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 0.470458984375,
|
|
"epoch": 0.30158730158730157,
|
|
"grad_norm": 1.2828384645809423,
|
|
"learning_rate": 1.986923980536286e-05,
|
|
"loss": 0.5668,
|
|
"mean_token_accuracy": 0.8176631266251206,
|
|
"num_tokens": 32692726.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"entropy": 0.47137451171875,
|
|
"epoch": 0.3055555555555556,
|
|
"grad_norm": 1.2453331184538057,
|
|
"learning_rate": 1.9862092681316774e-05,
|
|
"loss": 0.5442,
|
|
"mean_token_accuracy": 0.8217762364074588,
|
|
"num_tokens": 33108936.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"entropy": 0.453704833984375,
|
|
"epoch": 0.30952380952380953,
|
|
"grad_norm": 1.138281360826344,
|
|
"learning_rate": 1.9854756749769893e-05,
|
|
"loss": 0.5451,
|
|
"mean_token_accuracy": 0.8192528188228607,
|
|
"num_tokens": 33543076.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"entropy": 0.455780029296875,
|
|
"epoch": 0.3134920634920635,
|
|
"grad_norm": 1.3113762371169908,
|
|
"learning_rate": 1.984723215116693e-05,
|
|
"loss": 0.559,
|
|
"mean_token_accuracy": 0.8173351967707276,
|
|
"num_tokens": 33980897.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"entropy": 0.461700439453125,
|
|
"epoch": 0.31746031746031744,
|
|
"grad_norm": 1.231273184699265,
|
|
"learning_rate": 1.9839519029564608e-05,
|
|
"loss": 0.5545,
|
|
"mean_token_accuracy": 0.8189000273123384,
|
|
"num_tokens": 34404352.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 0.476165771484375,
|
|
"epoch": 0.32142857142857145,
|
|
"grad_norm": 1.3255041013441944,
|
|
"learning_rate": 1.983161753262886e-05,
|
|
"loss": 0.5591,
|
|
"mean_token_accuracy": 0.8155939728021622,
|
|
"num_tokens": 34815122.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"entropy": 0.479095458984375,
|
|
"epoch": 0.3253968253968254,
|
|
"grad_norm": 1.144243415774409,
|
|
"learning_rate": 1.982352781163204e-05,
|
|
"loss": 0.5569,
|
|
"mean_token_accuracy": 0.8167071230709553,
|
|
"num_tokens": 35236933.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"entropy": 0.468017578125,
|
|
"epoch": 0.32936507936507936,
|
|
"grad_norm": 1.3470659575750075,
|
|
"learning_rate": 1.9815250021449998e-05,
|
|
"loss": 0.5557,
|
|
"mean_token_accuracy": 0.8185685835778713,
|
|
"num_tokens": 35664506.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"entropy": 0.46673583984375,
|
|
"epoch": 0.3333333333333333,
|
|
"grad_norm": 1.2993064977541513,
|
|
"learning_rate": 1.980678432055913e-05,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.8149783732369542,
|
|
"num_tokens": 36088050.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"entropy": 0.472625732421875,
|
|
"epoch": 0.3373015873015873,
|
|
"grad_norm": 1.260442517304779,
|
|
"learning_rate": 1.9798130871033322e-05,
|
|
"loss": 0.5511,
|
|
"mean_token_accuracy": 0.818051096983254,
|
|
"num_tokens": 36501235.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 0.455963134765625,
|
|
"epoch": 0.3412698412698413,
|
|
"grad_norm": 1.2542494850010633,
|
|
"learning_rate": 1.9789289838540897e-05,
|
|
"loss": 0.554,
|
|
"mean_token_accuracy": 0.8185935420915484,
|
|
"num_tokens": 36942407.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"entropy": 0.456634521484375,
|
|
"epoch": 0.34523809523809523,
|
|
"grad_norm": 1.3539802412376234,
|
|
"learning_rate": 1.9780261392341383e-05,
|
|
"loss": 0.5516,
|
|
"mean_token_accuracy": 0.8191157821565866,
|
|
"num_tokens": 37381260.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"entropy": 0.4647216796875,
|
|
"epoch": 0.3492063492063492,
|
|
"grad_norm": 1.2106246481197491,
|
|
"learning_rate": 1.9771045705282313e-05,
|
|
"loss": 0.5564,
|
|
"mean_token_accuracy": 0.8167914487421513,
|
|
"num_tokens": 37803882.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"entropy": 0.4688720703125,
|
|
"epoch": 0.3531746031746032,
|
|
"grad_norm": 1.4482065495179155,
|
|
"learning_rate": 1.9761642953795896e-05,
|
|
"loss": 0.549,
|
|
"mean_token_accuracy": 0.8173990342766047,
|
|
"num_tokens": 38227635.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"entropy": 0.464630126953125,
|
|
"epoch": 0.35714285714285715,
|
|
"grad_norm": 1.483279776234404,
|
|
"learning_rate": 1.975205331789566e-05,
|
|
"loss": 0.5646,
|
|
"mean_token_accuracy": 0.8164498396217823,
|
|
"num_tokens": 38667329.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 0.4603271484375,
|
|
"epoch": 0.3611111111111111,
|
|
"grad_norm": 1.1991682622509072,
|
|
"learning_rate": 1.9742276981172978e-05,
|
|
"loss": 0.5558,
|
|
"mean_token_accuracy": 0.816204615868628,
|
|
"num_tokens": 39107330.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"entropy": 0.464508056640625,
|
|
"epoch": 0.36507936507936506,
|
|
"grad_norm": 1.0687403883123787,
|
|
"learning_rate": 1.973231413079357e-05,
|
|
"loss": 0.5331,
|
|
"mean_token_accuracy": 0.8214370720088482,
|
|
"num_tokens": 39524166.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"entropy": 0.4556884765625,
|
|
"epoch": 0.36904761904761907,
|
|
"grad_norm": 1.212069109316228,
|
|
"learning_rate": 1.9722164957493925e-05,
|
|
"loss": 0.5358,
|
|
"mean_token_accuracy": 0.823941863141954,
|
|
"num_tokens": 39952174.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"entropy": 0.45452880859375,
|
|
"epoch": 0.373015873015873,
|
|
"grad_norm": 1.1912124705725184,
|
|
"learning_rate": 1.971182965557763e-05,
|
|
"loss": 0.5462,
|
|
"mean_token_accuracy": 0.8201974583789706,
|
|
"num_tokens": 40389693.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"entropy": 0.45831298828125,
|
|
"epoch": 0.376984126984127,
|
|
"grad_norm": 1.097985862600317,
|
|
"learning_rate": 1.9701308422911674e-05,
|
|
"loss": 0.5417,
|
|
"mean_token_accuracy": 0.8212789446115494,
|
|
"num_tokens": 40815730.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 0.472564697265625,
|
|
"epoch": 0.38095238095238093,
|
|
"grad_norm": 1.0506974161366276,
|
|
"learning_rate": 1.969060146092264e-05,
|
|
"loss": 0.5464,
|
|
"mean_token_accuracy": 0.821564057841897,
|
|
"num_tokens": 41231841.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"entropy": 0.477783203125,
|
|
"epoch": 0.38492063492063494,
|
|
"grad_norm": 1.052268920405974,
|
|
"learning_rate": 1.967970897459286e-05,
|
|
"loss": 0.5561,
|
|
"mean_token_accuracy": 0.8177454238757491,
|
|
"num_tokens": 41650119.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"entropy": 0.462188720703125,
|
|
"epoch": 0.3888888888888889,
|
|
"grad_norm": 1.07343802824294,
|
|
"learning_rate": 1.966863117245648e-05,
|
|
"loss": 0.5351,
|
|
"mean_token_accuracy": 0.8210580609738827,
|
|
"num_tokens": 42082897.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"entropy": 0.47021484375,
|
|
"epoch": 0.39285714285714285,
|
|
"grad_norm": 0.9505719077699497,
|
|
"learning_rate": 1.9657368266595477e-05,
|
|
"loss": 0.5309,
|
|
"mean_token_accuracy": 0.8261024495586753,
|
|
"num_tokens": 42519361.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"entropy": 0.468353271484375,
|
|
"epoch": 0.3968253968253968,
|
|
"grad_norm": 1.118827385837718,
|
|
"learning_rate": 1.964592047263561e-05,
|
|
"loss": 0.5289,
|
|
"mean_token_accuracy": 0.8250956162810326,
|
|
"num_tokens": 42941458.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 0.459075927734375,
|
|
"epoch": 0.4007936507936508,
|
|
"grad_norm": 1.0805587142732163,
|
|
"learning_rate": 1.9634288009742254e-05,
|
|
"loss": 0.5345,
|
|
"mean_token_accuracy": 0.822862328030169,
|
|
"num_tokens": 43370971.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"entropy": 0.461395263671875,
|
|
"epoch": 0.40476190476190477,
|
|
"grad_norm": 1.11495068252665,
|
|
"learning_rate": 1.9622471100616253e-05,
|
|
"loss": 0.5358,
|
|
"mean_token_accuracy": 0.8217748673632741,
|
|
"num_tokens": 43801380.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"entropy": 0.45855712890625,
|
|
"epoch": 0.4087301587301587,
|
|
"grad_norm": 1.140917585323956,
|
|
"learning_rate": 1.961046997148961e-05,
|
|
"loss": 0.5482,
|
|
"mean_token_accuracy": 0.8196941930800676,
|
|
"num_tokens": 44245066.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"entropy": 0.47332763671875,
|
|
"epoch": 0.4126984126984127,
|
|
"grad_norm": 1.2030195624026125,
|
|
"learning_rate": 1.959828485212119e-05,
|
|
"loss": 0.546,
|
|
"mean_token_accuracy": 0.8202573778107762,
|
|
"num_tokens": 44671335.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"entropy": 0.48388671875,
|
|
"epoch": 0.4166666666666667,
|
|
"grad_norm": 1.2332127945633589,
|
|
"learning_rate": 1.958591597579231e-05,
|
|
"loss": 0.5427,
|
|
"mean_token_accuracy": 0.8179315272718668,
|
|
"num_tokens": 45066930.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 0.463653564453125,
|
|
"epoch": 0.42063492063492064,
|
|
"grad_norm": 1.1113355845745616,
|
|
"learning_rate": 1.957336357930227e-05,
|
|
"loss": 0.5331,
|
|
"mean_token_accuracy": 0.8212415920570493,
|
|
"num_tokens": 45508166.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"entropy": 0.458984375,
|
|
"epoch": 0.4246031746031746,
|
|
"grad_norm": 1.0863705725170332,
|
|
"learning_rate": 1.9560627902963808e-05,
|
|
"loss": 0.5484,
|
|
"mean_token_accuracy": 0.8195863580331206,
|
|
"num_tokens": 45960298.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"entropy": 0.46014404296875,
|
|
"epoch": 0.42857142857142855,
|
|
"grad_norm": 1.0877758605686885,
|
|
"learning_rate": 1.9547709190598538e-05,
|
|
"loss": 0.5359,
|
|
"mean_token_accuracy": 0.8214817009866238,
|
|
"num_tokens": 46398974.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"entropy": 0.451568603515625,
|
|
"epoch": 0.43253968253968256,
|
|
"grad_norm": 1.0427249445385431,
|
|
"learning_rate": 1.9534607689532236e-05,
|
|
"loss": 0.5438,
|
|
"mean_token_accuracy": 0.8201778931543231,
|
|
"num_tokens": 46837899.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"entropy": 0.451446533203125,
|
|
"epoch": 0.4365079365079365,
|
|
"grad_norm": 1.175695289729488,
|
|
"learning_rate": 1.9521323650590135e-05,
|
|
"loss": 0.5498,
|
|
"mean_token_accuracy": 0.8169540381059051,
|
|
"num_tokens": 47311746.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 0.457611083984375,
|
|
"epoch": 0.44047619047619047,
|
|
"grad_norm": 1.1077962748274401,
|
|
"learning_rate": 1.950785732809211e-05,
|
|
"loss": 0.5247,
|
|
"mean_token_accuracy": 0.8267239183187485,
|
|
"num_tokens": 47744340.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"entropy": 0.45452880859375,
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 0.9399177463048327,
|
|
"learning_rate": 1.9494208979847814e-05,
|
|
"loss": 0.5204,
|
|
"mean_token_accuracy": 0.826555940322578,
|
|
"num_tokens": 48177761.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"entropy": 0.464630126953125,
|
|
"epoch": 0.44841269841269843,
|
|
"grad_norm": 1.1098464998441184,
|
|
"learning_rate": 1.9480378867151746e-05,
|
|
"loss": 0.5415,
|
|
"mean_token_accuracy": 0.8196114804595709,
|
|
"num_tokens": 48604700.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"entropy": 0.464874267578125,
|
|
"epoch": 0.4523809523809524,
|
|
"grad_norm": 1.0854538207166011,
|
|
"learning_rate": 1.9466367254778234e-05,
|
|
"loss": 0.5224,
|
|
"mean_token_accuracy": 0.8268638197332621,
|
|
"num_tokens": 49026375.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"entropy": 0.449798583984375,
|
|
"epoch": 0.45634920634920634,
|
|
"grad_norm": 1.17364696539316,
|
|
"learning_rate": 1.9452174410976383e-05,
|
|
"loss": 0.5367,
|
|
"mean_token_accuracy": 0.8208001255989075,
|
|
"num_tokens": 49467267.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 0.461151123046875,
|
|
"epoch": 0.4603174603174603,
|
|
"grad_norm": 0.9873427369389342,
|
|
"learning_rate": 1.943780060746493e-05,
|
|
"loss": 0.5353,
|
|
"mean_token_accuracy": 0.8211417645215988,
|
|
"num_tokens": 49889163.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"entropy": 0.462799072265625,
|
|
"epoch": 0.4642857142857143,
|
|
"grad_norm": 1.0735045578812574,
|
|
"learning_rate": 1.9423246119427044e-05,
|
|
"loss": 0.5143,
|
|
"mean_token_accuracy": 0.8246986707672477,
|
|
"num_tokens": 50300460.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"entropy": 0.454498291015625,
|
|
"epoch": 0.46825396825396826,
|
|
"grad_norm": 1.2028864034535025,
|
|
"learning_rate": 1.940851122550506e-05,
|
|
"loss": 0.536,
|
|
"mean_token_accuracy": 0.8228262066841125,
|
|
"num_tokens": 50735361.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"entropy": 0.458526611328125,
|
|
"epoch": 0.4722222222222222,
|
|
"grad_norm": 1.0533705874526058,
|
|
"learning_rate": 1.9393596207795135e-05,
|
|
"loss": 0.5221,
|
|
"mean_token_accuracy": 0.8254814920946956,
|
|
"num_tokens": 51149122.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"entropy": 0.453338623046875,
|
|
"epoch": 0.47619047619047616,
|
|
"grad_norm": 1.0854374369386737,
|
|
"learning_rate": 1.9378501351841864e-05,
|
|
"loss": 0.5267,
|
|
"mean_token_accuracy": 0.8232422964647412,
|
|
"num_tokens": 51597577.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 0.442718505859375,
|
|
"epoch": 0.4801587301587302,
|
|
"grad_norm": 0.9631721841884465,
|
|
"learning_rate": 1.93632269466328e-05,
|
|
"loss": 0.53,
|
|
"mean_token_accuracy": 0.8238179190084338,
|
|
"num_tokens": 52043666.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"entropy": 0.4471435546875,
|
|
"epoch": 0.48412698412698413,
|
|
"grad_norm": 1.0190972188396954,
|
|
"learning_rate": 1.934777328459292e-05,
|
|
"loss": 0.5315,
|
|
"mean_token_accuracy": 0.8235808834433556,
|
|
"num_tokens": 52482382.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"entropy": 0.45355224609375,
|
|
"epoch": 0.4880952380952381,
|
|
"grad_norm": 0.9437542356299639,
|
|
"learning_rate": 1.933214066157904e-05,
|
|
"loss": 0.5263,
|
|
"mean_token_accuracy": 0.8229744052514434,
|
|
"num_tokens": 52931115.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"entropy": 0.451507568359375,
|
|
"epoch": 0.49206349206349204,
|
|
"grad_norm": 1.0834515748734725,
|
|
"learning_rate": 1.9316329376874146e-05,
|
|
"loss": 0.53,
|
|
"mean_token_accuracy": 0.8217291543260217,
|
|
"num_tokens": 53370750.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"entropy": 0.45770263671875,
|
|
"epoch": 0.49603174603174605,
|
|
"grad_norm": 1.0001575216738139,
|
|
"learning_rate": 1.930033973318164e-05,
|
|
"loss": 0.5209,
|
|
"mean_token_accuracy": 0.8262900058180094,
|
|
"num_tokens": 53798951.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 0.446929931640625,
|
|
"epoch": 0.5,
|
|
"grad_norm": 0.984702450663036,
|
|
"learning_rate": 1.9284172036619597e-05,
|
|
"loss": 0.5122,
|
|
"mean_token_accuracy": 0.8294054577127099,
|
|
"num_tokens": 54235189.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"entropy": 0.447174072265625,
|
|
"epoch": 0.503968253968254,
|
|
"grad_norm": 0.9634718133480947,
|
|
"learning_rate": 1.926782659671484e-05,
|
|
"loss": 0.5118,
|
|
"mean_token_accuracy": 0.8281444823369384,
|
|
"num_tokens": 54642925.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"entropy": 0.452545166015625,
|
|
"epoch": 0.5079365079365079,
|
|
"grad_norm": 0.9876659774467254,
|
|
"learning_rate": 1.9251303726397076e-05,
|
|
"loss": 0.5216,
|
|
"mean_token_accuracy": 0.8251809384673834,
|
|
"num_tokens": 55066936.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"entropy": 0.4573974609375,
|
|
"epoch": 0.5119047619047619,
|
|
"grad_norm": 0.9632797656676275,
|
|
"learning_rate": 1.9234603741992864e-05,
|
|
"loss": 0.5165,
|
|
"mean_token_accuracy": 0.8273195894435048,
|
|
"num_tokens": 55484500.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"entropy": 0.44158935546875,
|
|
"epoch": 0.5158730158730159,
|
|
"grad_norm": 0.9175209493730221,
|
|
"learning_rate": 1.9217726963219567e-05,
|
|
"loss": 0.5209,
|
|
"mean_token_accuracy": 0.8260001242160797,
|
|
"num_tokens": 55922405.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 0.4527587890625,
|
|
"epoch": 0.5198412698412699,
|
|
"grad_norm": 1.062087131423626,
|
|
"learning_rate": 1.9200673713179245e-05,
|
|
"loss": 0.5299,
|
|
"mean_token_accuracy": 0.8199161011725664,
|
|
"num_tokens": 56355834.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"entropy": 0.452056884765625,
|
|
"epoch": 0.5238095238095238,
|
|
"grad_norm": 0.964250067282172,
|
|
"learning_rate": 1.9183444318352458e-05,
|
|
"loss": 0.509,
|
|
"mean_token_accuracy": 0.8293298603966832,
|
|
"num_tokens": 56770275.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"entropy": 0.44903564453125,
|
|
"epoch": 0.5277777777777778,
|
|
"grad_norm": 0.9853678148235807,
|
|
"learning_rate": 1.9166039108592008e-05,
|
|
"loss": 0.5289,
|
|
"mean_token_accuracy": 0.8244264824315906,
|
|
"num_tokens": 57203160.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"entropy": 0.44952392578125,
|
|
"epoch": 0.5317460317460317,
|
|
"grad_norm": 1.0255963315554393,
|
|
"learning_rate": 1.9148458417116645e-05,
|
|
"loss": 0.5217,
|
|
"mean_token_accuracy": 0.8221221547573805,
|
|
"num_tokens": 57627870.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"entropy": 0.44769287109375,
|
|
"epoch": 0.5357142857142857,
|
|
"grad_norm": 1.048457109780583,
|
|
"learning_rate": 1.9130702580504678e-05,
|
|
"loss": 0.5239,
|
|
"mean_token_accuracy": 0.8261872082948685,
|
|
"num_tokens": 58059792.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 0.452117919921875,
|
|
"epoch": 0.5396825396825397,
|
|
"grad_norm": 0.9254591817527784,
|
|
"learning_rate": 1.911277193868751e-05,
|
|
"loss": 0.5071,
|
|
"mean_token_accuracy": 0.8290363997220993,
|
|
"num_tokens": 58469884.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"entropy": 0.44451904296875,
|
|
"epoch": 0.5436507936507936,
|
|
"grad_norm": 1.0048803588236515,
|
|
"learning_rate": 1.9094666834943177e-05,
|
|
"loss": 0.5088,
|
|
"mean_token_accuracy": 0.8265325101092458,
|
|
"num_tokens": 58908403.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"entropy": 0.4510498046875,
|
|
"epoch": 0.5476190476190477,
|
|
"grad_norm": 0.8959703325073818,
|
|
"learning_rate": 1.9076387615889728e-05,
|
|
"loss": 0.509,
|
|
"mean_token_accuracy": 0.8270564498379827,
|
|
"num_tokens": 59323796.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"entropy": 0.4468994140625,
|
|
"epoch": 0.5515873015873016,
|
|
"grad_norm": 0.9671186022782653,
|
|
"learning_rate": 1.9057934631478616e-05,
|
|
"loss": 0.5097,
|
|
"mean_token_accuracy": 0.8285402255132794,
|
|
"num_tokens": 59751885.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"entropy": 0.44732666015625,
|
|
"epoch": 0.5555555555555556,
|
|
"grad_norm": 1.0314928737522524,
|
|
"learning_rate": 1.903930823498799e-05,
|
|
"loss": 0.4979,
|
|
"mean_token_accuracy": 0.8310831068083644,
|
|
"num_tokens": 60183841.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 0.447723388671875,
|
|
"epoch": 0.5595238095238095,
|
|
"grad_norm": 0.9892945139524538,
|
|
"learning_rate": 1.9020508783015942e-05,
|
|
"loss": 0.5197,
|
|
"mean_token_accuracy": 0.8258982775732875,
|
|
"num_tokens": 60605801.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"entropy": 0.447662353515625,
|
|
"epoch": 0.5634920634920635,
|
|
"grad_norm": 0.9568747475172027,
|
|
"learning_rate": 1.9001536635473664e-05,
|
|
"loss": 0.5162,
|
|
"mean_token_accuracy": 0.8260063044726849,
|
|
"num_tokens": 61048601.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"entropy": 0.46087646484375,
|
|
"epoch": 0.5674603174603174,
|
|
"grad_norm": 1.0859536458388588,
|
|
"learning_rate": 1.898239215557856e-05,
|
|
"loss": 0.5123,
|
|
"mean_token_accuracy": 0.8280721958726645,
|
|
"num_tokens": 61446873.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"entropy": 0.448699951171875,
|
|
"epoch": 0.5714285714285714,
|
|
"grad_norm": 1.0019498607421429,
|
|
"learning_rate": 1.8963075709847308e-05,
|
|
"loss": 0.5278,
|
|
"mean_token_accuracy": 0.8225747244432569,
|
|
"num_tokens": 61887912.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"entropy": 0.454803466796875,
|
|
"epoch": 0.5753968253968254,
|
|
"grad_norm": 0.9611573189972261,
|
|
"learning_rate": 1.894358766808883e-05,
|
|
"loss": 0.5125,
|
|
"mean_token_accuracy": 0.8298147981986403,
|
|
"num_tokens": 62316051.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 0.440704345703125,
|
|
"epoch": 0.5793650793650794,
|
|
"grad_norm": 1.0453009129466901,
|
|
"learning_rate": 1.892392840339721e-05,
|
|
"loss": 0.5101,
|
|
"mean_token_accuracy": 0.8277928857132792,
|
|
"num_tokens": 62741342.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"entropy": 0.445953369140625,
|
|
"epoch": 0.5833333333333334,
|
|
"grad_norm": 0.9253514323420257,
|
|
"learning_rate": 1.8904098292144556e-05,
|
|
"loss": 0.5111,
|
|
"mean_token_accuracy": 0.8307394320145249,
|
|
"num_tokens": 63164890.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"entropy": 0.4366455078125,
|
|
"epoch": 0.5873015873015873,
|
|
"grad_norm": 1.0178074892348359,
|
|
"learning_rate": 1.8884097713973798e-05,
|
|
"loss": 0.514,
|
|
"mean_token_accuracy": 0.8273154394701123,
|
|
"num_tokens": 63594617.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"entropy": 0.44122314453125,
|
|
"epoch": 0.5912698412698413,
|
|
"grad_norm": 0.9605119211913561,
|
|
"learning_rate": 1.8863927051791418e-05,
|
|
"loss": 0.5227,
|
|
"mean_token_accuracy": 0.8260425301268697,
|
|
"num_tokens": 64050293.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"entropy": 0.440643310546875,
|
|
"epoch": 0.5952380952380952,
|
|
"grad_norm": 0.9479991030852958,
|
|
"learning_rate": 1.884358669176011e-05,
|
|
"loss": 0.4982,
|
|
"mean_token_accuracy": 0.8325884817168117,
|
|
"num_tokens": 64467695.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 0.444976806640625,
|
|
"epoch": 0.5992063492063492,
|
|
"grad_norm": 0.964607540439781,
|
|
"learning_rate": 1.88230770232914e-05,
|
|
"loss": 0.5142,
|
|
"mean_token_accuracy": 0.825270832516253,
|
|
"num_tokens": 64908198.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"entropy": 0.4447021484375,
|
|
"epoch": 0.6031746031746031,
|
|
"grad_norm": 0.9804608822405252,
|
|
"learning_rate": 1.8802398439038175e-05,
|
|
"loss": 0.503,
|
|
"mean_token_accuracy": 0.8305519446730614,
|
|
"num_tokens": 65333788.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"entropy": 0.45428466796875,
|
|
"epoch": 0.6071428571428571,
|
|
"grad_norm": 0.9727870339013585,
|
|
"learning_rate": 1.8781551334887204e-05,
|
|
"loss": 0.494,
|
|
"mean_token_accuracy": 0.833051766268909,
|
|
"num_tokens": 65748782.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"entropy": 0.44195556640625,
|
|
"epoch": 0.6111111111111112,
|
|
"grad_norm": 1.0608760680511125,
|
|
"learning_rate": 1.876053610995149e-05,
|
|
"loss": 0.5093,
|
|
"mean_token_accuracy": 0.8293332532048225,
|
|
"num_tokens": 66178918.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"entropy": 0.44537353515625,
|
|
"epoch": 0.6150793650793651,
|
|
"grad_norm": 1.0235107189518842,
|
|
"learning_rate": 1.87393531665627e-05,
|
|
"loss": 0.5184,
|
|
"mean_token_accuracy": 0.828137094154954,
|
|
"num_tokens": 66603248.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 0.438262939453125,
|
|
"epoch": 0.6190476190476191,
|
|
"grad_norm": 0.9010906323539429,
|
|
"learning_rate": 1.8718002910263426e-05,
|
|
"loss": 0.5053,
|
|
"mean_token_accuracy": 0.8307396033778787,
|
|
"num_tokens": 67052342.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"entropy": 0.43731689453125,
|
|
"epoch": 0.623015873015873,
|
|
"grad_norm": 0.9986738769179647,
|
|
"learning_rate": 1.869648574979942e-05,
|
|
"loss": 0.5065,
|
|
"mean_token_accuracy": 0.8303233273327351,
|
|
"num_tokens": 67476890.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"entropy": 0.440155029296875,
|
|
"epoch": 0.626984126984127,
|
|
"grad_norm": 0.8657633291877197,
|
|
"learning_rate": 1.8674802097111784e-05,
|
|
"loss": 0.5059,
|
|
"mean_token_accuracy": 0.8300606962293386,
|
|
"num_tokens": 67913391.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"entropy": 0.448150634765625,
|
|
"epoch": 0.6309523809523809,
|
|
"grad_norm": 0.9352113389947035,
|
|
"learning_rate": 1.865295236732907e-05,
|
|
"loss": 0.4949,
|
|
"mean_token_accuracy": 0.833710327744484,
|
|
"num_tokens": 68339443.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"entropy": 0.4429931640625,
|
|
"epoch": 0.6349206349206349,
|
|
"grad_norm": 0.8523834865258976,
|
|
"learning_rate": 1.8630936978759337e-05,
|
|
"loss": 0.5092,
|
|
"mean_token_accuracy": 0.8283061692491174,
|
|
"num_tokens": 68772115.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 0.43743896484375,
|
|
"epoch": 0.6388888888888888,
|
|
"grad_norm": 0.9636363540162411,
|
|
"learning_rate": 1.8608756352882152e-05,
|
|
"loss": 0.4984,
|
|
"mean_token_accuracy": 0.8309094673022628,
|
|
"num_tokens": 69198272.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"entropy": 0.43994140625,
|
|
"epoch": 0.6428571428571429,
|
|
"grad_norm": 0.8966734535109344,
|
|
"learning_rate": 1.85864109143405e-05,
|
|
"loss": 0.4921,
|
|
"mean_token_accuracy": 0.8349130833521485,
|
|
"num_tokens": 69611653.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"entropy": 0.448516845703125,
|
|
"epoch": 0.6468253968253969,
|
|
"grad_norm": 0.8821880241673743,
|
|
"learning_rate": 1.8563901090932673e-05,
|
|
"loss": 0.5232,
|
|
"mean_token_accuracy": 0.8246152186766267,
|
|
"num_tokens": 70054917.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"entropy": 0.437896728515625,
|
|
"epoch": 0.6507936507936508,
|
|
"grad_norm": 0.9223123901727143,
|
|
"learning_rate": 1.854122731360408e-05,
|
|
"loss": 0.5027,
|
|
"mean_token_accuracy": 0.8312725247815251,
|
|
"num_tokens": 70496952.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"entropy": 0.43646240234375,
|
|
"epoch": 0.6547619047619048,
|
|
"grad_norm": 0.8939813183437758,
|
|
"learning_rate": 1.851839001643898e-05,
|
|
"loss": 0.4959,
|
|
"mean_token_accuracy": 0.8323455331847072,
|
|
"num_tokens": 70919899.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 0.43939208984375,
|
|
"epoch": 0.6587301587301587,
|
|
"grad_norm": 0.9020946161100143,
|
|
"learning_rate": 1.8495389636652185e-05,
|
|
"loss": 0.4995,
|
|
"mean_token_accuracy": 0.8345548948273063,
|
|
"num_tokens": 71343921.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"entropy": 0.436492919921875,
|
|
"epoch": 0.6626984126984127,
|
|
"grad_norm": 0.907744568728475,
|
|
"learning_rate": 1.847222661458069e-05,
|
|
"loss": 0.5191,
|
|
"mean_token_accuracy": 0.8266001716256142,
|
|
"num_tokens": 71808905.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"entropy": 0.442840576171875,
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 0.9515785083438172,
|
|
"learning_rate": 1.8448901393675233e-05,
|
|
"loss": 0.5001,
|
|
"mean_token_accuracy": 0.8303157305344939,
|
|
"num_tokens": 72240608.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"entropy": 0.44061279296875,
|
|
"epoch": 0.6706349206349206,
|
|
"grad_norm": 1.0721428605965826,
|
|
"learning_rate": 1.8425414420491817e-05,
|
|
"loss": 0.492,
|
|
"mean_token_accuracy": 0.832602908834815,
|
|
"num_tokens": 72668688.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"entropy": 0.439117431640625,
|
|
"epoch": 0.6746031746031746,
|
|
"grad_norm": 0.823986080877254,
|
|
"learning_rate": 1.8401766144683145e-05,
|
|
"loss": 0.5066,
|
|
"mean_token_accuracy": 0.8304582042619586,
|
|
"num_tokens": 73118452.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 0.435211181640625,
|
|
"epoch": 0.6785714285714286,
|
|
"grad_norm": 0.9329819325690418,
|
|
"learning_rate": 1.8377957018990043e-05,
|
|
"loss": 0.5083,
|
|
"mean_token_accuracy": 0.8277195170521736,
|
|
"num_tokens": 73569120.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"entropy": 0.433624267578125,
|
|
"epoch": 0.6825396825396826,
|
|
"grad_norm": 0.9469634105889654,
|
|
"learning_rate": 1.8353987499232747e-05,
|
|
"loss": 0.5,
|
|
"mean_token_accuracy": 0.830720316618681,
|
|
"num_tokens": 73991069.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"entropy": 0.44415283203125,
|
|
"epoch": 0.6865079365079365,
|
|
"grad_norm": 0.8437243008063409,
|
|
"learning_rate": 1.8329858044302212e-05,
|
|
"loss": 0.4914,
|
|
"mean_token_accuracy": 0.8340719323605299,
|
|
"num_tokens": 74406271.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"entropy": 0.442138671875,
|
|
"epoch": 0.6904761904761905,
|
|
"grad_norm": 0.9448325053412396,
|
|
"learning_rate": 1.830556911615132e-05,
|
|
"loss": 0.4984,
|
|
"mean_token_accuracy": 0.8315248852595687,
|
|
"num_tokens": 74839901.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"entropy": 0.435211181640625,
|
|
"epoch": 0.6944444444444444,
|
|
"grad_norm": 0.8082342371498417,
|
|
"learning_rate": 1.8281121179786024e-05,
|
|
"loss": 0.4941,
|
|
"mean_token_accuracy": 0.8327573603019118,
|
|
"num_tokens": 75280578.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 0.436492919921875,
|
|
"epoch": 0.6984126984126984,
|
|
"grad_norm": 0.9182267774958506,
|
|
"learning_rate": 1.825651470325645e-05,
|
|
"loss": 0.5185,
|
|
"mean_token_accuracy": 0.8261669343337417,
|
|
"num_tokens": 75749725.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"entropy": 0.441497802734375,
|
|
"epoch": 0.7023809523809523,
|
|
"grad_norm": 0.8607237067960308,
|
|
"learning_rate": 1.823175015764795e-05,
|
|
"loss": 0.5041,
|
|
"mean_token_accuracy": 0.8318371307104826,
|
|
"num_tokens": 76175668.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"entropy": 0.443450927734375,
|
|
"epoch": 0.7063492063492064,
|
|
"grad_norm": 0.8618633112263342,
|
|
"learning_rate": 1.8206828017072057e-05,
|
|
"loss": 0.4985,
|
|
"mean_token_accuracy": 0.8302229046821594,
|
|
"num_tokens": 76593690.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"entropy": 0.441619873046875,
|
|
"epoch": 0.7103174603174603,
|
|
"grad_norm": 0.9015146702404145,
|
|
"learning_rate": 1.818174875865744e-05,
|
|
"loss": 0.491,
|
|
"mean_token_accuracy": 0.8343524327501655,
|
|
"num_tokens": 77009261.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"entropy": 0.441009521484375,
|
|
"epoch": 0.7142857142857143,
|
|
"grad_norm": 0.9212502815359641,
|
|
"learning_rate": 1.815651286254074e-05,
|
|
"loss": 0.5019,
|
|
"mean_token_accuracy": 0.8306312058120966,
|
|
"num_tokens": 77431030.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 0.435089111328125,
|
|
"epoch": 0.7182539682539683,
|
|
"grad_norm": 0.8731300242197482,
|
|
"learning_rate": 1.8131120811857398e-05,
|
|
"loss": 0.4858,
|
|
"mean_token_accuracy": 0.8352022236213088,
|
|
"num_tokens": 77852655.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"entropy": 0.44256591796875,
|
|
"epoch": 0.7222222222222222,
|
|
"grad_norm": 0.8959355020911784,
|
|
"learning_rate": 1.81055730927324e-05,
|
|
"loss": 0.4998,
|
|
"mean_token_accuracy": 0.8333287099376321,
|
|
"num_tokens": 78278605.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"entropy": 0.43603515625,
|
|
"epoch": 0.7261904761904762,
|
|
"grad_norm": 0.9116496627423212,
|
|
"learning_rate": 1.8079870194270958e-05,
|
|
"loss": 0.4855,
|
|
"mean_token_accuracy": 0.8339688014239073,
|
|
"num_tokens": 78721098.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"entropy": 0.439605712890625,
|
|
"epoch": 0.7301587301587301,
|
|
"grad_norm": 0.9140622005319157,
|
|
"learning_rate": 1.8054012608549167e-05,
|
|
"loss": 0.4963,
|
|
"mean_token_accuracy": 0.8326618708670139,
|
|
"num_tokens": 79154216.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"entropy": 0.433380126953125,
|
|
"epoch": 0.7341269841269841,
|
|
"grad_norm": 0.8229034336908294,
|
|
"learning_rate": 1.802800083060457e-05,
|
|
"loss": 0.4938,
|
|
"mean_token_accuracy": 0.8325842721387744,
|
|
"num_tokens": 79599332.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 0.43524169921875,
|
|
"epoch": 0.7380952380952381,
|
|
"grad_norm": 0.897020159567663,
|
|
"learning_rate": 1.8001835358426688e-05,
|
|
"loss": 0.5046,
|
|
"mean_token_accuracy": 0.8298712829127908,
|
|
"num_tokens": 80039204.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"entropy": 0.4283447265625,
|
|
"epoch": 0.7420634920634921,
|
|
"grad_norm": 0.8527620690748114,
|
|
"learning_rate": 1.7975516692947478e-05,
|
|
"loss": 0.4862,
|
|
"mean_token_accuracy": 0.8363011125475168,
|
|
"num_tokens": 80472128.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"entropy": 0.434478759765625,
|
|
"epoch": 0.746031746031746,
|
|
"grad_norm": 0.8892589600328908,
|
|
"learning_rate": 1.7949045338031744e-05,
|
|
"loss": 0.5016,
|
|
"mean_token_accuracy": 0.830377884209156,
|
|
"num_tokens": 80910348.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"entropy": 0.435333251953125,
|
|
"epoch": 0.75,
|
|
"grad_norm": 0.8402522023302489,
|
|
"learning_rate": 1.7922421800467515e-05,
|
|
"loss": 0.4981,
|
|
"mean_token_accuracy": 0.830297333188355,
|
|
"num_tokens": 81336350.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"entropy": 0.43426513671875,
|
|
"epoch": 0.753968253968254,
|
|
"grad_norm": 0.8661830298081237,
|
|
"learning_rate": 1.7895646589956294e-05,
|
|
"loss": 0.4933,
|
|
"mean_token_accuracy": 0.8319168901070952,
|
|
"num_tokens": 81765325.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 0.45111083984375,
|
|
"epoch": 0.7579365079365079,
|
|
"grad_norm": 0.9513754499577816,
|
|
"learning_rate": 1.7868720219103343e-05,
|
|
"loss": 0.4966,
|
|
"mean_token_accuracy": 0.8327370900660753,
|
|
"num_tokens": 82191876.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"entropy": 0.443572998046875,
|
|
"epoch": 0.7619047619047619,
|
|
"grad_norm": 0.8565906413499814,
|
|
"learning_rate": 1.7841643203407854e-05,
|
|
"loss": 0.4729,
|
|
"mean_token_accuracy": 0.8389169629663229,
|
|
"num_tokens": 82611125.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"entropy": 0.43267822265625,
|
|
"epoch": 0.7658730158730159,
|
|
"grad_norm": 0.9135365351092013,
|
|
"learning_rate": 1.7814416061253076e-05,
|
|
"loss": 0.4998,
|
|
"mean_token_accuracy": 0.8300598934292793,
|
|
"num_tokens": 83051815.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"entropy": 0.431640625,
|
|
"epoch": 0.7698412698412699,
|
|
"grad_norm": 0.8467497209932042,
|
|
"learning_rate": 1.77870393138964e-05,
|
|
"loss": 0.4929,
|
|
"mean_token_accuracy": 0.8316225642338395,
|
|
"num_tokens": 83488021.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"entropy": 0.43865966796875,
|
|
"epoch": 0.7738095238095238,
|
|
"grad_norm": 0.8533008256293745,
|
|
"learning_rate": 1.7759513485459367e-05,
|
|
"loss": 0.4911,
|
|
"mean_token_accuracy": 0.8322532493621111,
|
|
"num_tokens": 83902519.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 0.433837890625,
|
|
"epoch": 0.7777777777777778,
|
|
"grad_norm": 0.9223169889850834,
|
|
"learning_rate": 1.7731839102917646e-05,
|
|
"loss": 0.4917,
|
|
"mean_token_accuracy": 0.8330741114914417,
|
|
"num_tokens": 84321775.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"entropy": 0.431854248046875,
|
|
"epoch": 0.7817460317460317,
|
|
"grad_norm": 0.8693783829283632,
|
|
"learning_rate": 1.7704016696090936e-05,
|
|
"loss": 0.4877,
|
|
"mean_token_accuracy": 0.8349456917494535,
|
|
"num_tokens": 84745911.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"entropy": 0.433502197265625,
|
|
"epoch": 0.7857142857142857,
|
|
"grad_norm": 0.8213507259820778,
|
|
"learning_rate": 1.7676046797632834e-05,
|
|
"loss": 0.4712,
|
|
"mean_token_accuracy": 0.8394572427496314,
|
|
"num_tokens": 85167087.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"entropy": 0.443817138671875,
|
|
"epoch": 0.7896825396825397,
|
|
"grad_norm": 0.8331545963147762,
|
|
"learning_rate": 1.7647929943020625e-05,
|
|
"loss": 0.4933,
|
|
"mean_token_accuracy": 0.833775763399899,
|
|
"num_tokens": 85588482.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"entropy": 0.427947998046875,
|
|
"epoch": 0.7936507936507936,
|
|
"grad_norm": 0.8194052047164025,
|
|
"learning_rate": 1.7619666670545034e-05,
|
|
"loss": 0.4887,
|
|
"mean_token_accuracy": 0.8344959514215589,
|
|
"num_tokens": 86009383.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 0.433013916015625,
|
|
"epoch": 0.7976190476190477,
|
|
"grad_norm": 0.8009988751245896,
|
|
"learning_rate": 1.759125752129993e-05,
|
|
"loss": 0.4961,
|
|
"mean_token_accuracy": 0.8323280559852719,
|
|
"num_tokens": 86449600.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"entropy": 0.433624267578125,
|
|
"epoch": 0.8015873015873016,
|
|
"grad_norm": 0.8209561833032096,
|
|
"learning_rate": 1.7562703039171955e-05,
|
|
"loss": 0.4747,
|
|
"mean_token_accuracy": 0.8379244077950716,
|
|
"num_tokens": 86862628.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"entropy": 0.4326171875,
|
|
"epoch": 0.8055555555555556,
|
|
"grad_norm": 0.8915807469586918,
|
|
"learning_rate": 1.753400377083011e-05,
|
|
"loss": 0.49,
|
|
"mean_token_accuracy": 0.8344454681500793,
|
|
"num_tokens": 87295233.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"entropy": 0.431121826171875,
|
|
"epoch": 0.8095238095238095,
|
|
"grad_norm": 0.8241124494920523,
|
|
"learning_rate": 1.7505160265715303e-05,
|
|
"loss": 0.4813,
|
|
"mean_token_accuracy": 0.8365888074040413,
|
|
"num_tokens": 87713395.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"entropy": 0.432586669921875,
|
|
"epoch": 0.8134920634920635,
|
|
"grad_norm": 0.8140817202619633,
|
|
"learning_rate": 1.747617307602982e-05,
|
|
"loss": 0.5055,
|
|
"mean_token_accuracy": 0.830372148193419,
|
|
"num_tokens": 88158822.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 0.430633544921875,
|
|
"epoch": 0.8174603174603174,
|
|
"grad_norm": 0.9065008583130683,
|
|
"learning_rate": 1.7447042756726756e-05,
|
|
"loss": 0.4892,
|
|
"mean_token_accuracy": 0.8338504349812865,
|
|
"num_tokens": 88602545.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"entropy": 0.435455322265625,
|
|
"epoch": 0.8214285714285714,
|
|
"grad_norm": 0.8128045472312435,
|
|
"learning_rate": 1.741776986549938e-05,
|
|
"loss": 0.4704,
|
|
"mean_token_accuracy": 0.8403382319957018,
|
|
"num_tokens": 89025796.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"entropy": 0.43255615234375,
|
|
"epoch": 0.8253968253968254,
|
|
"grad_norm": 0.8022720959148042,
|
|
"learning_rate": 1.7388354962770488e-05,
|
|
"loss": 0.4908,
|
|
"mean_token_accuracy": 0.8343631466850638,
|
|
"num_tokens": 89444255.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"entropy": 0.43310546875,
|
|
"epoch": 0.8293650793650794,
|
|
"grad_norm": 0.9092929881385042,
|
|
"learning_rate": 1.735879861168163e-05,
|
|
"loss": 0.4971,
|
|
"mean_token_accuracy": 0.8356021726503968,
|
|
"num_tokens": 89894143.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"entropy": 0.4346923828125,
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 0.7549249190997407,
|
|
"learning_rate": 1.7329101378082374e-05,
|
|
"loss": 0.4546,
|
|
"mean_token_accuracy": 0.8430411163717508,
|
|
"num_tokens": 90312774.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 0.4388427734375,
|
|
"epoch": 0.8373015873015873,
|
|
"grad_norm": 0.8727254647412117,
|
|
"learning_rate": 1.729926383051943e-05,
|
|
"loss": 0.4784,
|
|
"mean_token_accuracy": 0.8370981393381953,
|
|
"num_tokens": 90741787.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"entropy": 0.431640625,
|
|
"epoch": 0.8412698412698413,
|
|
"grad_norm": 0.7541804775023891,
|
|
"learning_rate": 1.7269286540225805e-05,
|
|
"loss": 0.4704,
|
|
"mean_token_accuracy": 0.8386409590020776,
|
|
"num_tokens": 91177447.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"entropy": 0.43267822265625,
|
|
"epoch": 0.8452380952380952,
|
|
"grad_norm": 0.7677395619634457,
|
|
"learning_rate": 1.723917008110984e-05,
|
|
"loss": 0.4759,
|
|
"mean_token_accuracy": 0.8381857760250568,
|
|
"num_tokens": 91607165.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"entropy": 0.43829345703125,
|
|
"epoch": 0.8492063492063492,
|
|
"grad_norm": 0.8426223853041325,
|
|
"learning_rate": 1.720891502974423e-05,
|
|
"loss": 0.4774,
|
|
"mean_token_accuracy": 0.8353362819179893,
|
|
"num_tokens": 92026164.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"entropy": 0.432647705078125,
|
|
"epoch": 0.8531746031746031,
|
|
"grad_norm": 0.7956595213704394,
|
|
"learning_rate": 1.7178521965354992e-05,
|
|
"loss": 0.4736,
|
|
"mean_token_accuracy": 0.8389335246756673,
|
|
"num_tokens": 92463989.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 0.430908203125,
|
|
"epoch": 0.8571428571428571,
|
|
"grad_norm": 0.7978027639386832,
|
|
"learning_rate": 1.714799146981037e-05,
|
|
"loss": 0.4745,
|
|
"mean_token_accuracy": 0.8379769828170538,
|
|
"num_tokens": 92891631.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"entropy": 0.43951416015625,
|
|
"epoch": 0.8611111111111112,
|
|
"grad_norm": 0.7515092463448632,
|
|
"learning_rate": 1.7117324127609686e-05,
|
|
"loss": 0.4838,
|
|
"mean_token_accuracy": 0.83737269975245,
|
|
"num_tokens": 93322969.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"entropy": 0.436614990234375,
|
|
"epoch": 0.8650793650793651,
|
|
"grad_norm": 0.7917014918299389,
|
|
"learning_rate": 1.7086520525872173e-05,
|
|
"loss": 0.4737,
|
|
"mean_token_accuracy": 0.8372634230181575,
|
|
"num_tokens": 93760535.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"entropy": 0.431365966796875,
|
|
"epoch": 0.8690476190476191,
|
|
"grad_norm": 0.7226980931603464,
|
|
"learning_rate": 1.7055581254325716e-05,
|
|
"loss": 0.468,
|
|
"mean_token_accuracy": 0.8401279039680958,
|
|
"num_tokens": 94172398.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"entropy": 0.43701171875,
|
|
"epoch": 0.873015873015873,
|
|
"grad_norm": 0.7637809745171817,
|
|
"learning_rate": 1.7024506905295566e-05,
|
|
"loss": 0.4819,
|
|
"mean_token_accuracy": 0.836346473544836,
|
|
"num_tokens": 94599260.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 0.44158935546875,
|
|
"epoch": 0.876984126984127,
|
|
"grad_norm": 0.8053881064314574,
|
|
"learning_rate": 1.6993298073693005e-05,
|
|
"loss": 0.477,
|
|
"mean_token_accuracy": 0.8353390069678426,
|
|
"num_tokens": 95019433.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"entropy": 0.44500732421875,
|
|
"epoch": 0.8809523809523809,
|
|
"grad_norm": 0.7574417737692334,
|
|
"learning_rate": 1.6961955357003948e-05,
|
|
"loss": 0.4732,
|
|
"mean_token_accuracy": 0.8381653232499957,
|
|
"num_tokens": 95425799.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"entropy": 0.436981201171875,
|
|
"epoch": 0.8849206349206349,
|
|
"grad_norm": 0.8024393653209613,
|
|
"learning_rate": 1.693047935527751e-05,
|
|
"loss": 0.4736,
|
|
"mean_token_accuracy": 0.8379595559090376,
|
|
"num_tokens": 95822890.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"entropy": 0.43756103515625,
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 0.8215854969202835,
|
|
"learning_rate": 1.6898870671114527e-05,
|
|
"loss": 0.4883,
|
|
"mean_token_accuracy": 0.8364003216847777,
|
|
"num_tokens": 96260271.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"entropy": 0.4278564453125,
|
|
"epoch": 0.8928571428571429,
|
|
"grad_norm": 0.7668608306492521,
|
|
"learning_rate": 1.6867129909656e-05,
|
|
"loss": 0.4783,
|
|
"mean_token_accuracy": 0.8366113835945725,
|
|
"num_tokens": 96696652.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 0.436859130859375,
|
|
"epoch": 0.8968253968253969,
|
|
"grad_norm": 0.8321480081512098,
|
|
"learning_rate": 1.6835257678571515e-05,
|
|
"loss": 0.4763,
|
|
"mean_token_accuracy": 0.8387005385011435,
|
|
"num_tokens": 97135925.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"entropy": 0.4412841796875,
|
|
"epoch": 0.9007936507936508,
|
|
"grad_norm": 0.8357475643591474,
|
|
"learning_rate": 1.680325458804763e-05,
|
|
"loss": 0.4969,
|
|
"mean_token_accuracy": 0.8320699343457818,
|
|
"num_tokens": 97573810.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"entropy": 0.431304931640625,
|
|
"epoch": 0.9047619047619048,
|
|
"grad_norm": 0.779656796305732,
|
|
"learning_rate": 1.6771121250776163e-05,
|
|
"loss": 0.465,
|
|
"mean_token_accuracy": 0.8408154509961605,
|
|
"num_tokens": 98011108.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"entropy": 0.43701171875,
|
|
"epoch": 0.9087301587301587,
|
|
"grad_norm": 0.8611557820075499,
|
|
"learning_rate": 1.6738858281942477e-05,
|
|
"loss": 0.4637,
|
|
"mean_token_accuracy": 0.8399258134886622,
|
|
"num_tokens": 98441631.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"entropy": 0.440643310546875,
|
|
"epoch": 0.9126984126984127,
|
|
"grad_norm": 0.8547304328052301,
|
|
"learning_rate": 1.6706466299213718e-05,
|
|
"loss": 0.4763,
|
|
"mean_token_accuracy": 0.8364039584994316,
|
|
"num_tokens": 98873029.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 0.437896728515625,
|
|
"epoch": 0.9166666666666666,
|
|
"grad_norm": 0.8763515952964974,
|
|
"learning_rate": 1.6673945922726945e-05,
|
|
"loss": 0.4784,
|
|
"mean_token_accuracy": 0.8354263128712773,
|
|
"num_tokens": 99296106.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"entropy": 0.43878173828125,
|
|
"epoch": 0.9206349206349206,
|
|
"grad_norm": 0.814646978847238,
|
|
"learning_rate": 1.6641297775077313e-05,
|
|
"loss": 0.4772,
|
|
"mean_token_accuracy": 0.8371500456705689,
|
|
"num_tokens": 99734864.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"entropy": 0.43292236328125,
|
|
"epoch": 0.9246031746031746,
|
|
"grad_norm": 0.8062733622515471,
|
|
"learning_rate": 1.660852248130611e-05,
|
|
"loss": 0.4863,
|
|
"mean_token_accuracy": 0.8347219526767731,
|
|
"num_tokens": 100174517.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"entropy": 0.43914794921875,
|
|
"epoch": 0.9285714285714286,
|
|
"grad_norm": 0.8754059145656132,
|
|
"learning_rate": 1.6575620668888812e-05,
|
|
"loss": 0.4732,
|
|
"mean_token_accuracy": 0.837783177383244,
|
|
"num_tokens": 100606926.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"entropy": 0.43682861328125,
|
|
"epoch": 0.9325396825396826,
|
|
"grad_norm": 0.8404842546931222,
|
|
"learning_rate": 1.6542592967723065e-05,
|
|
"loss": 0.4661,
|
|
"mean_token_accuracy": 0.8383018802851439,
|
|
"num_tokens": 101020425.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 0.43560791015625,
|
|
"epoch": 0.9365079365079365,
|
|
"grad_norm": 0.8076832265015242,
|
|
"learning_rate": 1.6509440010116634e-05,
|
|
"loss": 0.4723,
|
|
"mean_token_accuracy": 0.8383181607350707,
|
|
"num_tokens": 101447599.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"entropy": 0.435455322265625,
|
|
"epoch": 0.9404761904761905,
|
|
"grad_norm": 0.8241311788244491,
|
|
"learning_rate": 1.6476162430775278e-05,
|
|
"loss": 0.4663,
|
|
"mean_token_accuracy": 0.8403345802798867,
|
|
"num_tokens": 101851986.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"entropy": 0.42706298828125,
|
|
"epoch": 0.9444444444444444,
|
|
"grad_norm": 0.8537310664275062,
|
|
"learning_rate": 1.6442760866790616e-05,
|
|
"loss": 0.4719,
|
|
"mean_token_accuracy": 0.8388460287824273,
|
|
"num_tokens": 102275358.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"entropy": 0.43780517578125,
|
|
"epoch": 0.9484126984126984,
|
|
"grad_norm": 0.811058555828741,
|
|
"learning_rate": 1.6409235957627926e-05,
|
|
"loss": 0.4673,
|
|
"mean_token_accuracy": 0.8385212691500783,
|
|
"num_tokens": 102688415.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"entropy": 0.43798828125,
|
|
"epoch": 0.9523809523809523,
|
|
"grad_norm": 0.9404107957388771,
|
|
"learning_rate": 1.6375588345113895e-05,
|
|
"loss": 0.4716,
|
|
"mean_token_accuracy": 0.8381083710119128,
|
|
"num_tokens": 103113293.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 0.429534912109375,
|
|
"epoch": 0.9563492063492064,
|
|
"grad_norm": 0.8268811238645964,
|
|
"learning_rate": 1.6341818673424342e-05,
|
|
"loss": 0.4738,
|
|
"mean_token_accuracy": 0.838194428011775,
|
|
"num_tokens": 103566779.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"entropy": 0.4266357421875,
|
|
"epoch": 0.9603174603174603,
|
|
"grad_norm": 0.9050037047902241,
|
|
"learning_rate": 1.630792758907189e-05,
|
|
"loss": 0.4782,
|
|
"mean_token_accuracy": 0.8364842068403959,
|
|
"num_tokens": 104000550.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"entropy": 0.436737060546875,
|
|
"epoch": 0.9642857142857143,
|
|
"grad_norm": 0.8094767952175125,
|
|
"learning_rate": 1.6273915740893557e-05,
|
|
"loss": 0.476,
|
|
"mean_token_accuracy": 0.8376884264871478,
|
|
"num_tokens": 104429372.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"entropy": 0.42962646484375,
|
|
"epoch": 0.9682539682539683,
|
|
"grad_norm": 0.8904444225202598,
|
|
"learning_rate": 1.6239783780038374e-05,
|
|
"loss": 0.4686,
|
|
"mean_token_accuracy": 0.8404411617666483,
|
|
"num_tokens": 104859286.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"entropy": 0.438262939453125,
|
|
"epoch": 0.9722222222222222,
|
|
"grad_norm": 0.8056975230627009,
|
|
"learning_rate": 1.6205532359954905e-05,
|
|
"loss": 0.4771,
|
|
"mean_token_accuracy": 0.8340575834736228,
|
|
"num_tokens": 105281017.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 0.423126220703125,
|
|
"epoch": 0.9761904761904762,
|
|
"grad_norm": 0.7784171562070645,
|
|
"learning_rate": 1.6171162136378716e-05,
|
|
"loss": 0.4669,
|
|
"mean_token_accuracy": 0.8392490344122052,
|
|
"num_tokens": 105729675.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"entropy": 0.42401123046875,
|
|
"epoch": 0.9801587301587301,
|
|
"grad_norm": 0.8062608875027727,
|
|
"learning_rate": 1.6136673767319853e-05,
|
|
"loss": 0.4712,
|
|
"mean_token_accuracy": 0.8378767920657992,
|
|
"num_tokens": 106168119.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"entropy": 0.422515869140625,
|
|
"epoch": 0.9841269841269841,
|
|
"grad_norm": 0.7922278955543418,
|
|
"learning_rate": 1.6102067913050227e-05,
|
|
"loss": 0.4687,
|
|
"mean_token_accuracy": 0.8400682499632239,
|
|
"num_tokens": 106603968.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"entropy": 0.421356201171875,
|
|
"epoch": 0.9880952380952381,
|
|
"grad_norm": 0.7435919501708257,
|
|
"learning_rate": 1.606734523609097e-05,
|
|
"loss": 0.4712,
|
|
"mean_token_accuracy": 0.8397826086729765,
|
|
"num_tokens": 107054152.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"entropy": 0.423004150390625,
|
|
"epoch": 0.9920634920634921,
|
|
"grad_norm": 0.7325977788312235,
|
|
"learning_rate": 1.603250640119977e-05,
|
|
"loss": 0.4609,
|
|
"mean_token_accuracy": 0.8421022659167647,
|
|
"num_tokens": 107495007.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 0.4324951171875,
|
|
"epoch": 0.996031746031746,
|
|
"grad_norm": 0.8289315109880487,
|
|
"learning_rate": 1.5997552075358122e-05,
|
|
"loss": 0.4783,
|
|
"mean_token_accuracy": 0.8354923082515597,
|
|
"num_tokens": 107928758.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"entropy": 0.427337646484375,
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.7226694679539868,
|
|
"learning_rate": 1.5962482927758568e-05,
|
|
"loss": 0.4732,
|
|
"mean_token_accuracy": 0.8377068918198347,
|
|
"num_tokens": 108364335.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"entropy": 0.443572998046875,
|
|
"epoch": 1.003968253968254,
|
|
"grad_norm": 0.7341598629462971,
|
|
"learning_rate": 1.592729962979189e-05,
|
|
"loss": 0.4341,
|
|
"mean_token_accuracy": 0.8490036567673087,
|
|
"num_tokens": 108775586.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"entropy": 0.433013916015625,
|
|
"epoch": 1.007936507936508,
|
|
"grad_norm": 0.6958749153405379,
|
|
"learning_rate": 1.589200285503426e-05,
|
|
"loss": 0.4311,
|
|
"mean_token_accuracy": 0.850931248627603,
|
|
"num_tokens": 109202665.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"entropy": 0.42047119140625,
|
|
"epoch": 1.0119047619047619,
|
|
"grad_norm": 0.830299124303698,
|
|
"learning_rate": 1.585659327923432e-05,
|
|
"loss": 0.4343,
|
|
"mean_token_accuracy": 0.8494839882478118,
|
|
"num_tokens": 109629975.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 0.41961669921875,
|
|
"epoch": 1.0158730158730158,
|
|
"grad_norm": 0.7427828462108962,
|
|
"learning_rate": 1.582107158030027e-05,
|
|
"loss": 0.4332,
|
|
"mean_token_accuracy": 0.8491029348224401,
|
|
"num_tokens": 110061605.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"entropy": 0.42138671875,
|
|
"epoch": 1.0198412698412698,
|
|
"grad_norm": 0.6632051825171987,
|
|
"learning_rate": 1.5785438438286892e-05,
|
|
"loss": 0.4215,
|
|
"mean_token_accuracy": 0.8531029289588332,
|
|
"num_tokens": 110487591.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"entropy": 0.422271728515625,
|
|
"epoch": 1.0238095238095237,
|
|
"grad_norm": 0.7208685537266245,
|
|
"learning_rate": 1.574969453538251e-05,
|
|
"loss": 0.4367,
|
|
"mean_token_accuracy": 0.8492154879495502,
|
|
"num_tokens": 110924491.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"entropy": 0.422119140625,
|
|
"epoch": 1.0277777777777777,
|
|
"grad_norm": 0.7622696147016514,
|
|
"learning_rate": 1.5713840555895937e-05,
|
|
"loss": 0.4322,
|
|
"mean_token_accuracy": 0.8504317132756114,
|
|
"num_tokens": 111351831.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"entropy": 0.42340087890625,
|
|
"epoch": 1.0317460317460316,
|
|
"grad_norm": 0.7232679832518091,
|
|
"learning_rate": 1.567787718624338e-05,
|
|
"loss": 0.4315,
|
|
"mean_token_accuracy": 0.8494284749031067,
|
|
"num_tokens": 111773832.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 0.424407958984375,
|
|
"epoch": 1.0357142857142858,
|
|
"grad_norm": 0.7348492552631081,
|
|
"learning_rate": 1.5641805114935297e-05,
|
|
"loss": 0.4229,
|
|
"mean_token_accuracy": 0.8527503348886967,
|
|
"num_tokens": 112210334.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"entropy": 0.424346923828125,
|
|
"epoch": 1.0396825396825398,
|
|
"grad_norm": 0.7224982909642063,
|
|
"learning_rate": 1.560562503256322e-05,
|
|
"loss": 0.4295,
|
|
"mean_token_accuracy": 0.8483862616121769,
|
|
"num_tokens": 112637470.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"entropy": 0.421417236328125,
|
|
"epoch": 1.0436507936507937,
|
|
"grad_norm": 0.7968828029144892,
|
|
"learning_rate": 1.556933763178651e-05,
|
|
"loss": 0.4338,
|
|
"mean_token_accuracy": 0.8488446967676282,
|
|
"num_tokens": 113069179.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"entropy": 0.427215576171875,
|
|
"epoch": 1.0476190476190477,
|
|
"grad_norm": 0.6927882582463685,
|
|
"learning_rate": 1.5532943607319143e-05,
|
|
"loss": 0.4348,
|
|
"mean_token_accuracy": 0.8495698990300298,
|
|
"num_tokens": 113501590.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"entropy": 0.423583984375,
|
|
"epoch": 1.0515873015873016,
|
|
"grad_norm": 0.6855557306353498,
|
|
"learning_rate": 1.5496443655916348e-05,
|
|
"loss": 0.4302,
|
|
"mean_token_accuracy": 0.8517827754840255,
|
|
"num_tokens": 113935337.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 0.424652099609375,
|
|
"epoch": 1.0555555555555556,
|
|
"grad_norm": 0.7439838182634214,
|
|
"learning_rate": 1.5459838476361326e-05,
|
|
"loss": 0.4339,
|
|
"mean_token_accuracy": 0.8486921405419707,
|
|
"num_tokens": 114360533.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"entropy": 0.416717529296875,
|
|
"epoch": 1.0595238095238095,
|
|
"grad_norm": 0.7237479220972823,
|
|
"learning_rate": 1.5423128769451832e-05,
|
|
"loss": 0.4282,
|
|
"mean_token_accuracy": 0.8520865635946393,
|
|
"num_tokens": 114792424.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"entropy": 0.41729736328125,
|
|
"epoch": 1.0634920634920635,
|
|
"grad_norm": 0.7635204529254467,
|
|
"learning_rate": 1.5386315237986785e-05,
|
|
"loss": 0.4366,
|
|
"mean_token_accuracy": 0.8476630486547947,
|
|
"num_tokens": 115231953.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"entropy": 0.420654296875,
|
|
"epoch": 1.0674603174603174,
|
|
"grad_norm": 0.6400753342281554,
|
|
"learning_rate": 1.5349398586752794e-05,
|
|
"loss": 0.4309,
|
|
"mean_token_accuracy": 0.8499054629355669,
|
|
"num_tokens": 115664522.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"entropy": 0.417083740234375,
|
|
"epoch": 1.0714285714285714,
|
|
"grad_norm": 0.8123384521312528,
|
|
"learning_rate": 1.5312379522510666e-05,
|
|
"loss": 0.4301,
|
|
"mean_token_accuracy": 0.8492474015802145,
|
|
"num_tokens": 116092221.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 0.421173095703125,
|
|
"epoch": 1.0753968253968254,
|
|
"grad_norm": 0.7019628203672783,
|
|
"learning_rate": 1.52752587539819e-05,
|
|
"loss": 0.4274,
|
|
"mean_token_accuracy": 0.8491615150123835,
|
|
"num_tokens": 116534679.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"entropy": 0.4229736328125,
|
|
"epoch": 1.0793650793650793,
|
|
"grad_norm": 0.7506293935142412,
|
|
"learning_rate": 1.5238036991835085e-05,
|
|
"loss": 0.438,
|
|
"mean_token_accuracy": 0.8483763262629509,
|
|
"num_tokens": 116984739.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"entropy": 0.418182373046875,
|
|
"epoch": 1.0833333333333333,
|
|
"grad_norm": 0.81916726768363,
|
|
"learning_rate": 1.5200714948672313e-05,
|
|
"loss": 0.4329,
|
|
"mean_token_accuracy": 0.8497389126569033,
|
|
"num_tokens": 117413323.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"entropy": 0.412933349609375,
|
|
"epoch": 1.0873015873015872,
|
|
"grad_norm": 0.7157560717096457,
|
|
"learning_rate": 1.5163293339015535e-05,
|
|
"loss": 0.4333,
|
|
"mean_token_accuracy": 0.8479102049022913,
|
|
"num_tokens": 117852991.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"entropy": 0.42327880859375,
|
|
"epoch": 1.0912698412698412,
|
|
"grad_norm": 0.7802341946060086,
|
|
"learning_rate": 1.512577287929288e-05,
|
|
"loss": 0.438,
|
|
"mean_token_accuracy": 0.8477771393954754,
|
|
"num_tokens": 118264477.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 0.41497802734375,
|
|
"epoch": 1.0952380952380953,
|
|
"grad_norm": 0.7955621612100239,
|
|
"learning_rate": 1.5088154287824934e-05,
|
|
"loss": 0.4264,
|
|
"mean_token_accuracy": 0.8529663607478142,
|
|
"num_tokens": 118696927.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"entropy": 0.409820556640625,
|
|
"epoch": 1.0992063492063493,
|
|
"grad_norm": 0.7201767225753009,
|
|
"learning_rate": 1.5050438284811001e-05,
|
|
"loss": 0.4352,
|
|
"mean_token_accuracy": 0.847323065623641,
|
|
"num_tokens": 119143061.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"entropy": 0.417938232421875,
|
|
"epoch": 1.1031746031746033,
|
|
"grad_norm": 0.7480446387717942,
|
|
"learning_rate": 1.5012625592315298e-05,
|
|
"loss": 0.4259,
|
|
"mean_token_accuracy": 0.8503220491111279,
|
|
"num_tokens": 119569613.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"entropy": 0.417694091796875,
|
|
"epoch": 1.1071428571428572,
|
|
"grad_norm": 0.8503377095008527,
|
|
"learning_rate": 1.4974716934253146e-05,
|
|
"loss": 0.4295,
|
|
"mean_token_accuracy": 0.8494290672242641,
|
|
"num_tokens": 119990044.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"entropy": 0.405731201171875,
|
|
"epoch": 1.1111111111111112,
|
|
"grad_norm": 0.8068032460856993,
|
|
"learning_rate": 1.4936713036377102e-05,
|
|
"loss": 0.4352,
|
|
"mean_token_accuracy": 0.8492538705468178,
|
|
"num_tokens": 120447089.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 0.41217041015625,
|
|
"epoch": 1.1150793650793651,
|
|
"grad_norm": 0.789218202429046,
|
|
"learning_rate": 1.4898614626263066e-05,
|
|
"loss": 0.4152,
|
|
"mean_token_accuracy": 0.8548848666250706,
|
|
"num_tokens": 120882118.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"entropy": 0.415618896484375,
|
|
"epoch": 1.119047619047619,
|
|
"grad_norm": 0.7218191501417075,
|
|
"learning_rate": 1.4860422433296363e-05,
|
|
"loss": 0.4317,
|
|
"mean_token_accuracy": 0.8495792560279369,
|
|
"num_tokens": 121314886.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"entropy": 0.4183349609375,
|
|
"epoch": 1.123015873015873,
|
|
"grad_norm": 0.7503416099168865,
|
|
"learning_rate": 1.4822137188657752e-05,
|
|
"loss": 0.4197,
|
|
"mean_token_accuracy": 0.8535007536411285,
|
|
"num_tokens": 121731396.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"entropy": 0.413421630859375,
|
|
"epoch": 1.126984126984127,
|
|
"grad_norm": 0.7881593510234792,
|
|
"learning_rate": 1.4783759625309454e-05,
|
|
"loss": 0.4241,
|
|
"mean_token_accuracy": 0.851870458573103,
|
|
"num_tokens": 122167617.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"entropy": 0.4091796875,
|
|
"epoch": 1.130952380952381,
|
|
"grad_norm": 0.6940859902467295,
|
|
"learning_rate": 1.474529047798112e-05,
|
|
"loss": 0.4272,
|
|
"mean_token_accuracy": 0.8504150630906224,
|
|
"num_tokens": 122593508.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 0.411651611328125,
|
|
"epoch": 1.1349206349206349,
|
|
"grad_norm": 0.7885889108343426,
|
|
"learning_rate": 1.4706730483155738e-05,
|
|
"loss": 0.4288,
|
|
"mean_token_accuracy": 0.8491430478170514,
|
|
"num_tokens": 123013840.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"entropy": 0.4080810546875,
|
|
"epoch": 1.1388888888888888,
|
|
"grad_norm": 0.7779729557331533,
|
|
"learning_rate": 1.4668080379055563e-05,
|
|
"loss": 0.4192,
|
|
"mean_token_accuracy": 0.853282954543829,
|
|
"num_tokens": 123444043.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"entropy": 0.410888671875,
|
|
"epoch": 1.1428571428571428,
|
|
"grad_norm": 0.7541972725693249,
|
|
"learning_rate": 1.4629340905627964e-05,
|
|
"loss": 0.4172,
|
|
"mean_token_accuracy": 0.8533556731417775,
|
|
"num_tokens": 123876490.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"entropy": 0.414825439453125,
|
|
"epoch": 1.1468253968253967,
|
|
"grad_norm": 0.7466215675753664,
|
|
"learning_rate": 1.4590512804531272e-05,
|
|
"loss": 0.4226,
|
|
"mean_token_accuracy": 0.8501791479066014,
|
|
"num_tokens": 124297019.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"entropy": 0.41717529296875,
|
|
"epoch": 1.1507936507936507,
|
|
"grad_norm": 0.7693301960113025,
|
|
"learning_rate": 1.4551596819120564e-05,
|
|
"loss": 0.4292,
|
|
"mean_token_accuracy": 0.8512964397668839,
|
|
"num_tokens": 124713314.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 0.415802001953125,
|
|
"epoch": 1.1547619047619047,
|
|
"grad_norm": 0.7246395498891883,
|
|
"learning_rate": 1.4512593694433455e-05,
|
|
"loss": 0.4277,
|
|
"mean_token_accuracy": 0.8493410600349307,
|
|
"num_tokens": 125125061.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"entropy": 0.4139404296875,
|
|
"epoch": 1.1587301587301586,
|
|
"grad_norm": 0.7560965179980317,
|
|
"learning_rate": 1.447350417717581e-05,
|
|
"loss": 0.4436,
|
|
"mean_token_accuracy": 0.8454991178587079,
|
|
"num_tokens": 125564746.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"entropy": 0.40960693359375,
|
|
"epoch": 1.1626984126984128,
|
|
"grad_norm": 0.7077121780226429,
|
|
"learning_rate": 1.4434329015707468e-05,
|
|
"loss": 0.418,
|
|
"mean_token_accuracy": 0.8543984591960907,
|
|
"num_tokens": 125992351.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"entropy": 0.41363525390625,
|
|
"epoch": 1.1666666666666667,
|
|
"grad_norm": 0.7114122184270399,
|
|
"learning_rate": 1.4395068960027903e-05,
|
|
"loss": 0.4184,
|
|
"mean_token_accuracy": 0.852596671320498,
|
|
"num_tokens": 126415997.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"entropy": 0.418243408203125,
|
|
"epoch": 1.1706349206349207,
|
|
"grad_norm": 0.7588297506038938,
|
|
"learning_rate": 1.435572476176187e-05,
|
|
"loss": 0.4344,
|
|
"mean_token_accuracy": 0.8491576574742794,
|
|
"num_tokens": 126851930.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 0.41839599609375,
|
|
"epoch": 1.1746031746031746,
|
|
"grad_norm": 0.7000410652898528,
|
|
"learning_rate": 1.4316297174145018e-05,
|
|
"loss": 0.4359,
|
|
"mean_token_accuracy": 0.847908278927207,
|
|
"num_tokens": 127285760.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"entropy": 0.426849365234375,
|
|
"epoch": 1.1785714285714286,
|
|
"grad_norm": 0.7000110970011031,
|
|
"learning_rate": 1.427678695200945e-05,
|
|
"loss": 0.447,
|
|
"mean_token_accuracy": 0.8466757563874125,
|
|
"num_tokens": 127724762.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"entropy": 0.412445068359375,
|
|
"epoch": 1.1825396825396826,
|
|
"grad_norm": 0.7729410408403453,
|
|
"learning_rate": 1.4237194851769318e-05,
|
|
"loss": 0.4245,
|
|
"mean_token_accuracy": 0.853807931765914,
|
|
"num_tokens": 128153685.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"entropy": 0.416259765625,
|
|
"epoch": 1.1865079365079365,
|
|
"grad_norm": 0.7363431945766964,
|
|
"learning_rate": 1.4197521631406279e-05,
|
|
"loss": 0.4234,
|
|
"mean_token_accuracy": 0.8529269192367792,
|
|
"num_tokens": 128574985.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"entropy": 0.4239501953125,
|
|
"epoch": 1.1904761904761905,
|
|
"grad_norm": 0.7757786686903285,
|
|
"learning_rate": 1.4157768050455038e-05,
|
|
"loss": 0.4238,
|
|
"mean_token_accuracy": 0.8496858030557632,
|
|
"num_tokens": 128990738.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 0.419342041015625,
|
|
"epoch": 1.1944444444444444,
|
|
"grad_norm": 0.7832399458969364,
|
|
"learning_rate": 1.4117934869988776e-05,
|
|
"loss": 0.4214,
|
|
"mean_token_accuracy": 0.8544543124735355,
|
|
"num_tokens": 129413253.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"entropy": 0.417449951171875,
|
|
"epoch": 1.1984126984126984,
|
|
"grad_norm": 0.7164361411308464,
|
|
"learning_rate": 1.4078022852604591e-05,
|
|
"loss": 0.448,
|
|
"mean_token_accuracy": 0.8454335303977132,
|
|
"num_tokens": 129848900.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"entropy": 0.419342041015625,
|
|
"epoch": 1.2023809523809523,
|
|
"grad_norm": 0.741247005990124,
|
|
"learning_rate": 1.4038032762408897e-05,
|
|
"loss": 0.4252,
|
|
"mean_token_accuracy": 0.8511590985581279,
|
|
"num_tokens": 130278951.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"entropy": 0.42132568359375,
|
|
"epoch": 1.2063492063492063,
|
|
"grad_norm": 0.8114658165541835,
|
|
"learning_rate": 1.3997965365002789e-05,
|
|
"loss": 0.4318,
|
|
"mean_token_accuracy": 0.8505065925419331,
|
|
"num_tokens": 130724709.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"entropy": 0.421905517578125,
|
|
"epoch": 1.2103174603174602,
|
|
"grad_norm": 0.6859694143798629,
|
|
"learning_rate": 1.3957821427467392e-05,
|
|
"loss": 0.4091,
|
|
"mean_token_accuracy": 0.8572418540716171,
|
|
"num_tokens": 131151665.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 0.41455078125,
|
|
"epoch": 1.2142857142857142,
|
|
"grad_norm": 0.7754845945066426,
|
|
"learning_rate": 1.3917601718349183e-05,
|
|
"loss": 0.4175,
|
|
"mean_token_accuracy": 0.8536632917821407,
|
|
"num_tokens": 131582811.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"entropy": 0.416259765625,
|
|
"epoch": 1.2182539682539684,
|
|
"grad_norm": 0.7186546230914753,
|
|
"learning_rate": 1.3877307007645256e-05,
|
|
"loss": 0.427,
|
|
"mean_token_accuracy": 0.8525044862180948,
|
|
"num_tokens": 131998529.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"entropy": 0.4093017578125,
|
|
"epoch": 1.2222222222222223,
|
|
"grad_norm": 0.7164454164150232,
|
|
"learning_rate": 1.3836938066788599e-05,
|
|
"loss": 0.4198,
|
|
"mean_token_accuracy": 0.8528330260887742,
|
|
"num_tokens": 132429743.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"entropy": 0.41455078125,
|
|
"epoch": 1.2261904761904763,
|
|
"grad_norm": 0.7241248979958975,
|
|
"learning_rate": 1.3796495668633325e-05,
|
|
"loss": 0.4265,
|
|
"mean_token_accuracy": 0.850708675570786,
|
|
"num_tokens": 132864811.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"entropy": 0.4132080078125,
|
|
"epoch": 1.2301587301587302,
|
|
"grad_norm": 0.7207326099891552,
|
|
"learning_rate": 1.3755980587439857e-05,
|
|
"loss": 0.4318,
|
|
"mean_token_accuracy": 0.8507328238338232,
|
|
"num_tokens": 133291943.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 0.41790771484375,
|
|
"epoch": 1.2341269841269842,
|
|
"grad_norm": 0.7273508243242796,
|
|
"learning_rate": 1.3715393598860129e-05,
|
|
"loss": 0.4246,
|
|
"mean_token_accuracy": 0.8514404995366931,
|
|
"num_tokens": 133728664.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"entropy": 0.4178466796875,
|
|
"epoch": 1.2380952380952381,
|
|
"grad_norm": 0.70087947356439,
|
|
"learning_rate": 1.367473547992272e-05,
|
|
"loss": 0.4159,
|
|
"mean_token_accuracy": 0.8550357017666101,
|
|
"num_tokens": 134149814.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"entropy": 0.419586181640625,
|
|
"epoch": 1.242063492063492,
|
|
"grad_norm": 0.6639795743514464,
|
|
"learning_rate": 1.3634007009017986e-05,
|
|
"loss": 0.4151,
|
|
"mean_token_accuracy": 0.853931562975049,
|
|
"num_tokens": 134567037.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"entropy": 0.416595458984375,
|
|
"epoch": 1.246031746031746,
|
|
"grad_norm": 0.7100431406730954,
|
|
"learning_rate": 1.3593208965883156e-05,
|
|
"loss": 0.4137,
|
|
"mean_token_accuracy": 0.8554408671334386,
|
|
"num_tokens": 134989406.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"entropy": 0.4085693359375,
|
|
"epoch": 1.25,
|
|
"grad_norm": 0.7105214659480911,
|
|
"learning_rate": 1.3552342131587399e-05,
|
|
"loss": 0.4025,
|
|
"mean_token_accuracy": 0.8589053172618151,
|
|
"num_tokens": 135405949.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 0.408843994140625,
|
|
"epoch": 1.253968253968254,
|
|
"grad_norm": 0.7899492973598554,
|
|
"learning_rate": 1.351140728851688e-05,
|
|
"loss": 0.4253,
|
|
"mean_token_accuracy": 0.8534569833427668,
|
|
"num_tokens": 135832642.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"entropy": 0.4046630859375,
|
|
"epoch": 1.257936507936508,
|
|
"grad_norm": 0.6960132729213381,
|
|
"learning_rate": 1.3470405220359773e-05,
|
|
"loss": 0.4211,
|
|
"mean_token_accuracy": 0.8528056116774678,
|
|
"num_tokens": 136281104.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"entropy": 0.41253662109375,
|
|
"epoch": 1.2619047619047619,
|
|
"grad_norm": 0.7421504637929449,
|
|
"learning_rate": 1.3429336712091258e-05,
|
|
"loss": 0.4113,
|
|
"mean_token_accuracy": 0.8566197585314512,
|
|
"num_tokens": 136724748.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"entropy": 0.410736083984375,
|
|
"epoch": 1.2658730158730158,
|
|
"grad_norm": 0.75910615139755,
|
|
"learning_rate": 1.3388202549958507e-05,
|
|
"loss": 0.4167,
|
|
"mean_token_accuracy": 0.8541540773585439,
|
|
"num_tokens": 137144383.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"entropy": 0.4176025390625,
|
|
"epoch": 1.2698412698412698,
|
|
"grad_norm": 0.7176046910830354,
|
|
"learning_rate": 1.334700352146561e-05,
|
|
"loss": 0.4221,
|
|
"mean_token_accuracy": 0.8523535262793303,
|
|
"num_tokens": 137570382.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 0.4130859375,
|
|
"epoch": 1.2738095238095237,
|
|
"grad_norm": 0.7147216223748529,
|
|
"learning_rate": 1.3305740415358506e-05,
|
|
"loss": 0.4255,
|
|
"mean_token_accuracy": 0.8524302830919623,
|
|
"num_tokens": 138002323.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"entropy": 0.412811279296875,
|
|
"epoch": 1.2777777777777777,
|
|
"grad_norm": 0.7746193751855323,
|
|
"learning_rate": 1.3264414021609899e-05,
|
|
"loss": 0.4271,
|
|
"mean_token_accuracy": 0.8531954158097506,
|
|
"num_tokens": 138431194.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"entropy": 0.41534423828125,
|
|
"epoch": 1.2817460317460316,
|
|
"grad_norm": 0.6884329097906199,
|
|
"learning_rate": 1.3223025131404106e-05,
|
|
"loss": 0.4116,
|
|
"mean_token_accuracy": 0.8547803815454245,
|
|
"num_tokens": 138863136.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"entropy": 0.410064697265625,
|
|
"epoch": 1.2857142857142856,
|
|
"grad_norm": 0.7440216530883886,
|
|
"learning_rate": 1.3181574537121933e-05,
|
|
"loss": 0.4058,
|
|
"mean_token_accuracy": 0.8586990479379892,
|
|
"num_tokens": 139287890.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"entropy": 0.412506103515625,
|
|
"epoch": 1.2896825396825398,
|
|
"grad_norm": 0.7042933191716969,
|
|
"learning_rate": 1.3140063032325491e-05,
|
|
"loss": 0.4269,
|
|
"mean_token_accuracy": 0.852539798244834,
|
|
"num_tokens": 139730663.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 0.40789794921875,
|
|
"epoch": 1.2936507936507937,
|
|
"grad_norm": 0.7053569226738333,
|
|
"learning_rate": 1.3098491411743014e-05,
|
|
"loss": 0.4203,
|
|
"mean_token_accuracy": 0.8511422863230109,
|
|
"num_tokens": 140160179.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"entropy": 0.41094970703125,
|
|
"epoch": 1.2976190476190477,
|
|
"grad_norm": 0.7260221861550915,
|
|
"learning_rate": 1.3056860471253639e-05,
|
|
"loss": 0.4148,
|
|
"mean_token_accuracy": 0.8542684894055128,
|
|
"num_tokens": 140577958.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"entropy": 0.40771484375,
|
|
"epoch": 1.3015873015873016,
|
|
"grad_norm": 0.6945091615927053,
|
|
"learning_rate": 1.3015171007872161e-05,
|
|
"loss": 0.4327,
|
|
"mean_token_accuracy": 0.8508248487487435,
|
|
"num_tokens": 141002875.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"entropy": 0.410186767578125,
|
|
"epoch": 1.3055555555555556,
|
|
"grad_norm": 0.7190051589443747,
|
|
"learning_rate": 1.297342381973379e-05,
|
|
"loss": 0.4144,
|
|
"mean_token_accuracy": 0.855740231461823,
|
|
"num_tokens": 141425392.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"entropy": 0.412933349609375,
|
|
"epoch": 1.3095238095238095,
|
|
"grad_norm": 0.7230421897639797,
|
|
"learning_rate": 1.2931619706078862e-05,
|
|
"loss": 0.4101,
|
|
"mean_token_accuracy": 0.8563672862946987,
|
|
"num_tokens": 141858286.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 0.416717529296875,
|
|
"epoch": 1.3134920634920635,
|
|
"grad_norm": 0.7164485626421909,
|
|
"learning_rate": 1.2889759467237532e-05,
|
|
"loss": 0.4104,
|
|
"mean_token_accuracy": 0.8578307218849659,
|
|
"num_tokens": 142279417.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"entropy": 0.4114990234375,
|
|
"epoch": 1.3174603174603174,
|
|
"grad_norm": 0.6758312572105224,
|
|
"learning_rate": 1.2847843904614474e-05,
|
|
"loss": 0.4122,
|
|
"mean_token_accuracy": 0.8550651278346777,
|
|
"num_tokens": 142698339.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"entropy": 0.409271240234375,
|
|
"epoch": 1.3214285714285714,
|
|
"grad_norm": 0.8043515777393827,
|
|
"learning_rate": 1.2805873820673509e-05,
|
|
"loss": 0.4097,
|
|
"mean_token_accuracy": 0.8561445344239473,
|
|
"num_tokens": 143128013.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"entropy": 0.4112548828125,
|
|
"epoch": 1.3253968253968254,
|
|
"grad_norm": 0.7169697385515053,
|
|
"learning_rate": 1.2763850018922257e-05,
|
|
"loss": 0.4106,
|
|
"mean_token_accuracy": 0.8560521546751261,
|
|
"num_tokens": 143561112.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"entropy": 0.412353515625,
|
|
"epoch": 1.3293650793650793,
|
|
"grad_norm": 0.7437379976550189,
|
|
"learning_rate": 1.2721773303896765e-05,
|
|
"loss": 0.4195,
|
|
"mean_token_accuracy": 0.8526777876541018,
|
|
"num_tokens": 143970890.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 0.408599853515625,
|
|
"epoch": 1.3333333333333333,
|
|
"grad_norm": 0.8092416715710883,
|
|
"learning_rate": 1.2679644481146081e-05,
|
|
"loss": 0.4168,
|
|
"mean_token_accuracy": 0.8542767520993948,
|
|
"num_tokens": 144390223.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"entropy": 0.407928466796875,
|
|
"epoch": 1.3373015873015874,
|
|
"grad_norm": 0.7460569230441075,
|
|
"learning_rate": 1.2637464357216847e-05,
|
|
"loss": 0.4298,
|
|
"mean_token_accuracy": 0.8512799562886357,
|
|
"num_tokens": 144839957.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"entropy": 0.412689208984375,
|
|
"epoch": 1.3412698412698414,
|
|
"grad_norm": 0.7141238772525242,
|
|
"learning_rate": 1.2595233739637851e-05,
|
|
"loss": 0.4296,
|
|
"mean_token_accuracy": 0.8526173504069448,
|
|
"num_tokens": 145276276.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"entropy": 0.41314697265625,
|
|
"epoch": 1.3452380952380953,
|
|
"grad_norm": 0.7625177516979067,
|
|
"learning_rate": 1.2552953436904578e-05,
|
|
"loss": 0.4318,
|
|
"mean_token_accuracy": 0.8507062029093504,
|
|
"num_tokens": 145722320.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"entropy": 0.416168212890625,
|
|
"epoch": 1.3492063492063493,
|
|
"grad_norm": 0.7323762900468244,
|
|
"learning_rate": 1.2510624258463719e-05,
|
|
"loss": 0.4102,
|
|
"mean_token_accuracy": 0.8566265497356653,
|
|
"num_tokens": 146148957.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 0.419464111328125,
|
|
"epoch": 1.3531746031746033,
|
|
"grad_norm": 0.720246750942426,
|
|
"learning_rate": 1.246824701469768e-05,
|
|
"loss": 0.4241,
|
|
"mean_token_accuracy": 0.8515134025365114,
|
|
"num_tokens": 146580318.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"entropy": 0.413848876953125,
|
|
"epoch": 1.3571428571428572,
|
|
"grad_norm": 0.7263345728177462,
|
|
"learning_rate": 1.2425822516909065e-05,
|
|
"loss": 0.4106,
|
|
"mean_token_accuracy": 0.8557441309094429,
|
|
"num_tokens": 146999892.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"entropy": 0.411956787109375,
|
|
"epoch": 1.3611111111111112,
|
|
"grad_norm": 0.6886575308922511,
|
|
"learning_rate": 1.2383351577305148e-05,
|
|
"loss": 0.4141,
|
|
"mean_token_accuracy": 0.8551986450329423,
|
|
"num_tokens": 147436184.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"entropy": 0.401458740234375,
|
|
"epoch": 1.3650793650793651,
|
|
"grad_norm": 0.7573997962027809,
|
|
"learning_rate": 1.2340835008982315e-05,
|
|
"loss": 0.4188,
|
|
"mean_token_accuracy": 0.8537988383322954,
|
|
"num_tokens": 147888947.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"entropy": 0.412353515625,
|
|
"epoch": 1.369047619047619,
|
|
"grad_norm": 0.702714358199938,
|
|
"learning_rate": 1.2298273625910512e-05,
|
|
"loss": 0.4268,
|
|
"mean_token_accuracy": 0.8513675974681973,
|
|
"num_tokens": 148330624.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 0.41815185546875,
|
|
"epoch": 1.373015873015873,
|
|
"grad_norm": 0.7125776426540186,
|
|
"learning_rate": 1.2255668242917651e-05,
|
|
"loss": 0.431,
|
|
"mean_token_accuracy": 0.8508031954988837,
|
|
"num_tokens": 148771994.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"entropy": 0.41412353515625,
|
|
"epoch": 1.376984126984127,
|
|
"grad_norm": 0.7116129020149227,
|
|
"learning_rate": 1.2213019675674008e-05,
|
|
"loss": 0.4131,
|
|
"mean_token_accuracy": 0.8551193429157138,
|
|
"num_tokens": 149203608.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"entropy": 0.4124755859375,
|
|
"epoch": 1.380952380952381,
|
|
"grad_norm": 0.6802674847268556,
|
|
"learning_rate": 1.2170328740676613e-05,
|
|
"loss": 0.4155,
|
|
"mean_token_accuracy": 0.8542332891374826,
|
|
"num_tokens": 149626353.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"entropy": 0.412689208984375,
|
|
"epoch": 1.3849206349206349,
|
|
"grad_norm": 0.7065345082327148,
|
|
"learning_rate": 1.2127596255233622e-05,
|
|
"loss": 0.4104,
|
|
"mean_token_accuracy": 0.8574284976348281,
|
|
"num_tokens": 150036189.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"entropy": 0.41180419921875,
|
|
"epoch": 1.3888888888888888,
|
|
"grad_norm": 0.6478009510663947,
|
|
"learning_rate": 1.2084823037448654e-05,
|
|
"loss": 0.4027,
|
|
"mean_token_accuracy": 0.858050768263638,
|
|
"num_tokens": 150484433.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 0.411163330078125,
|
|
"epoch": 1.3928571428571428,
|
|
"grad_norm": 0.6867982734460257,
|
|
"learning_rate": 1.2042009906205152e-05,
|
|
"loss": 0.4141,
|
|
"mean_token_accuracy": 0.8563978290185332,
|
|
"num_tokens": 150916869.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"entropy": 0.413421630859375,
|
|
"epoch": 1.3968253968253967,
|
|
"grad_norm": 0.7528102622705124,
|
|
"learning_rate": 1.1999157681150683e-05,
|
|
"loss": 0.4231,
|
|
"mean_token_accuracy": 0.8521570805460215,
|
|
"num_tokens": 151351171.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"entropy": 0.4088134765625,
|
|
"epoch": 1.4007936507936507,
|
|
"grad_norm": 0.7335651303479648,
|
|
"learning_rate": 1.1956267182681265e-05,
|
|
"loss": 0.4134,
|
|
"mean_token_accuracy": 0.8541133729740977,
|
|
"num_tokens": 151779921.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"entropy": 0.408721923828125,
|
|
"epoch": 1.4047619047619047,
|
|
"grad_norm": 0.7419182720532009,
|
|
"learning_rate": 1.1913339231925642e-05,
|
|
"loss": 0.4256,
|
|
"mean_token_accuracy": 0.850860440172255,
|
|
"num_tokens": 152198704.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"entropy": 0.412109375,
|
|
"epoch": 1.4087301587301586,
|
|
"grad_norm": 0.6775984558339723,
|
|
"learning_rate": 1.1870374650729582e-05,
|
|
"loss": 0.4096,
|
|
"mean_token_accuracy": 0.8563865106552839,
|
|
"num_tokens": 152607114.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 0.4183349609375,
|
|
"epoch": 1.4126984126984126,
|
|
"grad_norm": 0.6888120701305268,
|
|
"learning_rate": 1.1827374261640128e-05,
|
|
"loss": 0.4131,
|
|
"mean_token_accuracy": 0.856896661221981,
|
|
"num_tokens": 153027562.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"entropy": 0.416961669921875,
|
|
"epoch": 1.4166666666666667,
|
|
"grad_norm": 0.6818388664948022,
|
|
"learning_rate": 1.1784338887889858e-05,
|
|
"loss": 0.4057,
|
|
"mean_token_accuracy": 0.8576616421341896,
|
|
"num_tokens": 153449258.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"entropy": 0.41827392578125,
|
|
"epoch": 1.4206349206349207,
|
|
"grad_norm": 0.6735012808919367,
|
|
"learning_rate": 1.1741269353381128e-05,
|
|
"loss": 0.4119,
|
|
"mean_token_accuracy": 0.8539746999740601,
|
|
"num_tokens": 153863890.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"entropy": 0.417083740234375,
|
|
"epoch": 1.4246031746031746,
|
|
"grad_norm": 0.7056639670954223,
|
|
"learning_rate": 1.1698166482670293e-05,
|
|
"loss": 0.4136,
|
|
"mean_token_accuracy": 0.8554719127714634,
|
|
"num_tokens": 154280797.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"entropy": 0.415863037109375,
|
|
"epoch": 1.4285714285714286,
|
|
"grad_norm": 0.6586008605306323,
|
|
"learning_rate": 1.165503110095191e-05,
|
|
"loss": 0.4163,
|
|
"mean_token_accuracy": 0.8539502024650574,
|
|
"num_tokens": 154707913.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 0.41082763671875,
|
|
"epoch": 1.4325396825396826,
|
|
"grad_norm": 0.72245972178782,
|
|
"learning_rate": 1.1611864034042972e-05,
|
|
"loss": 0.4094,
|
|
"mean_token_accuracy": 0.8547179391607642,
|
|
"num_tokens": 155150096.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"entropy": 0.407867431640625,
|
|
"epoch": 1.4365079365079365,
|
|
"grad_norm": 0.6648391208069424,
|
|
"learning_rate": 1.1568666108367066e-05,
|
|
"loss": 0.4103,
|
|
"mean_token_accuracy": 0.8559131594374776,
|
|
"num_tokens": 155590050.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"entropy": 0.41180419921875,
|
|
"epoch": 1.4404761904761905,
|
|
"grad_norm": 0.6867753073723831,
|
|
"learning_rate": 1.1525438150938554e-05,
|
|
"loss": 0.4133,
|
|
"mean_token_accuracy": 0.8570982730016112,
|
|
"num_tokens": 156016093.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"entropy": 0.41387939453125,
|
|
"epoch": 1.4444444444444444,
|
|
"grad_norm": 0.617533443654073,
|
|
"learning_rate": 1.1482180989346771e-05,
|
|
"loss": 0.4084,
|
|
"mean_token_accuracy": 0.8573052315041423,
|
|
"num_tokens": 156449879.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"entropy": 0.42047119140625,
|
|
"epoch": 1.4484126984126984,
|
|
"grad_norm": 0.702255339155768,
|
|
"learning_rate": 1.1438895451740141e-05,
|
|
"loss": 0.4021,
|
|
"mean_token_accuracy": 0.8589789541438222,
|
|
"num_tokens": 156866761.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 0.416290283203125,
|
|
"epoch": 1.4523809523809523,
|
|
"grad_norm": 0.6723521380171534,
|
|
"learning_rate": 1.1395582366810348e-05,
|
|
"loss": 0.3975,
|
|
"mean_token_accuracy": 0.8603022275492549,
|
|
"num_tokens": 157304143.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"entropy": 0.414306640625,
|
|
"epoch": 1.4563492063492063,
|
|
"grad_norm": 0.6310156884325218,
|
|
"learning_rate": 1.135224256377646e-05,
|
|
"loss": 0.4337,
|
|
"mean_token_accuracy": 0.8482790300622582,
|
|
"num_tokens": 157758390.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"entropy": 0.416717529296875,
|
|
"epoch": 1.4603174603174602,
|
|
"grad_norm": 0.6674900430117614,
|
|
"learning_rate": 1.1308876872369062e-05,
|
|
"loss": 0.4057,
|
|
"mean_token_accuracy": 0.8564106421545148,
|
|
"num_tokens": 158177988.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"entropy": 0.406097412109375,
|
|
"epoch": 1.4642857142857144,
|
|
"grad_norm": 0.6565941986269037,
|
|
"learning_rate": 1.1265486122814359e-05,
|
|
"loss": 0.418,
|
|
"mean_token_accuracy": 0.853111038915813,
|
|
"num_tokens": 158634349.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"entropy": 0.412628173828125,
|
|
"epoch": 1.4682539682539684,
|
|
"grad_norm": 0.6632104455005605,
|
|
"learning_rate": 1.1222071145818293e-05,
|
|
"loss": 0.4122,
|
|
"mean_token_accuracy": 0.8553181765601039,
|
|
"num_tokens": 159060066.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 0.415252685546875,
|
|
"epoch": 1.4722222222222223,
|
|
"grad_norm": 0.707239372405196,
|
|
"learning_rate": 1.1178632772550636e-05,
|
|
"loss": 0.4113,
|
|
"mean_token_accuracy": 0.8547909967601299,
|
|
"num_tokens": 159490031.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"entropy": 0.40692138671875,
|
|
"epoch": 1.4761904761904763,
|
|
"grad_norm": 0.7069148314328081,
|
|
"learning_rate": 1.113517183462907e-05,
|
|
"loss": 0.412,
|
|
"mean_token_accuracy": 0.8557059289887547,
|
|
"num_tokens": 159942986.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"entropy": 0.41400146484375,
|
|
"epoch": 1.4801587301587302,
|
|
"grad_norm": 0.6897820697933245,
|
|
"learning_rate": 1.1091689164103281e-05,
|
|
"loss": 0.3906,
|
|
"mean_token_accuracy": 0.8620947021991014,
|
|
"num_tokens": 160355322.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"entropy": 0.408599853515625,
|
|
"epoch": 1.4841269841269842,
|
|
"grad_norm": 0.6751095283586555,
|
|
"learning_rate": 1.1048185593439014e-05,
|
|
"loss": 0.4147,
|
|
"mean_token_accuracy": 0.8550657378509641,
|
|
"num_tokens": 160782816.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"entropy": 0.410614013671875,
|
|
"epoch": 1.4880952380952381,
|
|
"grad_norm": 0.6972038572778951,
|
|
"learning_rate": 1.1004661955502143e-05,
|
|
"loss": 0.4148,
|
|
"mean_token_accuracy": 0.8550673946738243,
|
|
"num_tokens": 161216771.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 0.412353515625,
|
|
"epoch": 1.492063492063492,
|
|
"grad_norm": 0.6351069017407472,
|
|
"learning_rate": 1.0961119083542727e-05,
|
|
"loss": 0.3967,
|
|
"mean_token_accuracy": 0.8618287779390812,
|
|
"num_tokens": 161643512.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"entropy": 0.407440185546875,
|
|
"epoch": 1.496031746031746,
|
|
"grad_norm": 0.6819391026589559,
|
|
"learning_rate": 1.0917557811179057e-05,
|
|
"loss": 0.4052,
|
|
"mean_token_accuracy": 0.8588421484455466,
|
|
"num_tokens": 162077647.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"entropy": 0.4102783203125,
|
|
"epoch": 1.5,
|
|
"grad_norm": 0.6625502386920177,
|
|
"learning_rate": 1.0873978972381692e-05,
|
|
"loss": 0.3982,
|
|
"mean_token_accuracy": 0.859605161473155,
|
|
"num_tokens": 162503001.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"entropy": 0.411865234375,
|
|
"epoch": 1.503968253968254,
|
|
"grad_norm": 0.651583717791506,
|
|
"learning_rate": 1.0830383401457499e-05,
|
|
"loss": 0.4195,
|
|
"mean_token_accuracy": 0.8546869652345777,
|
|
"num_tokens": 162949686.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"entropy": 0.405609130859375,
|
|
"epoch": 1.507936507936508,
|
|
"grad_norm": 0.6806902067734124,
|
|
"learning_rate": 1.0786771933033677e-05,
|
|
"loss": 0.4037,
|
|
"mean_token_accuracy": 0.8567021545022726,
|
|
"num_tokens": 163388010.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 0.40545654296875,
|
|
"epoch": 1.5119047619047619,
|
|
"grad_norm": 0.6568305792093015,
|
|
"learning_rate": 1.0743145402041781e-05,
|
|
"loss": 0.3984,
|
|
"mean_token_accuracy": 0.8587896954268217,
|
|
"num_tokens": 163816567.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"entropy": 0.406280517578125,
|
|
"epoch": 1.5158730158730158,
|
|
"grad_norm": 0.6687688963859889,
|
|
"learning_rate": 1.0699504643701732e-05,
|
|
"loss": 0.4051,
|
|
"mean_token_accuracy": 0.8573078708723187,
|
|
"num_tokens": 164270399.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"entropy": 0.402984619140625,
|
|
"epoch": 1.5198412698412698,
|
|
"grad_norm": 0.6205824056394684,
|
|
"learning_rate": 1.0655850493505834e-05,
|
|
"loss": 0.3876,
|
|
"mean_token_accuracy": 0.8629192840307951,
|
|
"num_tokens": 164712402.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"entropy": 0.401519775390625,
|
|
"epoch": 1.5238095238095237,
|
|
"grad_norm": 0.6946581962210521,
|
|
"learning_rate": 1.0612183787202768e-05,
|
|
"loss": 0.4147,
|
|
"mean_token_accuracy": 0.8557152729481459,
|
|
"num_tokens": 165155523.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"entropy": 0.40338134765625,
|
|
"epoch": 1.5277777777777777,
|
|
"grad_norm": 0.6178420541561561,
|
|
"learning_rate": 1.0568505360781606e-05,
|
|
"loss": 0.3841,
|
|
"mean_token_accuracy": 0.8645893288776278,
|
|
"num_tokens": 165575993.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 0.40411376953125,
|
|
"epoch": 1.5317460317460316,
|
|
"grad_norm": 0.6997457239337582,
|
|
"learning_rate": 1.0524816050455801e-05,
|
|
"loss": 0.4145,
|
|
"mean_token_accuracy": 0.8545086095109582,
|
|
"num_tokens": 166004219.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"entropy": 0.404937744140625,
|
|
"epoch": 1.5357142857142856,
|
|
"grad_norm": 0.6344135411115165,
|
|
"learning_rate": 1.0481116692647165e-05,
|
|
"loss": 0.3977,
|
|
"mean_token_accuracy": 0.8587166350334883,
|
|
"num_tokens": 166459333.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"entropy": 0.40777587890625,
|
|
"epoch": 1.5396825396825395,
|
|
"grad_norm": 0.6634593666477472,
|
|
"learning_rate": 1.0437408123969877e-05,
|
|
"loss": 0.4007,
|
|
"mean_token_accuracy": 0.8594609973952174,
|
|
"num_tokens": 166887486.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"entropy": 0.4075927734375,
|
|
"epoch": 1.5436507936507935,
|
|
"grad_norm": 0.6684250544789218,
|
|
"learning_rate": 1.039369118121445e-05,
|
|
"loss": 0.4136,
|
|
"mean_token_accuracy": 0.8562117423862219,
|
|
"num_tokens": 167305651.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"entropy": 0.402740478515625,
|
|
"epoch": 1.5476190476190477,
|
|
"grad_norm": 0.6921394211692032,
|
|
"learning_rate": 1.0349966701331721e-05,
|
|
"loss": 0.4043,
|
|
"mean_token_accuracy": 0.8599712895229459,
|
|
"num_tokens": 167743608.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 0.409759521484375,
|
|
"epoch": 1.5515873015873016,
|
|
"grad_norm": 0.6424341617432388,
|
|
"learning_rate": 1.0306235521416822e-05,
|
|
"loss": 0.4002,
|
|
"mean_token_accuracy": 0.860035234130919,
|
|
"num_tokens": 168189361.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"entropy": 0.41387939453125,
|
|
"epoch": 1.5555555555555556,
|
|
"grad_norm": 0.6897265091020467,
|
|
"learning_rate": 1.0262498478693148e-05,
|
|
"loss": 0.4003,
|
|
"mean_token_accuracy": 0.8587908744812012,
|
|
"num_tokens": 168602032.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"entropy": 0.40863037109375,
|
|
"epoch": 1.5595238095238095,
|
|
"grad_norm": 0.6945381133940456,
|
|
"learning_rate": 1.0218756410496353e-05,
|
|
"loss": 0.4068,
|
|
"mean_token_accuracy": 0.8557405965402722,
|
|
"num_tokens": 169036397.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"entropy": 0.404693603515625,
|
|
"epoch": 1.5634920634920635,
|
|
"grad_norm": 0.6424876592982417,
|
|
"learning_rate": 1.0175010154258288e-05,
|
|
"loss": 0.4059,
|
|
"mean_token_accuracy": 0.8577195946127176,
|
|
"num_tokens": 169469975.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"entropy": 0.397216796875,
|
|
"epoch": 1.5674603174603174,
|
|
"grad_norm": 0.6360455374207441,
|
|
"learning_rate": 1.013126054749099e-05,
|
|
"loss": 0.4075,
|
|
"mean_token_accuracy": 0.8579382970929146,
|
|
"num_tokens": 169905002.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 0.4017333984375,
|
|
"epoch": 1.5714285714285714,
|
|
"grad_norm": 0.6606478789612956,
|
|
"learning_rate": 1.0087508427770639e-05,
|
|
"loss": 0.4025,
|
|
"mean_token_accuracy": 0.8562004147097468,
|
|
"num_tokens": 170343282.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"entropy": 0.408935546875,
|
|
"epoch": 1.5753968253968254,
|
|
"grad_norm": 0.6987795798888022,
|
|
"learning_rate": 1.0043754632721519e-05,
|
|
"loss": 0.3966,
|
|
"mean_token_accuracy": 0.8587719267234206,
|
|
"num_tokens": 170783254.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"entropy": 0.406951904296875,
|
|
"epoch": 1.5793650793650795,
|
|
"grad_norm": 0.6593916685637742,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.414,
|
|
"mean_token_accuracy": 0.8565381094813347,
|
|
"num_tokens": 171227432.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"entropy": 0.4052734375,
|
|
"epoch": 1.5833333333333335,
|
|
"grad_norm": 0.6987433063006335,
|
|
"learning_rate": 9.956245367278483e-06,
|
|
"loss": 0.386,
|
|
"mean_token_accuracy": 0.8624997651204467,
|
|
"num_tokens": 171673565.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"entropy": 0.4105224609375,
|
|
"epoch": 1.5873015873015874,
|
|
"grad_norm": 0.672307925393411,
|
|
"learning_rate": 9.912491572229366e-06,
|
|
"loss": 0.3915,
|
|
"mean_token_accuracy": 0.8613409381359816,
|
|
"num_tokens": 172096305.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 0.409149169921875,
|
|
"epoch": 1.5912698412698414,
|
|
"grad_norm": 0.7041365935845996,
|
|
"learning_rate": 9.868739452509011e-06,
|
|
"loss": 0.3931,
|
|
"mean_token_accuracy": 0.8597009964287281,
|
|
"num_tokens": 172495298.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"entropy": 0.40899658203125,
|
|
"epoch": 1.5952380952380953,
|
|
"grad_norm": 0.7517004342308762,
|
|
"learning_rate": 9.824989845741713e-06,
|
|
"loss": 0.3972,
|
|
"mean_token_accuracy": 0.8588024405762553,
|
|
"num_tokens": 172910673.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"entropy": 0.40240478515625,
|
|
"epoch": 1.5992063492063493,
|
|
"grad_norm": 0.6198853423830931,
|
|
"learning_rate": 9.78124358950365e-06,
|
|
"loss": 0.3954,
|
|
"mean_token_accuracy": 0.8617026535794139,
|
|
"num_tokens": 173326209.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"entropy": 0.403167724609375,
|
|
"epoch": 1.6031746031746033,
|
|
"grad_norm": 0.684199049613554,
|
|
"learning_rate": 9.737501521306855e-06,
|
|
"loss": 0.3994,
|
|
"mean_token_accuracy": 0.8593406956642866,
|
|
"num_tokens": 173775762.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"entropy": 0.410186767578125,
|
|
"epoch": 1.6071428571428572,
|
|
"grad_norm": 0.7341393365653166,
|
|
"learning_rate": 9.693764478583185e-06,
|
|
"loss": 0.3975,
|
|
"mean_token_accuracy": 0.8609625976532698,
|
|
"num_tokens": 174200041.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 0.402435302734375,
|
|
"epoch": 1.6111111111111112,
|
|
"grad_norm": 0.6781454981761531,
|
|
"learning_rate": 9.65003329866828e-06,
|
|
"loss": 0.4042,
|
|
"mean_token_accuracy": 0.8575213002040982,
|
|
"num_tokens": 174651858.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"entropy": 0.408233642578125,
|
|
"epoch": 1.6150793650793651,
|
|
"grad_norm": 0.6317009902021083,
|
|
"learning_rate": 9.606308818785552e-06,
|
|
"loss": 0.3867,
|
|
"mean_token_accuracy": 0.8641856899484992,
|
|
"num_tokens": 175081173.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"entropy": 0.402801513671875,
|
|
"epoch": 1.619047619047619,
|
|
"grad_norm": 0.6752157911204306,
|
|
"learning_rate": 9.562591876030127e-06,
|
|
"loss": 0.3937,
|
|
"mean_token_accuracy": 0.8606289671733975,
|
|
"num_tokens": 175519282.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"entropy": 0.406097412109375,
|
|
"epoch": 1.623015873015873,
|
|
"grad_norm": 0.7258956750619963,
|
|
"learning_rate": 9.518883307352839e-06,
|
|
"loss": 0.4086,
|
|
"mean_token_accuracy": 0.8588223177939653,
|
|
"num_tokens": 175965036.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"entropy": 0.405303955078125,
|
|
"epoch": 1.626984126984127,
|
|
"grad_norm": 0.6365492747842574,
|
|
"learning_rate": 9.475183949544204e-06,
|
|
"loss": 0.4104,
|
|
"mean_token_accuracy": 0.8550521014258265,
|
|
"num_tokens": 176387199.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 0.40435791015625,
|
|
"epoch": 1.630952380952381,
|
|
"grad_norm": 0.6803954542416505,
|
|
"learning_rate": 9.431494639218397e-06,
|
|
"loss": 0.3969,
|
|
"mean_token_accuracy": 0.8615404982119799,
|
|
"num_tokens": 176823428.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"entropy": 0.39996337890625,
|
|
"epoch": 1.6349206349206349,
|
|
"grad_norm": 0.6352543229598833,
|
|
"learning_rate": 9.387816212797233e-06,
|
|
"loss": 0.4023,
|
|
"mean_token_accuracy": 0.8599696168676019,
|
|
"num_tokens": 177264131.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"entropy": 0.408111572265625,
|
|
"epoch": 1.6388888888888888,
|
|
"grad_norm": 0.6536962265542489,
|
|
"learning_rate": 9.344149506494169e-06,
|
|
"loss": 0.3952,
|
|
"mean_token_accuracy": 0.8607912426814437,
|
|
"num_tokens": 177680911.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"entropy": 0.406005859375,
|
|
"epoch": 1.6428571428571428,
|
|
"grad_norm": 0.6661123978510443,
|
|
"learning_rate": 9.30049535629827e-06,
|
|
"loss": 0.4014,
|
|
"mean_token_accuracy": 0.859113815240562,
|
|
"num_tokens": 178114003.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"entropy": 0.410400390625,
|
|
"epoch": 1.6468253968253967,
|
|
"grad_norm": 0.6902363753958481,
|
|
"learning_rate": 9.256854597958222e-06,
|
|
"loss": 0.4146,
|
|
"mean_token_accuracy": 0.8541763303801417,
|
|
"num_tokens": 178533077.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 0.408782958984375,
|
|
"epoch": 1.6507936507936507,
|
|
"grad_norm": 0.7030439443784253,
|
|
"learning_rate": 9.213228066966328e-06,
|
|
"loss": 0.3924,
|
|
"mean_token_accuracy": 0.8622600650414824,
|
|
"num_tokens": 178950487.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"entropy": 0.405303955078125,
|
|
"epoch": 1.6547619047619047,
|
|
"grad_norm": 0.6475133352507451,
|
|
"learning_rate": 9.169616598542503e-06,
|
|
"loss": 0.3994,
|
|
"mean_token_accuracy": 0.8593355258926749,
|
|
"num_tokens": 179384739.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"entropy": 0.399627685546875,
|
|
"epoch": 1.6587301587301586,
|
|
"grad_norm": 0.6723065907691588,
|
|
"learning_rate": 9.126021027618312e-06,
|
|
"loss": 0.4085,
|
|
"mean_token_accuracy": 0.8555388646200299,
|
|
"num_tokens": 179833212.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"entropy": 0.4014892578125,
|
|
"epoch": 1.6626984126984126,
|
|
"grad_norm": 0.6653095194789432,
|
|
"learning_rate": 9.082442188820947e-06,
|
|
"loss": 0.391,
|
|
"mean_token_accuracy": 0.8626063298434019,
|
|
"num_tokens": 180259940.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"entropy": 0.396484375,
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 0.6734067922751799,
|
|
"learning_rate": 9.038880916457276e-06,
|
|
"loss": 0.3954,
|
|
"mean_token_accuracy": 0.8605092065408826,
|
|
"num_tokens": 180712234.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 0.40069580078125,
|
|
"epoch": 1.6706349206349205,
|
|
"grad_norm": 0.7248437901702897,
|
|
"learning_rate": 8.995338044497862e-06,
|
|
"loss": 0.4161,
|
|
"mean_token_accuracy": 0.8539806362241507,
|
|
"num_tokens": 181151354.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"entropy": 0.407073974609375,
|
|
"epoch": 1.6746031746031746,
|
|
"grad_norm": 0.6569712167965962,
|
|
"learning_rate": 8.951814406560988e-06,
|
|
"loss": 0.397,
|
|
"mean_token_accuracy": 0.8575204182416201,
|
|
"num_tokens": 181566490.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"entropy": 0.40673828125,
|
|
"epoch": 1.6785714285714286,
|
|
"grad_norm": 0.6763632648533178,
|
|
"learning_rate": 8.90831083589672e-06,
|
|
"loss": 0.4021,
|
|
"mean_token_accuracy": 0.8591755600646138,
|
|
"num_tokens": 181997420.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"entropy": 0.409881591796875,
|
|
"epoch": 1.6825396825396826,
|
|
"grad_norm": 0.6793544531923543,
|
|
"learning_rate": 8.864828165370932e-06,
|
|
"loss": 0.396,
|
|
"mean_token_accuracy": 0.8604298168793321,
|
|
"num_tokens": 182413254.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"entropy": 0.4053955078125,
|
|
"epoch": 1.6865079365079365,
|
|
"grad_norm": 0.6619199494789456,
|
|
"learning_rate": 8.821367227449368e-06,
|
|
"loss": 0.3906,
|
|
"mean_token_accuracy": 0.862027888186276,
|
|
"num_tokens": 182840621.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 0.405548095703125,
|
|
"epoch": 1.6904761904761905,
|
|
"grad_norm": 0.6604274179296861,
|
|
"learning_rate": 8.77792885418171e-06,
|
|
"loss": 0.3975,
|
|
"mean_token_accuracy": 0.8592646988108754,
|
|
"num_tokens": 183269538.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"entropy": 0.410430908203125,
|
|
"epoch": 1.6944444444444444,
|
|
"grad_norm": 0.6568683879895119,
|
|
"learning_rate": 8.734513877185644e-06,
|
|
"loss": 0.3838,
|
|
"mean_token_accuracy": 0.8651055432856083,
|
|
"num_tokens": 183684845.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"entropy": 0.4075927734375,
|
|
"epoch": 1.6984126984126984,
|
|
"grad_norm": 0.6704945921600994,
|
|
"learning_rate": 8.691123127630942e-06,
|
|
"loss": 0.3902,
|
|
"mean_token_accuracy": 0.8633867194876075,
|
|
"num_tokens": 184109496.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"entropy": 0.407073974609375,
|
|
"epoch": 1.7023809523809523,
|
|
"grad_norm": 0.7170767243747824,
|
|
"learning_rate": 8.647757436223543e-06,
|
|
"loss": 0.4135,
|
|
"mean_token_accuracy": 0.8537504924461246,
|
|
"num_tokens": 184533568.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"entropy": 0.4056396484375,
|
|
"epoch": 1.7063492063492065,
|
|
"grad_norm": 0.6949542788037961,
|
|
"learning_rate": 8.604417633189658e-06,
|
|
"loss": 0.4013,
|
|
"mean_token_accuracy": 0.8596660671755672,
|
|
"num_tokens": 184968366.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 0.406036376953125,
|
|
"epoch": 1.7103174603174605,
|
|
"grad_norm": 0.6318993348714297,
|
|
"learning_rate": 8.561104548259864e-06,
|
|
"loss": 0.4005,
|
|
"mean_token_accuracy": 0.8592518717050552,
|
|
"num_tokens": 185404884.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"entropy": 0.4052734375,
|
|
"epoch": 1.7142857142857144,
|
|
"grad_norm": 0.7150048622751305,
|
|
"learning_rate": 8.517819010653234e-06,
|
|
"loss": 0.4082,
|
|
"mean_token_accuracy": 0.8565432196483016,
|
|
"num_tokens": 185857166.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"entropy": 0.4102783203125,
|
|
"epoch": 1.7182539682539684,
|
|
"grad_norm": 0.6984571949535301,
|
|
"learning_rate": 8.474561849061446e-06,
|
|
"loss": 0.385,
|
|
"mean_token_accuracy": 0.8628620821982622,
|
|
"num_tokens": 186278301.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"entropy": 0.413604736328125,
|
|
"epoch": 1.7222222222222223,
|
|
"grad_norm": 0.6927790727966183,
|
|
"learning_rate": 8.431333891632937e-06,
|
|
"loss": 0.3857,
|
|
"mean_token_accuracy": 0.8627250017598271,
|
|
"num_tokens": 186684767.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"entropy": 0.404876708984375,
|
|
"epoch": 1.7261904761904763,
|
|
"grad_norm": 0.6958224617466756,
|
|
"learning_rate": 8.388135965957031e-06,
|
|
"loss": 0.3926,
|
|
"mean_token_accuracy": 0.8619824200868607,
|
|
"num_tokens": 187107470.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 0.399261474609375,
|
|
"epoch": 1.7301587301587302,
|
|
"grad_norm": 0.6668337287163626,
|
|
"learning_rate": 8.344968899048093e-06,
|
|
"loss": 0.3852,
|
|
"mean_token_accuracy": 0.8640718599781394,
|
|
"num_tokens": 187534641.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"entropy": 0.399871826171875,
|
|
"epoch": 1.7341269841269842,
|
|
"grad_norm": 0.660340307861853,
|
|
"learning_rate": 8.301833517329714e-06,
|
|
"loss": 0.3941,
|
|
"mean_token_accuracy": 0.8606917411088943,
|
|
"num_tokens": 187972052.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"entropy": 0.4000244140625,
|
|
"epoch": 1.7380952380952381,
|
|
"grad_norm": 0.7449016377460825,
|
|
"learning_rate": 8.258730646618872e-06,
|
|
"loss": 0.3883,
|
|
"mean_token_accuracy": 0.8606307609006763,
|
|
"num_tokens": 188403747.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"entropy": 0.40106201171875,
|
|
"epoch": 1.742063492063492,
|
|
"grad_norm": 0.6657481910760707,
|
|
"learning_rate": 8.215661112110143e-06,
|
|
"loss": 0.3932,
|
|
"mean_token_accuracy": 0.8618741119280457,
|
|
"num_tokens": 188833376.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"entropy": 0.397735595703125,
|
|
"epoch": 1.746031746031746,
|
|
"grad_norm": 0.6529962126614087,
|
|
"learning_rate": 8.172625738359876e-06,
|
|
"loss": 0.3923,
|
|
"mean_token_accuracy": 0.8605843409895897,
|
|
"num_tokens": 189286051.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 0.409576416015625,
|
|
"epoch": 1.75,
|
|
"grad_norm": 0.6520639060375251,
|
|
"learning_rate": 8.12962534927042e-06,
|
|
"loss": 0.3842,
|
|
"mean_token_accuracy": 0.8636613693088293,
|
|
"num_tokens": 189700932.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"entropy": 0.4051513671875,
|
|
"epoch": 1.753968253968254,
|
|
"grad_norm": 0.6752597049435843,
|
|
"learning_rate": 8.08666076807436e-06,
|
|
"loss": 0.392,
|
|
"mean_token_accuracy": 0.859367199242115,
|
|
"num_tokens": 190135846.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"entropy": 0.39495849609375,
|
|
"epoch": 1.757936507936508,
|
|
"grad_norm": 0.6405008649597489,
|
|
"learning_rate": 8.043732817318736e-06,
|
|
"loss": 0.3953,
|
|
"mean_token_accuracy": 0.8626237865537405,
|
|
"num_tokens": 190599539.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"entropy": 0.40020751953125,
|
|
"epoch": 1.7619047619047619,
|
|
"grad_norm": 0.7045668566293105,
|
|
"learning_rate": 8.000842318849317e-06,
|
|
"loss": 0.3977,
|
|
"mean_token_accuracy": 0.8612882681190968,
|
|
"num_tokens": 191023956.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"entropy": 0.412139892578125,
|
|
"epoch": 1.7658730158730158,
|
|
"grad_norm": 0.6309864360582974,
|
|
"learning_rate": 7.95799009379485e-06,
|
|
"loss": 0.3714,
|
|
"mean_token_accuracy": 0.8665594831109047,
|
|
"num_tokens": 191419256.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 0.400787353515625,
|
|
"epoch": 1.7698412698412698,
|
|
"grad_norm": 0.6313143549912045,
|
|
"learning_rate": 7.915176962551347e-06,
|
|
"loss": 0.3848,
|
|
"mean_token_accuracy": 0.8646227335557342,
|
|
"num_tokens": 191865715.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"entropy": 0.40869140625,
|
|
"epoch": 1.7738095238095237,
|
|
"grad_norm": 0.6791552619696869,
|
|
"learning_rate": 7.872403744766383e-06,
|
|
"loss": 0.403,
|
|
"mean_token_accuracy": 0.8603015225380659,
|
|
"num_tokens": 192279544.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"entropy": 0.40484619140625,
|
|
"epoch": 1.7777777777777777,
|
|
"grad_norm": 0.6497182250849004,
|
|
"learning_rate": 7.82967125932339e-06,
|
|
"loss": 0.3872,
|
|
"mean_token_accuracy": 0.8622237564995885,
|
|
"num_tokens": 192687536.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"entropy": 0.405914306640625,
|
|
"epoch": 1.7817460317460316,
|
|
"grad_norm": 0.6473578509768539,
|
|
"learning_rate": 7.786980324325994e-06,
|
|
"loss": 0.3886,
|
|
"mean_token_accuracy": 0.8661008570343256,
|
|
"num_tokens": 193092369.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"entropy": 0.408721923828125,
|
|
"epoch": 1.7857142857142856,
|
|
"grad_norm": 0.6624737663531524,
|
|
"learning_rate": 7.74433175708235e-06,
|
|
"loss": 0.3794,
|
|
"mean_token_accuracy": 0.8674081796780229,
|
|
"num_tokens": 193514317.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 0.400177001953125,
|
|
"epoch": 1.7896825396825395,
|
|
"grad_norm": 0.6173586758665184,
|
|
"learning_rate": 7.70172637408949e-06,
|
|
"loss": 0.3859,
|
|
"mean_token_accuracy": 0.8648398378863931,
|
|
"num_tokens": 193943892.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"entropy": 0.39544677734375,
|
|
"epoch": 1.7936507936507935,
|
|
"grad_norm": 0.6882345053798201,
|
|
"learning_rate": 7.659164991017689e-06,
|
|
"loss": 0.3816,
|
|
"mean_token_accuracy": 0.864141782745719,
|
|
"num_tokens": 194368425.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"entropy": 0.398651123046875,
|
|
"epoch": 1.7976190476190477,
|
|
"grad_norm": 0.643498270336236,
|
|
"learning_rate": 7.616648422694858e-06,
|
|
"loss": 0.3886,
|
|
"mean_token_accuracy": 0.8627204354852438,
|
|
"num_tokens": 194781122.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"entropy": 0.398651123046875,
|
|
"epoch": 1.8015873015873016,
|
|
"grad_norm": 0.6388429129984551,
|
|
"learning_rate": 7.5741774830909375e-06,
|
|
"loss": 0.3865,
|
|
"mean_token_accuracy": 0.8640177240595222,
|
|
"num_tokens": 195229420.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"entropy": 0.3966064453125,
|
|
"epoch": 1.8055555555555556,
|
|
"grad_norm": 0.6756670134886724,
|
|
"learning_rate": 7.531752985302323e-06,
|
|
"loss": 0.3922,
|
|
"mean_token_accuracy": 0.8626740667968988,
|
|
"num_tokens": 195670858.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 0.403045654296875,
|
|
"epoch": 1.8095238095238095,
|
|
"grad_norm": 0.673993885311234,
|
|
"learning_rate": 7.489375741536283e-06,
|
|
"loss": 0.3958,
|
|
"mean_token_accuracy": 0.8607546780258417,
|
|
"num_tokens": 196086060.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"entropy": 0.40087890625,
|
|
"epoch": 1.8134920634920635,
|
|
"grad_norm": 0.6550524224232206,
|
|
"learning_rate": 7.447046563095425e-06,
|
|
"loss": 0.3957,
|
|
"mean_token_accuracy": 0.8578624930232763,
|
|
"num_tokens": 196534067.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"entropy": 0.4056396484375,
|
|
"epoch": 1.8174603174603174,
|
|
"grad_norm": 0.6241087307689689,
|
|
"learning_rate": 7.404766260362153e-06,
|
|
"loss": 0.3842,
|
|
"mean_token_accuracy": 0.8643308812752366,
|
|
"num_tokens": 196949752.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"entropy": 0.40155029296875,
|
|
"epoch": 1.8214285714285714,
|
|
"grad_norm": 0.6333219557811135,
|
|
"learning_rate": 7.362535642783155e-06,
|
|
"loss": 0.3816,
|
|
"mean_token_accuracy": 0.8649132940918207,
|
|
"num_tokens": 197361623.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"entropy": 0.399658203125,
|
|
"epoch": 1.8253968253968254,
|
|
"grad_norm": 0.6379379135553368,
|
|
"learning_rate": 7.320355518853921e-06,
|
|
"loss": 0.3859,
|
|
"mean_token_accuracy": 0.8645495921373367,
|
|
"num_tokens": 197787383.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 0.398345947265625,
|
|
"epoch": 1.8293650793650795,
|
|
"grad_norm": 0.6588997380159366,
|
|
"learning_rate": 7.278226696103239e-06,
|
|
"loss": 0.3924,
|
|
"mean_token_accuracy": 0.8615701934322715,
|
|
"num_tokens": 198214788.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"entropy": 0.40325927734375,
|
|
"epoch": 1.8333333333333335,
|
|
"grad_norm": 0.6430673965041483,
|
|
"learning_rate": 7.236149981077746e-06,
|
|
"loss": 0.3974,
|
|
"mean_token_accuracy": 0.8626204943284392,
|
|
"num_tokens": 198640204.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"entropy": 0.3955078125,
|
|
"epoch": 1.8373015873015874,
|
|
"grad_norm": 0.6487127679328336,
|
|
"learning_rate": 7.194126179326497e-06,
|
|
"loss": 0.3985,
|
|
"mean_token_accuracy": 0.8603583332151175,
|
|
"num_tokens": 199086174.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"entropy": 0.395599365234375,
|
|
"epoch": 1.8412698412698414,
|
|
"grad_norm": 0.6624716705219701,
|
|
"learning_rate": 7.1521560953855274e-06,
|
|
"loss": 0.3894,
|
|
"mean_token_accuracy": 0.8631916120648384,
|
|
"num_tokens": 199534945.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"entropy": 0.40576171875,
|
|
"epoch": 1.8452380952380953,
|
|
"grad_norm": 0.6699916552832386,
|
|
"learning_rate": 7.110240532762469e-06,
|
|
"loss": 0.3878,
|
|
"mean_token_accuracy": 0.862906139343977,
|
|
"num_tokens": 199970879.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 0.406005859375,
|
|
"epoch": 1.8492063492063493,
|
|
"grad_norm": 0.6365324815628914,
|
|
"learning_rate": 7.068380293921142e-06,
|
|
"loss": 0.3794,
|
|
"mean_token_accuracy": 0.8647226821631193,
|
|
"num_tokens": 200401566.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"entropy": 0.403961181640625,
|
|
"epoch": 1.8531746031746033,
|
|
"grad_norm": 0.6659380259679744,
|
|
"learning_rate": 7.026576180266213e-06,
|
|
"loss": 0.3795,
|
|
"mean_token_accuracy": 0.8645898820832372,
|
|
"num_tokens": 200819281.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"entropy": 0.400146484375,
|
|
"epoch": 1.8571428571428572,
|
|
"grad_norm": 0.6623575240954482,
|
|
"learning_rate": 6.984828992127842e-06,
|
|
"loss": 0.3869,
|
|
"mean_token_accuracy": 0.8644722169265151,
|
|
"num_tokens": 201286569.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"entropy": 0.399322509765625,
|
|
"epoch": 1.8611111111111112,
|
|
"grad_norm": 0.6428412382384305,
|
|
"learning_rate": 6.9431395287463655e-06,
|
|
"loss": 0.3817,
|
|
"mean_token_accuracy": 0.8629525965079665,
|
|
"num_tokens": 201729117.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"entropy": 0.4012451171875,
|
|
"epoch": 1.8650793650793651,
|
|
"grad_norm": 0.6590064495664711,
|
|
"learning_rate": 6.9015085882569866e-06,
|
|
"loss": 0.3845,
|
|
"mean_token_accuracy": 0.8646084098145366,
|
|
"num_tokens": 202148785.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 0.401275634765625,
|
|
"epoch": 1.869047619047619,
|
|
"grad_norm": 0.6644154480782758,
|
|
"learning_rate": 6.859936967674509e-06,
|
|
"loss": 0.3833,
|
|
"mean_token_accuracy": 0.8641043901443481,
|
|
"num_tokens": 202562335.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"entropy": 0.408447265625,
|
|
"epoch": 1.873015873015873,
|
|
"grad_norm": 0.6874013779210217,
|
|
"learning_rate": 6.818425462878071e-06,
|
|
"loss": 0.3786,
|
|
"mean_token_accuracy": 0.8654429130256176,
|
|
"num_tokens": 202969412.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"entropy": 0.399871826171875,
|
|
"epoch": 1.876984126984127,
|
|
"grad_norm": 0.6419195701464625,
|
|
"learning_rate": 6.776974868595898e-06,
|
|
"loss": 0.3855,
|
|
"mean_token_accuracy": 0.8635794082656503,
|
|
"num_tokens": 203414579.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"entropy": 0.40521240234375,
|
|
"epoch": 1.880952380952381,
|
|
"grad_norm": 0.6373613162236946,
|
|
"learning_rate": 6.735585978390105e-06,
|
|
"loss": 0.3821,
|
|
"mean_token_accuracy": 0.8647842686623335,
|
|
"num_tokens": 203845826.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"entropy": 0.40625,
|
|
"epoch": 1.8849206349206349,
|
|
"grad_norm": 0.7484479121963832,
|
|
"learning_rate": 6.694259584641496e-06,
|
|
"loss": 0.3879,
|
|
"mean_token_accuracy": 0.862905758433044,
|
|
"num_tokens": 204272914.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 0.401580810546875,
|
|
"epoch": 1.8888888888888888,
|
|
"grad_norm": 0.6661868398653255,
|
|
"learning_rate": 6.652996478534395e-06,
|
|
"loss": 0.3772,
|
|
"mean_token_accuracy": 0.8666380383074284,
|
|
"num_tokens": 204713067.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"entropy": 0.40240478515625,
|
|
"epoch": 1.8928571428571428,
|
|
"grad_norm": 0.6367178541736724,
|
|
"learning_rate": 6.611797450041495e-06,
|
|
"loss": 0.3824,
|
|
"mean_token_accuracy": 0.8640545001253486,
|
|
"num_tokens": 205140799.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"entropy": 0.410491943359375,
|
|
"epoch": 1.8968253968253967,
|
|
"grad_norm": 0.6411794902398776,
|
|
"learning_rate": 6.570663287908744e-06,
|
|
"loss": 0.3759,
|
|
"mean_token_accuracy": 0.8667557742446661,
|
|
"num_tokens": 205549482.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"entropy": 0.39971923828125,
|
|
"epoch": 1.9007936507936507,
|
|
"grad_norm": 0.6657409283144686,
|
|
"learning_rate": 6.5295947796402315e-06,
|
|
"loss": 0.3871,
|
|
"mean_token_accuracy": 0.8628489142283797,
|
|
"num_tokens": 205957709.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"entropy": 0.399810791015625,
|
|
"epoch": 1.9047619047619047,
|
|
"grad_norm": 0.6650888096094876,
|
|
"learning_rate": 6.488592711483122e-06,
|
|
"loss": 0.3813,
|
|
"mean_token_accuracy": 0.865462708286941,
|
|
"num_tokens": 206394578.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 0.404998779296875,
|
|
"epoch": 1.9087301587301586,
|
|
"grad_norm": 0.6520479299882604,
|
|
"learning_rate": 6.447657868412603e-06,
|
|
"loss": 0.3832,
|
|
"mean_token_accuracy": 0.8648681128397584,
|
|
"num_tokens": 206810851.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"entropy": 0.400482177734375,
|
|
"epoch": 1.9126984126984126,
|
|
"grad_norm": 0.6641867800114467,
|
|
"learning_rate": 6.406791034116846e-06,
|
|
"loss": 0.3911,
|
|
"mean_token_accuracy": 0.8639266528189182,
|
|
"num_tokens": 207233636.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"entropy": 0.39788818359375,
|
|
"epoch": 1.9166666666666665,
|
|
"grad_norm": 0.6616452561654831,
|
|
"learning_rate": 6.365992990982015e-06,
|
|
"loss": 0.3917,
|
|
"mean_token_accuracy": 0.8605171097442508,
|
|
"num_tokens": 207671663.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"entropy": 0.413970947265625,
|
|
"epoch": 1.9206349206349205,
|
|
"grad_norm": 0.6811368094477863,
|
|
"learning_rate": 6.3252645200772836e-06,
|
|
"loss": 0.3839,
|
|
"mean_token_accuracy": 0.8638743665069342,
|
|
"num_tokens": 208074376.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"entropy": 0.402008056640625,
|
|
"epoch": 1.9246031746031746,
|
|
"grad_norm": 0.6474488866243678,
|
|
"learning_rate": 6.284606401139875e-06,
|
|
"loss": 0.3933,
|
|
"mean_token_accuracy": 0.8602316891774535,
|
|
"num_tokens": 208524843.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 0.407806396484375,
|
|
"epoch": 1.9285714285714286,
|
|
"grad_norm": 0.6704121804386468,
|
|
"learning_rate": 6.244019412560144e-06,
|
|
"loss": 0.3848,
|
|
"mean_token_accuracy": 0.863696001470089,
|
|
"num_tokens": 208947370.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"entropy": 0.401947021484375,
|
|
"epoch": 1.9325396825396826,
|
|
"grad_norm": 0.6514026741498656,
|
|
"learning_rate": 6.203504331366677e-06,
|
|
"loss": 0.3738,
|
|
"mean_token_accuracy": 0.8671468198299408,
|
|
"num_tokens": 209354451.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"entropy": 0.39605712890625,
|
|
"epoch": 1.9365079365079365,
|
|
"grad_norm": 0.6332801182132646,
|
|
"learning_rate": 6.163061933211403e-06,
|
|
"loss": 0.3815,
|
|
"mean_token_accuracy": 0.8641307642683387,
|
|
"num_tokens": 209798547.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"entropy": 0.396148681640625,
|
|
"epoch": 1.9404761904761905,
|
|
"grad_norm": 0.6747869284841455,
|
|
"learning_rate": 6.122692992354748e-06,
|
|
"loss": 0.3783,
|
|
"mean_token_accuracy": 0.8645921712741256,
|
|
"num_tokens": 210233790.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"entropy": 0.40008544921875,
|
|
"epoch": 1.9444444444444444,
|
|
"grad_norm": 0.6605152850398239,
|
|
"learning_rate": 6.082398281650823e-06,
|
|
"loss": 0.392,
|
|
"mean_token_accuracy": 0.8625458022579551,
|
|
"num_tokens": 210661829.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 0.39251708984375,
|
|
"epoch": 1.9484126984126984,
|
|
"grad_norm": 0.6047228067514416,
|
|
"learning_rate": 6.0421785725326085e-06,
|
|
"loss": 0.3807,
|
|
"mean_token_accuracy": 0.8650286197662354,
|
|
"num_tokens": 211113597.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"entropy": 0.39459228515625,
|
|
"epoch": 1.9523809523809523,
|
|
"grad_norm": 0.6770029694746321,
|
|
"learning_rate": 6.002034634997214e-06,
|
|
"loss": 0.3845,
|
|
"mean_token_accuracy": 0.8641347736120224,
|
|
"num_tokens": 211549046.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"entropy": 0.393890380859375,
|
|
"epoch": 1.9563492063492065,
|
|
"grad_norm": 0.6764981819376107,
|
|
"learning_rate": 5.9619672375911065e-06,
|
|
"loss": 0.3661,
|
|
"mean_token_accuracy": 0.8685044087469578,
|
|
"num_tokens": 212000519.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"entropy": 0.399566650390625,
|
|
"epoch": 1.9603174603174605,
|
|
"grad_norm": 0.5976014492249986,
|
|
"learning_rate": 5.92197714739541e-06,
|
|
"loss": 0.3806,
|
|
"mean_token_accuracy": 0.8653174787759781,
|
|
"num_tokens": 212447521.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"entropy": 0.401123046875,
|
|
"epoch": 1.9642857142857144,
|
|
"grad_norm": 0.6419565626843364,
|
|
"learning_rate": 5.882065130011226e-06,
|
|
"loss": 0.3854,
|
|
"mean_token_accuracy": 0.8639181992039084,
|
|
"num_tokens": 212865927.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 0.398956298828125,
|
|
"epoch": 1.9682539682539684,
|
|
"grad_norm": 0.649036294431529,
|
|
"learning_rate": 5.842231949544963e-06,
|
|
"loss": 0.3814,
|
|
"mean_token_accuracy": 0.8645293368026614,
|
|
"num_tokens": 213310334.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"entropy": 0.396514892578125,
|
|
"epoch": 1.9722222222222223,
|
|
"grad_norm": 0.6540271641946366,
|
|
"learning_rate": 5.80247836859372e-06,
|
|
"loss": 0.3819,
|
|
"mean_token_accuracy": 0.8648376986384392,
|
|
"num_tokens": 213742399.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"entropy": 0.401824951171875,
|
|
"epoch": 1.9761904761904763,
|
|
"grad_norm": 0.6249976191527038,
|
|
"learning_rate": 5.762805148230688e-06,
|
|
"loss": 0.3883,
|
|
"mean_token_accuracy": 0.8644895693287253,
|
|
"num_tokens": 214169043.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"entropy": 0.400848388671875,
|
|
"epoch": 1.9801587301587302,
|
|
"grad_norm": 0.6226665699147694,
|
|
"learning_rate": 5.723213047990553e-06,
|
|
"loss": 0.3869,
|
|
"mean_token_accuracy": 0.863439017906785,
|
|
"num_tokens": 214572957.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"entropy": 0.403656005859375,
|
|
"epoch": 1.9841269841269842,
|
|
"grad_norm": 0.6446422210225001,
|
|
"learning_rate": 5.68370282585499e-06,
|
|
"loss": 0.3877,
|
|
"mean_token_accuracy": 0.8621528865769506,
|
|
"num_tokens": 215005057.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 0.392364501953125,
|
|
"epoch": 1.9880952380952381,
|
|
"grad_norm": 0.6640325469666031,
|
|
"learning_rate": 5.64427523823813e-06,
|
|
"loss": 0.3714,
|
|
"mean_token_accuracy": 0.8665375467389822,
|
|
"num_tokens": 215449399.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"entropy": 0.3995361328125,
|
|
"epoch": 1.992063492063492,
|
|
"grad_norm": 0.6967069661289652,
|
|
"learning_rate": 5.604931039972099e-06,
|
|
"loss": 0.3723,
|
|
"mean_token_accuracy": 0.8670346606522799,
|
|
"num_tokens": 215869766.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"entropy": 0.399871826171875,
|
|
"epoch": 1.996031746031746,
|
|
"grad_norm": 0.6014892141579964,
|
|
"learning_rate": 5.5656709842925335e-06,
|
|
"loss": 0.3726,
|
|
"mean_token_accuracy": 0.8665146352723241,
|
|
"num_tokens": 216298988.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"entropy": 0.392730712890625,
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.6445229203870088,
|
|
"learning_rate": 5.5264958228241925e-06,
|
|
"loss": 0.3738,
|
|
"mean_token_accuracy": 0.8680051285773516,
|
|
"num_tokens": 216731206.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"entropy": 0.392303466796875,
|
|
"epoch": 2.003968253968254,
|
|
"grad_norm": 0.7007642747575338,
|
|
"learning_rate": 5.4874063055665495e-06,
|
|
"loss": 0.3394,
|
|
"mean_token_accuracy": 0.8802464632317424,
|
|
"num_tokens": 217159781.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 0.394378662109375,
|
|
"epoch": 2.007936507936508,
|
|
"grad_norm": 0.6915377565113627,
|
|
"learning_rate": 5.44840318087944e-06,
|
|
"loss": 0.3333,
|
|
"mean_token_accuracy": 0.8822930511087179,
|
|
"num_tokens": 217589905.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"entropy": 0.389556884765625,
|
|
"epoch": 2.011904761904762,
|
|
"grad_norm": 0.6607496119019493,
|
|
"learning_rate": 5.40948719546873e-06,
|
|
"loss": 0.3223,
|
|
"mean_token_accuracy": 0.8840867523103952,
|
|
"num_tokens": 218017429.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"entropy": 0.38677978515625,
|
|
"epoch": 2.015873015873016,
|
|
"grad_norm": 0.797575729187655,
|
|
"learning_rate": 5.370659094372036e-06,
|
|
"loss": 0.3487,
|
|
"mean_token_accuracy": 0.8751778230071068,
|
|
"num_tokens": 218446876.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"entropy": 0.38916015625,
|
|
"epoch": 2.0198412698412698,
|
|
"grad_norm": 0.8079376215374279,
|
|
"learning_rate": 5.331919620944438e-06,
|
|
"loss": 0.3421,
|
|
"mean_token_accuracy": 0.8785031987354159,
|
|
"num_tokens": 218885713.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"entropy": 0.3873291015625,
|
|
"epoch": 2.0238095238095237,
|
|
"grad_norm": 0.7130485344720543,
|
|
"learning_rate": 5.293269516844263e-06,
|
|
"loss": 0.3347,
|
|
"mean_token_accuracy": 0.8804626753553748,
|
|
"num_tokens": 219322571.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 0.394134521484375,
|
|
"epoch": 2.0277777777777777,
|
|
"grad_norm": 0.6529573527681491,
|
|
"learning_rate": 5.2547095220188815e-06,
|
|
"loss": 0.3378,
|
|
"mean_token_accuracy": 0.8767837462946773,
|
|
"num_tokens": 219748508.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"entropy": 0.39422607421875,
|
|
"epoch": 2.0317460317460316,
|
|
"grad_norm": 0.6836683232299732,
|
|
"learning_rate": 5.216240374690546e-06,
|
|
"loss": 0.3337,
|
|
"mean_token_accuracy": 0.8813108829781413,
|
|
"num_tokens": 220180160.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"entropy": 0.3922119140625,
|
|
"epoch": 2.0357142857142856,
|
|
"grad_norm": 0.6799043393272431,
|
|
"learning_rate": 5.177862811342254e-06,
|
|
"loss": 0.3295,
|
|
"mean_token_accuracy": 0.8823963804170489,
|
|
"num_tokens": 220606565.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"entropy": 0.39385986328125,
|
|
"epoch": 2.0396825396825395,
|
|
"grad_norm": 0.6581193128400078,
|
|
"learning_rate": 5.139577566703643e-06,
|
|
"loss": 0.3299,
|
|
"mean_token_accuracy": 0.881388746201992,
|
|
"num_tokens": 221016578.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"entropy": 0.38360595703125,
|
|
"epoch": 2.0436507936507935,
|
|
"grad_norm": 0.6684855157209281,
|
|
"learning_rate": 5.101385373736937e-06,
|
|
"loss": 0.3245,
|
|
"mean_token_accuracy": 0.8840543190017343,
|
|
"num_tokens": 221455851.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 0.389862060546875,
|
|
"epoch": 2.0476190476190474,
|
|
"grad_norm": 0.7158648675214864,
|
|
"learning_rate": 5.0632869636229035e-06,
|
|
"loss": 0.3372,
|
|
"mean_token_accuracy": 0.8804104384034872,
|
|
"num_tokens": 221871968.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"entropy": 0.38134765625,
|
|
"epoch": 2.0515873015873014,
|
|
"grad_norm": 0.6851404252008069,
|
|
"learning_rate": 5.025283065746855e-06,
|
|
"loss": 0.3204,
|
|
"mean_token_accuracy": 0.8836971241980791,
|
|
"num_tokens": 222303576.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"entropy": 0.383941650390625,
|
|
"epoch": 2.0555555555555554,
|
|
"grad_norm": 0.6707049123314238,
|
|
"learning_rate": 4.987374407684703e-06,
|
|
"loss": 0.3253,
|
|
"mean_token_accuracy": 0.8830197919160128,
|
|
"num_tokens": 222738323.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"entropy": 0.38531494140625,
|
|
"epoch": 2.0595238095238093,
|
|
"grad_norm": 0.651061286914035,
|
|
"learning_rate": 4.949561715189001e-06,
|
|
"loss": 0.3242,
|
|
"mean_token_accuracy": 0.8821469666436315,
|
|
"num_tokens": 223168261.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"entropy": 0.390960693359375,
|
|
"epoch": 2.0634920634920633,
|
|
"grad_norm": 0.6476270763528862,
|
|
"learning_rate": 4.911845712175067e-06,
|
|
"loss": 0.3313,
|
|
"mean_token_accuracy": 0.8828292330726981,
|
|
"num_tokens": 223584134.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 0.386322021484375,
|
|
"epoch": 2.0674603174603177,
|
|
"grad_norm": 0.6364599620319137,
|
|
"learning_rate": 4.8742271207071226e-06,
|
|
"loss": 0.3228,
|
|
"mean_token_accuracy": 0.8835309613496065,
|
|
"num_tokens": 224013215.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"entropy": 0.380828857421875,
|
|
"epoch": 2.0714285714285716,
|
|
"grad_norm": 0.6564651643829943,
|
|
"learning_rate": 4.836706660984467e-06,
|
|
"loss": 0.3321,
|
|
"mean_token_accuracy": 0.8808335028588772,
|
|
"num_tokens": 224461654.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"entropy": 0.38494873046875,
|
|
"epoch": 2.0753968253968256,
|
|
"grad_norm": 0.6364445517923993,
|
|
"learning_rate": 4.799285051327686e-06,
|
|
"loss": 0.33,
|
|
"mean_token_accuracy": 0.8805900542065501,
|
|
"num_tokens": 224885889.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"entropy": 0.38519287109375,
|
|
"epoch": 2.0793650793650795,
|
|
"grad_norm": 0.6498450049930671,
|
|
"learning_rate": 4.761963008164918e-06,
|
|
"loss": 0.3366,
|
|
"mean_token_accuracy": 0.8799572549760342,
|
|
"num_tokens": 225327562.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"entropy": 0.378173828125,
|
|
"epoch": 2.0833333333333335,
|
|
"grad_norm": 0.644410093004496,
|
|
"learning_rate": 4.724741246018103e-06,
|
|
"loss": 0.3292,
|
|
"mean_token_accuracy": 0.8829803112894297,
|
|
"num_tokens": 225767121.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 0.3868408203125,
|
|
"epoch": 2.0873015873015874,
|
|
"grad_norm": 0.6615349299629835,
|
|
"learning_rate": 4.687620477489337e-06,
|
|
"loss": 0.3184,
|
|
"mean_token_accuracy": 0.8841284308582544,
|
|
"num_tokens": 226189031.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"entropy": 0.382080078125,
|
|
"epoch": 2.0912698412698414,
|
|
"grad_norm": 0.681773396883082,
|
|
"learning_rate": 4.650601413247214e-06,
|
|
"loss": 0.3324,
|
|
"mean_token_accuracy": 0.8802290465682745,
|
|
"num_tokens": 226627902.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"entropy": 0.3839111328125,
|
|
"epoch": 2.0952380952380953,
|
|
"grad_norm": 0.6628168297027915,
|
|
"learning_rate": 4.613684762013217e-06,
|
|
"loss": 0.3426,
|
|
"mean_token_accuracy": 0.8767191367223859,
|
|
"num_tokens": 227062309.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"entropy": 0.386871337890625,
|
|
"epoch": 2.0992063492063493,
|
|
"grad_norm": 0.6227370006419931,
|
|
"learning_rate": 4.57687123054817e-06,
|
|
"loss": 0.3273,
|
|
"mean_token_accuracy": 0.8822421031072736,
|
|
"num_tokens": 227485113.0,
|
|
"step": 529
|
|
},
|
|
{
|
|
"entropy": 0.389251708984375,
|
|
"epoch": 2.1031746031746033,
|
|
"grad_norm": 0.6402574176585532,
|
|
"learning_rate": 4.5401615236386785e-06,
|
|
"loss": 0.3309,
|
|
"mean_token_accuracy": 0.8817484118044376,
|
|
"num_tokens": 227920598.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 0.388427734375,
|
|
"epoch": 2.107142857142857,
|
|
"grad_norm": 0.6406500881050289,
|
|
"learning_rate": 4.503556344083656e-06,
|
|
"loss": 0.3243,
|
|
"mean_token_accuracy": 0.8830273868516088,
|
|
"num_tokens": 228346699.0,
|
|
"step": 531
|
|
},
|
|
{
|
|
"entropy": 0.3875732421875,
|
|
"epoch": 2.111111111111111,
|
|
"grad_norm": 0.6789574480582125,
|
|
"learning_rate": 4.467056392680863e-06,
|
|
"loss": 0.3309,
|
|
"mean_token_accuracy": 0.8825666131451726,
|
|
"num_tokens": 228773818.0,
|
|
"step": 532
|
|
},
|
|
{
|
|
"entropy": 0.388214111328125,
|
|
"epoch": 2.115079365079365,
|
|
"grad_norm": 0.6510204304692228,
|
|
"learning_rate": 4.4306623682134875e-06,
|
|
"loss": 0.3244,
|
|
"mean_token_accuracy": 0.8822726272046566,
|
|
"num_tokens": 229188623.0,
|
|
"step": 533
|
|
},
|
|
{
|
|
"entropy": 0.381195068359375,
|
|
"epoch": 2.119047619047619,
|
|
"grad_norm": 0.6607016173165737,
|
|
"learning_rate": 4.394374967436783e-06,
|
|
"loss": 0.3153,
|
|
"mean_token_accuracy": 0.8861860791221261,
|
|
"num_tokens": 229627711.0,
|
|
"step": 534
|
|
},
|
|
{
|
|
"entropy": 0.385040283203125,
|
|
"epoch": 2.123015873015873,
|
|
"grad_norm": 0.6694823178157694,
|
|
"learning_rate": 4.358194885064704e-06,
|
|
"loss": 0.3384,
|
|
"mean_token_accuracy": 0.8798664947971702,
|
|
"num_tokens": 230054209.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 0.3831787109375,
|
|
"epoch": 2.126984126984127,
|
|
"grad_norm": 0.6435423039418425,
|
|
"learning_rate": 4.3221228137566225e-06,
|
|
"loss": 0.3309,
|
|
"mean_token_accuracy": 0.8810933278873563,
|
|
"num_tokens": 230489032.0,
|
|
"step": 536
|
|
},
|
|
{
|
|
"entropy": 0.382110595703125,
|
|
"epoch": 2.130952380952381,
|
|
"grad_norm": 0.6509585759312683,
|
|
"learning_rate": 4.286159444104068e-06,
|
|
"loss": 0.3316,
|
|
"mean_token_accuracy": 0.8800732661038637,
|
|
"num_tokens": 230910745.0,
|
|
"step": 537
|
|
},
|
|
{
|
|
"entropy": 0.387664794921875,
|
|
"epoch": 2.134920634920635,
|
|
"grad_norm": 0.6213215519100801,
|
|
"learning_rate": 4.250305464617494e-06,
|
|
"loss": 0.3314,
|
|
"mean_token_accuracy": 0.8809810969978571,
|
|
"num_tokens": 231339019.0,
|
|
"step": 538
|
|
},
|
|
{
|
|
"entropy": 0.385498046875,
|
|
"epoch": 2.138888888888889,
|
|
"grad_norm": 0.7913050493338661,
|
|
"learning_rate": 4.2145615617131095e-06,
|
|
"loss": 0.3388,
|
|
"mean_token_accuracy": 0.8805615156888962,
|
|
"num_tokens": 231777523.0,
|
|
"step": 539
|
|
},
|
|
{
|
|
"entropy": 0.3892822265625,
|
|
"epoch": 2.142857142857143,
|
|
"grad_norm": 0.6397574362821433,
|
|
"learning_rate": 4.178928419699731e-06,
|
|
"loss": 0.3275,
|
|
"mean_token_accuracy": 0.8803174262866378,
|
|
"num_tokens": 232199672.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 0.39251708984375,
|
|
"epoch": 2.1468253968253967,
|
|
"grad_norm": 0.6561714495314072,
|
|
"learning_rate": 4.143406720765687e-06,
|
|
"loss": 0.3224,
|
|
"mean_token_accuracy": 0.8841910324990749,
|
|
"num_tokens": 232633797.0,
|
|
"step": 541
|
|
},
|
|
{
|
|
"entropy": 0.386260986328125,
|
|
"epoch": 2.1507936507936507,
|
|
"grad_norm": 0.676691357314513,
|
|
"learning_rate": 4.107997144965747e-06,
|
|
"loss": 0.3324,
|
|
"mean_token_accuracy": 0.881273141130805,
|
|
"num_tokens": 233076007.0,
|
|
"step": 542
|
|
},
|
|
{
|
|
"entropy": 0.38470458984375,
|
|
"epoch": 2.1547619047619047,
|
|
"grad_norm": 0.7241624585208927,
|
|
"learning_rate": 4.0727003702081146e-06,
|
|
"loss": 0.3172,
|
|
"mean_token_accuracy": 0.8856724062934518,
|
|
"num_tokens": 233509851.0,
|
|
"step": 543
|
|
},
|
|
{
|
|
"entropy": 0.3896484375,
|
|
"epoch": 2.1587301587301586,
|
|
"grad_norm": 0.78911437248552,
|
|
"learning_rate": 4.037517072241435e-06,
|
|
"loss": 0.3271,
|
|
"mean_token_accuracy": 0.8804695382714272,
|
|
"num_tokens": 233942156.0,
|
|
"step": 544
|
|
},
|
|
{
|
|
"entropy": 0.386474609375,
|
|
"epoch": 2.1626984126984126,
|
|
"grad_norm": 0.6722927644193839,
|
|
"learning_rate": 4.002447924641882e-06,
|
|
"loss": 0.324,
|
|
"mean_token_accuracy": 0.8829648504033685,
|
|
"num_tokens": 234372963.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 0.38409423828125,
|
|
"epoch": 2.1666666666666665,
|
|
"grad_norm": 0.6466957029468782,
|
|
"learning_rate": 3.967493598800233e-06,
|
|
"loss": 0.3237,
|
|
"mean_token_accuracy": 0.884342834353447,
|
|
"num_tokens": 234844668.0,
|
|
"step": 546
|
|
},
|
|
{
|
|
"entropy": 0.39801025390625,
|
|
"epoch": 2.1706349206349205,
|
|
"grad_norm": 0.6412289556826881,
|
|
"learning_rate": 3.9326547639090315e-06,
|
|
"loss": 0.3504,
|
|
"mean_token_accuracy": 0.8763287300243974,
|
|
"num_tokens": 235271990.0,
|
|
"step": 547
|
|
},
|
|
{
|
|
"entropy": 0.385894775390625,
|
|
"epoch": 2.1746031746031744,
|
|
"grad_norm": 0.6171782227283126,
|
|
"learning_rate": 3.897932086949778e-06,
|
|
"loss": 0.3311,
|
|
"mean_token_accuracy": 0.8819945016875863,
|
|
"num_tokens": 235697877.0,
|
|
"step": 548
|
|
},
|
|
{
|
|
"entropy": 0.396209716796875,
|
|
"epoch": 2.1785714285714284,
|
|
"grad_norm": 0.6361871979559561,
|
|
"learning_rate": 3.863326232680148e-06,
|
|
"loss": 0.3325,
|
|
"mean_token_accuracy": 0.8788015209138393,
|
|
"num_tokens": 236134537.0,
|
|
"step": 549
|
|
},
|
|
{
|
|
"entropy": 0.379608154296875,
|
|
"epoch": 2.1825396825396823,
|
|
"grad_norm": 0.6315811408319069,
|
|
"learning_rate": 3.828837863621286e-06,
|
|
"loss": 0.3138,
|
|
"mean_token_accuracy": 0.8871300676837564,
|
|
"num_tokens": 236586699.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 0.391204833984375,
|
|
"epoch": 2.1865079365079367,
|
|
"grad_norm": 0.6494424143773995,
|
|
"learning_rate": 3.7944676400451017e-06,
|
|
"loss": 0.3287,
|
|
"mean_token_accuracy": 0.8799652308225632,
|
|
"num_tokens": 236995332.0,
|
|
"step": 551
|
|
},
|
|
{
|
|
"entropy": 0.3856201171875,
|
|
"epoch": 2.1904761904761907,
|
|
"grad_norm": 0.6255192304119477,
|
|
"learning_rate": 3.76021621996163e-06,
|
|
"loss": 0.3207,
|
|
"mean_token_accuracy": 0.8843406355008483,
|
|
"num_tokens": 237426378.0,
|
|
"step": 552
|
|
},
|
|
{
|
|
"entropy": 0.376617431640625,
|
|
"epoch": 2.1944444444444446,
|
|
"grad_norm": 0.6407771894276859,
|
|
"learning_rate": 3.7260842591064504e-06,
|
|
"loss": 0.3152,
|
|
"mean_token_accuracy": 0.8854828383773565,
|
|
"num_tokens": 237866086.0,
|
|
"step": 553
|
|
},
|
|
{
|
|
"entropy": 0.382476806640625,
|
|
"epoch": 2.1984126984126986,
|
|
"grad_norm": 0.6801463416248983,
|
|
"learning_rate": 3.6920724109281146e-06,
|
|
"loss": 0.329,
|
|
"mean_token_accuracy": 0.8822561521083117,
|
|
"num_tokens": 238297987.0,
|
|
"step": 554
|
|
},
|
|
{
|
|
"entropy": 0.395477294921875,
|
|
"epoch": 2.2023809523809526,
|
|
"grad_norm": 0.6188509021837985,
|
|
"learning_rate": 3.6581813265756595e-06,
|
|
"loss": 0.3282,
|
|
"mean_token_accuracy": 0.8823494836688042,
|
|
"num_tokens": 238729296.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 0.388092041015625,
|
|
"epoch": 2.2063492063492065,
|
|
"grad_norm": 0.6351713659674968,
|
|
"learning_rate": 3.6244116548861084e-06,
|
|
"loss": 0.336,
|
|
"mean_token_accuracy": 0.8807284999638796,
|
|
"num_tokens": 239171101.0,
|
|
"step": 556
|
|
},
|
|
{
|
|
"entropy": 0.38885498046875,
|
|
"epoch": 2.2103174603174605,
|
|
"grad_norm": 0.6619090358761864,
|
|
"learning_rate": 3.590764042372079e-06,
|
|
"loss": 0.3248,
|
|
"mean_token_accuracy": 0.8822610294446349,
|
|
"num_tokens": 239602299.0,
|
|
"step": 557
|
|
},
|
|
{
|
|
"entropy": 0.38800048828125,
|
|
"epoch": 2.2142857142857144,
|
|
"grad_norm": 0.6012724630133454,
|
|
"learning_rate": 3.557239133209387e-06,
|
|
"loss": 0.3182,
|
|
"mean_token_accuracy": 0.88497896771878,
|
|
"num_tokens": 240034337.0,
|
|
"step": 558
|
|
},
|
|
{
|
|
"entropy": 0.388671875,
|
|
"epoch": 2.2182539682539684,
|
|
"grad_norm": 0.6362574759114775,
|
|
"learning_rate": 3.523837569224725e-06,
|
|
"loss": 0.3203,
|
|
"mean_token_accuracy": 0.8840533634647727,
|
|
"num_tokens": 240461291.0,
|
|
"step": 559
|
|
},
|
|
{
|
|
"entropy": 0.392059326171875,
|
|
"epoch": 2.2222222222222223,
|
|
"grad_norm": 0.6674103812544709,
|
|
"learning_rate": 3.4905599898833665e-06,
|
|
"loss": 0.3153,
|
|
"mean_token_accuracy": 0.8856786200776696,
|
|
"num_tokens": 240860927.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 0.383575439453125,
|
|
"epoch": 2.2261904761904763,
|
|
"grad_norm": 0.6349123290369829,
|
|
"learning_rate": 3.4574070322769347e-06,
|
|
"loss": 0.3281,
|
|
"mean_token_accuracy": 0.8817283101379871,
|
|
"num_tokens": 241301179.0,
|
|
"step": 561
|
|
},
|
|
{
|
|
"entropy": 0.384368896484375,
|
|
"epoch": 2.2301587301587302,
|
|
"grad_norm": 0.7198961662563939,
|
|
"learning_rate": 3.4243793311111916e-06,
|
|
"loss": 0.3277,
|
|
"mean_token_accuracy": 0.8824960431084037,
|
|
"num_tokens": 241739076.0,
|
|
"step": 562
|
|
},
|
|
{
|
|
"entropy": 0.39068603515625,
|
|
"epoch": 2.234126984126984,
|
|
"grad_norm": 0.683740121166347,
|
|
"learning_rate": 3.391477518693894e-06,
|
|
"loss": 0.3321,
|
|
"mean_token_accuracy": 0.8821808360517025,
|
|
"num_tokens": 242150640.0,
|
|
"step": 563
|
|
},
|
|
{
|
|
"entropy": 0.3800048828125,
|
|
"epoch": 2.238095238095238,
|
|
"grad_norm": 0.6437837444588406,
|
|
"learning_rate": 3.358702224922691e-06,
|
|
"loss": 0.3158,
|
|
"mean_token_accuracy": 0.8863429753109813,
|
|
"num_tokens": 242574011.0,
|
|
"step": 564
|
|
},
|
|
{
|
|
"entropy": 0.39166259765625,
|
|
"epoch": 2.242063492063492,
|
|
"grad_norm": 0.7506758272688273,
|
|
"learning_rate": 3.3260540772730576e-06,
|
|
"loss": 0.3276,
|
|
"mean_token_accuracy": 0.879186031408608,
|
|
"num_tokens": 243005939.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 0.387481689453125,
|
|
"epoch": 2.246031746031746,
|
|
"grad_norm": 0.6315669129852838,
|
|
"learning_rate": 3.2935337007862865e-06,
|
|
"loss": 0.3262,
|
|
"mean_token_accuracy": 0.8840317856520414,
|
|
"num_tokens": 243458878.0,
|
|
"step": 566
|
|
},
|
|
{
|
|
"entropy": 0.38836669921875,
|
|
"epoch": 2.25,
|
|
"grad_norm": 0.6389628006202059,
|
|
"learning_rate": 3.261141718057523e-06,
|
|
"loss": 0.3356,
|
|
"mean_token_accuracy": 0.8814380522817373,
|
|
"num_tokens": 243877438.0,
|
|
"step": 567
|
|
},
|
|
{
|
|
"entropy": 0.38360595703125,
|
|
"epoch": 2.253968253968254,
|
|
"grad_norm": 0.6259643943263192,
|
|
"learning_rate": 3.2288787492238416e-06,
|
|
"loss": 0.3263,
|
|
"mean_token_accuracy": 0.8834005445241928,
|
|
"num_tokens": 244313964.0,
|
|
"step": 568
|
|
},
|
|
{
|
|
"entropy": 0.383331298828125,
|
|
"epoch": 2.257936507936508,
|
|
"grad_norm": 0.6219404801873205,
|
|
"learning_rate": 3.1967454119523745e-06,
|
|
"loss": 0.324,
|
|
"mean_token_accuracy": 0.8827744442969561,
|
|
"num_tokens": 244761734.0,
|
|
"step": 569
|
|
},
|
|
{
|
|
"entropy": 0.384307861328125,
|
|
"epoch": 2.261904761904762,
|
|
"grad_norm": 0.5955899515224996,
|
|
"learning_rate": 3.1647423214284856e-06,
|
|
"loss": 0.3118,
|
|
"mean_token_accuracy": 0.887902582064271,
|
|
"num_tokens": 245200322.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 0.381744384765625,
|
|
"epoch": 2.265873015873016,
|
|
"grad_norm": 0.646703276648792,
|
|
"learning_rate": 3.1328700903440045e-06,
|
|
"loss": 0.3206,
|
|
"mean_token_accuracy": 0.8841962497681379,
|
|
"num_tokens": 245646233.0,
|
|
"step": 571
|
|
},
|
|
{
|
|
"entropy": 0.386260986328125,
|
|
"epoch": 2.2698412698412698,
|
|
"grad_norm": 0.6521102057792992,
|
|
"learning_rate": 3.101129328885475e-06,
|
|
"loss": 0.3217,
|
|
"mean_token_accuracy": 0.8851350508630276,
|
|
"num_tokens": 246083539.0,
|
|
"step": 572
|
|
},
|
|
{
|
|
"entropy": 0.38726806640625,
|
|
"epoch": 2.2738095238095237,
|
|
"grad_norm": 0.635188052792932,
|
|
"learning_rate": 3.0695206447224923e-06,
|
|
"loss": 0.3198,
|
|
"mean_token_accuracy": 0.8851980855688453,
|
|
"num_tokens": 246515677.0,
|
|
"step": 573
|
|
},
|
|
{
|
|
"entropy": 0.38677978515625,
|
|
"epoch": 2.2777777777777777,
|
|
"grad_norm": 0.6388945711775174,
|
|
"learning_rate": 3.0380446429960573e-06,
|
|
"loss": 0.3178,
|
|
"mean_token_accuracy": 0.8853446776047349,
|
|
"num_tokens": 246933619.0,
|
|
"step": 574
|
|
},
|
|
{
|
|
"entropy": 0.399322509765625,
|
|
"epoch": 2.2817460317460316,
|
|
"grad_norm": 0.6434111635560193,
|
|
"learning_rate": 3.0067019263069973e-06,
|
|
"loss": 0.3261,
|
|
"mean_token_accuracy": 0.8847488528117537,
|
|
"num_tokens": 247344382.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 0.385498046875,
|
|
"epoch": 2.2857142857142856,
|
|
"grad_norm": 0.6359197299105421,
|
|
"learning_rate": 2.9754930947044357e-06,
|
|
"loss": 0.3144,
|
|
"mean_token_accuracy": 0.8865975281223655,
|
|
"num_tokens": 247765672.0,
|
|
"step": 576
|
|
},
|
|
{
|
|
"entropy": 0.389495849609375,
|
|
"epoch": 2.2896825396825395,
|
|
"grad_norm": 0.6154901011869501,
|
|
"learning_rate": 2.9444187456742855e-06,
|
|
"loss": 0.3172,
|
|
"mean_token_accuracy": 0.8845733245834708,
|
|
"num_tokens": 248183306.0,
|
|
"step": 577
|
|
},
|
|
{
|
|
"entropy": 0.37921142578125,
|
|
"epoch": 2.2936507936507935,
|
|
"grad_norm": 0.6319087371113467,
|
|
"learning_rate": 2.9134794741278317e-06,
|
|
"loss": 0.3226,
|
|
"mean_token_accuracy": 0.8831391530111432,
|
|
"num_tokens": 248628378.0,
|
|
"step": 578
|
|
},
|
|
{
|
|
"entropy": 0.386138916015625,
|
|
"epoch": 2.2976190476190474,
|
|
"grad_norm": 0.617827795169469,
|
|
"learning_rate": 2.8826758723903192e-06,
|
|
"loss": 0.3289,
|
|
"mean_token_accuracy": 0.8825296880677342,
|
|
"num_tokens": 249071096.0,
|
|
"step": 579
|
|
},
|
|
{
|
|
"entropy": 0.38525390625,
|
|
"epoch": 2.3015873015873014,
|
|
"grad_norm": 0.6959398807719873,
|
|
"learning_rate": 2.8520085301896373e-06,
|
|
"loss": 0.3265,
|
|
"mean_token_accuracy": 0.8830155087634921,
|
|
"num_tokens": 249501143.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 0.385986328125,
|
|
"epoch": 2.3055555555555554,
|
|
"grad_norm": 0.6348696654732209,
|
|
"learning_rate": 2.821478034645009e-06,
|
|
"loss": 0.323,
|
|
"mean_token_accuracy": 0.8836052594706416,
|
|
"num_tokens": 249948355.0,
|
|
"step": 581
|
|
},
|
|
{
|
|
"entropy": 0.38616943359375,
|
|
"epoch": 2.3095238095238093,
|
|
"grad_norm": 0.6329011771841873,
|
|
"learning_rate": 2.791084970255772e-06,
|
|
"loss": 0.3233,
|
|
"mean_token_accuracy": 0.8842885615304112,
|
|
"num_tokens": 250356099.0,
|
|
"step": 582
|
|
},
|
|
{
|
|
"entropy": 0.384613037109375,
|
|
"epoch": 2.3134920634920633,
|
|
"grad_norm": 0.6456652675035203,
|
|
"learning_rate": 2.7608299188901632e-06,
|
|
"loss": 0.3144,
|
|
"mean_token_accuracy": 0.8848955044522882,
|
|
"num_tokens": 250775592.0,
|
|
"step": 583
|
|
},
|
|
{
|
|
"entropy": 0.381988525390625,
|
|
"epoch": 2.317460317460317,
|
|
"grad_norm": 0.6394640721117717,
|
|
"learning_rate": 2.730713459774198e-06,
|
|
"loss": 0.3277,
|
|
"mean_token_accuracy": 0.8828927706927061,
|
|
"num_tokens": 251219125.0,
|
|
"step": 584
|
|
},
|
|
{
|
|
"entropy": 0.379119873046875,
|
|
"epoch": 2.3214285714285716,
|
|
"grad_norm": 0.632993088257485,
|
|
"learning_rate": 2.7007361694805735e-06,
|
|
"loss": 0.32,
|
|
"mean_token_accuracy": 0.8848384916782379,
|
|
"num_tokens": 251648861.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 0.37896728515625,
|
|
"epoch": 2.3253968253968256,
|
|
"grad_norm": 0.642288723817126,
|
|
"learning_rate": 2.670898621917629e-06,
|
|
"loss": 0.3212,
|
|
"mean_token_accuracy": 0.8847629306837916,
|
|
"num_tokens": 252080434.0,
|
|
"step": 586
|
|
},
|
|
{
|
|
"entropy": 0.38128662109375,
|
|
"epoch": 2.3293650793650795,
|
|
"grad_norm": 0.6541101628145074,
|
|
"learning_rate": 2.64120138831837e-06,
|
|
"loss": 0.3126,
|
|
"mean_token_accuracy": 0.8882659897208214,
|
|
"num_tokens": 252502018.0,
|
|
"step": 587
|
|
},
|
|
{
|
|
"entropy": 0.383026123046875,
|
|
"epoch": 2.3333333333333335,
|
|
"grad_norm": 0.7162259536705861,
|
|
"learning_rate": 2.6116450372295145e-06,
|
|
"loss": 0.3191,
|
|
"mean_token_accuracy": 0.883970595896244,
|
|
"num_tokens": 252923218.0,
|
|
"step": 588
|
|
},
|
|
{
|
|
"entropy": 0.384735107421875,
|
|
"epoch": 2.3373015873015874,
|
|
"grad_norm": 0.6626450382178084,
|
|
"learning_rate": 2.5822301345006196e-06,
|
|
"loss": 0.3168,
|
|
"mean_token_accuracy": 0.8849868765100837,
|
|
"num_tokens": 253337148.0,
|
|
"step": 589
|
|
},
|
|
{
|
|
"entropy": 0.39013671875,
|
|
"epoch": 2.3412698412698414,
|
|
"grad_norm": 0.6568665871706648,
|
|
"learning_rate": 2.5529572432732473e-06,
|
|
"loss": 0.3209,
|
|
"mean_token_accuracy": 0.884642724879086,
|
|
"num_tokens": 253761289.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 0.38323974609375,
|
|
"epoch": 2.3452380952380953,
|
|
"grad_norm": 0.6598852576655171,
|
|
"learning_rate": 2.5238269239701816e-06,
|
|
"loss": 0.3161,
|
|
"mean_token_accuracy": 0.884860472753644,
|
|
"num_tokens": 254195017.0,
|
|
"step": 591
|
|
},
|
|
{
|
|
"entropy": 0.38433837890625,
|
|
"epoch": 2.3492063492063493,
|
|
"grad_norm": 0.637276420544699,
|
|
"learning_rate": 2.4948397342846985e-06,
|
|
"loss": 0.3328,
|
|
"mean_token_accuracy": 0.8818829879164696,
|
|
"num_tokens": 254643716.0,
|
|
"step": 592
|
|
},
|
|
{
|
|
"entropy": 0.385223388671875,
|
|
"epoch": 2.3531746031746033,
|
|
"grad_norm": 0.6301366966020561,
|
|
"learning_rate": 2.4659962291698936e-06,
|
|
"loss": 0.3282,
|
|
"mean_token_accuracy": 0.880899085663259,
|
|
"num_tokens": 255069339.0,
|
|
"step": 593
|
|
},
|
|
{
|
|
"entropy": 0.389404296875,
|
|
"epoch": 2.357142857142857,
|
|
"grad_norm": 0.638225122674086,
|
|
"learning_rate": 2.4372969608280483e-06,
|
|
"loss": 0.3203,
|
|
"mean_token_accuracy": 0.8841871181502938,
|
|
"num_tokens": 255486573.0,
|
|
"step": 594
|
|
},
|
|
{
|
|
"entropy": 0.381072998046875,
|
|
"epoch": 2.361111111111111,
|
|
"grad_norm": 0.6545431908408221,
|
|
"learning_rate": 2.408742478700071e-06,
|
|
"loss": 0.3076,
|
|
"mean_token_accuracy": 0.8880687272176147,
|
|
"num_tokens": 255913426.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 0.383636474609375,
|
|
"epoch": 2.365079365079365,
|
|
"grad_norm": 0.645166690257223,
|
|
"learning_rate": 2.3803333294549647e-06,
|
|
"loss": 0.3124,
|
|
"mean_token_accuracy": 0.8868766566738486,
|
|
"num_tokens": 256345326.0,
|
|
"step": 596
|
|
},
|
|
{
|
|
"entropy": 0.378448486328125,
|
|
"epoch": 2.369047619047619,
|
|
"grad_norm": 0.646372016881236,
|
|
"learning_rate": 2.352070056979375e-06,
|
|
"loss": 0.3161,
|
|
"mean_token_accuracy": 0.8870938578620553,
|
|
"num_tokens": 256765173.0,
|
|
"step": 597
|
|
},
|
|
{
|
|
"entropy": 0.377532958984375,
|
|
"epoch": 2.373015873015873,
|
|
"grad_norm": 0.6693829082891946,
|
|
"learning_rate": 2.3239532023671663e-06,
|
|
"loss": 0.3087,
|
|
"mean_token_accuracy": 0.8909780327230692,
|
|
"num_tokens": 257185152.0,
|
|
"step": 598
|
|
},
|
|
{
|
|
"entropy": 0.377288818359375,
|
|
"epoch": 2.376984126984127,
|
|
"grad_norm": 0.6753658239787765,
|
|
"learning_rate": 2.295983303909065e-06,
|
|
"loss": 0.3163,
|
|
"mean_token_accuracy": 0.8857049969956279,
|
|
"num_tokens": 257627732.0,
|
|
"step": 599
|
|
},
|
|
{
|
|
"entropy": 0.37738037109375,
|
|
"epoch": 2.380952380952381,
|
|
"grad_norm": 0.637766728659791,
|
|
"learning_rate": 2.2681608970823567e-06,
|
|
"loss": 0.3121,
|
|
"mean_token_accuracy": 0.8857978647574782,
|
|
"num_tokens": 258067282.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 0.385101318359375,
|
|
"epoch": 2.384920634920635,
|
|
"grad_norm": 1.319145343989088,
|
|
"learning_rate": 2.2404865145406353e-06,
|
|
"loss": 0.3237,
|
|
"mean_token_accuracy": 0.8839105069637299,
|
|
"num_tokens": 258491168.0,
|
|
"step": 601
|
|
},
|
|
{
|
|
"entropy": 0.378387451171875,
|
|
"epoch": 2.388888888888889,
|
|
"grad_norm": 0.6194353316699897,
|
|
"learning_rate": 2.2129606861036003e-06,
|
|
"loss": 0.3159,
|
|
"mean_token_accuracy": 0.8844478046521544,
|
|
"num_tokens": 258944016.0,
|
|
"step": 602
|
|
},
|
|
{
|
|
"entropy": 0.385406494140625,
|
|
"epoch": 2.392857142857143,
|
|
"grad_norm": 0.6145111798633861,
|
|
"learning_rate": 2.1855839387469237e-06,
|
|
"loss": 0.3121,
|
|
"mean_token_accuracy": 0.8875940628349781,
|
|
"num_tokens": 259379493.0,
|
|
"step": 603
|
|
},
|
|
{
|
|
"entropy": 0.388275146484375,
|
|
"epoch": 2.3968253968253967,
|
|
"grad_norm": 0.6886819461995825,
|
|
"learning_rate": 2.158356796592147e-06,
|
|
"loss": 0.3041,
|
|
"mean_token_accuracy": 0.8887152783572674,
|
|
"num_tokens": 259808268.0,
|
|
"step": 604
|
|
},
|
|
{
|
|
"entropy": 0.3804931640625,
|
|
"epoch": 2.4007936507936507,
|
|
"grad_norm": 0.6244956591633763,
|
|
"learning_rate": 2.1312797808966625e-06,
|
|
"loss": 0.3245,
|
|
"mean_token_accuracy": 0.8850710866972804,
|
|
"num_tokens": 260247659.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 0.3851318359375,
|
|
"epoch": 2.4047619047619047,
|
|
"grad_norm": 0.6517731852559921,
|
|
"learning_rate": 2.1043534100437123e-06,
|
|
"loss": 0.3371,
|
|
"mean_token_accuracy": 0.8805443737655878,
|
|
"num_tokens": 260684292.0,
|
|
"step": 606
|
|
},
|
|
{
|
|
"entropy": 0.380767822265625,
|
|
"epoch": 2.4087301587301586,
|
|
"grad_norm": 0.6377112681635206,
|
|
"learning_rate": 2.0775781995324886e-06,
|
|
"loss": 0.3219,
|
|
"mean_token_accuracy": 0.8848995277658105,
|
|
"num_tokens": 261121173.0,
|
|
"step": 607
|
|
},
|
|
{
|
|
"entropy": 0.387847900390625,
|
|
"epoch": 2.4126984126984126,
|
|
"grad_norm": 0.6347004293735145,
|
|
"learning_rate": 2.0509546619682553e-06,
|
|
"loss": 0.3183,
|
|
"mean_token_accuracy": 0.8860855745151639,
|
|
"num_tokens": 261555918.0,
|
|
"step": 608
|
|
},
|
|
{
|
|
"entropy": 0.38128662109375,
|
|
"epoch": 2.4166666666666665,
|
|
"grad_norm": 2.3373272895881656,
|
|
"learning_rate": 2.024483307052526e-06,
|
|
"loss": 0.3236,
|
|
"mean_token_accuracy": 0.8850080538541079,
|
|
"num_tokens": 262000331.0,
|
|
"step": 609
|
|
},
|
|
{
|
|
"entropy": 0.384307861328125,
|
|
"epoch": 2.4206349206349205,
|
|
"grad_norm": 0.645561304833522,
|
|
"learning_rate": 1.9981646415733157e-06,
|
|
"loss": 0.3145,
|
|
"mean_token_accuracy": 0.88714156486094,
|
|
"num_tokens": 262425946.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 0.385833740234375,
|
|
"epoch": 2.4246031746031744,
|
|
"grad_norm": 0.639279216493586,
|
|
"learning_rate": 1.971999169395432e-06,
|
|
"loss": 0.3141,
|
|
"mean_token_accuracy": 0.8860221272334456,
|
|
"num_tokens": 262853210.0,
|
|
"step": 611
|
|
},
|
|
{
|
|
"entropy": 0.3848876953125,
|
|
"epoch": 2.4285714285714284,
|
|
"grad_norm": 0.6257683879585533,
|
|
"learning_rate": 1.945987391450833e-06,
|
|
"loss": 0.3041,
|
|
"mean_token_accuracy": 0.8897171234712005,
|
|
"num_tokens": 263268966.0,
|
|
"step": 612
|
|
},
|
|
{
|
|
"entropy": 0.383392333984375,
|
|
"epoch": 2.432539682539683,
|
|
"grad_norm": 0.6467526598593158,
|
|
"learning_rate": 1.920129805729043e-06,
|
|
"loss": 0.3224,
|
|
"mean_token_accuracy": 0.8844416281208396,
|
|
"num_tokens": 263709583.0,
|
|
"step": 613
|
|
},
|
|
{
|
|
"entropy": 0.383514404296875,
|
|
"epoch": 2.4365079365079367,
|
|
"grad_norm": 0.6584407884920952,
|
|
"learning_rate": 1.8944269072676013e-06,
|
|
"loss": 0.3024,
|
|
"mean_token_accuracy": 0.888917769305408,
|
|
"num_tokens": 264137961.0,
|
|
"step": 614
|
|
},
|
|
{
|
|
"entropy": 0.378143310546875,
|
|
"epoch": 2.4404761904761907,
|
|
"grad_norm": 2.358916383104869,
|
|
"learning_rate": 1.8688791881426017e-06,
|
|
"loss": 0.3235,
|
|
"mean_token_accuracy": 0.8838730929419398,
|
|
"num_tokens": 264579578.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 0.387786865234375,
|
|
"epoch": 2.4444444444444446,
|
|
"grad_norm": 0.6484092770878318,
|
|
"learning_rate": 1.843487137459261e-06,
|
|
"loss": 0.3158,
|
|
"mean_token_accuracy": 0.8852028921246529,
|
|
"num_tokens": 264995910.0,
|
|
"step": 616
|
|
},
|
|
{
|
|
"entropy": 0.3782958984375,
|
|
"epoch": 2.4484126984126986,
|
|
"grad_norm": 0.6283835633567808,
|
|
"learning_rate": 1.8182512413425624e-06,
|
|
"loss": 0.3221,
|
|
"mean_token_accuracy": 0.8847752753645182,
|
|
"num_tokens": 265440836.0,
|
|
"step": 617
|
|
},
|
|
{
|
|
"entropy": 0.385711669921875,
|
|
"epoch": 2.4523809523809526,
|
|
"grad_norm": 0.6265536607231427,
|
|
"learning_rate": 1.7931719829279448e-06,
|
|
"loss": 0.3131,
|
|
"mean_token_accuracy": 0.8853830918669701,
|
|
"num_tokens": 265867518.0,
|
|
"step": 618
|
|
},
|
|
{
|
|
"entropy": 0.385498046875,
|
|
"epoch": 2.4563492063492065,
|
|
"grad_norm": 0.6536050788460606,
|
|
"learning_rate": 1.7682498423520545e-06,
|
|
"loss": 0.3276,
|
|
"mean_token_accuracy": 0.8838295871391892,
|
|
"num_tokens": 266304569.0,
|
|
"step": 619
|
|
},
|
|
{
|
|
"entropy": 0.38214111328125,
|
|
"epoch": 2.4603174603174605,
|
|
"grad_norm": 0.6199559421660695,
|
|
"learning_rate": 1.7434852967435523e-06,
|
|
"loss": 0.3089,
|
|
"mean_token_accuracy": 0.8871648367494345,
|
|
"num_tokens": 266730039.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 0.38201904296875,
|
|
"epoch": 2.4642857142857144,
|
|
"grad_norm": 0.6217208841926519,
|
|
"learning_rate": 1.7188788202139794e-06,
|
|
"loss": 0.3192,
|
|
"mean_token_accuracy": 0.8856141036376357,
|
|
"num_tokens": 267169815.0,
|
|
"step": 621
|
|
},
|
|
{
|
|
"entropy": 0.3819580078125,
|
|
"epoch": 2.4682539682539684,
|
|
"grad_norm": 0.6118534660743937,
|
|
"learning_rate": 1.6944308838486823e-06,
|
|
"loss": 0.3198,
|
|
"mean_token_accuracy": 0.8841827157884836,
|
|
"num_tokens": 267605673.0,
|
|
"step": 622
|
|
},
|
|
{
|
|
"entropy": 0.3839111328125,
|
|
"epoch": 2.4722222222222223,
|
|
"grad_norm": 0.6453367699458878,
|
|
"learning_rate": 1.6701419556977882e-06,
|
|
"loss": 0.3326,
|
|
"mean_token_accuracy": 0.8813635427504778,
|
|
"num_tokens": 268044360.0,
|
|
"step": 623
|
|
},
|
|
{
|
|
"entropy": 0.38397216796875,
|
|
"epoch": 2.4761904761904763,
|
|
"grad_norm": 0.6102430733837766,
|
|
"learning_rate": 1.6460125007672556e-06,
|
|
"loss": 0.3197,
|
|
"mean_token_accuracy": 0.8862089207395911,
|
|
"num_tokens": 268470812.0,
|
|
"step": 624
|
|
},
|
|
{
|
|
"entropy": 0.382415771484375,
|
|
"epoch": 2.4801587301587302,
|
|
"grad_norm": 0.6312834862639178,
|
|
"learning_rate": 1.6220429810099603e-06,
|
|
"loss": 0.3197,
|
|
"mean_token_accuracy": 0.8827575715258718,
|
|
"num_tokens": 268895281.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 0.379058837890625,
|
|
"epoch": 2.484126984126984,
|
|
"grad_norm": 0.7044597247523807,
|
|
"learning_rate": 1.5982338553168563e-06,
|
|
"loss": 0.3012,
|
|
"mean_token_accuracy": 0.8895168509334326,
|
|
"num_tokens": 269329768.0,
|
|
"step": 626
|
|
},
|
|
{
|
|
"entropy": 0.379669189453125,
|
|
"epoch": 2.488095238095238,
|
|
"grad_norm": 0.6298185893493836,
|
|
"learning_rate": 1.5745855795081889e-06,
|
|
"loss": 0.323,
|
|
"mean_token_accuracy": 0.8828219333663583,
|
|
"num_tokens": 269763538.0,
|
|
"step": 627
|
|
},
|
|
{
|
|
"entropy": 0.38275146484375,
|
|
"epoch": 2.492063492063492,
|
|
"grad_norm": 0.6657396946256616,
|
|
"learning_rate": 1.551098606324768e-06,
|
|
"loss": 0.3185,
|
|
"mean_token_accuracy": 0.8838854227215052,
|
|
"num_tokens": 270192672.0,
|
|
"step": 628
|
|
},
|
|
{
|
|
"entropy": 0.383880615234375,
|
|
"epoch": 2.496031746031746,
|
|
"grad_norm": 0.6461801602019108,
|
|
"learning_rate": 1.527773385419311e-06,
|
|
"loss": 0.3291,
|
|
"mean_token_accuracy": 0.8830574974417686,
|
|
"num_tokens": 270633240.0,
|
|
"step": 629
|
|
},
|
|
{
|
|
"entropy": 0.3831787109375,
|
|
"epoch": 2.5,
|
|
"grad_norm": 5.02088348983452,
|
|
"learning_rate": 1.5046103633478148e-06,
|
|
"loss": 0.3115,
|
|
"mean_token_accuracy": 0.8865573918446898,
|
|
"num_tokens": 271053623.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 0.378753662109375,
|
|
"epoch": 2.503968253968254,
|
|
"grad_norm": 0.5964915600808843,
|
|
"learning_rate": 1.4816099835610209e-06,
|
|
"loss": 0.312,
|
|
"mean_token_accuracy": 0.88751888461411,
|
|
"num_tokens": 271492583.0,
|
|
"step": 631
|
|
},
|
|
{
|
|
"entropy": 0.381805419921875,
|
|
"epoch": 2.507936507936508,
|
|
"grad_norm": 0.6381933996091104,
|
|
"learning_rate": 1.4587726863959239e-06,
|
|
"loss": 0.3269,
|
|
"mean_token_accuracy": 0.8842789568006992,
|
|
"num_tokens": 271921985.0,
|
|
"step": 632
|
|
},
|
|
{
|
|
"entropy": 0.39300537109375,
|
|
"epoch": 2.511904761904762,
|
|
"grad_norm": 0.6212653644771459,
|
|
"learning_rate": 1.4360989090673284e-06,
|
|
"loss": 0.3149,
|
|
"mean_token_accuracy": 0.8869155524298549,
|
|
"num_tokens": 272349367.0,
|
|
"step": 633
|
|
},
|
|
{
|
|
"entropy": 0.391815185546875,
|
|
"epoch": 2.515873015873016,
|
|
"grad_norm": 0.6390041331562423,
|
|
"learning_rate": 1.4135890856595047e-06,
|
|
"loss": 0.3251,
|
|
"mean_token_accuracy": 0.883450117893517,
|
|
"num_tokens": 272757706.0,
|
|
"step": 634
|
|
},
|
|
{
|
|
"entropy": 0.3900146484375,
|
|
"epoch": 2.5198412698412698,
|
|
"grad_norm": 0.6415648041434637,
|
|
"learning_rate": 1.3912436471178525e-06,
|
|
"loss": 0.3137,
|
|
"mean_token_accuracy": 0.8865510458126664,
|
|
"num_tokens": 273178323.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 0.385589599609375,
|
|
"epoch": 2.5238095238095237,
|
|
"grad_norm": 0.6786208735461828,
|
|
"learning_rate": 1.3690630212406653e-06,
|
|
"loss": 0.31,
|
|
"mean_token_accuracy": 0.886282910592854,
|
|
"num_tokens": 273603268.0,
|
|
"step": 636
|
|
},
|
|
{
|
|
"entropy": 0.384246826171875,
|
|
"epoch": 2.5277777777777777,
|
|
"grad_norm": 0.6389187775408092,
|
|
"learning_rate": 1.3470476326709337e-06,
|
|
"loss": 0.3051,
|
|
"mean_token_accuracy": 0.8894489388912916,
|
|
"num_tokens": 274036335.0,
|
|
"step": 637
|
|
},
|
|
{
|
|
"entropy": 0.384674072265625,
|
|
"epoch": 2.5317460317460316,
|
|
"grad_norm": 0.6496924237808063,
|
|
"learning_rate": 1.3251979028882179e-06,
|
|
"loss": 0.3153,
|
|
"mean_token_accuracy": 0.8875564280897379,
|
|
"num_tokens": 274458735.0,
|
|
"step": 638
|
|
},
|
|
{
|
|
"entropy": 0.377685546875,
|
|
"epoch": 2.5357142857142856,
|
|
"grad_norm": 0.6510962044644899,
|
|
"learning_rate": 1.3035142502005792e-06,
|
|
"loss": 0.3144,
|
|
"mean_token_accuracy": 0.8849059278145432,
|
|
"num_tokens": 274909982.0,
|
|
"step": 639
|
|
},
|
|
{
|
|
"entropy": 0.38714599609375,
|
|
"epoch": 2.5396825396825395,
|
|
"grad_norm": 0.6541727348118395,
|
|
"learning_rate": 1.281997089736574e-06,
|
|
"loss": 0.3178,
|
|
"mean_token_accuracy": 0.8841635789722204,
|
|
"num_tokens": 275320028.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 0.379791259765625,
|
|
"epoch": 2.5436507936507935,
|
|
"grad_norm": 0.6200761248476931,
|
|
"learning_rate": 1.2606468334373e-06,
|
|
"loss": 0.3102,
|
|
"mean_token_accuracy": 0.8897372307255864,
|
|
"num_tokens": 275726848.0,
|
|
"step": 641
|
|
},
|
|
{
|
|
"entropy": 0.378173828125,
|
|
"epoch": 2.5476190476190474,
|
|
"grad_norm": 0.6688554503436679,
|
|
"learning_rate": 1.2394638900485124e-06,
|
|
"loss": 0.3209,
|
|
"mean_token_accuracy": 0.8843832314014435,
|
|
"num_tokens": 276151262.0,
|
|
"step": 642
|
|
},
|
|
{
|
|
"entropy": 0.380615234375,
|
|
"epoch": 2.5515873015873014,
|
|
"grad_norm": 0.6660887015596744,
|
|
"learning_rate": 1.2184486651128014e-06,
|
|
"loss": 0.3254,
|
|
"mean_token_accuracy": 0.8823684249073267,
|
|
"num_tokens": 276568860.0,
|
|
"step": 643
|
|
},
|
|
{
|
|
"entropy": 0.3778076171875,
|
|
"epoch": 2.5555555555555554,
|
|
"grad_norm": 0.6455282415719623,
|
|
"learning_rate": 1.197601560961824e-06,
|
|
"loss": 0.3125,
|
|
"mean_token_accuracy": 0.886366662569344,
|
|
"num_tokens": 276997327.0,
|
|
"step": 644
|
|
},
|
|
{
|
|
"entropy": 0.38653564453125,
|
|
"epoch": 2.5595238095238093,
|
|
"grad_norm": 0.6059096202304458,
|
|
"learning_rate": 1.1769229767086053e-06,
|
|
"loss": 0.3223,
|
|
"mean_token_accuracy": 0.8857936700806022,
|
|
"num_tokens": 277426196.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 0.3828125,
|
|
"epoch": 2.5634920634920633,
|
|
"grad_norm": 0.6256296541644343,
|
|
"learning_rate": 1.1564133082398942e-06,
|
|
"loss": 0.3233,
|
|
"mean_token_accuracy": 0.8879679264500737,
|
|
"num_tokens": 277849848.0,
|
|
"step": 646
|
|
},
|
|
{
|
|
"entropy": 0.37725830078125,
|
|
"epoch": 2.567460317460317,
|
|
"grad_norm": 0.6068305756135023,
|
|
"learning_rate": 1.1360729482085852e-06,
|
|
"loss": 0.3111,
|
|
"mean_token_accuracy": 0.8888252349570394,
|
|
"num_tokens": 278289888.0,
|
|
"step": 647
|
|
},
|
|
{
|
|
"entropy": 0.379852294921875,
|
|
"epoch": 2.571428571428571,
|
|
"grad_norm": 0.6525074013342971,
|
|
"learning_rate": 1.1159022860262036e-06,
|
|
"loss": 0.3065,
|
|
"mean_token_accuracy": 0.8898705607280135,
|
|
"num_tokens": 278726761.0,
|
|
"step": 648
|
|
},
|
|
{
|
|
"entropy": 0.382720947265625,
|
|
"epoch": 2.575396825396825,
|
|
"grad_norm": 0.6120138350704707,
|
|
"learning_rate": 1.0959017078554458e-06,
|
|
"loss": 0.2998,
|
|
"mean_token_accuracy": 0.8927411213517189,
|
|
"num_tokens": 279136759.0,
|
|
"step": 649
|
|
},
|
|
{
|
|
"entropy": 0.382659912109375,
|
|
"epoch": 2.5793650793650795,
|
|
"grad_norm": 0.6229193839182782,
|
|
"learning_rate": 1.0760715966027923e-06,
|
|
"loss": 0.3193,
|
|
"mean_token_accuracy": 0.8850045939907432,
|
|
"num_tokens": 279572082.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 0.382904052734375,
|
|
"epoch": 2.5833333333333335,
|
|
"grad_norm": 0.6342083100796324,
|
|
"learning_rate": 1.0564123319111708e-06,
|
|
"loss": 0.329,
|
|
"mean_token_accuracy": 0.8824840355664492,
|
|
"num_tokens": 279994957.0,
|
|
"step": 651
|
|
},
|
|
{
|
|
"entropy": 0.38031005859375,
|
|
"epoch": 2.5873015873015874,
|
|
"grad_norm": 0.6205742323248385,
|
|
"learning_rate": 1.036924290152691e-06,
|
|
"loss": 0.3075,
|
|
"mean_token_accuracy": 0.8880053823813796,
|
|
"num_tokens": 280414468.0,
|
|
"step": 652
|
|
},
|
|
{
|
|
"entropy": 0.38330078125,
|
|
"epoch": 2.5912698412698414,
|
|
"grad_norm": 0.6452525781952142,
|
|
"learning_rate": 1.017607844421441e-06,
|
|
"loss": 0.3234,
|
|
"mean_token_accuracy": 0.8852792903780937,
|
|
"num_tokens": 280832145.0,
|
|
"step": 653
|
|
},
|
|
{
|
|
"entropy": 0.38494873046875,
|
|
"epoch": 2.5952380952380953,
|
|
"grad_norm": 0.6150246403399638,
|
|
"learning_rate": 9.984633645263386e-07,
|
|
"loss": 0.3053,
|
|
"mean_token_accuracy": 0.8897464731708169,
|
|
"num_tokens": 281237288.0,
|
|
"step": 654
|
|
},
|
|
{
|
|
"entropy": 0.376922607421875,
|
|
"epoch": 2.5992063492063493,
|
|
"grad_norm": 0.6204054410973546,
|
|
"learning_rate": 9.794912169840564e-07,
|
|
"loss": 0.3063,
|
|
"mean_token_accuracy": 0.8919531209394336,
|
|
"num_tokens": 281692258.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 0.38128662109375,
|
|
"epoch": 2.6031746031746033,
|
|
"grad_norm": 0.6259115787217637,
|
|
"learning_rate": 9.606917650120084e-07,
|
|
"loss": 0.3019,
|
|
"mean_token_accuracy": 0.8896377719938755,
|
|
"num_tokens": 282118057.0,
|
|
"step": 656
|
|
},
|
|
{
|
|
"entropy": 0.376617431640625,
|
|
"epoch": 2.607142857142857,
|
|
"grad_norm": 0.6405045817213874,
|
|
"learning_rate": 9.420653685213854e-07,
|
|
"loss": 0.3207,
|
|
"mean_token_accuracy": 0.883592015132308,
|
|
"num_tokens": 282582066.0,
|
|
"step": 657
|
|
},
|
|
{
|
|
"entropy": 0.371551513671875,
|
|
"epoch": 2.611111111111111,
|
|
"grad_norm": 0.6258478413075457,
|
|
"learning_rate": 9.236123841102762e-07,
|
|
"loss": 0.3199,
|
|
"mean_token_accuracy": 0.8873294722288847,
|
|
"num_tokens": 283028330.0,
|
|
"step": 658
|
|
},
|
|
{
|
|
"entropy": 0.390045166015625,
|
|
"epoch": 2.615079365079365,
|
|
"grad_norm": 0.623636922000303,
|
|
"learning_rate": 9.053331650568264e-07,
|
|
"loss": 0.322,
|
|
"mean_token_accuracy": 0.8835032200440764,
|
|
"num_tokens": 283436768.0,
|
|
"step": 659
|
|
},
|
|
{
|
|
"entropy": 0.38201904296875,
|
|
"epoch": 2.619047619047619,
|
|
"grad_norm": 0.6221707456489836,
|
|
"learning_rate": 8.872280613124895e-07,
|
|
"loss": 0.3076,
|
|
"mean_token_accuracy": 0.887150245718658,
|
|
"num_tokens": 283866607.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 0.38751220703125,
|
|
"epoch": 2.623015873015873,
|
|
"grad_norm": 0.6328952287494918,
|
|
"learning_rate": 8.692974194953263e-07,
|
|
"loss": 0.3223,
|
|
"mean_token_accuracy": 0.8878462919965386,
|
|
"num_tokens": 284281791.0,
|
|
"step": 661
|
|
},
|
|
{
|
|
"entropy": 0.383544921875,
|
|
"epoch": 2.626984126984127,
|
|
"grad_norm": 0.6134165981973436,
|
|
"learning_rate": 8.515415828833562e-07,
|
|
"loss": 0.3041,
|
|
"mean_token_accuracy": 0.8905844045802951,
|
|
"num_tokens": 284705319.0,
|
|
"step": 662
|
|
},
|
|
{
|
|
"entropy": 0.3831787109375,
|
|
"epoch": 2.630952380952381,
|
|
"grad_norm": 0.6003677767075459,
|
|
"learning_rate": 8.339608914079944e-07,
|
|
"loss": 0.3277,
|
|
"mean_token_accuracy": 0.8837083810940385,
|
|
"num_tokens": 285154313.0,
|
|
"step": 663
|
|
},
|
|
{
|
|
"entropy": 0.3780517578125,
|
|
"epoch": 2.634920634920635,
|
|
"grad_norm": 0.6185564068781259,
|
|
"learning_rate": 8.165556816475462e-07,
|
|
"loss": 0.3169,
|
|
"mean_token_accuracy": 0.8858558805659413,
|
|
"num_tokens": 285598883.0,
|
|
"step": 664
|
|
},
|
|
{
|
|
"entropy": 0.384521484375,
|
|
"epoch": 2.638888888888889,
|
|
"grad_norm": 0.615938430119585,
|
|
"learning_rate": 7.993262868207552e-07,
|
|
"loss": 0.3189,
|
|
"mean_token_accuracy": 0.8873115805909038,
|
|
"num_tokens": 286037660.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 0.3826904296875,
|
|
"epoch": 2.642857142857143,
|
|
"grad_norm": 0.6120620823511644,
|
|
"learning_rate": 7.822730367804332e-07,
|
|
"loss": 0.3162,
|
|
"mean_token_accuracy": 0.8862875821068883,
|
|
"num_tokens": 286461446.0,
|
|
"step": 666
|
|
},
|
|
{
|
|
"entropy": 0.3834228515625,
|
|
"epoch": 2.6468253968253967,
|
|
"grad_norm": 0.5993824978295295,
|
|
"learning_rate": 7.653962580071384e-07,
|
|
"loss": 0.3063,
|
|
"mean_token_accuracy": 0.8879508459940553,
|
|
"num_tokens": 286877023.0,
|
|
"step": 667
|
|
},
|
|
{
|
|
"entropy": 0.3873291015625,
|
|
"epoch": 2.6507936507936507,
|
|
"grad_norm": 0.633531914850964,
|
|
"learning_rate": 7.486962736029247e-07,
|
|
"loss": 0.3123,
|
|
"mean_token_accuracy": 0.887955573387444,
|
|
"num_tokens": 287308399.0,
|
|
"step": 668
|
|
},
|
|
{
|
|
"entropy": 0.379241943359375,
|
|
"epoch": 2.6547619047619047,
|
|
"grad_norm": 0.6065402358723048,
|
|
"learning_rate": 7.321734032851613e-07,
|
|
"loss": 0.3179,
|
|
"mean_token_accuracy": 0.8874384136870503,
|
|
"num_tokens": 287728804.0,
|
|
"step": 669
|
|
},
|
|
{
|
|
"entropy": 0.376983642578125,
|
|
"epoch": 2.6587301587301586,
|
|
"grad_norm": 0.6293972276012216,
|
|
"learning_rate": 7.158279633804077e-07,
|
|
"loss": 0.3106,
|
|
"mean_token_accuracy": 0.8873779606074095,
|
|
"num_tokens": 288187832.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 0.38238525390625,
|
|
"epoch": 2.6626984126984126,
|
|
"grad_norm": 0.6195597148292992,
|
|
"learning_rate": 6.996602668183605e-07,
|
|
"loss": 0.3109,
|
|
"mean_token_accuracy": 0.8889595773071051,
|
|
"num_tokens": 288600411.0,
|
|
"step": 671
|
|
},
|
|
{
|
|
"entropy": 0.37738037109375,
|
|
"epoch": 2.6666666666666665,
|
|
"grad_norm": 0.6154573646928048,
|
|
"learning_rate": 6.836706231258583e-07,
|
|
"loss": 0.3192,
|
|
"mean_token_accuracy": 0.8858912223950028,
|
|
"num_tokens": 289047051.0,
|
|
"step": 672
|
|
},
|
|
{
|
|
"entropy": 0.383026123046875,
|
|
"epoch": 2.6706349206349205,
|
|
"grad_norm": 0.6263655996244245,
|
|
"learning_rate": 6.678593384209597e-07,
|
|
"loss": 0.3155,
|
|
"mean_token_accuracy": 0.8867061976343393,
|
|
"num_tokens": 289478039.0,
|
|
"step": 673
|
|
},
|
|
{
|
|
"entropy": 0.385162353515625,
|
|
"epoch": 2.674603174603175,
|
|
"grad_norm": 0.6573248369770568,
|
|
"learning_rate": 6.522267154070816e-07,
|
|
"loss": 0.3262,
|
|
"mean_token_accuracy": 0.8859394267201424,
|
|
"num_tokens": 289920851.0,
|
|
"step": 674
|
|
},
|
|
{
|
|
"entropy": 0.38201904296875,
|
|
"epoch": 2.678571428571429,
|
|
"grad_norm": 0.5939088235971999,
|
|
"learning_rate": 6.367730533672035e-07,
|
|
"loss": 0.3119,
|
|
"mean_token_accuracy": 0.8882531467825174,
|
|
"num_tokens": 290351325.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 0.380096435546875,
|
|
"epoch": 2.682539682539683,
|
|
"grad_norm": 0.6262374391544449,
|
|
"learning_rate": 6.214986481581365e-07,
|
|
"loss": 0.3045,
|
|
"mean_token_accuracy": 0.8911728356033564,
|
|
"num_tokens": 290766931.0,
|
|
"step": 676
|
|
},
|
|
{
|
|
"entropy": 0.389251708984375,
|
|
"epoch": 2.6865079365079367,
|
|
"grad_norm": 0.6147080862748338,
|
|
"learning_rate": 6.064037922048661e-07,
|
|
"loss": 0.3191,
|
|
"mean_token_accuracy": 0.8846084726974368,
|
|
"num_tokens": 291185306.0,
|
|
"step": 677
|
|
},
|
|
{
|
|
"entropy": 0.386932373046875,
|
|
"epoch": 2.6904761904761907,
|
|
"grad_norm": 0.6146757669564619,
|
|
"learning_rate": 5.914887744949426e-07,
|
|
"loss": 0.3072,
|
|
"mean_token_accuracy": 0.8904552990570664,
|
|
"num_tokens": 291599836.0,
|
|
"step": 678
|
|
},
|
|
{
|
|
"entropy": 0.3798828125,
|
|
"epoch": 2.6944444444444446,
|
|
"grad_norm": 0.6122316865058661,
|
|
"learning_rate": 5.767538805729578e-07,
|
|
"loss": 0.3233,
|
|
"mean_token_accuracy": 0.8849823428317904,
|
|
"num_tokens": 292026507.0,
|
|
"step": 679
|
|
},
|
|
{
|
|
"entropy": 0.377960205078125,
|
|
"epoch": 2.6984126984126986,
|
|
"grad_norm": 0.5919864168210549,
|
|
"learning_rate": 5.621993925350722e-07,
|
|
"loss": 0.3139,
|
|
"mean_token_accuracy": 0.8860092582181096,
|
|
"num_tokens": 292454972.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 0.38397216796875,
|
|
"epoch": 2.7023809523809526,
|
|
"grad_norm": 0.6003413926603242,
|
|
"learning_rate": 5.478255890236184e-07,
|
|
"loss": 0.3145,
|
|
"mean_token_accuracy": 0.8844663957133889,
|
|
"num_tokens": 292885705.0,
|
|
"step": 681
|
|
},
|
|
{
|
|
"entropy": 0.384857177734375,
|
|
"epoch": 2.7063492063492065,
|
|
"grad_norm": 0.6327165590547364,
|
|
"learning_rate": 5.336327452217682e-07,
|
|
"loss": 0.3009,
|
|
"mean_token_accuracy": 0.8899004301056266,
|
|
"num_tokens": 293307675.0,
|
|
"step": 682
|
|
},
|
|
{
|
|
"entropy": 0.384429931640625,
|
|
"epoch": 2.7103174603174605,
|
|
"grad_norm": 0.6210543868986266,
|
|
"learning_rate": 5.196211328482559e-07,
|
|
"loss": 0.3226,
|
|
"mean_token_accuracy": 0.8854889068752527,
|
|
"num_tokens": 293722148.0,
|
|
"step": 683
|
|
},
|
|
{
|
|
"entropy": 0.3853759765625,
|
|
"epoch": 2.7142857142857144,
|
|
"grad_norm": 0.6057372551677664,
|
|
"learning_rate": 5.057910201521876e-07,
|
|
"loss": 0.3211,
|
|
"mean_token_accuracy": 0.8846705863252282,
|
|
"num_tokens": 294149752.0,
|
|
"step": 684
|
|
},
|
|
{
|
|
"entropy": 0.387847900390625,
|
|
"epoch": 2.7182539682539684,
|
|
"grad_norm": 0.6148908097024449,
|
|
"learning_rate": 4.921426719078948e-07,
|
|
"loss": 0.3049,
|
|
"mean_token_accuracy": 0.8889408009126782,
|
|
"num_tokens": 294555288.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 0.383392333984375,
|
|
"epoch": 2.7222222222222223,
|
|
"grad_norm": 0.6097068524134369,
|
|
"learning_rate": 4.786763494098689e-07,
|
|
"loss": 0.3014,
|
|
"mean_token_accuracy": 0.8917689863592386,
|
|
"num_tokens": 294975614.0,
|
|
"step": 686
|
|
},
|
|
{
|
|
"entropy": 0.377655029296875,
|
|
"epoch": 2.7261904761904763,
|
|
"grad_norm": 0.6094849230504564,
|
|
"learning_rate": 4.653923104677671e-07,
|
|
"loss": 0.3148,
|
|
"mean_token_accuracy": 0.8867970844730735,
|
|
"num_tokens": 295422071.0,
|
|
"step": 687
|
|
},
|
|
{
|
|
"entropy": 0.380615234375,
|
|
"epoch": 2.7301587301587302,
|
|
"grad_norm": 0.6178616647341642,
|
|
"learning_rate": 4.522908094014655e-07,
|
|
"loss": 0.3151,
|
|
"mean_token_accuracy": 0.8884375654160976,
|
|
"num_tokens": 295846874.0,
|
|
"step": 688
|
|
},
|
|
{
|
|
"entropy": 0.38031005859375,
|
|
"epoch": 2.734126984126984,
|
|
"grad_norm": 0.5894255953320765,
|
|
"learning_rate": 4.3937209703619476e-07,
|
|
"loss": 0.3011,
|
|
"mean_token_accuracy": 0.8908967413008213,
|
|
"num_tokens": 296288822.0,
|
|
"step": 689
|
|
},
|
|
{
|
|
"entropy": 0.3753662109375,
|
|
"epoch": 2.738095238095238,
|
|
"grad_norm": 0.6135220448939788,
|
|
"learning_rate": 4.2663642069773693e-07,
|
|
"loss": 0.3102,
|
|
"mean_token_accuracy": 0.8868862120434642,
|
|
"num_tokens": 296730922.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 0.376129150390625,
|
|
"epoch": 2.742063492063492,
|
|
"grad_norm": 0.5985603171722215,
|
|
"learning_rate": 4.140840242076927e-07,
|
|
"loss": 0.3124,
|
|
"mean_token_accuracy": 0.8877345686778426,
|
|
"num_tokens": 297169723.0,
|
|
"step": 691
|
|
},
|
|
{
|
|
"entropy": 0.37847900390625,
|
|
"epoch": 2.746031746031746,
|
|
"grad_norm": 0.5897101346016688,
|
|
"learning_rate": 4.017151478788117e-07,
|
|
"loss": 0.2995,
|
|
"mean_token_accuracy": 0.8927986742928624,
|
|
"num_tokens": 297610941.0,
|
|
"step": 692
|
|
},
|
|
{
|
|
"entropy": 0.380523681640625,
|
|
"epoch": 2.75,
|
|
"grad_norm": 0.6138920707362799,
|
|
"learning_rate": 3.895300285103931e-07,
|
|
"loss": 0.2959,
|
|
"mean_token_accuracy": 0.8884680820629001,
|
|
"num_tokens": 298044879.0,
|
|
"step": 693
|
|
},
|
|
{
|
|
"entropy": 0.379547119140625,
|
|
"epoch": 2.753968253968254,
|
|
"grad_norm": 0.6212097605594783,
|
|
"learning_rate": 3.7752889938375113e-07,
|
|
"loss": 0.3028,
|
|
"mean_token_accuracy": 0.8896859297528863,
|
|
"num_tokens": 298464464.0,
|
|
"step": 694
|
|
},
|
|
{
|
|
"entropy": 0.374053955078125,
|
|
"epoch": 2.757936507936508,
|
|
"grad_norm": 0.6230588065700267,
|
|
"learning_rate": 3.657119902577466e-07,
|
|
"loss": 0.3059,
|
|
"mean_token_accuracy": 0.8904304560273886,
|
|
"num_tokens": 298903233.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 0.379852294921875,
|
|
"epoch": 2.761904761904762,
|
|
"grad_norm": 0.6119585891354671,
|
|
"learning_rate": 3.5407952736439266e-07,
|
|
"loss": 0.3011,
|
|
"mean_token_accuracy": 0.8909467747434974,
|
|
"num_tokens": 299315956.0,
|
|
"step": 696
|
|
},
|
|
{
|
|
"entropy": 0.384979248046875,
|
|
"epoch": 2.765873015873016,
|
|
"grad_norm": 0.630765395796783,
|
|
"learning_rate": 3.426317334045226e-07,
|
|
"loss": 0.3082,
|
|
"mean_token_accuracy": 0.8890936635434628,
|
|
"num_tokens": 299732237.0,
|
|
"step": 697
|
|
},
|
|
{
|
|
"entropy": 0.379730224609375,
|
|
"epoch": 2.7698412698412698,
|
|
"grad_norm": 0.665272270914942,
|
|
"learning_rate": 3.313688275435234e-07,
|
|
"loss": 0.3077,
|
|
"mean_token_accuracy": 0.889319458976388,
|
|
"num_tokens": 300162540.0,
|
|
"step": 698
|
|
},
|
|
{
|
|
"entropy": 0.38665771484375,
|
|
"epoch": 2.7738095238095237,
|
|
"grad_norm": 0.6746597478881748,
|
|
"learning_rate": 3.202910254071434e-07,
|
|
"loss": 0.3147,
|
|
"mean_token_accuracy": 0.88882967364043,
|
|
"num_tokens": 300580263.0,
|
|
"step": 699
|
|
},
|
|
{
|
|
"entropy": 0.376708984375,
|
|
"epoch": 2.7777777777777777,
|
|
"grad_norm": 0.6038711621641686,
|
|
"learning_rate": 3.0939853907736126e-07,
|
|
"loss": 0.2978,
|
|
"mean_token_accuracy": 0.8918641023337841,
|
|
"num_tokens": 301009855.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 0.3848876953125,
|
|
"epoch": 2.7817460317460316,
|
|
"grad_norm": 0.6176026217150087,
|
|
"learning_rate": 2.9869157708832805e-07,
|
|
"loss": 0.3019,
|
|
"mean_token_accuracy": 0.8898680582642555,
|
|
"num_tokens": 301429836.0,
|
|
"step": 701
|
|
},
|
|
{
|
|
"entropy": 0.38232421875,
|
|
"epoch": 2.7857142857142856,
|
|
"grad_norm": 0.6147895745768526,
|
|
"learning_rate": 2.881703444223716e-07,
|
|
"loss": 0.3053,
|
|
"mean_token_accuracy": 0.8896784670650959,
|
|
"num_tokens": 301852351.0,
|
|
"step": 702
|
|
},
|
|
{
|
|
"entropy": 0.382965087890625,
|
|
"epoch": 2.7896825396825395,
|
|
"grad_norm": 0.6315651993446116,
|
|
"learning_rate": 2.778350425060794e-07,
|
|
"loss": 0.3084,
|
|
"mean_token_accuracy": 0.8870635628700256,
|
|
"num_tokens": 302270241.0,
|
|
"step": 703
|
|
},
|
|
{
|
|
"entropy": 0.37701416015625,
|
|
"epoch": 2.7936507936507935,
|
|
"grad_norm": 0.599519759404208,
|
|
"learning_rate": 2.6768586920643324e-07,
|
|
"loss": 0.3068,
|
|
"mean_token_accuracy": 0.8893379056826234,
|
|
"num_tokens": 302702189.0,
|
|
"step": 704
|
|
},
|
|
{
|
|
"entropy": 0.380889892578125,
|
|
"epoch": 2.7976190476190474,
|
|
"grad_norm": 0.6078813973100959,
|
|
"learning_rate": 2.5772301882702634e-07,
|
|
"loss": 0.2992,
|
|
"mean_token_accuracy": 0.8915481101721525,
|
|
"num_tokens": 303138996.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 0.38671875,
|
|
"epoch": 2.8015873015873014,
|
|
"grad_norm": 0.6086380862937784,
|
|
"learning_rate": 2.4794668210434194e-07,
|
|
"loss": 0.3079,
|
|
"mean_token_accuracy": 0.887032619677484,
|
|
"num_tokens": 303546117.0,
|
|
"step": 706
|
|
},
|
|
{
|
|
"entropy": 0.37628173828125,
|
|
"epoch": 2.8055555555555554,
|
|
"grad_norm": 0.6027408982980551,
|
|
"learning_rate": 2.3835704620410294e-07,
|
|
"loss": 0.3063,
|
|
"mean_token_accuracy": 0.8905981313437223,
|
|
"num_tokens": 303985054.0,
|
|
"step": 707
|
|
},
|
|
{
|
|
"entropy": 0.37945556640625,
|
|
"epoch": 2.8095238095238093,
|
|
"grad_norm": 0.5994294775725023,
|
|
"learning_rate": 2.2895429471768925e-07,
|
|
"loss": 0.3073,
|
|
"mean_token_accuracy": 0.888611021451652,
|
|
"num_tokens": 304424136.0,
|
|
"step": 708
|
|
},
|
|
{
|
|
"entropy": 0.377716064453125,
|
|
"epoch": 2.8134920634920633,
|
|
"grad_norm": 0.6845179144444798,
|
|
"learning_rate": 2.1973860765861831e-07,
|
|
"loss": 0.302,
|
|
"mean_token_accuracy": 0.8900069631636143,
|
|
"num_tokens": 304862181.0,
|
|
"step": 709
|
|
},
|
|
{
|
|
"entropy": 0.3812255859375,
|
|
"epoch": 2.817460317460317,
|
|
"grad_norm": 0.6225625871453486,
|
|
"learning_rate": 2.107101614591045e-07,
|
|
"loss": 0.322,
|
|
"mean_token_accuracy": 0.8866511387750506,
|
|
"num_tokens": 305299176.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 0.377899169921875,
|
|
"epoch": 2.821428571428571,
|
|
"grad_norm": 0.6096957767253436,
|
|
"learning_rate": 2.0186912896667744e-07,
|
|
"loss": 0.3126,
|
|
"mean_token_accuracy": 0.8875591978430748,
|
|
"num_tokens": 305755604.0,
|
|
"step": 711
|
|
},
|
|
{
|
|
"entropy": 0.37921142578125,
|
|
"epoch": 2.825396825396825,
|
|
"grad_norm": 0.6667361291321218,
|
|
"learning_rate": 1.9321567944087573e-07,
|
|
"loss": 0.3012,
|
|
"mean_token_accuracy": 0.8895198963582516,
|
|
"num_tokens": 306180918.0,
|
|
"step": 712
|
|
},
|
|
{
|
|
"entropy": 0.382537841796875,
|
|
"epoch": 2.8293650793650795,
|
|
"grad_norm": 0.6218241109654549,
|
|
"learning_rate": 1.8474997855000177e-07,
|
|
"loss": 0.3208,
|
|
"mean_token_accuracy": 0.885167789645493,
|
|
"num_tokens": 306601315.0,
|
|
"step": 713
|
|
},
|
|
{
|
|
"entropy": 0.378570556640625,
|
|
"epoch": 2.8333333333333335,
|
|
"grad_norm": 0.5992276590550127,
|
|
"learning_rate": 1.7647218836795878e-07,
|
|
"loss": 0.3143,
|
|
"mean_token_accuracy": 0.8854878153651953,
|
|
"num_tokens": 307053855.0,
|
|
"step": 714
|
|
},
|
|
{
|
|
"entropy": 0.384765625,
|
|
"epoch": 2.8373015873015874,
|
|
"grad_norm": 0.6678241835727501,
|
|
"learning_rate": 1.6838246737113983e-07,
|
|
"loss": 0.3242,
|
|
"mean_token_accuracy": 0.8875371310859919,
|
|
"num_tokens": 307466527.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 0.382415771484375,
|
|
"epoch": 2.8412698412698414,
|
|
"grad_norm": 0.6327330692027668,
|
|
"learning_rate": 1.604809704353949e-07,
|
|
"loss": 0.3148,
|
|
"mean_token_accuracy": 0.8862312156707048,
|
|
"num_tokens": 307908233.0,
|
|
"step": 716
|
|
},
|
|
{
|
|
"entropy": 0.382720947265625,
|
|
"epoch": 2.8452380952380953,
|
|
"grad_norm": 0.6124198926311829,
|
|
"learning_rate": 1.5276784883307084e-07,
|
|
"loss": 0.3008,
|
|
"mean_token_accuracy": 0.8917096089571714,
|
|
"num_tokens": 308331740.0,
|
|
"step": 717
|
|
},
|
|
{
|
|
"entropy": 0.38189697265625,
|
|
"epoch": 2.8492063492063493,
|
|
"grad_norm": 0.595663678571871,
|
|
"learning_rate": 1.4524325023010932e-07,
|
|
"loss": 0.3004,
|
|
"mean_token_accuracy": 0.8921401789411902,
|
|
"num_tokens": 308765267.0,
|
|
"step": 718
|
|
},
|
|
{
|
|
"entropy": 0.3831787109375,
|
|
"epoch": 2.8531746031746033,
|
|
"grad_norm": 0.6212901666769164,
|
|
"learning_rate": 1.3790731868322472e-07,
|
|
"loss": 0.2948,
|
|
"mean_token_accuracy": 0.8915543537586927,
|
|
"num_tokens": 309177878.0,
|
|
"step": 719
|
|
},
|
|
{
|
|
"entropy": 0.390869140625,
|
|
"epoch": 2.857142857142857,
|
|
"grad_norm": 0.6016366375502071,
|
|
"learning_rate": 1.3076019463714173e-07,
|
|
"loss": 0.3179,
|
|
"mean_token_accuracy": 0.885790922679007,
|
|
"num_tokens": 309586897.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 0.377838134765625,
|
|
"epoch": 2.861111111111111,
|
|
"grad_norm": 0.6177685966036272,
|
|
"learning_rate": 1.238020149219099e-07,
|
|
"loss": 0.2997,
|
|
"mean_token_accuracy": 0.8919113390147686,
|
|
"num_tokens": 310011953.0,
|
|
"step": 721
|
|
},
|
|
{
|
|
"entropy": 0.382354736328125,
|
|
"epoch": 2.865079365079365,
|
|
"grad_norm": 0.5837465047986848,
|
|
"learning_rate": 1.1703291275028227e-07,
|
|
"loss": 0.3124,
|
|
"mean_token_accuracy": 0.8868957068771124,
|
|
"num_tokens": 310440896.0,
|
|
"step": 722
|
|
},
|
|
{
|
|
"entropy": 0.38067626953125,
|
|
"epoch": 2.869047619047619,
|
|
"grad_norm": 0.6367975390082888,
|
|
"learning_rate": 1.1045301771516748e-07,
|
|
"loss": 0.3049,
|
|
"mean_token_accuracy": 0.8887251811102033,
|
|
"num_tokens": 310867155.0,
|
|
"step": 723
|
|
},
|
|
{
|
|
"entropy": 0.380279541015625,
|
|
"epoch": 2.873015873015873,
|
|
"grad_norm": 0.5927899623186489,
|
|
"learning_rate": 1.0406245578714613e-07,
|
|
"loss": 0.3041,
|
|
"mean_token_accuracy": 0.8911432735621929,
|
|
"num_tokens": 311297562.0,
|
|
"step": 724
|
|
},
|
|
{
|
|
"entropy": 0.377532958984375,
|
|
"epoch": 2.876984126984127,
|
|
"grad_norm": 0.6106617971663426,
|
|
"learning_rate": 9.786134931205726e-08,
|
|
"loss": 0.3168,
|
|
"mean_token_accuracy": 0.888288808055222,
|
|
"num_tokens": 311750163.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 0.37890625,
|
|
"epoch": 2.880952380952381,
|
|
"grad_norm": 0.6038481080562059,
|
|
"learning_rate": 9.184981700866347e-08,
|
|
"loss": 0.3229,
|
|
"mean_token_accuracy": 0.887555805966258,
|
|
"num_tokens": 312189513.0,
|
|
"step": 726
|
|
},
|
|
{
|
|
"entropy": 0.37835693359375,
|
|
"epoch": 2.884920634920635,
|
|
"grad_norm": 0.5979812639407914,
|
|
"learning_rate": 8.602797396636941e-08,
|
|
"loss": 0.3062,
|
|
"mean_token_accuracy": 0.8877870365977287,
|
|
"num_tokens": 312626002.0,
|
|
"step": 727
|
|
},
|
|
{
|
|
"entropy": 0.38275146484375,
|
|
"epoch": 2.888888888888889,
|
|
"grad_norm": 0.6398028531474257,
|
|
"learning_rate": 8.039593164302362e-08,
|
|
"loss": 0.3138,
|
|
"mean_token_accuracy": 0.8863641833886504,
|
|
"num_tokens": 313055977.0,
|
|
"step": 728
|
|
},
|
|
{
|
|
"entropy": 0.3807373046875,
|
|
"epoch": 2.892857142857143,
|
|
"grad_norm": 0.616805437056275,
|
|
"learning_rate": 7.495379786278456e-08,
|
|
"loss": 0.302,
|
|
"mean_token_accuracy": 0.8888390958309174,
|
|
"num_tokens": 313491837.0,
|
|
"step": 729
|
|
},
|
|
{
|
|
"entropy": 0.381591796875,
|
|
"epoch": 2.8968253968253967,
|
|
"grad_norm": 0.6015916324330552,
|
|
"learning_rate": 6.970167681405459e-08,
|
|
"loss": 0.3251,
|
|
"mean_token_accuracy": 0.8857261892408133,
|
|
"num_tokens": 313934488.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 0.383575439453125,
|
|
"epoch": 2.9007936507936507,
|
|
"grad_norm": 0.6021514631896276,
|
|
"learning_rate": 6.463966904748487e-08,
|
|
"loss": 0.308,
|
|
"mean_token_accuracy": 0.8882989063858986,
|
|
"num_tokens": 314354323.0,
|
|
"step": 731
|
|
},
|
|
{
|
|
"entropy": 0.38092041015625,
|
|
"epoch": 2.9047619047619047,
|
|
"grad_norm": 0.6035366486394909,
|
|
"learning_rate": 5.97678714740535e-08,
|
|
"loss": 0.3191,
|
|
"mean_token_accuracy": 0.8849895298480988,
|
|
"num_tokens": 314782888.0,
|
|
"step": 732
|
|
},
|
|
{
|
|
"entropy": 0.380157470703125,
|
|
"epoch": 2.9087301587301586,
|
|
"grad_norm": 0.6246810880941083,
|
|
"learning_rate": 5.508637736320488e-08,
|
|
"loss": 0.3026,
|
|
"mean_token_accuracy": 0.8905821247026324,
|
|
"num_tokens": 315218545.0,
|
|
"step": 733
|
|
},
|
|
{
|
|
"entropy": 0.37481689453125,
|
|
"epoch": 2.9126984126984126,
|
|
"grad_norm": 0.5742999768873462,
|
|
"learning_rate": 5.0595276341071084e-08,
|
|
"loss": 0.2928,
|
|
"mean_token_accuracy": 0.8928123638033867,
|
|
"num_tokens": 315667111.0,
|
|
"step": 734
|
|
},
|
|
{
|
|
"entropy": 0.387847900390625,
|
|
"epoch": 2.9166666666666665,
|
|
"grad_norm": 0.61460809122353,
|
|
"learning_rate": 4.62946543887488e-08,
|
|
"loss": 0.3122,
|
|
"mean_token_accuracy": 0.8853531358763576,
|
|
"num_tokens": 316078411.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 0.378631591796875,
|
|
"epoch": 2.9206349206349205,
|
|
"grad_norm": 0.6260179342418157,
|
|
"learning_rate": 4.218459384065954e-08,
|
|
"loss": 0.3127,
|
|
"mean_token_accuracy": 0.8865446662530303,
|
|
"num_tokens": 316519645.0,
|
|
"step": 736
|
|
},
|
|
{
|
|
"entropy": 0.38299560546875,
|
|
"epoch": 2.924603174603175,
|
|
"grad_norm": 0.6528580905521568,
|
|
"learning_rate": 3.826517338296865e-08,
|
|
"loss": 0.3214,
|
|
"mean_token_accuracy": 0.886158674955368,
|
|
"num_tokens": 316948710.0,
|
|
"step": 737
|
|
},
|
|
{
|
|
"entropy": 0.380340576171875,
|
|
"epoch": 2.928571428571429,
|
|
"grad_norm": 0.5932548971126642,
|
|
"learning_rate": 3.4536468052082106e-08,
|
|
"loss": 0.2991,
|
|
"mean_token_accuracy": 0.8918010191991925,
|
|
"num_tokens": 317376914.0,
|
|
"step": 738
|
|
},
|
|
{
|
|
"entropy": 0.382415771484375,
|
|
"epoch": 2.932539682539683,
|
|
"grad_norm": 0.6263098885743876,
|
|
"learning_rate": 3.0998549233205446e-08,
|
|
"loss": 0.3014,
|
|
"mean_token_accuracy": 0.8899451838806272,
|
|
"num_tokens": 317783718.0,
|
|
"step": 739
|
|
},
|
|
{
|
|
"entropy": 0.38323974609375,
|
|
"epoch": 2.9365079365079367,
|
|
"grad_norm": 0.6280292549269856,
|
|
"learning_rate": 2.7651484658984816e-08,
|
|
"loss": 0.3263,
|
|
"mean_token_accuracy": 0.8853430952876806,
|
|
"num_tokens": 318210944.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 0.380523681640625,
|
|
"epoch": 2.9404761904761907,
|
|
"grad_norm": 0.6199875922767187,
|
|
"learning_rate": 2.4495338408201397e-08,
|
|
"loss": 0.3206,
|
|
"mean_token_accuracy": 0.8864352721720934,
|
|
"num_tokens": 318636758.0,
|
|
"step": 741
|
|
},
|
|
{
|
|
"entropy": 0.37628173828125,
|
|
"epoch": 2.9444444444444446,
|
|
"grad_norm": 0.5975785528517842,
|
|
"learning_rate": 2.153017090455123e-08,
|
|
"loss": 0.3055,
|
|
"mean_token_accuracy": 0.8887868747115135,
|
|
"num_tokens": 319072977.0,
|
|
"step": 742
|
|
},
|
|
{
|
|
"entropy": 0.383331298828125,
|
|
"epoch": 2.9484126984126986,
|
|
"grad_norm": 0.6002719426244223,
|
|
"learning_rate": 1.8756038915486165e-08,
|
|
"loss": 0.3097,
|
|
"mean_token_accuracy": 0.888092122040689,
|
|
"num_tokens": 319491098.0,
|
|
"step": 743
|
|
},
|
|
{
|
|
"entropy": 0.376800537109375,
|
|
"epoch": 2.9523809523809526,
|
|
"grad_norm": 0.6400387970423335,
|
|
"learning_rate": 1.6172995551125836e-08,
|
|
"loss": 0.3248,
|
|
"mean_token_accuracy": 0.8845498086884618,
|
|
"num_tokens": 319936836.0,
|
|
"step": 744
|
|
},
|
|
{
|
|
"entropy": 0.376953125,
|
|
"epoch": 2.9563492063492065,
|
|
"grad_norm": 0.6271748530387473,
|
|
"learning_rate": 1.3781090263242924e-08,
|
|
"loss": 0.3167,
|
|
"mean_token_accuracy": 0.887404091656208,
|
|
"num_tokens": 320379711.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 0.379180908203125,
|
|
"epoch": 2.9603174603174605,
|
|
"grad_norm": 0.6066932052254789,
|
|
"learning_rate": 1.1580368844316125e-08,
|
|
"loss": 0.2976,
|
|
"mean_token_accuracy": 0.8918975051492453,
|
|
"num_tokens": 320807541.0,
|
|
"step": 746
|
|
},
|
|
{
|
|
"entropy": 0.380401611328125,
|
|
"epoch": 2.9642857142857144,
|
|
"grad_norm": 0.6315981302947575,
|
|
"learning_rate": 9.570873426649752e-09,
|
|
"loss": 0.3099,
|
|
"mean_token_accuracy": 0.8889196058735251,
|
|
"num_tokens": 321249293.0,
|
|
"step": 747
|
|
},
|
|
{
|
|
"entropy": 0.381500244140625,
|
|
"epoch": 2.9682539682539684,
|
|
"grad_norm": 0.5833316927135702,
|
|
"learning_rate": 7.752642481573258e-09,
|
|
"loss": 0.3042,
|
|
"mean_token_accuracy": 0.8893669536337256,
|
|
"num_tokens": 321676330.0,
|
|
"step": 748
|
|
},
|
|
{
|
|
"entropy": 0.379302978515625,
|
|
"epoch": 2.9722222222222223,
|
|
"grad_norm": 0.6084320256550227,
|
|
"learning_rate": 6.125710818701836e-09,
|
|
"loss": 0.3034,
|
|
"mean_token_accuracy": 0.8921427316963673,
|
|
"num_tokens": 322088977.0,
|
|
"step": 749
|
|
},
|
|
{
|
|
"entropy": 0.3780517578125,
|
|
"epoch": 2.9761904761904763,
|
|
"grad_norm": 0.6035928268017802,
|
|
"learning_rate": 4.690109585268054e-09,
|
|
"loss": 0.3061,
|
|
"mean_token_accuracy": 0.8898161184042692,
|
|
"num_tokens": 322526195.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 0.38763427734375,
|
|
"epoch": 2.9801587301587302,
|
|
"grad_norm": 0.6190270535862277,
|
|
"learning_rate": 3.445866265526787e-09,
|
|
"loss": 0.3177,
|
|
"mean_token_accuracy": 0.8873382732272148,
|
|
"num_tokens": 322944491.0,
|
|
"step": 751
|
|
},
|
|
{
|
|
"entropy": 0.38092041015625,
|
|
"epoch": 2.984126984126984,
|
|
"grad_norm": 0.6117829785012998,
|
|
"learning_rate": 2.3930046802322914e-09,
|
|
"loss": 0.3148,
|
|
"mean_token_accuracy": 0.885888421908021,
|
|
"num_tokens": 323373321.0,
|
|
"step": 752
|
|
},
|
|
{
|
|
"entropy": 0.3792724609375,
|
|
"epoch": 2.988095238095238,
|
|
"grad_norm": 0.588266741951019,
|
|
"learning_rate": 1.531544986177469e-09,
|
|
"loss": 0.3041,
|
|
"mean_token_accuracy": 0.8897914877161384,
|
|
"num_tokens": 323803097.0,
|
|
"step": 753
|
|
},
|
|
{
|
|
"entropy": 0.37762451171875,
|
|
"epoch": 2.992063492063492,
|
|
"grad_norm": 0.5894056132295504,
|
|
"learning_rate": 8.615036758108375e-10,
|
|
"loss": 0.2976,
|
|
"mean_token_accuracy": 0.890996178612113,
|
|
"num_tokens": 324241487.0,
|
|
"step": 754
|
|
},
|
|
{
|
|
"entropy": 0.376739501953125,
|
|
"epoch": 2.996031746031746,
|
|
"grad_norm": 0.6741605494505846,
|
|
"learning_rate": 3.8289357691900785e-10,
|
|
"loss": 0.314,
|
|
"mean_token_accuracy": 0.8861532881855965,
|
|
"num_tokens": 324694260.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 0.38372802734375,
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.6110340309341243,
|
|
"learning_rate": 9.572385238243443e-11,
|
|
"loss": 0.314,
|
|
"mean_token_accuracy": 0.8879904169589281,
|
|
"num_tokens": 325114310.0,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 756,
|
|
"total_flos": 601237772369920.0,
|
|
"train_loss": 0.43782205426346055,
|
|
"train_runtime": 58008.3635,
|
|
"train_samples_per_second": 1.27,
|
|
"train_steps_per_second": 0.013
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 756,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 63,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 601237772369920.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|