2035 lines
54 KiB
JSON
2035 lines
54 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.05811081732864573,
|
|
"eval_steps": 500,
|
|
"global_step": 1000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 10.742584228515625,
|
|
"epoch": 0.0002905540866432286,
|
|
"grad_norm": 4.90625,
|
|
"learning_rate": 2e-06,
|
|
"loss": 10.7837,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 10156.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 10.742587471008301,
|
|
"epoch": 0.0005811081732864572,
|
|
"grad_norm": 4.8125,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 10.7753,
|
|
"mean_token_accuracy": 9.267840650863945e-05,
|
|
"num_tokens": 20933.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 10.74257869720459,
|
|
"epoch": 0.0008716622599296859,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 7e-06,
|
|
"loss": 10.7508,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 31298.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 10.742635726928711,
|
|
"epoch": 0.0011622163465729145,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 10.697,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 40913.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 10.742652702331544,
|
|
"epoch": 0.0014527704332161432,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 10.5798,
|
|
"mean_token_accuracy": 0.0007269373920280487,
|
|
"num_tokens": 49901.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 10.742454719543456,
|
|
"epoch": 0.0017433245198593718,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 10.4688,
|
|
"mean_token_accuracy": 0.01560134175233543,
|
|
"num_tokens": 59328.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.741775226593017,
|
|
"epoch": 0.0020338786065026006,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 10.3287,
|
|
"mean_token_accuracy": 0.037073963694274424,
|
|
"num_tokens": 68405.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.740037631988525,
|
|
"epoch": 0.002324432693145829,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 10.2203,
|
|
"mean_token_accuracy": 0.037133642472326756,
|
|
"num_tokens": 77591.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.73731575012207,
|
|
"epoch": 0.0026149867797890577,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 10.1202,
|
|
"mean_token_accuracy": 0.03901108838617802,
|
|
"num_tokens": 88186.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.734606838226318,
|
|
"epoch": 0.0029055408664322865,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 10.0211,
|
|
"mean_token_accuracy": 0.04241710864007473,
|
|
"num_tokens": 97594.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.73211612701416,
|
|
"epoch": 0.003196094953075515,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.9871,
|
|
"mean_token_accuracy": 0.03826836366206408,
|
|
"num_tokens": 107386.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.73102445602417,
|
|
"epoch": 0.0034866490397187436,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.9132,
|
|
"mean_token_accuracy": 0.03943221494555473,
|
|
"num_tokens": 116742.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.729645252227783,
|
|
"epoch": 0.0037772031263619723,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.8519,
|
|
"mean_token_accuracy": 0.03962419871240854,
|
|
"num_tokens": 126520.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.727834033966065,
|
|
"epoch": 0.004067757213005201,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.7907,
|
|
"mean_token_accuracy": 0.03989919070154428,
|
|
"num_tokens": 136382.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.724947452545166,
|
|
"epoch": 0.0043583112996484295,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.7212,
|
|
"mean_token_accuracy": 0.03671109899878502,
|
|
"num_tokens": 146435.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.72182493209839,
|
|
"epoch": 0.004648865386291658,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.6591,
|
|
"mean_token_accuracy": 0.037667426839470865,
|
|
"num_tokens": 156174.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.71723222732544,
|
|
"epoch": 0.004939419472934887,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 9.5783,
|
|
"mean_token_accuracy": 0.04142397493124008,
|
|
"num_tokens": 165118.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.708585739135742,
|
|
"epoch": 0.005229973559578115,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 9.5252,
|
|
"mean_token_accuracy": 0.04036426953971386,
|
|
"num_tokens": 174401.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.697160243988037,
|
|
"epoch": 0.005520527646221344,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 9.443,
|
|
"mean_token_accuracy": 0.04118307866156101,
|
|
"num_tokens": 183533.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.683875274658202,
|
|
"epoch": 0.005811081732864573,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 9.3596,
|
|
"mean_token_accuracy": 0.045602331310510634,
|
|
"num_tokens": 193296.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 10.665638542175293,
|
|
"epoch": 0.006101635819507801,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 9.2242,
|
|
"mean_token_accuracy": 0.055989645794034,
|
|
"num_tokens": 202741.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 10.637047958374023,
|
|
"epoch": 0.00639218990615103,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 9.1359,
|
|
"mean_token_accuracy": 0.05134495124220848,
|
|
"num_tokens": 212441.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 10.61000461578369,
|
|
"epoch": 0.006682743992794259,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.9868,
|
|
"mean_token_accuracy": 0.04918566383421421,
|
|
"num_tokens": 220671.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 10.56931962966919,
|
|
"epoch": 0.006973298079437487,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.9878,
|
|
"mean_token_accuracy": 0.04560479037463665,
|
|
"num_tokens": 231390.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 10.515452098846435,
|
|
"epoch": 0.007263852166080716,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.8241,
|
|
"mean_token_accuracy": 0.05023673102259636,
|
|
"num_tokens": 241137.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 10.430156230926514,
|
|
"epoch": 0.007554406252723945,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 8.6778,
|
|
"mean_token_accuracy": 0.05138532817363739,
|
|
"num_tokens": 250627.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 10.353140926361084,
|
|
"epoch": 0.007844960339367173,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.5255,
|
|
"mean_token_accuracy": 0.05529710613191128,
|
|
"num_tokens": 259564.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 10.262280082702636,
|
|
"epoch": 0.008135514426010402,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 8.4168,
|
|
"mean_token_accuracy": 0.05102897398173809,
|
|
"num_tokens": 268997.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 10.17268762588501,
|
|
"epoch": 0.00842606851265363,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 8.402,
|
|
"mean_token_accuracy": 0.04707291163504124,
|
|
"num_tokens": 278989.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 10.068906784057617,
|
|
"epoch": 0.008716622599296859,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 8.2195,
|
|
"mean_token_accuracy": 0.04823922924697399,
|
|
"num_tokens": 288770.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 9.884156227111816,
|
|
"epoch": 0.009007176685940088,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 8.1604,
|
|
"mean_token_accuracy": 0.05296766012907028,
|
|
"num_tokens": 298368.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 9.749515438079834,
|
|
"epoch": 0.009297730772583316,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.9887,
|
|
"mean_token_accuracy": 0.054083061218261716,
|
|
"num_tokens": 307437.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 9.539670753479005,
|
|
"epoch": 0.009588284859226545,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.931,
|
|
"mean_token_accuracy": 0.05368399284780025,
|
|
"num_tokens": 317842.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 9.367785167694091,
|
|
"epoch": 0.009878838945869774,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.7746,
|
|
"mean_token_accuracy": 0.056211471930146216,
|
|
"num_tokens": 327455.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 9.106531143188477,
|
|
"epoch": 0.010169393032513002,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.7023,
|
|
"mean_token_accuracy": 0.059121083468198776,
|
|
"num_tokens": 338593.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 8.891216564178468,
|
|
"epoch": 0.01045994711915623,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.6717,
|
|
"mean_token_accuracy": 0.060001150518655774,
|
|
"num_tokens": 348278.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 8.690237522125244,
|
|
"epoch": 0.01075050120579946,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.5848,
|
|
"mean_token_accuracy": 0.060652027279138564,
|
|
"num_tokens": 358293.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 8.500970458984375,
|
|
"epoch": 0.011041055292442687,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.6462,
|
|
"mean_token_accuracy": 0.06345079019665718,
|
|
"num_tokens": 368177.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 8.432841682434082,
|
|
"epoch": 0.011331609379085917,
|
|
"grad_norm": 0.87890625,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.5041,
|
|
"mean_token_accuracy": 0.06438801400363445,
|
|
"num_tokens": 377258.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 8.328762531280518,
|
|
"epoch": 0.011622163465729146,
|
|
"grad_norm": 0.79296875,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.515,
|
|
"mean_token_accuracy": 0.06462946832180023,
|
|
"num_tokens": 385931.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 8.228355598449706,
|
|
"epoch": 0.011912717552372373,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.4262,
|
|
"mean_token_accuracy": 0.06731356121599674,
|
|
"num_tokens": 394370.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 8.163572025299072,
|
|
"epoch": 0.012203271639015602,
|
|
"grad_norm": 0.765625,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.5127,
|
|
"mean_token_accuracy": 0.06187250129878521,
|
|
"num_tokens": 405167.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 8.144425964355468,
|
|
"epoch": 0.012493825725658832,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.4823,
|
|
"mean_token_accuracy": 0.06424942426383495,
|
|
"num_tokens": 414954.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 8.074434852600097,
|
|
"epoch": 0.01278437981230206,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.4379,
|
|
"mean_token_accuracy": 0.07021872885525227,
|
|
"num_tokens": 423806.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 8.100719451904297,
|
|
"epoch": 0.013074933898945288,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.4049,
|
|
"mean_token_accuracy": 0.07006631046533585,
|
|
"num_tokens": 433416.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 8.068440341949463,
|
|
"epoch": 0.013365487985588518,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.4086,
|
|
"mean_token_accuracy": 0.0656484205275774,
|
|
"num_tokens": 443237.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 8.008077144622803,
|
|
"epoch": 0.013656042072231747,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.3811,
|
|
"mean_token_accuracy": 0.07138268202543259,
|
|
"num_tokens": 452334.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.95733003616333,
|
|
"epoch": 0.013946596158874974,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.451,
|
|
"mean_token_accuracy": 0.07069577798247337,
|
|
"num_tokens": 462604.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.985943031311035,
|
|
"epoch": 0.014237150245518203,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.3342,
|
|
"mean_token_accuracy": 0.07418472990393639,
|
|
"num_tokens": 472105.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.985353708267212,
|
|
"epoch": 0.014527704332161433,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.3573,
|
|
"mean_token_accuracy": 0.07192124761641025,
|
|
"num_tokens": 481873.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.852858924865723,
|
|
"epoch": 0.01481825841880466,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.3134,
|
|
"mean_token_accuracy": 0.07094009146094322,
|
|
"num_tokens": 490776.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.97090711593628,
|
|
"epoch": 0.01510881250544789,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.3459,
|
|
"mean_token_accuracy": 0.06945950090885163,
|
|
"num_tokens": 500237.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.988322401046753,
|
|
"epoch": 0.015399366592091119,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.3569,
|
|
"mean_token_accuracy": 0.0719369538128376,
|
|
"num_tokens": 509449.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.863973140716553,
|
|
"epoch": 0.015689920678734346,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.3463,
|
|
"mean_token_accuracy": 0.07629362866282463,
|
|
"num_tokens": 519335.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.850080347061157,
|
|
"epoch": 0.015980474765377575,
|
|
"grad_norm": 0.828125,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.269,
|
|
"mean_token_accuracy": 0.07348301075398922,
|
|
"num_tokens": 529108.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.803001642227173,
|
|
"epoch": 0.016271028852020804,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.3421,
|
|
"mean_token_accuracy": 0.07442944496870041,
|
|
"num_tokens": 539409.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.8401947021484375,
|
|
"epoch": 0.016561582938664034,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.269,
|
|
"mean_token_accuracy": 0.07476447969675064,
|
|
"num_tokens": 549790.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.773062610626221,
|
|
"epoch": 0.01685213702530726,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.2597,
|
|
"mean_token_accuracy": 0.07743276208639145,
|
|
"num_tokens": 559343.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.833370351791382,
|
|
"epoch": 0.01714269111195049,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.306,
|
|
"mean_token_accuracy": 0.07507650516927242,
|
|
"num_tokens": 568806.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.692620134353637,
|
|
"epoch": 0.017433245198593718,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.1532,
|
|
"mean_token_accuracy": 0.07671754881739616,
|
|
"num_tokens": 578988.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.840510559082031,
|
|
"epoch": 0.017723799285236947,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.258,
|
|
"mean_token_accuracy": 0.0767325557768345,
|
|
"num_tokens": 588588.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.740892934799194,
|
|
"epoch": 0.018014353371880176,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.2385,
|
|
"mean_token_accuracy": 0.07767370343208313,
|
|
"num_tokens": 597957.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.761815309524536,
|
|
"epoch": 0.018304907458523405,
|
|
"grad_norm": 0.8671875,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.2168,
|
|
"mean_token_accuracy": 0.07732245922088624,
|
|
"num_tokens": 607446.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.723113679885865,
|
|
"epoch": 0.01859546154516663,
|
|
"grad_norm": 0.8359375,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.1559,
|
|
"mean_token_accuracy": 0.07753840312361718,
|
|
"num_tokens": 617064.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.695508337020874,
|
|
"epoch": 0.01888601563180986,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.2008,
|
|
"mean_token_accuracy": 0.08057244047522545,
|
|
"num_tokens": 625927.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.717827177047729,
|
|
"epoch": 0.01917656971845309,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.1152,
|
|
"mean_token_accuracy": 0.07994545996189117,
|
|
"num_tokens": 635341.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.675025224685669,
|
|
"epoch": 0.01946712380509632,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.1106,
|
|
"mean_token_accuracy": 0.08988085016608238,
|
|
"num_tokens": 645095.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.714554166793823,
|
|
"epoch": 0.019757677891739548,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.1558,
|
|
"mean_token_accuracy": 0.0730321068316698,
|
|
"num_tokens": 654754.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.60111026763916,
|
|
"epoch": 0.020048231978382777,
|
|
"grad_norm": 0.8671875,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 7.1266,
|
|
"mean_token_accuracy": 0.07690966166555882,
|
|
"num_tokens": 664589.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.6628223896026615,
|
|
"epoch": 0.020338786065026003,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 7.1425,
|
|
"mean_token_accuracy": 0.07918459475040436,
|
|
"num_tokens": 673870.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.577814197540283,
|
|
"epoch": 0.020629340151669232,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000177,
|
|
"loss": 7.1137,
|
|
"mean_token_accuracy": 0.07997918874025345,
|
|
"num_tokens": 684309.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.6769345760345455,
|
|
"epoch": 0.02091989423831246,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 7.1629,
|
|
"mean_token_accuracy": 0.07469077445566655,
|
|
"num_tokens": 693702.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.534895896911621,
|
|
"epoch": 0.02121044832495569,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.000182,
|
|
"loss": 7.0599,
|
|
"mean_token_accuracy": 0.07970957532525062,
|
|
"num_tokens": 702951.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.588031339645386,
|
|
"epoch": 0.02150100241159892,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 7.0677,
|
|
"mean_token_accuracy": 0.08218754455447197,
|
|
"num_tokens": 712481.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.600922870635986,
|
|
"epoch": 0.02179155649824215,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.0683,
|
|
"mean_token_accuracy": 0.08380770459771156,
|
|
"num_tokens": 721579.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.572713327407837,
|
|
"epoch": 0.022082110584885375,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 7.0774,
|
|
"mean_token_accuracy": 0.07982454895973205,
|
|
"num_tokens": 731404.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.548839807510376,
|
|
"epoch": 0.022372664671528604,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.000192,
|
|
"loss": 7.0556,
|
|
"mean_token_accuracy": 0.07496214136481286,
|
|
"num_tokens": 740751.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.523876476287842,
|
|
"epoch": 0.022663218758171833,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 7.0247,
|
|
"mean_token_accuracy": 0.08082472011446953,
|
|
"num_tokens": 751171.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.552808237075806,
|
|
"epoch": 0.022953772844815062,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 7.0823,
|
|
"mean_token_accuracy": 0.07615064568817616,
|
|
"num_tokens": 760874.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.583486127853393,
|
|
"epoch": 0.02324432693145829,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 7.0585,
|
|
"mean_token_accuracy": 0.08326990716159344,
|
|
"num_tokens": 769652.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.488273334503174,
|
|
"epoch": 0.02353488101810152,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000202,
|
|
"loss": 7.0421,
|
|
"mean_token_accuracy": 0.07620194889605045,
|
|
"num_tokens": 779591.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.564187002182007,
|
|
"epoch": 0.023825435104744747,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 7.156,
|
|
"mean_token_accuracy": 0.08098742663860321,
|
|
"num_tokens": 789582.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.506245565414429,
|
|
"epoch": 0.024115989191387976,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000207,
|
|
"loss": 7.0546,
|
|
"mean_token_accuracy": 0.07765479311347008,
|
|
"num_tokens": 799146.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.4926127910614015,
|
|
"epoch": 0.024406543278031205,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 7.0246,
|
|
"mean_token_accuracy": 0.0782523088157177,
|
|
"num_tokens": 808934.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.532363748550415,
|
|
"epoch": 0.024697097364674434,
|
|
"grad_norm": 0.87109375,
|
|
"learning_rate": 0.000212,
|
|
"loss": 7.0821,
|
|
"mean_token_accuracy": 0.07597277015447616,
|
|
"num_tokens": 819280.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.457432746887207,
|
|
"epoch": 0.024987651451317663,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 6.9892,
|
|
"mean_token_accuracy": 0.0840725652873516,
|
|
"num_tokens": 828818.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.463752698898316,
|
|
"epoch": 0.025278205537960893,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 6.9816,
|
|
"mean_token_accuracy": 0.08661384396255016,
|
|
"num_tokens": 839175.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.5449175357818605,
|
|
"epoch": 0.02556875962460412,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 7.0777,
|
|
"mean_token_accuracy": 0.07947314418852329,
|
|
"num_tokens": 849965.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.392349624633789,
|
|
"epoch": 0.025859313711247348,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000222,
|
|
"loss": 6.9968,
|
|
"mean_token_accuracy": 0.08229465186595916,
|
|
"num_tokens": 859229.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.4397971630096436,
|
|
"epoch": 0.026149867797890577,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 6.9708,
|
|
"mean_token_accuracy": 0.0816520519554615,
|
|
"num_tokens": 869199.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.399962043762207,
|
|
"epoch": 0.026440421884533806,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 6.9666,
|
|
"mean_token_accuracy": 0.09285714998841285,
|
|
"num_tokens": 879470.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.4366514682769775,
|
|
"epoch": 0.026730975971177035,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 6.9274,
|
|
"mean_token_accuracy": 0.0792453158646822,
|
|
"num_tokens": 888397.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.370485734939575,
|
|
"epoch": 0.027021530057820264,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 6.8202,
|
|
"mean_token_accuracy": 0.0855403620749712,
|
|
"num_tokens": 898321.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.4845947265625,
|
|
"epoch": 0.027312084144463494,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 7.0856,
|
|
"mean_token_accuracy": 0.0808610200881958,
|
|
"num_tokens": 907947.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.327203702926636,
|
|
"epoch": 0.02760263823110672,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000237,
|
|
"loss": 6.9103,
|
|
"mean_token_accuracy": 0.0954340323805809,
|
|
"num_tokens": 916842.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.380954456329346,
|
|
"epoch": 0.02789319231774995,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 7.0098,
|
|
"mean_token_accuracy": 0.08165798112750053,
|
|
"num_tokens": 926431.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.412681722640992,
|
|
"epoch": 0.028183746404393178,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000242,
|
|
"loss": 6.9162,
|
|
"mean_token_accuracy": 0.08133741281926632,
|
|
"num_tokens": 935819.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.44426212310791,
|
|
"epoch": 0.028474300491036407,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 6.9418,
|
|
"mean_token_accuracy": 0.08402741849422454,
|
|
"num_tokens": 944198.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.264917373657227,
|
|
"epoch": 0.028764854577679636,
|
|
"grad_norm": 0.88671875,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.9628,
|
|
"mean_token_accuracy": 0.08352083042263984,
|
|
"num_tokens": 954972.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.385922384262085,
|
|
"epoch": 0.029055408664322865,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 6.9018,
|
|
"mean_token_accuracy": 0.08520250022411346,
|
|
"num_tokens": 964532.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 7.475071048736572,
|
|
"epoch": 0.02934596275096609,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000252,
|
|
"loss": 6.9955,
|
|
"mean_token_accuracy": 0.07958225682377815,
|
|
"num_tokens": 974547.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 7.299204540252686,
|
|
"epoch": 0.02963651683760932,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 6.935,
|
|
"mean_token_accuracy": 0.08022963926196099,
|
|
"num_tokens": 984245.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.318370199203491,
|
|
"epoch": 0.02992707092425255,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.000257,
|
|
"loss": 6.7766,
|
|
"mean_token_accuracy": 0.08500204458832741,
|
|
"num_tokens": 994400.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 7.352757215499878,
|
|
"epoch": 0.03021762501089578,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 7.0024,
|
|
"mean_token_accuracy": 0.07765024341642857,
|
|
"num_tokens": 1005775.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.312537145614624,
|
|
"epoch": 0.030508179097539008,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000262,
|
|
"loss": 6.9055,
|
|
"mean_token_accuracy": 0.08693855553865433,
|
|
"num_tokens": 1015386.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.383286190032959,
|
|
"epoch": 0.030798733184182237,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 6.8994,
|
|
"mean_token_accuracy": 0.09188547134399414,
|
|
"num_tokens": 1024963.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 7.249363946914673,
|
|
"epoch": 0.031089287270825463,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 6.8996,
|
|
"mean_token_accuracy": 0.08531768508255481,
|
|
"num_tokens": 1034667.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.265355777740479,
|
|
"epoch": 0.03137984135746869,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 6.8796,
|
|
"mean_token_accuracy": 0.08795020580291749,
|
|
"num_tokens": 1044171.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 7.295146417617798,
|
|
"epoch": 0.031670395444111925,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 6.8538,
|
|
"mean_token_accuracy": 0.08691519349813462,
|
|
"num_tokens": 1053585.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 7.237406063079834,
|
|
"epoch": 0.03196094953075515,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 6.7515,
|
|
"mean_token_accuracy": 0.09050033241510391,
|
|
"num_tokens": 1063310.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 7.263738679885864,
|
|
"epoch": 0.032251503617398376,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.000277,
|
|
"loss": 6.8651,
|
|
"mean_token_accuracy": 0.08824861124157905,
|
|
"num_tokens": 1073529.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 7.175330972671508,
|
|
"epoch": 0.03254205770404161,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 6.8319,
|
|
"mean_token_accuracy": 0.08951647505164147,
|
|
"num_tokens": 1083432.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 7.184946346282959,
|
|
"epoch": 0.032832611790684835,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 6.8004,
|
|
"mean_token_accuracy": 0.09656240493059158,
|
|
"num_tokens": 1092453.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 7.274725437164307,
|
|
"epoch": 0.03312316587732807,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 6.8865,
|
|
"mean_token_accuracy": 0.08661114051938057,
|
|
"num_tokens": 1102402.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 7.303795433044433,
|
|
"epoch": 0.03341371996397129,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000287,
|
|
"loss": 6.8928,
|
|
"mean_token_accuracy": 0.09610759019851685,
|
|
"num_tokens": 1111907.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 7.228280067443848,
|
|
"epoch": 0.03370427405061452,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 6.7846,
|
|
"mean_token_accuracy": 0.09133462607860565,
|
|
"num_tokens": 1120712.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 7.0720751762390135,
|
|
"epoch": 0.03399482813725775,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000292,
|
|
"loss": 6.6691,
|
|
"mean_token_accuracy": 0.0894063800573349,
|
|
"num_tokens": 1131165.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 7.229758644104004,
|
|
"epoch": 0.03428538222390098,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 6.8337,
|
|
"mean_token_accuracy": 0.08700250834226608,
|
|
"num_tokens": 1140527.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 7.137591791152954,
|
|
"epoch": 0.03457593631054421,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000297,
|
|
"loss": 6.792,
|
|
"mean_token_accuracy": 0.08842456936836243,
|
|
"num_tokens": 1149977.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 7.240325021743774,
|
|
"epoch": 0.034866490397187436,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 6.8153,
|
|
"mean_token_accuracy": 0.08972005397081376,
|
|
"num_tokens": 1159918.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 7.116828918457031,
|
|
"epoch": 0.03515704448383067,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.000302,
|
|
"loss": 6.7965,
|
|
"mean_token_accuracy": 0.08587550893425941,
|
|
"num_tokens": 1169218.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 7.1641600131988525,
|
|
"epoch": 0.035447598570473894,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 6.8058,
|
|
"mean_token_accuracy": 0.09056585654616356,
|
|
"num_tokens": 1179429.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 7.0538177490234375,
|
|
"epoch": 0.03573815265711712,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.000307,
|
|
"loss": 6.7051,
|
|
"mean_token_accuracy": 0.0951805867254734,
|
|
"num_tokens": 1189379.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 7.165834856033325,
|
|
"epoch": 0.03602870674376035,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.6834,
|
|
"mean_token_accuracy": 0.09452618882060052,
|
|
"num_tokens": 1198643.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 7.1435986995697025,
|
|
"epoch": 0.03631926083040358,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.8985,
|
|
"mean_token_accuracy": 0.08901753202080727,
|
|
"num_tokens": 1207933.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 7.125590705871582,
|
|
"epoch": 0.03660981491704681,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 6.7771,
|
|
"mean_token_accuracy": 0.09473630785942078,
|
|
"num_tokens": 1217000.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 7.342123746871948,
|
|
"epoch": 0.03690036900369004,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.8715,
|
|
"mean_token_accuracy": 0.08738602064549923,
|
|
"num_tokens": 1227054.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 7.0751423835754395,
|
|
"epoch": 0.03719092309033326,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.8639,
|
|
"mean_token_accuracy": 0.08903967961668968,
|
|
"num_tokens": 1237126.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 7.132748985290528,
|
|
"epoch": 0.037481477176976495,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000322,
|
|
"loss": 6.7309,
|
|
"mean_token_accuracy": 0.09907565861940384,
|
|
"num_tokens": 1247404.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 7.105540752410889,
|
|
"epoch": 0.03777203126361972,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 6.6672,
|
|
"mean_token_accuracy": 0.08641588017344475,
|
|
"num_tokens": 1257130.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 7.073269605636597,
|
|
"epoch": 0.038062585350262954,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 6.7423,
|
|
"mean_token_accuracy": 0.09811322540044784,
|
|
"num_tokens": 1266931.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 7.157707405090332,
|
|
"epoch": 0.03835313943690618,
|
|
"grad_norm": 0.8828125,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.7753,
|
|
"mean_token_accuracy": 0.08842945359647274,
|
|
"num_tokens": 1277770.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 7.074891519546509,
|
|
"epoch": 0.03864369352354941,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.6966,
|
|
"mean_token_accuracy": 0.09733218997716904,
|
|
"num_tokens": 1287188.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 7.035866546630859,
|
|
"epoch": 0.03893424761019264,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.7408,
|
|
"mean_token_accuracy": 0.09134816229343415,
|
|
"num_tokens": 1297038.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 7.091120624542237,
|
|
"epoch": 0.03922480169683586,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.6964,
|
|
"mean_token_accuracy": 0.09473009631037713,
|
|
"num_tokens": 1306860.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 7.030598735809326,
|
|
"epoch": 0.039515355783479096,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.6668,
|
|
"mean_token_accuracy": 0.09435953348875045,
|
|
"num_tokens": 1316585.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 7.1326805591583256,
|
|
"epoch": 0.03980590987012232,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.7282,
|
|
"mean_token_accuracy": 0.09551571607589722,
|
|
"num_tokens": 1325601.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 7.101321458816528,
|
|
"epoch": 0.040096463956765555,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 6.7604,
|
|
"mean_token_accuracy": 0.09247554913163185,
|
|
"num_tokens": 1336305.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 7.1049731254577635,
|
|
"epoch": 0.04038701804340878,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.6507,
|
|
"mean_token_accuracy": 0.09341847449541092,
|
|
"num_tokens": 1344820.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 6.997063255310058,
|
|
"epoch": 0.040677572130052006,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.6331,
|
|
"mean_token_accuracy": 0.09355669766664505,
|
|
"num_tokens": 1353950.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 7.01454758644104,
|
|
"epoch": 0.04096812621669524,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.7545,
|
|
"mean_token_accuracy": 0.09254956245422363,
|
|
"num_tokens": 1364881.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 7.0095212936401365,
|
|
"epoch": 0.041258680303338464,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.7061,
|
|
"mean_token_accuracy": 0.09260506108403206,
|
|
"num_tokens": 1374018.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 7.11537013053894,
|
|
"epoch": 0.0415492343899817,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.6946,
|
|
"mean_token_accuracy": 0.08821133449673653,
|
|
"num_tokens": 1384319.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 6.958690166473389,
|
|
"epoch": 0.04183978847662492,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.5713,
|
|
"mean_token_accuracy": 0.09440450817346573,
|
|
"num_tokens": 1393753.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 6.922836446762085,
|
|
"epoch": 0.042130342563268156,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.6616,
|
|
"mean_token_accuracy": 0.09427325800061226,
|
|
"num_tokens": 1403599.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 7.020907402038574,
|
|
"epoch": 0.04242089664991138,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.6611,
|
|
"mean_token_accuracy": 0.10043973848223686,
|
|
"num_tokens": 1412508.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 7.071925306320191,
|
|
"epoch": 0.04271145073655461,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.8015,
|
|
"mean_token_accuracy": 0.0910523734986782,
|
|
"num_tokens": 1422776.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 6.998428392410278,
|
|
"epoch": 0.04300200482319784,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.6414,
|
|
"mean_token_accuracy": 0.09633751660585403,
|
|
"num_tokens": 1432901.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 7.035877513885498,
|
|
"epoch": 0.043292558909841065,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.677,
|
|
"mean_token_accuracy": 0.09542910531163215,
|
|
"num_tokens": 1442916.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 6.878139925003052,
|
|
"epoch": 0.0435831129964843,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.5395,
|
|
"mean_token_accuracy": 0.09616116657853127,
|
|
"num_tokens": 1453037.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 6.96289029121399,
|
|
"epoch": 0.043873667083127524,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.6196,
|
|
"mean_token_accuracy": 0.10786209627985954,
|
|
"num_tokens": 1461963.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 7.00122447013855,
|
|
"epoch": 0.04416422116977075,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 6.7012,
|
|
"mean_token_accuracy": 0.09169812574982643,
|
|
"num_tokens": 1471521.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 6.930304098129272,
|
|
"epoch": 0.04445477525641398,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.5366,
|
|
"mean_token_accuracy": 0.0987947553396225,
|
|
"num_tokens": 1481438.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 6.89730920791626,
|
|
"epoch": 0.04474532934305721,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.5654,
|
|
"mean_token_accuracy": 0.09912522435188294,
|
|
"num_tokens": 1490522.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 6.994078540802002,
|
|
"epoch": 0.04503588342970044,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.7343,
|
|
"mean_token_accuracy": 0.09250347167253495,
|
|
"num_tokens": 1501034.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 6.894172525405883,
|
|
"epoch": 0.045326437516343666,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.5391,
|
|
"mean_token_accuracy": 0.10528326034545898,
|
|
"num_tokens": 1510390.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 6.992980337142944,
|
|
"epoch": 0.0456169916029869,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.6468,
|
|
"mean_token_accuracy": 0.09232402816414834,
|
|
"num_tokens": 1520048.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 6.977211618423462,
|
|
"epoch": 0.045907545689630125,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.5275,
|
|
"mean_token_accuracy": 0.10221462920308114,
|
|
"num_tokens": 1529113.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 6.760094785690308,
|
|
"epoch": 0.04619809977627335,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.6057,
|
|
"mean_token_accuracy": 0.09887640923261642,
|
|
"num_tokens": 1538573.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 6.975562715530396,
|
|
"epoch": 0.04648865386291658,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.6064,
|
|
"mean_token_accuracy": 0.10373581051826478,
|
|
"num_tokens": 1547471.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 6.8805656909942625,
|
|
"epoch": 0.04677920794955981,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.5641,
|
|
"mean_token_accuracy": 0.10285315811634063,
|
|
"num_tokens": 1557259.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 7.063277673721314,
|
|
"epoch": 0.04706976203620304,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.7921,
|
|
"mean_token_accuracy": 0.09200607016682624,
|
|
"num_tokens": 1567383.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 6.87684121131897,
|
|
"epoch": 0.04736031612284627,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.4826,
|
|
"mean_token_accuracy": 0.11064840331673623,
|
|
"num_tokens": 1577106.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 6.807673025131225,
|
|
"epoch": 0.04765087020948949,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.5393,
|
|
"mean_token_accuracy": 0.10080247670412064,
|
|
"num_tokens": 1586100.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 6.877712535858154,
|
|
"epoch": 0.047941424296132726,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.6279,
|
|
"mean_token_accuracy": 0.09564873427152634,
|
|
"num_tokens": 1596950.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 6.891899585723877,
|
|
"epoch": 0.04823197838277595,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.5837,
|
|
"mean_token_accuracy": 0.09832958057522774,
|
|
"num_tokens": 1606001.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 6.978082180023193,
|
|
"epoch": 0.048522532469419184,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.6825,
|
|
"mean_token_accuracy": 0.0975476372987032,
|
|
"num_tokens": 1616498.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 6.831979036331177,
|
|
"epoch": 0.04881308655606241,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.5199,
|
|
"mean_token_accuracy": 0.10347988307476044,
|
|
"num_tokens": 1625195.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 6.784482002258301,
|
|
"epoch": 0.04910364064270564,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.4476,
|
|
"mean_token_accuracy": 0.10162880271673203,
|
|
"num_tokens": 1635176.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 6.806185960769653,
|
|
"epoch": 0.04939419472934887,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.553,
|
|
"mean_token_accuracy": 0.1015662670135498,
|
|
"num_tokens": 1645183.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 6.801709985733032,
|
|
"epoch": 0.049684748815992094,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.5479,
|
|
"mean_token_accuracy": 0.10148834735155106,
|
|
"num_tokens": 1654226.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.834500074386597,
|
|
"epoch": 0.04997530290263533,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.5426,
|
|
"mean_token_accuracy": 0.10362305790185929,
|
|
"num_tokens": 1664572.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 6.950858306884766,
|
|
"epoch": 0.05026585698927855,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.6472,
|
|
"mean_token_accuracy": 0.09981537386775016,
|
|
"num_tokens": 1674070.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 6.791647720336914,
|
|
"epoch": 0.050556411075921785,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.4773,
|
|
"mean_token_accuracy": 0.09943379536271095,
|
|
"num_tokens": 1683473.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 6.777591514587402,
|
|
"epoch": 0.05084696516256501,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.4869,
|
|
"mean_token_accuracy": 0.10118941962718964,
|
|
"num_tokens": 1693171.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 6.898639726638794,
|
|
"epoch": 0.05113751924920824,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.606,
|
|
"mean_token_accuracy": 0.09705074802041054,
|
|
"num_tokens": 1703023.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 6.73418025970459,
|
|
"epoch": 0.05142807333585147,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.4984,
|
|
"mean_token_accuracy": 0.1019330695271492,
|
|
"num_tokens": 1712698.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 6.906363248825073,
|
|
"epoch": 0.051718627422494695,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.6098,
|
|
"mean_token_accuracy": 0.09838435426354408,
|
|
"num_tokens": 1721502.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 6.7474723815917965,
|
|
"epoch": 0.05200918150913793,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.4942,
|
|
"mean_token_accuracy": 0.10594057068228721,
|
|
"num_tokens": 1730551.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 6.808920383453369,
|
|
"epoch": 0.052299735595781154,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.5645,
|
|
"mean_token_accuracy": 0.10622440055012702,
|
|
"num_tokens": 1739368.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 6.827513933181763,
|
|
"epoch": 0.052590289682424386,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.4176,
|
|
"mean_token_accuracy": 0.11146403327584267,
|
|
"num_tokens": 1748528.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 6.713736248016358,
|
|
"epoch": 0.05288084376906761,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.5739,
|
|
"mean_token_accuracy": 0.09899114519357681,
|
|
"num_tokens": 1759569.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 6.80773286819458,
|
|
"epoch": 0.05317139785571084,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.5099,
|
|
"mean_token_accuracy": 0.10788461863994599,
|
|
"num_tokens": 1769366.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 6.76817569732666,
|
|
"epoch": 0.05346195194235407,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.6024,
|
|
"mean_token_accuracy": 0.09936894476413727,
|
|
"num_tokens": 1780155.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 6.755830335617065,
|
|
"epoch": 0.053752506028997296,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.4233,
|
|
"mean_token_accuracy": 0.10512633025646209,
|
|
"num_tokens": 1789436.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 6.823408889770508,
|
|
"epoch": 0.05404306011564053,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.5652,
|
|
"mean_token_accuracy": 0.0998048096895218,
|
|
"num_tokens": 1798836.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 6.751146364212036,
|
|
"epoch": 0.054333614202283755,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.444,
|
|
"mean_token_accuracy": 0.10532717406749725,
|
|
"num_tokens": 1808666.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 6.8108867645263675,
|
|
"epoch": 0.05462416828892699,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.5972,
|
|
"mean_token_accuracy": 0.09496863186359406,
|
|
"num_tokens": 1820001.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 6.751294231414795,
|
|
"epoch": 0.05491472237557021,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.4693,
|
|
"mean_token_accuracy": 0.10566612035036087,
|
|
"num_tokens": 1830284.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 6.820448493957519,
|
|
"epoch": 0.05520527646221344,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.4794,
|
|
"mean_token_accuracy": 0.10577797368168831,
|
|
"num_tokens": 1839930.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.629036235809326,
|
|
"epoch": 0.05549583054885667,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.5675,
|
|
"mean_token_accuracy": 0.10090194195508957,
|
|
"num_tokens": 1850697.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 6.817226839065552,
|
|
"epoch": 0.0557863846354999,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.497,
|
|
"mean_token_accuracy": 0.10740380734205246,
|
|
"num_tokens": 1860196.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 6.774875259399414,
|
|
"epoch": 0.05607693872214313,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.47,
|
|
"mean_token_accuracy": 0.1075842596590519,
|
|
"num_tokens": 1869000.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.722468996047974,
|
|
"epoch": 0.056367492808786356,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.469,
|
|
"mean_token_accuracy": 0.10600791200995445,
|
|
"num_tokens": 1878687.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 6.728367662429809,
|
|
"epoch": 0.05665804689542958,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.3467,
|
|
"mean_token_accuracy": 0.10569515079259872,
|
|
"num_tokens": 1886914.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 6.671978425979614,
|
|
"epoch": 0.056948600982072814,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.5321,
|
|
"mean_token_accuracy": 0.10422437414526939,
|
|
"num_tokens": 1897392.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 6.805356025695801,
|
|
"epoch": 0.05723915506871604,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.488,
|
|
"mean_token_accuracy": 0.10600305423140526,
|
|
"num_tokens": 1906215.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 6.8313037872314455,
|
|
"epoch": 0.05752970915535927,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.5017,
|
|
"mean_token_accuracy": 0.10730748698115349,
|
|
"num_tokens": 1915376.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.659111022949219,
|
|
"epoch": 0.0578202632420025,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.465,
|
|
"mean_token_accuracy": 0.10440039038658142,
|
|
"num_tokens": 1925558.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.676358318328857,
|
|
"epoch": 0.05811081732864573,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.4301,
|
|
"mean_token_accuracy": 0.10430914014577866,
|
|
"num_tokens": 1935176.0,
|
|
"step": 1000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 4000,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 423927884021760.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|