Files
Cthulhu-8B-v1.4/LoRA/trainer_state.json

2375 lines
68 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.8042294681072235,
"epoch": 0.025806451612903226,
"grad_norm": 1.166382074356079,
"learning_rate": 0.0,
"loss": 2.5975,
"mean_token_accuracy": 0.4834420457482338,
"num_tokens": 1533.0,
"step": 1
},
{
"entropy": 1.8224012553691864,
"epoch": 0.05161290322580645,
"grad_norm": 1.568097472190857,
"learning_rate": 8.333333333333334e-06,
"loss": 2.6194,
"mean_token_accuracy": 0.5228946506977081,
"num_tokens": 2447.0,
"step": 2
},
{
"entropy": 2.1347350478172302,
"epoch": 0.07741935483870968,
"grad_norm": 1.6636226177215576,
"learning_rate": 1.6666666666666667e-05,
"loss": 3.1216,
"mean_token_accuracy": 0.4500608742237091,
"num_tokens": 3252.0,
"step": 3
},
{
"entropy": 2.042035460472107,
"epoch": 0.1032258064516129,
"grad_norm": 1.8585174083709717,
"learning_rate": 2.5e-05,
"loss": 3.0927,
"mean_token_accuracy": 0.434286504983902,
"num_tokens": 3990.0,
"step": 4
},
{
"entropy": 2.0793383419513702,
"epoch": 0.12903225806451613,
"grad_norm": 2.271517753601074,
"learning_rate": 3.3333333333333335e-05,
"loss": 3.1323,
"mean_token_accuracy": 0.44490282237529755,
"num_tokens": 4623.0,
"step": 5
},
{
"entropy": 2.078058958053589,
"epoch": 0.15483870967741936,
"grad_norm": 2.0911874771118164,
"learning_rate": 4.166666666666667e-05,
"loss": 3.0791,
"mean_token_accuracy": 0.4434494748711586,
"num_tokens": 5202.0,
"step": 6
},
{
"entropy": 1.9296036958694458,
"epoch": 0.18064516129032257,
"grad_norm": 2.447918176651001,
"learning_rate": 5e-05,
"loss": 2.9283,
"mean_token_accuracy": 0.5010824277997017,
"num_tokens": 5738.0,
"step": 7
},
{
"entropy": 2.1553411781787872,
"epoch": 0.2064516129032258,
"grad_norm": 2.70611572265625,
"learning_rate": 5.833333333333334e-05,
"loss": 2.8435,
"mean_token_accuracy": 0.498832605779171,
"num_tokens": 6235.0,
"step": 8
},
{
"entropy": 2.148306369781494,
"epoch": 0.23225806451612904,
"grad_norm": 2.3149070739746094,
"learning_rate": 6.666666666666667e-05,
"loss": 2.8677,
"mean_token_accuracy": 0.46573129296302795,
"num_tokens": 6703.0,
"step": 9
},
{
"entropy": 1.9346267580986023,
"epoch": 0.25806451612903225,
"grad_norm": 1.3574178218841553,
"learning_rate": 7.500000000000001e-05,
"loss": 2.4543,
"mean_token_accuracy": 0.5017582848668098,
"num_tokens": 8003.0,
"step": 10
},
{
"entropy": 2.2560064792633057,
"epoch": 0.2838709677419355,
"grad_norm": 1.4286997318267822,
"learning_rate": 8.333333333333334e-05,
"loss": 2.4076,
"mean_token_accuracy": 0.516123816370964,
"num_tokens": 8830.0,
"step": 11
},
{
"entropy": 2.271284520626068,
"epoch": 0.3096774193548387,
"grad_norm": 1.289847493171692,
"learning_rate": 9.166666666666667e-05,
"loss": 2.2502,
"mean_token_accuracy": 0.581367239356041,
"num_tokens": 9586.0,
"step": 12
},
{
"entropy": 2.506469488143921,
"epoch": 0.33548387096774196,
"grad_norm": 1.698026418685913,
"learning_rate": 0.0001,
"loss": 2.5559,
"mean_token_accuracy": 0.5279825925827026,
"num_tokens": 10255.0,
"step": 13
},
{
"entropy": 2.488889992237091,
"epoch": 0.36129032258064514,
"grad_norm": 2.1104917526245117,
"learning_rate": 9.999827315381885e-05,
"loss": 2.3051,
"mean_token_accuracy": 0.5456234812736511,
"num_tokens": 10842.0,
"step": 14
},
{
"entropy": 2.494838774204254,
"epoch": 0.3870967741935484,
"grad_norm": 1.7446825504302979,
"learning_rate": 9.999309273455528e-05,
"loss": 2.1948,
"mean_token_accuracy": 0.5685414522886276,
"num_tokens": 11363.0,
"step": 15
},
{
"entropy": 2.623446822166443,
"epoch": 0.4129032258064516,
"grad_norm": 1.934134840965271,
"learning_rate": 9.998445910004082e-05,
"loss": 2.2624,
"mean_token_accuracy": 0.5481147766113281,
"num_tokens": 11819.0,
"step": 16
},
{
"entropy": 2.3205150961875916,
"epoch": 0.43870967741935485,
"grad_norm": 1.6750158071517944,
"learning_rate": 9.997237284663379e-05,
"loss": 1.8547,
"mean_token_accuracy": 0.6086297482252121,
"num_tokens": 12247.0,
"step": 17
},
{
"entropy": 2.435093104839325,
"epoch": 0.4645161290322581,
"grad_norm": 1.8602609634399414,
"learning_rate": 9.995683480917821e-05,
"loss": 2.1032,
"mean_token_accuracy": 0.5650125294923782,
"num_tokens": 12646.0,
"step": 18
},
{
"entropy": 2.1141549050807953,
"epoch": 0.49032258064516127,
"grad_norm": 0.9358610510826111,
"learning_rate": 9.993784606094612e-05,
"loss": 1.9903,
"mean_token_accuracy": 0.5407712012529373,
"num_tokens": 14509.0,
"step": 19
},
{
"entropy": 2.083885967731476,
"epoch": 0.5161290322580645,
"grad_norm": 1.1308526992797852,
"learning_rate": 9.991540791356342e-05,
"loss": 1.8726,
"mean_token_accuracy": 0.5599013864994049,
"num_tokens": 15617.0,
"step": 20
},
{
"entropy": 2.3853049874305725,
"epoch": 0.5419354838709678,
"grad_norm": 1.350138545036316,
"learning_rate": 9.988952191691925e-05,
"loss": 2.251,
"mean_token_accuracy": 0.5332682132720947,
"num_tokens": 16449.0,
"step": 21
},
{
"entropy": 2.1798684000968933,
"epoch": 0.567741935483871,
"grad_norm": 1.3853743076324463,
"learning_rate": 9.986018985905901e-05,
"loss": 1.9656,
"mean_token_accuracy": 0.5732992142438889,
"num_tokens": 17216.0,
"step": 22
},
{
"entropy": 2.2904029488563538,
"epoch": 0.5935483870967742,
"grad_norm": 2.5513713359832764,
"learning_rate": 9.982741376606078e-05,
"loss": 2.1948,
"mean_token_accuracy": 0.5600379034876823,
"num_tokens": 17868.0,
"step": 23
},
{
"entropy": 1.961841881275177,
"epoch": 0.6193548387096774,
"grad_norm": 1.9767720699310303,
"learning_rate": 9.97911959018954e-05,
"loss": 1.9528,
"mean_token_accuracy": 0.5889081507921219,
"num_tokens": 18439.0,
"step": 24
},
{
"entropy": 2.061126083135605,
"epoch": 0.6451612903225806,
"grad_norm": 1.8903456926345825,
"learning_rate": 9.975153876827008e-05,
"loss": 1.9973,
"mean_token_accuracy": 0.5782413184642792,
"num_tokens": 18947.0,
"step": 25
},
{
"entropy": 1.953830897808075,
"epoch": 0.6709677419354839,
"grad_norm": 2.247823715209961,
"learning_rate": 9.97084451044556e-05,
"loss": 1.8999,
"mean_token_accuracy": 0.5786410048604012,
"num_tokens": 19410.0,
"step": 26
},
{
"entropy": 1.8129592537879944,
"epoch": 0.6967741935483871,
"grad_norm": 2.3078598976135254,
"learning_rate": 9.966191788709716e-05,
"loss": 1.6035,
"mean_token_accuracy": 0.6230615079402924,
"num_tokens": 19831.0,
"step": 27
},
{
"entropy": 1.9399387836456299,
"epoch": 0.7225806451612903,
"grad_norm": 1.3792117834091187,
"learning_rate": 9.961196033000861e-05,
"loss": 1.9753,
"mean_token_accuracy": 0.5892214328050613,
"num_tokens": 20970.0,
"step": 28
},
{
"entropy": 1.8130147755146027,
"epoch": 0.7483870967741936,
"grad_norm": 1.5490132570266724,
"learning_rate": 9.955857588395065e-05,
"loss": 1.7023,
"mean_token_accuracy": 0.6110316589474678,
"num_tokens": 21755.0,
"step": 29
},
{
"entropy": 2.077410489320755,
"epoch": 0.7741935483870968,
"grad_norm": 1.8052752017974854,
"learning_rate": 9.950176823639233e-05,
"loss": 1.9752,
"mean_token_accuracy": 0.6064967960119247,
"num_tokens": 22504.0,
"step": 30
},
{
"entropy": 1.9274516999721527,
"epoch": 0.8,
"grad_norm": 1.9139018058776855,
"learning_rate": 9.944154131125642e-05,
"loss": 2.0548,
"mean_token_accuracy": 0.5636427998542786,
"num_tokens": 23183.0,
"step": 31
},
{
"entropy": 1.9550862610340118,
"epoch": 0.8258064516129032,
"grad_norm": 1.9849357604980469,
"learning_rate": 9.937789926864838e-05,
"loss": 1.8553,
"mean_token_accuracy": 0.5807601362466812,
"num_tokens": 23774.0,
"step": 32
},
{
"entropy": 1.8781771957874298,
"epoch": 0.8516129032258064,
"grad_norm": 2.0134923458099365,
"learning_rate": 9.931084650456892e-05,
"loss": 1.7917,
"mean_token_accuracy": 0.6070037335157394,
"num_tokens": 24313.0,
"step": 33
},
{
"entropy": 1.897193729877472,
"epoch": 0.8774193548387097,
"grad_norm": 2.607464551925659,
"learning_rate": 9.924038765061042e-05,
"loss": 1.7723,
"mean_token_accuracy": 0.6191761344671249,
"num_tokens": 24779.0,
"step": 34
},
{
"entropy": 1.7519680559635162,
"epoch": 0.9032258064516129,
"grad_norm": 2.4835267066955566,
"learning_rate": 9.916652757363698e-05,
"loss": 1.5883,
"mean_token_accuracy": 0.6609883904457092,
"num_tokens": 25211.0,
"step": 35
},
{
"entropy": 1.9292193055152893,
"epoch": 0.9290322580645162,
"grad_norm": 2.3735604286193848,
"learning_rate": 9.90892713754483e-05,
"loss": 1.8049,
"mean_token_accuracy": 0.5980570763349533,
"num_tokens": 25599.0,
"step": 36
},
{
"entropy": 1.881245195865631,
"epoch": 0.9548387096774194,
"grad_norm": 1.9849742650985718,
"learning_rate": 9.900862439242719e-05,
"loss": 1.7902,
"mean_token_accuracy": 0.5820632129907608,
"num_tokens": 26408.0,
"step": 37
},
{
"entropy": 2.113930821418762,
"epoch": 0.9806451612903225,
"grad_norm": 3.527271270751953,
"learning_rate": 9.892459219517108e-05,
"loss": 2.2025,
"mean_token_accuracy": 0.5260728523135185,
"num_tokens": 27021.0,
"step": 38
},
{
"entropy": 1.7831549247105916,
"epoch": 1.0,
"grad_norm": 2.5327165126800537,
"learning_rate": 9.883718058810707e-05,
"loss": 1.4478,
"mean_token_accuracy": 0.6935366789499918,
"num_tokens": 27353.0,
"step": 39
},
{
"entropy": 1.797234058380127,
"epoch": 1.0258064516129033,
"grad_norm": 1.3197723627090454,
"learning_rate": 9.874639560909117e-05,
"loss": 1.8934,
"mean_token_accuracy": 0.5857948064804077,
"num_tokens": 28829.0,
"step": 40
},
{
"entropy": 1.9161739647388458,
"epoch": 1.0516129032258064,
"grad_norm": 1.5616050958633423,
"learning_rate": 9.865224352899119e-05,
"loss": 1.7257,
"mean_token_accuracy": 0.6109496206045151,
"num_tokens": 29650.0,
"step": 41
},
{
"entropy": 1.8019072711467743,
"epoch": 1.0774193548387097,
"grad_norm": 1.8876160383224487,
"learning_rate": 9.85547308512535e-05,
"loss": 1.8085,
"mean_token_accuracy": 0.5969990640878677,
"num_tokens": 30359.0,
"step": 42
},
{
"entropy": 1.7833741307258606,
"epoch": 1.103225806451613,
"grad_norm": 2.0070252418518066,
"learning_rate": 9.84538643114539e-05,
"loss": 1.6704,
"mean_token_accuracy": 0.5969647467136383,
"num_tokens": 30961.0,
"step": 43
},
{
"entropy": 1.7372365295886993,
"epoch": 1.129032258064516,
"grad_norm": 1.8577375411987305,
"learning_rate": 9.834965087683236e-05,
"loss": 1.6159,
"mean_token_accuracy": 0.6475881487131119,
"num_tokens": 31527.0,
"step": 44
},
{
"entropy": 1.636292964220047,
"epoch": 1.1548387096774193,
"grad_norm": 1.8432772159576416,
"learning_rate": 9.824209774581174e-05,
"loss": 1.5197,
"mean_token_accuracy": 0.6530560553073883,
"num_tokens": 32050.0,
"step": 45
},
{
"entropy": 1.6075344681739807,
"epoch": 1.1806451612903226,
"grad_norm": 1.869754672050476,
"learning_rate": 9.81312123475006e-05,
"loss": 1.3557,
"mean_token_accuracy": 0.6470372080802917,
"num_tokens": 32532.0,
"step": 46
},
{
"entropy": 1.6146334111690521,
"epoch": 1.206451612903226,
"grad_norm": 2.099989175796509,
"learning_rate": 9.801700234117999e-05,
"loss": 1.2998,
"mean_token_accuracy": 0.6936827301979065,
"num_tokens": 32967.0,
"step": 47
},
{
"entropy": 1.7050741314888,
"epoch": 1.232258064516129,
"grad_norm": 2.504159688949585,
"learning_rate": 9.789947561577445e-05,
"loss": 1.5017,
"mean_token_accuracy": 0.622559979557991,
"num_tokens": 33363.0,
"step": 48
},
{
"entropy": 1.6869353950023651,
"epoch": 1.2580645161290323,
"grad_norm": 1.2886877059936523,
"learning_rate": 9.777864028930705e-05,
"loss": 1.6731,
"mean_token_accuracy": 0.6039082556962967,
"num_tokens": 35015.0,
"step": 49
},
{
"entropy": 1.6093480288982391,
"epoch": 1.2838709677419355,
"grad_norm": 1.6378092765808105,
"learning_rate": 9.765450470833865e-05,
"loss": 1.4894,
"mean_token_accuracy": 0.6367563456296921,
"num_tokens": 35999.0,
"step": 50
},
{
"entropy": 1.6687067151069641,
"epoch": 1.3096774193548386,
"grad_norm": 1.8195027112960815,
"learning_rate": 9.752707744739145e-05,
"loss": 1.5385,
"mean_token_accuracy": 0.6437539905309677,
"num_tokens": 36850.0,
"step": 51
},
{
"entropy": 1.4987359642982483,
"epoch": 1.335483870967742,
"grad_norm": 1.8060271739959717,
"learning_rate": 9.73963673083566e-05,
"loss": 1.3978,
"mean_token_accuracy": 0.661731407046318,
"num_tokens": 37604.0,
"step": 52
},
{
"entropy": 1.5831853449344635,
"epoch": 1.3612903225806452,
"grad_norm": 2.213078260421753,
"learning_rate": 9.726238331988624e-05,
"loss": 1.7863,
"mean_token_accuracy": 0.6147271245718002,
"num_tokens": 38314.0,
"step": 53
},
{
"entropy": 1.5708496272563934,
"epoch": 1.3870967741935485,
"grad_norm": 3.098945140838623,
"learning_rate": 9.712513473676996e-05,
"loss": 1.6752,
"mean_token_accuracy": 0.6371889561414719,
"num_tokens": 38941.0,
"step": 54
},
{
"entropy": 1.4293319284915924,
"epoch": 1.4129032258064516,
"grad_norm": 2.6225318908691406,
"learning_rate": 9.698463103929542e-05,
"loss": 1.5132,
"mean_token_accuracy": 0.6733423620462418,
"num_tokens": 39485.0,
"step": 55
},
{
"entropy": 1.4221723973751068,
"epoch": 1.4387096774193548,
"grad_norm": 2.834839105606079,
"learning_rate": 9.684088193259355e-05,
"loss": 1.4956,
"mean_token_accuracy": 0.6675658673048019,
"num_tokens": 39954.0,
"step": 56
},
{
"entropy": 1.3391860723495483,
"epoch": 1.4645161290322581,
"grad_norm": 2.185546398162842,
"learning_rate": 9.669389734596819e-05,
"loss": 1.1981,
"mean_token_accuracy": 0.7050470858812332,
"num_tokens": 40374.0,
"step": 57
},
{
"entropy": 1.6070669293403625,
"epoch": 1.4903225806451612,
"grad_norm": 1.3461191654205322,
"learning_rate": 9.654368743221022e-05,
"loss": 1.6617,
"mean_token_accuracy": 0.5980251729488373,
"num_tokens": 42027.0,
"step": 58
},
{
"entropy": 1.6520465910434723,
"epoch": 1.5161290322580645,
"grad_norm": 1.6961472034454346,
"learning_rate": 9.639026256689628e-05,
"loss": 1.577,
"mean_token_accuracy": 0.6316726058721542,
"num_tokens": 42916.0,
"step": 59
},
{
"entropy": 1.7670880556106567,
"epoch": 1.5419354838709678,
"grad_norm": 2.0527658462524414,
"learning_rate": 9.623363334767208e-05,
"loss": 1.7517,
"mean_token_accuracy": 0.6005731225013733,
"num_tokens": 43719.0,
"step": 60
},
{
"entropy": 1.5744120478630066,
"epoch": 1.567741935483871,
"grad_norm": 2.1162519454956055,
"learning_rate": 9.607381059352038e-05,
"loss": 1.5544,
"mean_token_accuracy": 0.6523573398590088,
"num_tokens": 44493.0,
"step": 61
},
{
"entropy": 1.728984385728836,
"epoch": 1.5935483870967742,
"grad_norm": 2.0401268005371094,
"learning_rate": 9.591080534401371e-05,
"loss": 1.699,
"mean_token_accuracy": 0.6030448973178864,
"num_tokens": 45170.0,
"step": 62
},
{
"entropy": 1.5222464203834534,
"epoch": 1.6193548387096774,
"grad_norm": 2.430859327316284,
"learning_rate": 9.574462885855174e-05,
"loss": 1.2944,
"mean_token_accuracy": 0.6946325898170471,
"num_tokens": 45755.0,
"step": 63
},
{
"entropy": 1.528793841600418,
"epoch": 1.6451612903225805,
"grad_norm": 2.3277854919433594,
"learning_rate": 9.557529261558367e-05,
"loss": 1.3969,
"mean_token_accuracy": 0.6722464263439178,
"num_tokens": 46268.0,
"step": 64
},
{
"entropy": 1.6062091886997223,
"epoch": 1.6709677419354838,
"grad_norm": 2.8640811443328857,
"learning_rate": 9.540280831181525e-05,
"loss": 1.3636,
"mean_token_accuracy": 0.6864263862371445,
"num_tokens": 46737.0,
"step": 65
},
{
"entropy": 1.336740493774414,
"epoch": 1.696774193548387,
"grad_norm": 2.5550613403320312,
"learning_rate": 9.522718786140097e-05,
"loss": 1.0106,
"mean_token_accuracy": 0.7365925908088684,
"num_tokens": 47163.0,
"step": 66
},
{
"entropy": 1.789841502904892,
"epoch": 1.7225806451612904,
"grad_norm": 1.9967743158340454,
"learning_rate": 9.504844339512095e-05,
"loss": 1.715,
"mean_token_accuracy": 0.614040270447731,
"num_tokens": 48108.0,
"step": 67
},
{
"entropy": 1.5481957495212555,
"epoch": 1.7483870967741937,
"grad_norm": 1.912815809249878,
"learning_rate": 9.486658725954321e-05,
"loss": 1.3063,
"mean_token_accuracy": 0.6685247123241425,
"num_tokens": 48901.0,
"step": 68
},
{
"entropy": 1.618812471628189,
"epoch": 1.7741935483870968,
"grad_norm": 2.1326448917388916,
"learning_rate": 9.468163201617062e-05,
"loss": 1.4826,
"mean_token_accuracy": 0.6648016273975372,
"num_tokens": 49668.0,
"step": 69
},
{
"entropy": 1.4738461375236511,
"epoch": 1.8,
"grad_norm": 2.2856757640838623,
"learning_rate": 9.449359044057345e-05,
"loss": 1.5099,
"mean_token_accuracy": 0.6307590007781982,
"num_tokens": 50353.0,
"step": 70
},
{
"entropy": 1.42239710688591,
"epoch": 1.8258064516129031,
"grad_norm": 2.272261381149292,
"learning_rate": 9.430247552150673e-05,
"loss": 1.4451,
"mean_token_accuracy": 0.6698804646730423,
"num_tokens": 50954.0,
"step": 71
},
{
"entropy": 1.5603100061416626,
"epoch": 1.8516129032258064,
"grad_norm": 2.444957971572876,
"learning_rate": 9.410830046001321e-05,
"loss": 1.5631,
"mean_token_accuracy": 0.6537315994501114,
"num_tokens": 51493.0,
"step": 72
},
{
"entropy": 1.421448290348053,
"epoch": 1.8774193548387097,
"grad_norm": 2.62430477142334,
"learning_rate": 9.391107866851143e-05,
"loss": 1.442,
"mean_token_accuracy": 0.6888918429613113,
"num_tokens": 51976.0,
"step": 73
},
{
"entropy": 1.3042734861373901,
"epoch": 1.903225806451613,
"grad_norm": 2.522318124771118,
"learning_rate": 9.371082376986928e-05,
"loss": 1.2438,
"mean_token_accuracy": 0.6721822023391724,
"num_tokens": 52413.0,
"step": 74
},
{
"entropy": 1.0973184555768967,
"epoch": 1.9290322580645163,
"grad_norm": 2.2152483463287354,
"learning_rate": 9.350754959646306e-05,
"loss": 0.9649,
"mean_token_accuracy": 0.7464027404785156,
"num_tokens": 52812.0,
"step": 75
},
{
"entropy": 1.4785442054271698,
"epoch": 1.9548387096774194,
"grad_norm": 1.778226613998413,
"learning_rate": 9.330127018922194e-05,
"loss": 1.5472,
"mean_token_accuracy": 0.6413073837757111,
"num_tokens": 53810.0,
"step": 76
},
{
"entropy": 1.4850931763648987,
"epoch": 1.9806451612903224,
"grad_norm": 2.324070453643799,
"learning_rate": 9.30919997966582e-05,
"loss": 1.4766,
"mean_token_accuracy": 0.6507462114095688,
"num_tokens": 54370.0,
"step": 77
},
{
"entropy": 1.5041760206222534,
"epoch": 2.0,
"grad_norm": 2.711214542388916,
"learning_rate": 9.287975287388298e-05,
"loss": 1.3224,
"mean_token_accuracy": 0.6853142380714417,
"num_tokens": 54706.0,
"step": 78
},
{
"entropy": 1.561076819896698,
"epoch": 2.0258064516129033,
"grad_norm": 1.4298901557922363,
"learning_rate": 9.266454408160779e-05,
"loss": 1.5017,
"mean_token_accuracy": 0.6616432368755341,
"num_tokens": 56147.0,
"step": 79
},
{
"entropy": 1.4342933893203735,
"epoch": 2.0516129032258066,
"grad_norm": 1.9477201700210571,
"learning_rate": 9.244638828513187e-05,
"loss": 1.0989,
"mean_token_accuracy": 0.7380426079034805,
"num_tokens": 56998.0,
"step": 80
},
{
"entropy": 1.3799369037151337,
"epoch": 2.07741935483871,
"grad_norm": 1.899839162826538,
"learning_rate": 9.22253005533154e-05,
"loss": 1.0685,
"mean_token_accuracy": 0.7503155916929245,
"num_tokens": 57799.0,
"step": 81
},
{
"entropy": 1.2785212695598602,
"epoch": 2.1032258064516127,
"grad_norm": 2.1526200771331787,
"learning_rate": 9.200129615753859e-05,
"loss": 1.0346,
"mean_token_accuracy": 0.7295394539833069,
"num_tokens": 58548.0,
"step": 82
},
{
"entropy": 1.1957830488681793,
"epoch": 2.129032258064516,
"grad_norm": 2.5215909481048584,
"learning_rate": 9.177439057064683e-05,
"loss": 1.0066,
"mean_token_accuracy": 0.7433657646179199,
"num_tokens": 59174.0,
"step": 83
},
{
"entropy": 1.3421072363853455,
"epoch": 2.1548387096774193,
"grad_norm": 2.606336832046509,
"learning_rate": 9.154459946588198e-05,
"loss": 1.1666,
"mean_token_accuracy": 0.7091180384159088,
"num_tokens": 59769.0,
"step": 84
},
{
"entropy": 1.032430723309517,
"epoch": 2.1806451612903226,
"grad_norm": 2.835961103439331,
"learning_rate": 9.131193871579975e-05,
"loss": 0.9103,
"mean_token_accuracy": 0.7784561067819595,
"num_tokens": 60295.0,
"step": 85
},
{
"entropy": 1.0069421231746674,
"epoch": 2.206451612903226,
"grad_norm": 3.632134437561035,
"learning_rate": 9.107642439117321e-05,
"loss": 0.7677,
"mean_token_accuracy": 0.7896548062562943,
"num_tokens": 60744.0,
"step": 86
},
{
"entropy": 0.784252293407917,
"epoch": 2.232258064516129,
"grad_norm": 3.14766526222229,
"learning_rate": 9.083807275988284e-05,
"loss": 0.6092,
"mean_token_accuracy": 0.8186918497085571,
"num_tokens": 61151.0,
"step": 87
},
{
"entropy": 1.1425200402736664,
"epoch": 2.258064516129032,
"grad_norm": 2.9548776149749756,
"learning_rate": 9.059690028579283e-05,
"loss": 1.2423,
"mean_token_accuracy": 0.67966029047966,
"num_tokens": 62417.0,
"step": 88
},
{
"entropy": 1.075703114271164,
"epoch": 2.2838709677419353,
"grad_norm": 2.6472651958465576,
"learning_rate": 9.035292362761381e-05,
"loss": 1.1406,
"mean_token_accuracy": 0.7184228450059891,
"num_tokens": 63270.0,
"step": 89
},
{
"entropy": 0.9899384379386902,
"epoch": 2.3096774193548386,
"grad_norm": 2.6800777912139893,
"learning_rate": 9.01061596377522e-05,
"loss": 0.9555,
"mean_token_accuracy": 0.759021058678627,
"num_tokens": 64027.0,
"step": 90
},
{
"entropy": 1.2101148664951324,
"epoch": 2.335483870967742,
"grad_norm": 3.1797468662261963,
"learning_rate": 8.985662536114613e-05,
"loss": 1.2574,
"mean_token_accuracy": 0.707681193947792,
"num_tokens": 64701.0,
"step": 91
},
{
"entropy": 0.9667136818170547,
"epoch": 2.361290322580645,
"grad_norm": 2.6233391761779785,
"learning_rate": 8.960433803408813e-05,
"loss": 0.7913,
"mean_token_accuracy": 0.7882635146379471,
"num_tokens": 65308.0,
"step": 92
},
{
"entropy": 0.9306632727384567,
"epoch": 2.3870967741935485,
"grad_norm": 2.395880699157715,
"learning_rate": 8.934931508303445e-05,
"loss": 0.7301,
"mean_token_accuracy": 0.7955707758665085,
"num_tokens": 65878.0,
"step": 93
},
{
"entropy": 1.0530627965927124,
"epoch": 2.412903225806452,
"grad_norm": 2.9347379207611084,
"learning_rate": 8.90915741234015e-05,
"loss": 0.8363,
"mean_token_accuracy": 0.775736004114151,
"num_tokens": 66364.0,
"step": 94
},
{
"entropy": 1.0531336814165115,
"epoch": 2.4387096774193546,
"grad_norm": 3.1018309593200684,
"learning_rate": 8.883113295834892e-05,
"loss": 0.8268,
"mean_token_accuracy": 0.7704032361507416,
"num_tokens": 66820.0,
"step": 95
},
{
"entropy": 1.0696537494659424,
"epoch": 2.464516129032258,
"grad_norm": 3.423306941986084,
"learning_rate": 8.856800957755e-05,
"loss": 0.7847,
"mean_token_accuracy": 0.7773692905902863,
"num_tokens": 67214.0,
"step": 96
},
{
"entropy": 1.306801289319992,
"epoch": 2.490322580645161,
"grad_norm": 1.6437768936157227,
"learning_rate": 8.83022221559489e-05,
"loss": 1.2357,
"mean_token_accuracy": 0.669854074716568,
"num_tokens": 68733.0,
"step": 97
},
{
"entropy": 1.1772551238536835,
"epoch": 2.5161290322580645,
"grad_norm": 2.4962806701660156,
"learning_rate": 8.803378905250544e-05,
"loss": 1.0752,
"mean_token_accuracy": 0.711113303899765,
"num_tokens": 69580.0,
"step": 98
},
{
"entropy": 1.166929692029953,
"epoch": 2.541935483870968,
"grad_norm": 2.8279449939727783,
"learning_rate": 8.776272880892675e-05,
"loss": 1.0135,
"mean_token_accuracy": 0.7302903383970261,
"num_tokens": 70359.0,
"step": 99
},
{
"entropy": 1.2368881702423096,
"epoch": 2.567741935483871,
"grad_norm": 2.812784194946289,
"learning_rate": 8.748906014838672e-05,
"loss": 1.0997,
"mean_token_accuracy": 0.7428575754165649,
"num_tokens": 71051.0,
"step": 100
},
{
"entropy": 1.0734427571296692,
"epoch": 2.5935483870967744,
"grad_norm": 3.168055772781372,
"learning_rate": 8.721280197423258e-05,
"loss": 0.9557,
"mean_token_accuracy": 0.7500255256891251,
"num_tokens": 71653.0,
"step": 101
},
{
"entropy": 1.0182117372751236,
"epoch": 2.6193548387096772,
"grad_norm": 2.928173065185547,
"learning_rate": 8.69339733686793e-05,
"loss": 0.7934,
"mean_token_accuracy": 0.7967472970485687,
"num_tokens": 72206.0,
"step": 102
},
{
"entropy": 0.9825232028961182,
"epoch": 2.6451612903225805,
"grad_norm": 3.5911121368408203,
"learning_rate": 8.665259359149132e-05,
"loss": 0.7435,
"mean_token_accuracy": 0.7856406420469284,
"num_tokens": 72709.0,
"step": 103
},
{
"entropy": 0.8393460661172867,
"epoch": 2.670967741935484,
"grad_norm": 3.1751551628112793,
"learning_rate": 8.636868207865244e-05,
"loss": 0.5727,
"mean_token_accuracy": 0.8536647707223892,
"num_tokens": 73172.0,
"step": 104
},
{
"entropy": 0.733843207359314,
"epoch": 2.696774193548387,
"grad_norm": 3.002105951309204,
"learning_rate": 8.60822584410231e-05,
"loss": 0.4306,
"mean_token_accuracy": 0.9011064171791077,
"num_tokens": 73601.0,
"step": 105
},
{
"entropy": 1.09127739071846,
"epoch": 2.7225806451612904,
"grad_norm": 2.7801899909973145,
"learning_rate": 8.579334246298593e-05,
"loss": 1.3229,
"mean_token_accuracy": 0.6847837716341019,
"num_tokens": 75066.0,
"step": 106
},
{
"entropy": 1.1003702282905579,
"epoch": 2.7483870967741937,
"grad_norm": 2.8465728759765625,
"learning_rate": 8.550195410107902e-05,
"loss": 1.026,
"mean_token_accuracy": 0.7287466824054718,
"num_tokens": 75935.0,
"step": 107
},
{
"entropy": 1.0054174661636353,
"epoch": 2.774193548387097,
"grad_norm": 2.6831374168395996,
"learning_rate": 8.520811348261759e-05,
"loss": 0.8887,
"mean_token_accuracy": 0.7784150391817093,
"num_tokens": 76730.0,
"step": 108
},
{
"entropy": 1.1102914214134216,
"epoch": 2.8,
"grad_norm": 3.408310651779175,
"learning_rate": 8.491184090430364e-05,
"loss": 1.0831,
"mean_token_accuracy": 0.7278113067150116,
"num_tokens": 77474.0,
"step": 109
},
{
"entropy": 0.999423012137413,
"epoch": 2.825806451612903,
"grad_norm": 3.7338831424713135,
"learning_rate": 8.461315683082399e-05,
"loss": 1.0257,
"mean_token_accuracy": 0.7361829876899719,
"num_tokens": 78068.0,
"step": 110
},
{
"entropy": 0.9764816612005234,
"epoch": 2.8516129032258064,
"grad_norm": 3.499826192855835,
"learning_rate": 8.43120818934367e-05,
"loss": 0.8335,
"mean_token_accuracy": 0.764112114906311,
"num_tokens": 78589.0,
"step": 111
},
{
"entropy": 0.9866785109043121,
"epoch": 2.8774193548387097,
"grad_norm": 3.31439471244812,
"learning_rate": 8.400863688854597e-05,
"loss": 0.9472,
"mean_token_accuracy": 0.7592662870883942,
"num_tokens": 79080.0,
"step": 112
},
{
"entropy": 0.8102796524763107,
"epoch": 2.903225806451613,
"grad_norm": 3.768465757369995,
"learning_rate": 8.370284277626577e-05,
"loss": 0.6879,
"mean_token_accuracy": 0.7918446511030197,
"num_tokens": 79518.0,
"step": 113
},
{
"entropy": 0.7523371577262878,
"epoch": 2.9290322580645163,
"grad_norm": 3.107103109359741,
"learning_rate": 8.339472067897187e-05,
"loss": 0.5142,
"mean_token_accuracy": 0.8337104171514511,
"num_tokens": 79925.0,
"step": 114
},
{
"entropy": 1.2405670583248138,
"epoch": 2.9548387096774196,
"grad_norm": 2.0415544509887695,
"learning_rate": 8.308429187984297e-05,
"loss": 1.2469,
"mean_token_accuracy": 0.6947166323661804,
"num_tokens": 81111.0,
"step": 115
},
{
"entropy": 1.0534760355949402,
"epoch": 2.9806451612903224,
"grad_norm": 3.243969440460205,
"learning_rate": 8.27715778213905e-05,
"loss": 1.0014,
"mean_token_accuracy": 0.752901017665863,
"num_tokens": 81717.0,
"step": 116
},
{
"entropy": 0.8150668541590372,
"epoch": 3.0,
"grad_norm": 3.7620151042938232,
"learning_rate": 8.24566001039776e-05,
"loss": 0.6544,
"mean_token_accuracy": 0.8201234340667725,
"num_tokens": 82059.0,
"step": 117
},
{
"entropy": 1.2218182981014252,
"epoch": 3.0258064516129033,
"grad_norm": 2.0384461879730225,
"learning_rate": 8.213938048432697e-05,
"loss": 0.9903,
"mean_token_accuracy": 0.7458517551422119,
"num_tokens": 83704.0,
"step": 118
},
{
"entropy": 1.0654624998569489,
"epoch": 3.0516129032258066,
"grad_norm": 2.7097387313842773,
"learning_rate": 8.181994087401819e-05,
"loss": 0.6589,
"mean_token_accuracy": 0.8282175809144974,
"num_tokens": 84564.0,
"step": 119
},
{
"entropy": 0.9389624744653702,
"epoch": 3.07741935483871,
"grad_norm": 3.422351360321045,
"learning_rate": 8.149830333797407e-05,
"loss": 0.6736,
"mean_token_accuracy": 0.8170457482337952,
"num_tokens": 85305.0,
"step": 120
},
{
"entropy": 0.891632542014122,
"epoch": 3.1032258064516127,
"grad_norm": 2.9999988079071045,
"learning_rate": 8.117449009293668e-05,
"loss": 0.5435,
"mean_token_accuracy": 0.8579341620206833,
"num_tokens": 85927.0,
"step": 121
},
{
"entropy": 0.7080177962779999,
"epoch": 3.129032258064516,
"grad_norm": 2.7167727947235107,
"learning_rate": 8.084852350593264e-05,
"loss": 0.386,
"mean_token_accuracy": 0.9050543904304504,
"num_tokens": 86500.0,
"step": 122
},
{
"entropy": 0.5361127704381943,
"epoch": 3.1548387096774193,
"grad_norm": 3.051241874694824,
"learning_rate": 8.052042609272817e-05,
"loss": 0.314,
"mean_token_accuracy": 0.9146886169910431,
"num_tokens": 87009.0,
"step": 123
},
{
"entropy": 0.5324621573090553,
"epoch": 3.1806451612903226,
"grad_norm": 3.0022952556610107,
"learning_rate": 8.019022051627388e-05,
"loss": 0.3141,
"mean_token_accuracy": 0.9247495979070663,
"num_tokens": 87467.0,
"step": 124
},
{
"entropy": 0.40624529123306274,
"epoch": 3.206451612903226,
"grad_norm": 3.094412326812744,
"learning_rate": 7.985792958513931e-05,
"loss": 0.26,
"mean_token_accuracy": 0.9299735277891159,
"num_tokens": 87885.0,
"step": 125
},
{
"entropy": 0.3672215938568115,
"epoch": 3.232258064516129,
"grad_norm": 3.4929354190826416,
"learning_rate": 7.952357625193749e-05,
"loss": 0.2392,
"mean_token_accuracy": 0.9306517392396927,
"num_tokens": 88260.0,
"step": 126
},
{
"entropy": 0.8298548460006714,
"epoch": 3.258064516129032,
"grad_norm": 2.836134672164917,
"learning_rate": 7.91871836117395e-05,
"loss": 0.7053,
"mean_token_accuracy": 0.8246497809886932,
"num_tokens": 89262.0,
"step": 127
},
{
"entropy": 0.5190232917666435,
"epoch": 3.2838709677419353,
"grad_norm": 5.216272830963135,
"learning_rate": 7.884877490047915e-05,
"loss": 0.565,
"mean_token_accuracy": 0.8471736311912537,
"num_tokens": 90062.0,
"step": 128
},
{
"entropy": 0.4947461038827896,
"epoch": 3.3096774193548386,
"grad_norm": 4.143370628356934,
"learning_rate": 7.85083734933481e-05,
"loss": 0.5013,
"mean_token_accuracy": 0.8697308301925659,
"num_tokens": 90841.0,
"step": 129
},
{
"entropy": 0.5702934339642525,
"epoch": 3.335483870967742,
"grad_norm": 5.3610520362854,
"learning_rate": 7.81660029031811e-05,
"loss": 0.657,
"mean_token_accuracy": 0.8270199149847031,
"num_tokens": 91591.0,
"step": 130
},
{
"entropy": 0.5612503439188004,
"epoch": 3.361290322580645,
"grad_norm": 4.896009922027588,
"learning_rate": 7.782168677883206e-05,
"loss": 0.638,
"mean_token_accuracy": 0.8336956202983856,
"num_tokens": 92304.0,
"step": 131
},
{
"entropy": 0.47641437500715256,
"epoch": 3.3870967741935485,
"grad_norm": 5.059084415435791,
"learning_rate": 7.74754489035403e-05,
"loss": 0.516,
"mean_token_accuracy": 0.8493129163980484,
"num_tokens": 92920.0,
"step": 132
},
{
"entropy": 0.5311232656240463,
"epoch": 3.412903225806452,
"grad_norm": 3.7369489669799805,
"learning_rate": 7.712731319328798e-05,
"loss": 0.4084,
"mean_token_accuracy": 0.8949003219604492,
"num_tokens": 93468.0,
"step": 133
},
{
"entropy": 0.4599653482437134,
"epoch": 3.4387096774193546,
"grad_norm": 4.457752704620361,
"learning_rate": 7.677730369514793e-05,
"loss": 0.4303,
"mean_token_accuracy": 0.8998099863529205,
"num_tokens": 93952.0,
"step": 134
},
{
"entropy": 0.3341464288532734,
"epoch": 3.464516129032258,
"grad_norm": 2.74814772605896,
"learning_rate": 7.642544458562278e-05,
"loss": 0.2045,
"mean_token_accuracy": 0.9389902055263519,
"num_tokens": 94378.0,
"step": 135
},
{
"entropy": 0.7704500108957291,
"epoch": 3.490322580645161,
"grad_norm": 2.1899735927581787,
"learning_rate": 7.60717601689749e-05,
"loss": 0.7928,
"mean_token_accuracy": 0.7940146774053574,
"num_tokens": 96188.0,
"step": 136
},
{
"entropy": 0.8460464626550674,
"epoch": 3.5161290322580645,
"grad_norm": 2.439542531967163,
"learning_rate": 7.571627487554769e-05,
"loss": 0.7167,
"mean_token_accuracy": 0.7986479252576828,
"num_tokens": 97250.0,
"step": 137
},
{
"entropy": 0.683267816901207,
"epoch": 3.541935483870968,
"grad_norm": 3.4693028926849365,
"learning_rate": 7.535901326007795e-05,
"loss": 0.5391,
"mean_token_accuracy": 0.8488983660936356,
"num_tokens": 98028.0,
"step": 138
},
{
"entropy": 0.6665534228086472,
"epoch": 3.567741935483871,
"grad_norm": 3.313450336456299,
"learning_rate": 7.500000000000001e-05,
"loss": 0.4977,
"mean_token_accuracy": 0.8638099581003189,
"num_tokens": 98727.0,
"step": 139
},
{
"entropy": 0.6375805735588074,
"epoch": 3.5935483870967744,
"grad_norm": 3.621342897415161,
"learning_rate": 7.463925989374089e-05,
"loss": 0.521,
"mean_token_accuracy": 0.8624279350042343,
"num_tokens": 99329.0,
"step": 140
},
{
"entropy": 0.5712595283985138,
"epoch": 3.6193548387096772,
"grad_norm": 3.667834520339966,
"learning_rate": 7.427681785900761e-05,
"loss": 0.4579,
"mean_token_accuracy": 0.8609372973442078,
"num_tokens": 99866.0,
"step": 141
},
{
"entropy": 0.5664890855550766,
"epoch": 3.6451612903225805,
"grad_norm": 3.193061113357544,
"learning_rate": 7.391269893106592e-05,
"loss": 0.3498,
"mean_token_accuracy": 0.9016094356775284,
"num_tokens": 100358.0,
"step": 142
},
{
"entropy": 0.4809069186449051,
"epoch": 3.670967741935484,
"grad_norm": 2.9797909259796143,
"learning_rate": 7.354692826101102e-05,
"loss": 0.239,
"mean_token_accuracy": 0.937361553311348,
"num_tokens": 100810.0,
"step": 143
},
{
"entropy": 0.3825264722108841,
"epoch": 3.696774193548387,
"grad_norm": 2.5916123390197754,
"learning_rate": 7.317953111403029e-05,
"loss": 0.2293,
"mean_token_accuracy": 0.959057167172432,
"num_tokens": 101224.0,
"step": 144
},
{
"entropy": 1.0315645188093185,
"epoch": 3.7225806451612904,
"grad_norm": 2.4332456588745117,
"learning_rate": 7.281053286765815e-05,
"loss": 0.9734,
"mean_token_accuracy": 0.7563262432813644,
"num_tokens": 102666.0,
"step": 145
},
{
"entropy": 0.7324022054672241,
"epoch": 3.7483870967741937,
"grad_norm": 3.319155693054199,
"learning_rate": 7.243995901002312e-05,
"loss": 0.526,
"mean_token_accuracy": 0.862901970744133,
"num_tokens": 103560.0,
"step": 146
},
{
"entropy": 0.7977930456399918,
"epoch": 3.774193548387097,
"grad_norm": 3.708766460418701,
"learning_rate": 7.20678351380872e-05,
"loss": 0.5996,
"mean_token_accuracy": 0.8376729637384415,
"num_tokens": 104386.0,
"step": 147
},
{
"entropy": 0.67112597823143,
"epoch": 3.8,
"grad_norm": 3.474480152130127,
"learning_rate": 7.169418695587791e-05,
"loss": 0.5283,
"mean_token_accuracy": 0.8518707603216171,
"num_tokens": 105173.0,
"step": 148
},
{
"entropy": 0.6674353927373886,
"epoch": 3.825806451612903,
"grad_norm": 4.0479736328125,
"learning_rate": 7.13190402727127e-05,
"loss": 0.5836,
"mean_token_accuracy": 0.8252883553504944,
"num_tokens": 105827.0,
"step": 149
},
{
"entropy": 0.6601278185844421,
"epoch": 3.8516129032258064,
"grad_norm": 3.1081454753875732,
"learning_rate": 7.094242100141625e-05,
"loss": 0.4519,
"mean_token_accuracy": 0.8595046997070312,
"num_tokens": 106405.0,
"step": 150
},
{
"entropy": 0.397666834294796,
"epoch": 3.8774193548387097,
"grad_norm": 2.5936572551727295,
"learning_rate": 7.056435515653059e-05,
"loss": 0.2092,
"mean_token_accuracy": 0.9478294253349304,
"num_tokens": 106926.0,
"step": 151
},
{
"entropy": 0.5597369372844696,
"epoch": 3.903225806451613,
"grad_norm": 4.103569984436035,
"learning_rate": 7.018486885251812e-05,
"loss": 0.4531,
"mean_token_accuracy": 0.8746808618307114,
"num_tokens": 107392.0,
"step": 152
},
{
"entropy": 0.38467343896627426,
"epoch": 3.9290322580645163,
"grad_norm": 3.1950509548187256,
"learning_rate": 6.980398830195785e-05,
"loss": 0.212,
"mean_token_accuracy": 0.9444408565759659,
"num_tokens": 107827.0,
"step": 153
},
{
"entropy": 0.5698762461543083,
"epoch": 3.9548387096774196,
"grad_norm": 3.3116562366485596,
"learning_rate": 6.942173981373474e-05,
"loss": 0.4076,
"mean_token_accuracy": 0.8756328076124191,
"num_tokens": 108519.0,
"step": 154
},
{
"entropy": 0.5269991233944893,
"epoch": 3.9806451612903224,
"grad_norm": 3.074373483657837,
"learning_rate": 6.903814979122249e-05,
"loss": 0.3577,
"mean_token_accuracy": 0.9049306809902191,
"num_tokens": 109080.0,
"step": 155
},
{
"entropy": 0.411786029736201,
"epoch": 4.0,
"grad_norm": 3.1152896881103516,
"learning_rate": 6.86532447304597e-05,
"loss": 0.2401,
"mean_token_accuracy": 0.9342868526776632,
"num_tokens": 109412.0,
"step": 156
},
{
"entropy": 0.672158882021904,
"epoch": 4.025806451612903,
"grad_norm": 2.576361894607544,
"learning_rate": 6.826705121831976e-05,
"loss": 0.5307,
"mean_token_accuracy": 0.8603871315717697,
"num_tokens": 110911.0,
"step": 157
},
{
"entropy": 0.5033400803804398,
"epoch": 4.051612903225807,
"grad_norm": 2.3417139053344727,
"learning_rate": 6.78795959306743e-05,
"loss": 0.2862,
"mean_token_accuracy": 0.9291664808988571,
"num_tokens": 111773.0,
"step": 158
},
{
"entropy": 0.3818225935101509,
"epoch": 4.077419354838709,
"grad_norm": 2.526963233947754,
"learning_rate": 6.749090563055076e-05,
"loss": 0.204,
"mean_token_accuracy": 0.9366898983716965,
"num_tokens": 112552.0,
"step": 159
},
{
"entropy": 0.5210211500525475,
"epoch": 4.103225806451613,
"grad_norm": 3.331657648086548,
"learning_rate": 6.710100716628344e-05,
"loss": 0.3463,
"mean_token_accuracy": 0.9079622030258179,
"num_tokens": 113279.0,
"step": 160
},
{
"entropy": 0.4078049287199974,
"epoch": 4.129032258064516,
"grad_norm": 2.643353223800659,
"learning_rate": 6.670992746965938e-05,
"loss": 0.2458,
"mean_token_accuracy": 0.9378542304039001,
"num_tokens": 113927.0,
"step": 161
},
{
"entropy": 0.2958051636815071,
"epoch": 4.15483870967742,
"grad_norm": 2.6562397480010986,
"learning_rate": 6.63176935540578e-05,
"loss": 0.2228,
"mean_token_accuracy": 0.9389047920703888,
"num_tokens": 114535.0,
"step": 162
},
{
"entropy": 0.2642120160162449,
"epoch": 4.180645161290323,
"grad_norm": 3.720411539077759,
"learning_rate": 6.592433251258423e-05,
"loss": 0.1609,
"mean_token_accuracy": 0.9546155333518982,
"num_tokens": 115092.0,
"step": 163
},
{
"entropy": 0.2038814201951027,
"epoch": 4.2064516129032254,
"grad_norm": 3.742655038833618,
"learning_rate": 6.552987151619919e-05,
"loss": 0.1438,
"mean_token_accuracy": 0.9577045887708664,
"num_tokens": 115572.0,
"step": 164
},
{
"entropy": 0.21509704366326332,
"epoch": 4.232258064516129,
"grad_norm": 4.123962879180908,
"learning_rate": 6.51343378118413e-05,
"loss": 0.1326,
"mean_token_accuracy": 0.955599308013916,
"num_tokens": 116004.0,
"step": 165
},
{
"entropy": 0.5882035046815872,
"epoch": 4.258064516129032,
"grad_norm": 2.629396438598633,
"learning_rate": 6.473775872054521e-05,
"loss": 0.5174,
"mean_token_accuracy": 0.855495274066925,
"num_tokens": 117713.0,
"step": 166
},
{
"entropy": 0.4447134956717491,
"epoch": 4.283870967741936,
"grad_norm": 5.003028869628906,
"learning_rate": 6.434016163555452e-05,
"loss": 0.4682,
"mean_token_accuracy": 0.8714989423751831,
"num_tokens": 118624.0,
"step": 167
},
{
"entropy": 0.3722687065601349,
"epoch": 4.309677419354839,
"grad_norm": 3.819241762161255,
"learning_rate": 6.394157402042951e-05,
"loss": 0.3207,
"mean_token_accuracy": 0.9076657742261887,
"num_tokens": 119441.0,
"step": 168
},
{
"entropy": 0.2616325728595257,
"epoch": 4.335483870967742,
"grad_norm": 3.4206392765045166,
"learning_rate": 6.354202340715026e-05,
"loss": 0.205,
"mean_token_accuracy": 0.9454829543828964,
"num_tokens": 120187.0,
"step": 169
},
{
"entropy": 0.3457096070051193,
"epoch": 4.361290322580645,
"grad_norm": 3.556037425994873,
"learning_rate": 6.314153739421476e-05,
"loss": 0.2697,
"mean_token_accuracy": 0.9172067493200302,
"num_tokens": 120838.0,
"step": 170
},
{
"entropy": 0.2511453256011009,
"epoch": 4.387096774193548,
"grad_norm": 2.943145751953125,
"learning_rate": 6.274014364473274e-05,
"loss": 0.1491,
"mean_token_accuracy": 0.9682914614677429,
"num_tokens": 121408.0,
"step": 171
},
{
"entropy": 0.23977105692029,
"epoch": 4.412903225806452,
"grad_norm": 3.426252603530884,
"learning_rate": 6.233786988451468e-05,
"loss": 0.1645,
"mean_token_accuracy": 0.9556652754545212,
"num_tokens": 121915.0,
"step": 172
},
{
"entropy": 0.19089676067233086,
"epoch": 4.438709677419355,
"grad_norm": 2.1618521213531494,
"learning_rate": 6.19347439001569e-05,
"loss": 0.1059,
"mean_token_accuracy": 0.97336345911026,
"num_tokens": 122368.0,
"step": 173
},
{
"entropy": 0.19364609941840172,
"epoch": 4.464516129032258,
"grad_norm": 3.3634703159332275,
"learning_rate": 6.153079353712201e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9543762654066086,
"num_tokens": 122767.0,
"step": 174
},
{
"entropy": 0.6687990427017212,
"epoch": 4.490322580645161,
"grad_norm": 2.883437395095825,
"learning_rate": 6.112604669781572e-05,
"loss": 0.5348,
"mean_token_accuracy": 0.8638840764760971,
"num_tokens": 124288.0,
"step": 175
},
{
"entropy": 0.472368985414505,
"epoch": 4.516129032258064,
"grad_norm": 2.9869871139526367,
"learning_rate": 6.072053133965938e-05,
"loss": 0.2776,
"mean_token_accuracy": 0.9314542561769485,
"num_tokens": 125161.0,
"step": 176
},
{
"entropy": 0.4055846929550171,
"epoch": 4.541935483870968,
"grad_norm": 3.554269552230835,
"learning_rate": 6.031427547315889e-05,
"loss": 0.3152,
"mean_token_accuracy": 0.9113509654998779,
"num_tokens": 125955.0,
"step": 177
},
{
"entropy": 0.3913852721452713,
"epoch": 4.567741935483871,
"grad_norm": 3.3943800926208496,
"learning_rate": 5.9907307159969884e-05,
"loss": 0.2882,
"mean_token_accuracy": 0.9336675554513931,
"num_tokens": 126654.0,
"step": 178
},
{
"entropy": 0.2266981489956379,
"epoch": 4.593548387096774,
"grad_norm": 2.6177566051483154,
"learning_rate": 5.949965451095951e-05,
"loss": 0.1521,
"mean_token_accuracy": 0.9607619494199753,
"num_tokens": 127200.0,
"step": 179
},
{
"entropy": 0.2510114349424839,
"epoch": 4.619354838709677,
"grad_norm": 2.9274792671203613,
"learning_rate": 5.9091345684264546e-05,
"loss": 0.1527,
"mean_token_accuracy": 0.9545964151620865,
"num_tokens": 127710.0,
"step": 180
},
{
"entropy": 0.27408041059970856,
"epoch": 4.645161290322581,
"grad_norm": 3.970353841781616,
"learning_rate": 5.868240888334653e-05,
"loss": 0.2088,
"mean_token_accuracy": 0.9431939721107483,
"num_tokens": 128171.0,
"step": 181
},
{
"entropy": 0.21555104106664658,
"epoch": 4.670967741935484,
"grad_norm": 2.1485326290130615,
"learning_rate": 5.827287235504356e-05,
"loss": 0.1231,
"mean_token_accuracy": 0.9743186682462692,
"num_tokens": 128603.0,
"step": 182
},
{
"entropy": 0.1890631914138794,
"epoch": 4.6967741935483875,
"grad_norm": 3.0446012020111084,
"learning_rate": 5.786276438761927e-05,
"loss": 0.166,
"mean_token_accuracy": 0.9585428386926651,
"num_tokens": 129018.0,
"step": 183
},
{
"entropy": 0.4911561757326126,
"epoch": 4.72258064516129,
"grad_norm": 2.324612617492676,
"learning_rate": 5.745211330880872e-05,
"loss": 0.3596,
"mean_token_accuracy": 0.9241899400949478,
"num_tokens": 130189.0,
"step": 184
},
{
"entropy": 0.3451598323881626,
"epoch": 4.748387096774193,
"grad_norm": 3.1134896278381348,
"learning_rate": 5.704094748386184e-05,
"loss": 0.2163,
"mean_token_accuracy": 0.9265208840370178,
"num_tokens": 130996.0,
"step": 185
},
{
"entropy": 0.39382658153772354,
"epoch": 4.774193548387097,
"grad_norm": 3.3759310245513916,
"learning_rate": 5.6629295313583974e-05,
"loss": 0.266,
"mean_token_accuracy": 0.923931747674942,
"num_tokens": 131734.0,
"step": 186
},
{
"entropy": 0.362373985350132,
"epoch": 4.8,
"grad_norm": 3.549544095993042,
"learning_rate": 5.621718523237427e-05,
"loss": 0.2415,
"mean_token_accuracy": 0.9290976673364639,
"num_tokens": 132406.0,
"step": 187
},
{
"entropy": 0.33830052614212036,
"epoch": 4.825806451612904,
"grad_norm": 2.8866331577301025,
"learning_rate": 5.5804645706261514e-05,
"loss": 0.2333,
"mean_token_accuracy": 0.93567855656147,
"num_tokens": 133001.0,
"step": 188
},
{
"entropy": 0.2700263783335686,
"epoch": 4.851612903225806,
"grad_norm": 2.9685375690460205,
"learning_rate": 5.539170523093794e-05,
"loss": 0.1737,
"mean_token_accuracy": 0.9484844356775284,
"num_tokens": 133568.0,
"step": 189
},
{
"entropy": 0.2686317004263401,
"epoch": 4.877419354838709,
"grad_norm": 2.7458479404449463,
"learning_rate": 5.497839232979084e-05,
"loss": 0.1727,
"mean_token_accuracy": 0.9658856242895126,
"num_tokens": 134062.0,
"step": 190
},
{
"entropy": 0.2341674156486988,
"epoch": 4.903225806451613,
"grad_norm": 2.944103956222534,
"learning_rate": 5.456473555193242e-05,
"loss": 0.1788,
"mean_token_accuracy": 0.9528596550226212,
"num_tokens": 134514.0,
"step": 191
},
{
"entropy": 0.22099602594971657,
"epoch": 4.929032258064516,
"grad_norm": 3.862736940383911,
"learning_rate": 5.415076347022776e-05,
"loss": 0.1657,
"mean_token_accuracy": 0.9679511785507202,
"num_tokens": 134923.0,
"step": 192
},
{
"entropy": 0.5313196182250977,
"epoch": 4.95483870967742,
"grad_norm": 3.1668918132781982,
"learning_rate": 5.373650467932122e-05,
"loss": 0.5281,
"mean_token_accuracy": 0.8866761773824692,
"num_tokens": 135869.0,
"step": 193
},
{
"entropy": 0.2688843570649624,
"epoch": 4.980645161290322,
"grad_norm": 2.9400172233581543,
"learning_rate": 5.332198779366122e-05,
"loss": 0.1822,
"mean_token_accuracy": 0.9536565244197845,
"num_tokens": 136435.0,
"step": 194
},
{
"entropy": 0.34634942809740704,
"epoch": 5.0,
"grad_norm": 4.880941867828369,
"learning_rate": 5.290724144552379e-05,
"loss": 0.2718,
"mean_token_accuracy": 0.9203394254048666,
"num_tokens": 136765.0,
"step": 195
},
{
"entropy": 0.5787394121289253,
"epoch": 5.025806451612903,
"grad_norm": 2.429058313369751,
"learning_rate": 5.249229428303486e-05,
"loss": 0.3105,
"mean_token_accuracy": 0.9199163019657135,
"num_tokens": 138102.0,
"step": 196
},
{
"entropy": 0.3213765248656273,
"epoch": 5.051612903225807,
"grad_norm": 2.9777679443359375,
"learning_rate": 5.2077174968191346e-05,
"loss": 0.1813,
"mean_token_accuracy": 0.9481654316186905,
"num_tokens": 138950.0,
"step": 197
},
{
"entropy": 0.2601848617196083,
"epoch": 5.077419354838709,
"grad_norm": 2.173152446746826,
"learning_rate": 5.166191217488133e-05,
"loss": 0.1352,
"mean_token_accuracy": 0.9740329831838608,
"num_tokens": 139722.0,
"step": 198
},
{
"entropy": 0.27228355780243874,
"epoch": 5.103225806451613,
"grad_norm": 2.206040859222412,
"learning_rate": 5.124653458690365e-05,
"loss": 0.1203,
"mean_token_accuracy": 0.9656965136528015,
"num_tokens": 140396.0,
"step": 199
},
{
"entropy": 0.17130273580551147,
"epoch": 5.129032258064516,
"grad_norm": 2.000005006790161,
"learning_rate": 5.083107089598632e-05,
"loss": 0.0938,
"mean_token_accuracy": 0.9830586761236191,
"num_tokens": 140987.0,
"step": 200
},
{
"entropy": 0.19337046518921852,
"epoch": 5.15483870967742,
"grad_norm": 2.180755376815796,
"learning_rate": 5.041554979980486e-05,
"loss": 0.092,
"mean_token_accuracy": 0.9733314365148544,
"num_tokens": 141517.0,
"step": 201
},
{
"entropy": 0.16925612837076187,
"epoch": 5.180645161290323,
"grad_norm": 1.6496930122375488,
"learning_rate": 5e-05,
"loss": 0.0819,
"mean_token_accuracy": 0.9781141579151154,
"num_tokens": 142025.0,
"step": 202
},
{
"entropy": 0.20112577825784683,
"epoch": 5.2064516129032254,
"grad_norm": 2.5295193195343018,
"learning_rate": 4.9584450200195156e-05,
"loss": 0.1113,
"mean_token_accuracy": 0.972536712884903,
"num_tokens": 142501.0,
"step": 203
},
{
"entropy": 0.12446376867592335,
"epoch": 5.232258064516129,
"grad_norm": 1.8126459121704102,
"learning_rate": 4.9168929104013697e-05,
"loss": 0.1119,
"mean_token_accuracy": 0.9784018099308014,
"num_tokens": 142930.0,
"step": 204
},
{
"entropy": 0.3357328027486801,
"epoch": 5.258064516129032,
"grad_norm": 2.69579815864563,
"learning_rate": 4.875346541309637e-05,
"loss": 0.2933,
"mean_token_accuracy": 0.9279916733503342,
"num_tokens": 144619.0,
"step": 205
},
{
"entropy": 0.27347391098737717,
"epoch": 5.283870967741936,
"grad_norm": 3.0113985538482666,
"learning_rate": 4.8338087825118675e-05,
"loss": 0.2147,
"mean_token_accuracy": 0.9462355375289917,
"num_tokens": 145485.0,
"step": 206
},
{
"entropy": 0.18706193938851357,
"epoch": 5.309677419354839,
"grad_norm": 2.3350462913513184,
"learning_rate": 4.792282503180867e-05,
"loss": 0.1089,
"mean_token_accuracy": 0.9645346254110336,
"num_tokens": 146253.0,
"step": 207
},
{
"entropy": 0.23134352639317513,
"epoch": 5.335483870967742,
"grad_norm": 2.53825306892395,
"learning_rate": 4.750770571696514e-05,
"loss": 0.139,
"mean_token_accuracy": 0.9644808024168015,
"num_tokens": 146961.0,
"step": 208
},
{
"entropy": 0.18409648537635803,
"epoch": 5.361290322580645,
"grad_norm": 3.6751139163970947,
"learning_rate": 4.709275855447621e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9647018611431122,
"num_tokens": 147585.0,
"step": 209
},
{
"entropy": 0.13805431686341763,
"epoch": 5.387096774193548,
"grad_norm": 2.252584218978882,
"learning_rate": 4.6678012206338793e-05,
"loss": 0.11,
"mean_token_accuracy": 0.9786661118268967,
"num_tokens": 148137.0,
"step": 210
},
{
"entropy": 0.1293979026377201,
"epoch": 5.412903225806452,
"grad_norm": 3.228670358657837,
"learning_rate": 4.626349532067879e-05,
"loss": 0.1009,
"mean_token_accuracy": 0.9756647497415543,
"num_tokens": 148635.0,
"step": 211
},
{
"entropy": 0.15355101972818375,
"epoch": 5.438709677419355,
"grad_norm": 2.5168986320495605,
"learning_rate": 4.584923652977224e-05,
"loss": 0.0966,
"mean_token_accuracy": 0.9696203321218491,
"num_tokens": 149098.0,
"step": 212
},
{
"entropy": 0.12985192984342575,
"epoch": 5.464516129032258,
"grad_norm": 1.9614430665969849,
"learning_rate": 4.543526444806759e-05,
"loss": 0.0876,
"mean_token_accuracy": 0.9787871986627579,
"num_tokens": 149525.0,
"step": 213
},
{
"entropy": 0.41858533024787903,
"epoch": 5.490322580645161,
"grad_norm": 2.3210058212280273,
"learning_rate": 4.502160767020918e-05,
"loss": 0.3106,
"mean_token_accuracy": 0.9150111377239227,
"num_tokens": 151159.0,
"step": 214
},
{
"entropy": 0.23978786170482635,
"epoch": 5.516129032258064,
"grad_norm": 2.6100656986236572,
"learning_rate": 4.4608294769062075e-05,
"loss": 0.131,
"mean_token_accuracy": 0.969085082411766,
"num_tokens": 151972.0,
"step": 215
},
{
"entropy": 0.20062651857733727,
"epoch": 5.541935483870968,
"grad_norm": 2.6525464057922363,
"learning_rate": 4.4195354293738484e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.9647854268550873,
"num_tokens": 152742.0,
"step": 216
},
{
"entropy": 0.18283047527074814,
"epoch": 5.567741935483871,
"grad_norm": 1.9218651056289673,
"learning_rate": 4.378281476762576e-05,
"loss": 0.1113,
"mean_token_accuracy": 0.9758298695087433,
"num_tokens": 153456.0,
"step": 217
},
{
"entropy": 0.17359177768230438,
"epoch": 5.593548387096774,
"grad_norm": 2.074409008026123,
"learning_rate": 4.337070468641604e-05,
"loss": 0.1127,
"mean_token_accuracy": 0.9679757952690125,
"num_tokens": 154114.0,
"step": 218
},
{
"entropy": 0.15452994219958782,
"epoch": 5.619354838709677,
"grad_norm": 1.4686728715896606,
"learning_rate": 4.295905251613817e-05,
"loss": 0.083,
"mean_token_accuracy": 0.9716224670410156,
"num_tokens": 154710.0,
"step": 219
},
{
"entropy": 0.1461981236934662,
"epoch": 5.645161290322581,
"grad_norm": 2.090766191482544,
"learning_rate": 4.254788669119127e-05,
"loss": 0.0915,
"mean_token_accuracy": 0.9731487780809402,
"num_tokens": 155272.0,
"step": 220
},
{
"entropy": 0.14948130398988724,
"epoch": 5.670967741935484,
"grad_norm": 2.874465227127075,
"learning_rate": 4.213723561238074e-05,
"loss": 0.1213,
"mean_token_accuracy": 0.9657130539417267,
"num_tokens": 155765.0,
"step": 221
},
{
"entropy": 0.147341663017869,
"epoch": 5.6967741935483875,
"grad_norm": 2.8784825801849365,
"learning_rate": 4.172712764495644e-05,
"loss": 0.1131,
"mean_token_accuracy": 0.9677340090274811,
"num_tokens": 156170.0,
"step": 222
},
{
"entropy": 0.37899941951036453,
"epoch": 5.72258064516129,
"grad_norm": 2.1102116107940674,
"learning_rate": 4.131759111665349e-05,
"loss": 0.2919,
"mean_token_accuracy": 0.9289288818836212,
"num_tokens": 157544.0,
"step": 223
},
{
"entropy": 0.1955309621989727,
"epoch": 5.748387096774193,
"grad_norm": 2.2968599796295166,
"learning_rate": 4.0908654315735466e-05,
"loss": 0.1214,
"mean_token_accuracy": 0.9681131392717361,
"num_tokens": 158450.0,
"step": 224
},
{
"entropy": 0.19111444801092148,
"epoch": 5.774193548387097,
"grad_norm": 2.6387436389923096,
"learning_rate": 4.0500345489040515e-05,
"loss": 0.1412,
"mean_token_accuracy": 0.9579745233058929,
"num_tokens": 159264.0,
"step": 225
},
{
"entropy": 0.1776861809194088,
"epoch": 5.8,
"grad_norm": 2.6175966262817383,
"learning_rate": 4.0092692840030134e-05,
"loss": 0.1223,
"mean_token_accuracy": 0.9692755341529846,
"num_tokens": 159933.0,
"step": 226
},
{
"entropy": 0.15603690408170223,
"epoch": 5.825806451612904,
"grad_norm": 2.4090588092803955,
"learning_rate": 3.968572452684113e-05,
"loss": 0.1004,
"mean_token_accuracy": 0.9694436490535736,
"num_tokens": 160526.0,
"step": 227
},
{
"entropy": 0.14687431044876575,
"epoch": 5.851612903225806,
"grad_norm": 2.5552449226379395,
"learning_rate": 3.9279468660340626e-05,
"loss": 0.1015,
"mean_token_accuracy": 0.9700941145420074,
"num_tokens": 161001.0,
"step": 228
},
{
"entropy": 0.14584726840257645,
"epoch": 5.877419354838709,
"grad_norm": 2.417149782180786,
"learning_rate": 3.887395330218429e-05,
"loss": 0.1257,
"mean_token_accuracy": 0.969669446349144,
"num_tokens": 161434.0,
"step": 229
},
{
"entropy": 0.12778180465102196,
"epoch": 5.903225806451613,
"grad_norm": 1.2179059982299805,
"learning_rate": 3.846920646287799e-05,
"loss": 0.0758,
"mean_token_accuracy": 0.9738518297672272,
"num_tokens": 161858.0,
"step": 230
},
{
"entropy": 0.1522289477288723,
"epoch": 5.929032258064516,
"grad_norm": 2.0130059719085693,
"learning_rate": 3.806525609984312e-05,
"loss": 0.1062,
"mean_token_accuracy": 0.9636791348457336,
"num_tokens": 162250.0,
"step": 231
},
{
"entropy": 0.23178323358297348,
"epoch": 5.95483870967742,
"grad_norm": 2.4759209156036377,
"learning_rate": 3.7662130115485314e-05,
"loss": 0.1228,
"mean_token_accuracy": 0.9636365175247192,
"num_tokens": 163108.0,
"step": 232
},
{
"entropy": 0.16584154963493347,
"epoch": 5.980645161290322,
"grad_norm": 2.447923421859741,
"learning_rate": 3.7259856355267273e-05,
"loss": 0.1304,
"mean_token_accuracy": 0.9603947103023529,
"num_tokens": 163768.0,
"step": 233
},
{
"entropy": 0.12077461183071136,
"epoch": 6.0,
"grad_norm": 3.7384884357452393,
"learning_rate": 3.685846260578524e-05,
"loss": 0.0966,
"mean_token_accuracy": 0.9680581092834473,
"num_tokens": 164118.0,
"step": 234
}
],
"logging_steps": 1,
"max_steps": 390,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7434558634475520.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}